From 2182a4f9e9f179d94905c06ec948983bb6f2f1b9 Mon Sep 17 00:00:00 2001 From: Jacek Czaja Date: Mon, 14 Nov 2022 09:53:23 +0100 Subject: [PATCH 001/210] - Modified mem_desc() to return reference to Tensor::memory::desc to (#47844) avoid copying --- paddle/phi/core/dense_tensor.inl | 2 +- paddle/phi/core/dense_tensor_impl.cc | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/phi/core/dense_tensor.inl b/paddle/phi/core/dense_tensor.inl index c876ba679ba5d..d631032cae083 100644 --- a/paddle/phi/core/dense_tensor.inl +++ b/paddle/phi/core/dense_tensor.inl @@ -116,7 +116,7 @@ following codes there. #ifdef PADDLE_WITH_MKLDNN public: - dnnl::memory::desc mem_desc() const; + const dnnl::memory::desc& mem_desc() const; inline void set_mem_desc(const dnnl::memory::desc& mem_desc) { mem_desc_ = mem_desc; diff --git a/paddle/phi/core/dense_tensor_impl.cc b/paddle/phi/core/dense_tensor_impl.cc index f16b6625011a1..c8998f65efb6a 100644 --- a/paddle/phi/core/dense_tensor_impl.cc +++ b/paddle/phi/core/dense_tensor_impl.cc @@ -344,7 +344,7 @@ std::vector DenseTensor::Chunk(int64_t chunks, } #ifdef PADDLE_WITH_MKLDNN -dnnl::memory::desc DenseTensor::mem_desc() const { return mem_desc_; } +const dnnl::memory::desc& DenseTensor::mem_desc() const { return mem_desc_; } #endif // NOTE: For historical reasons, this interface has a special behavior, From 42c8d51a94c4149e72e162b8749637cc5218d270 Mon Sep 17 00:00:00 2001 From: HongyuJia Date: Mon, 14 Nov 2022 17:22:50 +0800 Subject: [PATCH 002/210] clean fluid elementwise_max (#47866) --- .../dygraph_optimizer/hybrid_parallel_optimizer.py | 2 +- .../fleet/meta_optimizers/localsgd_optimizer.py | 2 +- .../fleet/meta_parallel/sharding/group_sharded_utils.py | 2 +- .../fleet/meta_parallel/sharding/sharding_utils.py | 2 +- python/paddle/fluid/clip.py | 8 +++----- python/paddle/fluid/dygraph/learning_rate_scheduler.py | 5 ++--- .../paddle/fluid/tests/unittests/dist_fleet_simnet_bow.py | 2 +- .../unittests/dygraph_to_static/simnet_dygraph_model.py | 4 +++- python/paddle/fluid/tests/unittests/test_dist_fleet_ps.py | 2 +- .../paddle/fluid/tests/unittests/test_dist_fleet_ps11.py | 2 +- .../paddle/fluid/tests/unittests/test_dist_fleet_ps12.py | 2 +- .../paddle/fluid/tests/unittests/test_dist_fleet_ps13.py | 2 +- .../paddle/fluid/tests/unittests/test_dist_fleet_ps2.py | 2 +- .../paddle/fluid/tests/unittests/test_dist_fleet_ps3.py | 2 +- .../paddle/fluid/tests/unittests/test_dist_fleet_ps4.py | 2 +- .../paddle/fluid/tests/unittests/test_dist_fleet_ps5.py | 2 +- .../paddle/fluid/tests/unittests/test_dist_fleet_ps6.py | 2 +- python/paddle/fluid/tests/unittests/test_layers.py | 6 ++---- .../paddle/incubate/distributed/models/moe/grad_clip.py | 4 +++- 19 files changed, 27 insertions(+), 28 deletions(-) diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py index 33922b7f35d9c..bd05cbe879718 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py @@ -160,7 +160,7 @@ def _dygraph_clip(self, params_grads): ) clip_var = layers.elementwise_div( x=max_global_norm, - y=layers.elementwise_max(x=global_norm_var_fp32, y=max_global_norm), + y=paddle.maximum(x=global_norm_var_fp32, y=max_global_norm), ) clip_var_fp16 = paddle.cast(clip_var, paddle.float16) for p, g in params_grads: diff --git a/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py index b3c4231b36f3e..1cd0b23488ed7 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py @@ -468,7 +468,7 @@ def communicate_avg_loss(): next_local_steps = layers.elementwise_min( next_local_steps, max_local_steps ) - next_local_steps = layers.elementwise_max( + next_local_steps = paddle.maximum( next_local_steps, min_local_steps ) layers.assign(next_local_steps, k_steps) diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py index 6832e9a7caa21..2976ef88e5983 100644 --- a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py +++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py @@ -141,7 +141,7 @@ def _dygraph_clip(self, params_grads): clip_var = layers.elementwise_div( x=max_global_norm, - y=layers.elementwise_max(x=global_norm_var, y=max_global_norm), + y=paddle.maximum(x=global_norm_var, y=max_global_norm), ) clip_var_fp16 = paddle.cast(clip_var, paddle.float16) diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py index 07cf159c3e66f..4cee382339538 100644 --- a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py +++ b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py @@ -138,7 +138,7 @@ def _dygraph_clip(self, params_grads): clip_var = layers.elementwise_div( x=max_global_norm, - y=layers.elementwise_max(x=global_norm_var, y=max_global_norm), + y=paddle.maximum(x=global_norm_var, y=max_global_norm), ) clip_var_fp16 = paddle.cast(clip_var, paddle.float16) diff --git a/python/paddle/fluid/clip.py b/python/paddle/fluid/clip.py index 68a2f8a0deea3..19c8629fa9ad8 100644 --- a/python/paddle/fluid/clip.py +++ b/python/paddle/fluid/clip.py @@ -550,7 +550,7 @@ def _dygraph_clip(self, params_grads): need_clip = True clip_var = layers.elementwise_div( x=max_global_norm, - y=layers.elementwise_max(x=global_norm_var, y=max_global_norm), + y=paddle.maximum(x=global_norm_var, y=max_global_norm), ) elif global_norm_var > max_global_norm: # only when global_norm_var > max_global_norm, grad need clip @@ -654,9 +654,7 @@ def _static_clip(self, params_grads): ) scale_var = layers.elementwise_div( x=max_global_norm, - y=layers.elementwise_max( - x=max_global_norm, y=global_norm_var - ), + y=paddle.maximum(x=max_global_norm, y=global_norm_var), ) param_new_grad_name_dict = dict() for p, g in params_grads: @@ -733,7 +731,7 @@ def _create_operators(self, param, grad): clip_var = self.context[self.group_name + "_clip"] group_scale_var = layers.elementwise_div( x=clip_var, - y=layers.elementwise_max(x=clip_var, y=group_norm_var), + y=paddle.maximum(x=clip_var, y=group_norm_var), ) assert group_scale_var.shape == (1,) self.context[group_scale_name] = group_scale_var diff --git a/python/paddle/fluid/dygraph/learning_rate_scheduler.py b/python/paddle/fluid/dygraph/learning_rate_scheduler.py index 3afe92cbc6234..0204542d6ec2b 100644 --- a/python/paddle/fluid/dygraph/learning_rate_scheduler.py +++ b/python/paddle/fluid/dygraph/learning_rate_scheduler.py @@ -15,6 +15,7 @@ import math import warnings +import paddle from .. import unique_name from ..framework import Variable from ..data_feeder import check_type @@ -977,11 +978,9 @@ def step(self, loss): self.num_bad_epochs += 1 if self.num_bad_epochs > self.patience: - from .. import layers - self.cooldown_counter = self.cooldown self.num_bad_epochs = 0 - new_lr = layers.elementwise_max( + new_lr = paddle.maximum( self.learning_rate * self.decay_rate, self.min_lr ) if self.learning_rate - new_lr > self.eps: diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_simnet_bow.py b/python/paddle/fluid/tests/unittests/dist_fleet_simnet_bow.py index f9d926ad1c733..cc6371cc7cc67 100644 --- a/python/paddle/fluid/tests/unittests/dist_fleet_simnet_bow.py +++ b/python/paddle/fluid/tests/unittests/dist_fleet_simnet_bow.py @@ -74,7 +74,7 @@ def get_loss(cos_q_pt, cos_q_nt): cos_q_pt, ) loss_op2 = fluid.layers.elementwise_add(loss_op1, cos_q_nt) - loss_op3 = fluid.layers.elementwise_max( + loss_op3 = paddle.maximum( fluid.layers.fill_constant_batch_size_like( input=loss_op2, shape=[-1, 1], value=0.0, dtype='float32' ), diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model.py index 0bb08405141c3..facc72faf8736 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +import paddle + import paddle.fluid as fluid import paddle.fluid.param_attr as attr @@ -151,7 +153,7 @@ def ops(self, x, y): """ operation """ - max = fluid.layers.elementwise_max(x, y) + max = paddle.maximum(x, y) return max diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps.py index 8d5ac58d62aaa..42a96cc66f41d 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps.py +++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps.py @@ -56,7 +56,7 @@ def get_loss(cos_q_pt, cos_q_nt): cos_q_pt, ) loss_op2 = fluid.layers.elementwise_add(loss_op1, cos_q_nt) - loss_op3 = fluid.layers.elementwise_max( + loss_op3 = paddle.maximum( fluid.layers.fill_constant_batch_size_like( input=loss_op2, shape=[-1, 1], value=0.0, dtype='float32' ), diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps11.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps11.py index b0d8df316a8ab..4fc0e2eb5a0c3 100755 --- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps11.py +++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps11.py @@ -56,7 +56,7 @@ def get_loss(cos_q_pt, cos_q_nt): cos_q_pt, ) loss_op2 = fluid.layers.elementwise_add(loss_op1, cos_q_nt) - loss_op3 = fluid.layers.elementwise_max( + loss_op3 = paddle.maximum( fluid.layers.fill_constant_batch_size_like( input=loss_op2, shape=[-1, 1], value=0.0, dtype='float32' ), diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps12.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps12.py index b4c10116a55c3..f38dbf9e56334 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps12.py +++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps12.py @@ -59,7 +59,7 @@ def get_loss(cos_q_pt, cos_q_nt): cos_q_pt, ) loss_op2 = fluid.layers.elementwise_add(loss_op1, cos_q_nt) - loss_op3 = fluid.layers.elementwise_max( + loss_op3 = paddle.maximum( fluid.layers.fill_constant_batch_size_like( input=loss_op2, shape=[-1, 1], value=0.0, dtype='float32' ), diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps13.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps13.py index 47cbaefd68d7b..1c8c3b4f879e0 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps13.py +++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps13.py @@ -60,7 +60,7 @@ def get_loss(cos_q_pt, cos_q_nt): cos_q_pt, ) loss_op2 = fluid.layers.elementwise_add(loss_op1, cos_q_nt) - loss_op3 = fluid.layers.elementwise_max( + loss_op3 = paddle.maximum( fluid.layers.fill_constant_batch_size_like( input=loss_op2, shape=[-1, 1], value=0.0, dtype='float32' ), diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps2.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps2.py index a4cdcb32bd412..335577818259c 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps2.py +++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps2.py @@ -59,7 +59,7 @@ def get_loss(cos_q_pt, cos_q_nt): cos_q_pt, ) loss_op2 = fluid.layers.elementwise_add(loss_op1, cos_q_nt) - loss_op3 = fluid.layers.elementwise_max( + loss_op3 = paddle.maximum( fluid.layers.fill_constant_batch_size_like( input=loss_op2, shape=[-1, 1], value=0.0, dtype='float32' ), diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps3.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps3.py index 2eb6277018441..abd0ff1c858c1 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps3.py +++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps3.py @@ -56,7 +56,7 @@ def get_loss(cos_q_pt, cos_q_nt): cos_q_pt, ) loss_op2 = fluid.layers.elementwise_add(loss_op1, cos_q_nt) - loss_op3 = fluid.layers.elementwise_max( + loss_op3 = paddle.maximum( fluid.layers.fill_constant_batch_size_like( input=loss_op2, shape=[-1, 1], value=0.0, dtype='float32' ), diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps4.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps4.py index 0cb4ee6e3af2f..cfc806d372b01 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps4.py +++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps4.py @@ -56,7 +56,7 @@ def get_loss(cos_q_pt, cos_q_nt): cos_q_pt, ) loss_op2 = fluid.layers.elementwise_add(loss_op1, cos_q_nt) - loss_op3 = fluid.layers.elementwise_max( + loss_op3 = paddle.maximum( fluid.layers.fill_constant_batch_size_like( input=loss_op2, shape=[-1, 1], value=0.0, dtype='float32' ), diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps5.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps5.py index fbb640fc8bbc8..f88ca8fcb1de3 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps5.py +++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps5.py @@ -56,7 +56,7 @@ def get_loss(cos_q_pt, cos_q_nt): cos_q_pt, ) loss_op2 = fluid.layers.elementwise_add(loss_op1, cos_q_nt) - loss_op3 = fluid.layers.elementwise_max( + loss_op3 = paddle.maximum( fluid.layers.fill_constant_batch_size_like( input=loss_op2, shape=[-1, 1], value=0.0, dtype='float32' ), diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps6.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps6.py index e0b73b4344c68..6cfae26323e5c 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps6.py +++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps6.py @@ -56,7 +56,7 @@ def get_loss(cos_q_pt, cos_q_nt): cos_q_pt, ) loss_op2 = fluid.layers.elementwise_add(loss_op1, cos_q_nt) - loss_op3 = fluid.layers.elementwise_max( + loss_op3 = paddle.maximum( fluid.layers.fill_constant_batch_size_like( input=loss_op2, shape=[-1, 1], value=0.0, dtype='float32' ), diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index ba29e7430a942..9f9e98bfca1c7 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -652,14 +652,12 @@ def test_elementwise_minmax(self): min_eager_ret = layers.elementwise_min( to_variable(n), to_variable(n2) ) - max_eager_ret = layers.elementwise_max( - to_variable(n), to_variable(n2) - ) + max_eager_ret = paddle.maximum(to_variable(n), to_variable(n2)) min_eager_ret_value = min_eager_ret.numpy() max_eager_ret_value = max_eager_ret.numpy() min_ret = layers.elementwise_min(to_variable(n), to_variable(n2)) - max_ret = layers.elementwise_max(to_variable(n), to_variable(n2)) + max_ret = paddle.maximum(to_variable(n), to_variable(n2)) min_ret_value = min_ret.numpy() max_ret_value = max_ret.numpy() diff --git a/python/paddle/incubate/distributed/models/moe/grad_clip.py b/python/paddle/incubate/distributed/models/moe/grad_clip.py index 9a3a0dfc0f707..17372ea4f175d 100644 --- a/python/paddle/incubate/distributed/models/moe/grad_clip.py +++ b/python/paddle/incubate/distributed/models/moe/grad_clip.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +import paddle + import paddle.distributed as dist from paddle.fluid.clip import ClipGradBase, _squared_l2_norm from paddle.fluid.dygraph import base as imperative_base @@ -213,7 +215,7 @@ def _dygraph_clip(self, params_grads): ) clip_var = layers.elementwise_div( x=max_global_norm, - y=layers.elementwise_max(x=global_norm_var, y=max_global_norm), + y=paddle.maximum(x=global_norm_var, y=max_global_norm), ) for p, g in params_grads: if g is None: From 1a145aab48764a058fc3cc677a8accd2233800db Mon Sep 17 00:00:00 2001 From: cyber-pioneer <116002591+cyber-pioneer@users.noreply.github.com> Date: Mon, 14 Nov 2022 18:11:27 +0800 Subject: [PATCH 003/210] add cos double and triple grad operator (#47796) --- .../generator/codegen_utils.py | 2 + paddle/phi/api/yaml/backward.yaml | 24 ++++ paddle/phi/api/yaml/op_compat.yaml | 2 +- paddle/phi/kernels/activation_grad_kernel.h | 19 +++ .../phi/kernels/cpu/activation_grad_kernel.cc | 23 ++++ paddle/phi/kernels/funcs/activation_functor.h | 117 +++++++++++++++--- .../phi/kernels/gpu/activation_grad_kernel.cu | 20 +++ .../phi/kernels/impl/activation_grad_impl.h | 52 ++++++++ .../unittests/test_activation_nn_grad.py | 64 ++++++++++ 9 files changed, 308 insertions(+), 15 deletions(-) diff --git a/paddle/fluid/eager/auto_code_generator/generator/codegen_utils.py b/paddle/fluid/eager/auto_code_generator/generator/codegen_utils.py index 796e0089110da..ac0b01dd4de98 100644 --- a/paddle/fluid/eager/auto_code_generator/generator/codegen_utils.py +++ b/paddle/fluid/eager/auto_code_generator/generator/codegen_utils.py @@ -38,6 +38,8 @@ "tanh_triple_grad", "sin_double_grad", "sin_triple_grad", + "cos_double_grad", + "cos_triple_grad", "subtract_double_grad", "divide_double_grad", "log_double_grad", diff --git a/paddle/phi/api/yaml/backward.yaml b/paddle/phi/api/yaml/backward.yaml index 82c2b12b1762d..919f69525bbc2 100644 --- a/paddle/phi/api/yaml/backward.yaml +++ b/paddle/phi/api/yaml/backward.yaml @@ -172,6 +172,18 @@ kernel : func : cholesky_solve_grad +- backward_op : cos_double_grad + forward : cos_grad (Tensor x, Tensor grad_out) -> Tensor(grad_x) + args : (Tensor x, Tensor grad_out, Tensor grad_x_grad) + output : Tensor(x_grad), Tensor(grad_out_grad) + infer_meta : + func : GeneralBinaryGradInferMeta + param : [x, x] + kernel : + func : cos_double_grad + backward : cos_triple_grad + inplace : (grad_x_grad -> grad_out_grad) + - backward_op : cos_grad forward : cos (Tensor x) -> Tensor(out) args : (Tensor x, Tensor out_grad) @@ -181,8 +193,20 @@ param : [x] kernel : func : cos_grad + backward : cos_double_grad inplace : (out_grad -> x_grad) +- backward_op : cos_triple_grad + forward : cos_double_grad (Tensor x, Tensor grad_out_forward, Tensor grad_x_grad_forward) -> Tensor(grad_x), Tensor(grad_out_grad) + args : (Tensor x, Tensor grad_out_forward, Tensor grad_x_grad_forward, Tensor grad_x_grad, Tensor grad_out_grad_grad) + output : Tensor(x_grad), Tensor(grad_out_forward_grad), Tensor(grad_x_grad_forward_grad) + infer_meta : + func : GeneralTernaryGradInferMeta + param : [x, x, grad_x_grad_forward] + kernel : + func : cos_triple_grad + inplace : (grad_x_grad_forward -> grad_out_forward_grad) + - backward_op : cosh_grad forward : cosh (Tensor x) -> Tensor(out) args : (Tensor x, Tensor out_grad) diff --git a/paddle/phi/api/yaml/op_compat.yaml b/paddle/phi/api/yaml/op_compat.yaml index 0af8731d5aa22..a0894a9aca8f6 100644 --- a/paddle/phi/api/yaml/op_compat.yaml +++ b/paddle/phi/api/yaml/op_compat.yaml @@ -229,7 +229,7 @@ attrs : [bool use_cudnn = true, bool use_mkldnn = false, int workspace_size_MB = platform::GetDefaultConvWorkspaceSizeLimitMB()] - op : cos - backward : cos_grad + backward : cos_grad, cos_double_grad, cos_triple_grad inputs : x : X outputs : diff --git a/paddle/phi/kernels/activation_grad_kernel.h b/paddle/phi/kernels/activation_grad_kernel.h index 8004f4ad80b9c..847383fc38e94 100644 --- a/paddle/phi/kernels/activation_grad_kernel.h +++ b/paddle/phi/kernels/activation_grad_kernel.h @@ -88,6 +88,14 @@ void SinDoubleGradKernel(const Context& dev_ctx, DenseTensor* dx, DenseTensor* ddout); +template +void CosDoubleGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& dout, + const DenseTensor& ddx, + DenseTensor* dx, + DenseTensor* ddout); + template void TanhDoubleGradKernel(const Context& dev_ctx, const DenseTensor& out, @@ -118,6 +126,17 @@ void SinTripleGradKernel(const Context& dev_ctx, DenseTensor* d_dout, DenseTensor* d_ddx); +template +void CosTripleGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& dout, + const DenseTensor& ddx, + const DenseTensor& d_dx_new, + const DenseTensor& d_ddout, + DenseTensor* d_x_new, + DenseTensor* d_dout, + DenseTensor* d_ddx); + template void LeakyReluDoubleGradKernel(const Context& dev_ctx, const DenseTensor& x, diff --git a/paddle/phi/kernels/cpu/activation_grad_kernel.cc b/paddle/phi/kernels/cpu/activation_grad_kernel.cc index edbe4083fb02f..06485e847d6ad 100644 --- a/paddle/phi/kernels/cpu/activation_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/activation_grad_kernel.cc @@ -336,6 +336,7 @@ PD_REGISTER_KERNEL(square_double_grad, phi::dtype::float16, int, int64_t) {} + PD_REGISTER_KERNEL(sin_double_grad, CPU, ALL_LAYOUT, @@ -345,6 +346,7 @@ PD_REGISTER_KERNEL(sin_double_grad, phi::dtype::float16, int, int64_t) {} + PD_REGISTER_KERNEL(sin_triple_grad, CPU, ALL_LAYOUT, @@ -354,6 +356,27 @@ PD_REGISTER_KERNEL(sin_triple_grad, phi::dtype::float16, int, int64_t) {} + +PD_REGISTER_KERNEL(cos_double_grad, + CPU, + ALL_LAYOUT, + phi::CosDoubleGradKernel, + float, + double, + phi::dtype::float16, + int, + int64_t) {} + +PD_REGISTER_KERNEL(cos_triple_grad, + CPU, + ALL_LAYOUT, + phi::CosTripleGradKernel, + float, + double, + phi::dtype::float16, + int, + int64_t) {} + PD_REGISTER_ACTIVATION_GRAD_KERNEL(softsign_grad, SoftsignGradKernel) PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_grad, SigmoidGradKernel) PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_double_grad, SigmoidDoubleGradKernel) diff --git a/paddle/phi/kernels/funcs/activation_functor.h b/paddle/phi/kernels/funcs/activation_functor.h index 460e6300c4087..ccdff93d5b23c 100644 --- a/paddle/phi/kernels/funcs/activation_functor.h +++ b/paddle/phi/kernels/funcs/activation_functor.h @@ -117,23 +117,22 @@ struct SinDoubleGradFunctor : public BaseActivationFunctor { DenseTensor* dX, DenseTensor* ddOut) const { auto* d = dev.eigen_device(); - auto ddx = EigenVector::Flatten( - GET_DATA_SAFELY(ddX, "Input", "DDX", "SinDoubleGrad")); + auto d2d1x = EigenVector::Flatten( + GET_DATA_SAFELY(ddX, "Input", "d2d1x", "SinDoubleGrad")); auto x = EigenVector::Flatten( - GET_DATA_SAFELY(X, "Input", "X", "SinDoubleGrad")); - // sin DoubleGrad: ddy=cos(x)*ddx, dx=-sin(x)*dy*ddx + GET_DATA_SAFELY(X, "Input", "x", "SinDoubleGrad")); - // calculate dx first, so ddy can inplace ddx - auto dx = EigenVector::Flatten( - GET_DATA_SAFELY(dX, "Output", "DX", "SinDoubleGrad")); - auto dout = EigenVector::Flatten( - GET_DATA_SAFELY(dOut, "Output", "DOut", "SinDoubleGrad")); - dx.device(*d) = -ddx * x.unaryExpr(Sine()) * dout; + // calculate d2x first, so d2d1y can inplace d2d1x + auto d2x = EigenVector::Flatten( + GET_DATA_SAFELY(dX, "Output", "d2x", "SinDoubleGrad")); + auto d1y = EigenVector::Flatten( + GET_DATA_SAFELY(dOut, "Output", "d1y", "SinDoubleGrad")); + d2x.device(*d) = -d2d1x * x.unaryExpr(Sine()) * d1y; - // calculate ddout - auto ddout = EigenVector::Flatten( - GET_DATA_SAFELY(ddOut, "Output", "DDOut", "SinDoubleGrad")); - ddout.device(*d) = ddx * x.unaryExpr(Cosine()); + // calculate d2d1y + auto d2d1y = EigenVector::Flatten( + GET_DATA_SAFELY(ddOut, "Output", "d2d1y", "SinDoubleGrad")); + d2d1y.device(*d) = d2d1x * x.unaryExpr(Cosine()); } static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } }; @@ -221,6 +220,22 @@ struct ReciprocalGradFunctor : public BaseActivationFunctor { } }; +// 1st reverse grad +// y = cos(x) +// x --> y +// d1x = d1y * -sin(x) +// +// 2nd reverse grad +// x, d1y --> d1x +// d2x = -cos(x) * d1y * d2d1x +// d2d1y = -sin(x) * d2d1x +// +// 3rd reverse grad +// x, d1y, d2d1x --> d2x, d2d1y +// d3x = sin(x) * d1y * d2d1x * d3d2x - cos(x) * d2d1x * d3d2d1y +// d3d1y = -cos(x) * d2d1x * d3d2x +// d3d2d1x = -cos(x) * d1y * d3d2x - sin(x) * d3d2d1y + // cosine'(x) = -sin(x) template struct CosGradFunctor : public BaseActivationFunctor { @@ -236,6 +251,80 @@ struct CosGradFunctor : public BaseActivationFunctor { static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } }; +// cos''(x) = -cos(x) +template +struct CosDoubleGradFunctor : public BaseActivationFunctor { + template + void operator()(const Device& dev, + const DenseTensor* X, + const DenseTensor* dOut, + const DenseTensor* ddX, + DenseTensor* dX, + DenseTensor* ddOut) const { + auto* d = dev.eigen_device(); + auto d2d1x = EigenVector::Flatten( + GET_DATA_SAFELY(ddX, "Input", "d2d1x", "CosDoubleGrad")); + auto x = EigenVector::Flatten( + GET_DATA_SAFELY(X, "Input", "x", "CosDoubleGrad")); + + // calculate d2x first, so d2d1y can inplace d2d1x + auto d2x = EigenVector::Flatten( + GET_DATA_SAFELY(dX, "Output", "d2x", "CosDoubleGrad")); + auto d1y = EigenVector::Flatten( + GET_DATA_SAFELY(dOut, "Output", "d1y", "CosDoubleGrad")); + d2x.device(*d) = -d2d1x * x.unaryExpr(Cosine()) * d1y; + + // calculate d2d1y + auto d2d1y = EigenVector::Flatten( + GET_DATA_SAFELY(ddOut, "Output", "d2d1y", "CosDoubleGrad")); + d2d1y.device(*d) = -d2d1x * x.unaryExpr(Sine()); + } + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +template +struct CosTripleGradFunctor : public BaseActivationFunctor { + template + void operator()(const Device& dev, + const DenseTensor* X, + const DenseTensor* ddX, + const DenseTensor* dOut, + const DenseTensor* d_DDOut, + const DenseTensor* d_dx_New, + DenseTensor* d_d_Out, + DenseTensor* d_x_New, + DenseTensor* d_DDx) const { + auto* d = dev.eigen_device(); + auto x = EigenVector::Flatten( + GET_DATA_SAFELY(X, "Input", "x", "CosTripleGrad")); + auto d2d1x = EigenVector::Flatten( + GET_DATA_SAFELY(ddX, "Input", "d2d1x", "CosTripleGrad")); + auto d1y = EigenVector::Flatten( + GET_DATA_SAFELY(dOut, "Input", "d1y", "CosTripleGrad")); + auto d3d2d1y = EigenVector::Flatten( + GET_DATA_SAFELY(d_DDOut, "Input", "d3d2d1y", "CosTripleGrad")); + auto d3d2x = EigenVector::Flatten( + GET_DATA_SAFELY(d_dx_New, "Input", "d3d2x", "CosTripleGrad")); + + auto d3x = EigenVector::Flatten( + GET_DATA_SAFELY(d_x_New, "Output", "d3x", "CosTripleGrad")); + d3x.device(*d) = x.unaryExpr(Sine()) * d1y * d2d1x * d3d2x - + x.unaryExpr(Cosine()) * d2d1x * d3d2d1y; + + auto d3d1y = EigenVector::Flatten( + GET_DATA_SAFELY(d_d_Out, "Output", "d3d1y", "CosTripleGrad")); + d3d1y.device(*d) = -x.unaryExpr(Cosine()) * d2d1x * d3d2x; + + auto d3d2d1x = EigenVector::Flatten( + GET_DATA_SAFELY(d_DDx, "Output", "d3d2d1x", "CosTripleGrad")); + d3d2d1x.device(*d) = -x.unaryExpr(Cosine()) * d1y * d3d2x - + x.unaryExpr(Sine()) * d3d2d1y; + } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } +}; + // cosine(x) = cos(x) template struct CosFunctor : public BaseActivationFunctor { diff --git a/paddle/phi/kernels/gpu/activation_grad_kernel.cu b/paddle/phi/kernels/gpu/activation_grad_kernel.cu index 763af5652ee5b..5e75909649a65 100644 --- a/paddle/phi/kernels/gpu/activation_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/activation_grad_kernel.cu @@ -437,6 +437,26 @@ PD_REGISTER_KERNEL(sin_triple_grad, int64_t, phi::dtype::float16) {} +PD_REGISTER_KERNEL(cos_double_grad, + GPU, + ALL_LAYOUT, + phi::CosDoubleGradKernel, + float, + double, + int, + int64_t, + phi::dtype::float16) {} + +PD_REGISTER_KERNEL(cos_triple_grad, + GPU, + ALL_LAYOUT, + phi::CosTripleGradKernel, + float, + double, + int, + int64_t, + phi::dtype::float16) {} + PD_REGISTER_ACTIVATION_GRAD_KERNEL(softsign_grad, SoftsignGradKernel) PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_grad, SigmoidGradKernel) PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_double_grad, SigmoidDoubleGradKernel) diff --git a/paddle/phi/kernels/impl/activation_grad_impl.h b/paddle/phi/kernels/impl/activation_grad_impl.h index 76082608e8227..dd7dadc1e1cf9 100644 --- a/paddle/phi/kernels/impl/activation_grad_impl.h +++ b/paddle/phi/kernels/impl/activation_grad_impl.h @@ -646,4 +646,56 @@ void SinTripleGradKernel(const Context& dev_ctx, d_ddx); // output } +template +void CosDoubleGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& dout, + const DenseTensor& ddx, + DenseTensor* dx, + DenseTensor* ddout) { + if (dx) { + dx->Resize(x.dims()); + dev_ctx.template Alloc(dx); + } + if (ddout) { + dev_ctx.template Alloc(ddout); + } + phi::funcs::CosDoubleGradFunctor functor; + functor(dev_ctx, &x, &dout, &ddx, dx, ddout); +} + +template +void CosTripleGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& dout, + const DenseTensor& ddx, + const DenseTensor& d_dx_new, + const DenseTensor& d_ddout, + DenseTensor* d_x_new, + DenseTensor* d_dout, + DenseTensor* d_ddx) { + if (d_dout) { + d_dout->Resize(x.dims()); + dev_ctx.template Alloc(d_dout); + } + if (d_x_new) { + d_dout->Resize(x.dims()); + dev_ctx.template Alloc(d_x_new); + } + if (d_ddx) { + d_dout->Resize(ddx.dims()); + dev_ctx.template Alloc(d_ddx); + } + funcs::CosTripleGradFunctor functor; + functor(dev_ctx, + &x, + &ddx, + &dout, + &d_ddout, + &d_dx_new, // input + d_dout, + d_x_new, + d_ddx); // output +} + } // namespace phi diff --git a/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py b/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py index f8ec154f92dc2..38a894755f464 100644 --- a/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py +++ b/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py @@ -503,6 +503,38 @@ def test_grad(self): self.func(p) +class TestCosDoubleGradCheck(unittest.TestCase): + def cos_wrapper(self, x): + return paddle.cos(x[0]) + + @prog_scope() + def func(self, place): + shape = [2, 3, 7, 9] + eps = 0.0005 + dtype = np.float64 + x = layers.data('x', shape, False, dtype=dtype) + x.persistable = True + y = paddle.cos(x) + x_arr = np.random.uniform(-1, 1, shape).astype(dtype) + x_arr[np.abs(x_arr) < 0.005] = 0.002 + gradient_checker.double_grad_check( + [x], y, x_init=x_arr, place=place, eps=eps + ) + fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True}) + gradient_checker.double_grad_check_for_dygraph( + self.cos_wrapper, [x], y, x_init=x_arr, place=place + ) + fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False}) + + def test_grad(self): + paddle.enable_static() + places = [fluid.CPUPlace()] + if core.is_compiled_with_cuda(): + places.append(fluid.CUDAPlace(0)) + for p in places: + self.func(p) + + class TestPowDoubleGradCheck1(unittest.TestCase): def pow_wrapper(self, x): return paddle.pow(x[0], 2) @@ -690,5 +722,37 @@ def test_grad(self): self.func(p) +class TestCosTripleGradCheck(unittest.TestCase): + def cos_wrapper(self, x): + return paddle.cos(x[0]) + + @prog_scope() + def func(self, place): + shape = [2, 3, 7, 9] + eps = 0.0005 + dtype = np.float64 + x = layers.data('x', shape, False, dtype=dtype) + x.persistable = True + y = layers.cos(x) + x_arr = np.random.random(shape).astype(dtype) + x_arr[np.abs(x_arr) < 0.005] = 0.002 + gradient_checker.triple_grad_check( + [x], y, x_init=x_arr, place=place, eps=eps + ) + fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True}) + gradient_checker.triple_grad_check_for_dygraph( + self.cos_wrapper, [x], y, x_init=x_arr, place=place + ) + fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False}) + + def test_grad(self): + paddle.enable_static() + places = [fluid.CPUPlace()] + if core.is_compiled_with_cuda(): + places.append(fluid.CUDAPlace(0)) + for p in places: + self.func(p) + + if __name__ == "__main__": unittest.main() From e0be4b9447298665c2446b7c78376c3b1edc3c24 Mon Sep 17 00:00:00 2001 From: Zhou Wei <1183042833@qq.com> Date: Mon, 14 Nov 2022 20:07:08 +0800 Subject: [PATCH 004/210] [Zero-Dim] support input 0D Tensor as scalar attribute for some api (#47689) * [Zero-Dim] support input 0D Tensor as scalar attribute for some api * fix doc --- paddle/phi/infermeta/ternary.cc | 84 ++--- python/paddle/fluid/layers/tensor.py | 12 +- .../tests/unittests/test_zero_dim_tensor.py | 299 ++++++++++++++++++ python/paddle/tensor/creation.py | 196 ++++++------ python/paddle/tensor/random.py | 86 +++-- 5 files changed, 471 insertions(+), 206 deletions(-) diff --git a/paddle/phi/infermeta/ternary.cc b/paddle/phi/infermeta/ternary.cc index c8310707351a7..dc219deac0691 100644 --- a/paddle/phi/infermeta/ternary.cc +++ b/paddle/phi/infermeta/ternary.cc @@ -259,48 +259,24 @@ void ArangeInferMeta(const MetaTensor& start, const MetaTensor& end, const MetaTensor& step, MetaTensor* out) { - auto start_dims = start.dims(); - auto end_dims = end.dims(); - auto step_dims = step.dims(); - PADDLE_ENFORCE_EQ( - start_dims.size(), - 1, - phi::errors::InvalidArgument( - "The dim of the shape of Input(Start) should be 1, but got %d", - start_dims.size())); - - PADDLE_ENFORCE_EQ(start_dims[0], + PADDLE_ENFORCE_EQ(phi::product(start.dims()), 1, phi::errors::InvalidArgument( - "The first dim of the shape of Input(Start) should " - "be 1, but got %d", - start_dims[0])); - PADDLE_ENFORCE_EQ( - end_dims.size(), - 1, - phi::errors::InvalidArgument( - "The dim of the shape of Input(End) should be 1, but got %d", - end_dims.size())); + "The numel of Input(start) should be 1, but got %d", + phi::product(start.dims()))); - PADDLE_ENFORCE_EQ( - end_dims[0], - 1, - phi::errors::InvalidArgument("The first dim of the shape of " - "Input(End) should be 1, but got %d", - end_dims[0])); - PADDLE_ENFORCE_EQ( - step_dims.size(), - 1, - phi::errors::InvalidArgument( - "The dim of the shape of Input(Step) should be 1, but got %d", - step_dims.size())); + PADDLE_ENFORCE_EQ(phi::product(end.dims()), + 1, + phi::errors::InvalidArgument( + "The numel of Input(end) should be 1, but got %d", + phi::product(end.dims()))); - PADDLE_ENFORCE_EQ(step_dims[0], + PADDLE_ENFORCE_EQ(phi::product(step.dims()), 1, phi::errors::InvalidArgument( - "The first dim of the shape of Input(Step) should " - "be 1, but got %d", - step_dims[0])); + "The numel of Input(step) should be 1, but got %d", + phi::product(step.dims()))); + out->set_dims({-1}); out->set_dtype(start.dtype()); } @@ -635,27 +611,27 @@ void LinspaceRawInferMeta(const MetaTensor& start, const MetaTensor& stop, const MetaTensor& number, MetaTensor* out) { - auto s_dims = start.dims(); PADDLE_ENFORCE_EQ( - (s_dims.size() == 1) && (s_dims[0] == 1), - true, - phi::errors::InvalidArgument("The shape of Input(Start) must be [1]," - "but received input shape is [%s].", - s_dims)); - auto e_dims = stop.dims(); + phi::product(start.dims()), + 1, + phi::errors::InvalidArgument("The size of Input(start) should be 1," + "but got %d.", + phi::product(start.dims()))); + PADDLE_ENFORCE_EQ( - (e_dims.size() == 1) && (e_dims[0] == 1), - true, - phi::errors::InvalidArgument("The shape of Input(Stop) must be [1]," - "but received input shape is [%s].", - e_dims)); - auto step_dims = number.dims(); + phi::product(stop.dims()), + 1, + phi::errors::InvalidArgument("The size of Input(stop) should be 1," + "but got %d.", + phi::product(stop.dims()))); + PADDLE_ENFORCE_EQ( - (step_dims.size() == 1) && (step_dims[0] == 1), - true, - phi::errors::InvalidArgument("The shape of Input(Num) must be [1]," - "but received input shape is [%s].", - step_dims)); + phi::product(number.dims()), + 1, + phi::errors::InvalidArgument("The size of Input(number) should be 1," + "but got %d.", + phi::product(number.dims()))); + out->set_dims(phi::make_ddim({-1})); out->set_dtype(start.dtype()); } diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py index a282fae027f24..79766ba09f791 100644 --- a/python/paddle/fluid/layers/tensor.py +++ b/python/paddle/fluid/layers/tensor.py @@ -918,17 +918,7 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None, name=None): if force_cpu: place = core.CPUPlace() if isinstance(shape, (list, tuple)): - for item in shape: - if not isinstance(item, Variable): - shape = list( - map( - lambda x: x.numpy().flat[0] - if isinstance(x, Variable) - else x, - shape, - ) - ) - break + shape = utils.convert_shape_to_list(shape) if not isinstance(dtype, core.VarDesc.VarType): dtype = convert_np_dtype_to_dtype_(dtype) diff --git a/python/paddle/fluid/tests/unittests/test_zero_dim_tensor.py b/python/paddle/fluid/tests/unittests/test_zero_dim_tensor.py index 26f24f5f7e952..c85f5aec42e9f 100644 --- a/python/paddle/fluid/tests/unittests/test_zero_dim_tensor.py +++ b/python/paddle/fluid/tests/unittests/test_zero_dim_tensor.py @@ -498,5 +498,304 @@ def test_shape(self): np.testing.assert_array_equal(out.numpy(), np.array([])) +# Use to test API whose zero-dim input tensors don't have grad and not need to test backward in OpTest. +class TestNoBackwardAPI(unittest.TestCase): + def setUp(self): + paddle.disable_static() + self.shape = [ + paddle.full([], 2, 'int32'), + paddle.full([], 3, 'int32'), + paddle.full([], 4, 'int32'), + ] + + def test_slice(self): + starts = [paddle.full([], 1, 'int32'), paddle.full([], 1, 'int32')] + ends = [paddle.full([], 3, 'int32'), paddle.full([], 3, 'int32')] + x = paddle.rand([5, 3, 3]) + out = paddle.slice(x, [1, 2], starts, ends) + self.assertEqual(out.shape, [5, 2, 2]) + + def test_strided_slice(self): + starts = [paddle.full([], 0, 'int32'), paddle.full([], 0, 'int32')] + ends = [paddle.full([], 4, 'int32'), paddle.full([], 4, 'int32')] + strides = [paddle.full([], 2, 'int32'), paddle.full([], 2, 'int32')] + x = paddle.rand([5, 5, 5]) + out = paddle.strided_slice(x, [1, 2], starts, ends, strides) + self.assertEqual(out.shape, [5, 2, 2]) + + def test_linspace(self): + start = paddle.full([], 1.0) + stop = paddle.full([], 5.0) + num = paddle.full([], 5, 'int32') + out = paddle.linspace(start, stop, num) + np.testing.assert_array_equal(out.numpy(), [1.0, 2.0, 3.0, 4.0, 5.0]) + + def test_arange(self): + start = paddle.full([], 1.0) + stop = paddle.full([], 6.0) + step = paddle.full([], 1.0) + out = paddle.arange(start, stop, step) + np.testing.assert_array_equal(out.numpy(), [1.0, 2.0, 3.0, 4.0, 5.0]) + + def test_normal(self): + mean = paddle.full([], 0.0) + std = paddle.full([], 0.0) + out = paddle.normal(mean, std) + self.assertEqual(out.shape, []) + + out = paddle.normal(0.0, 1.0, []) + self.assertEqual(out.shape, []) + + out = paddle.normal(0.0, 1.0, self.shape) + self.assertEqual(out.shape, [2, 3, 4]) + + def test_rand(self): + out = paddle.rand([]) + self.assertEqual(out.shape, []) + + out = paddle.rand(self.shape) + self.assertEqual(out.shape, [2, 3, 4]) + + def test_randn(self): + out = paddle.randn([]) + self.assertEqual(out.shape, []) + + out = paddle.randn(self.shape) + self.assertEqual(out.shape, [2, 3, 4]) + + def test_randint_and_randint_like(self): + out = paddle.randint(-10, 10, []) + self.assertEqual(out.shape, []) + + out = paddle.randint_like(out, -10, 10) + self.assertEqual(out.shape, []) + + out = paddle.randint(-10, 10, self.shape) + self.assertEqual(out.shape, [2, 3, 4]) + + def test_standard_normal(self): + out = paddle.standard_normal([]) + self.assertEqual(out.shape, []) + + out = paddle.standard_normal(self.shape) + self.assertEqual(out.shape, [2, 3, 4]) + + def test_uniform(self): + out = paddle.uniform([]) + self.assertEqual(out.shape, []) + + out = paddle.uniform(self.shape) + self.assertEqual(out.shape, [2, 3, 4]) + + def test_empty_and_empty_like(self): + out = paddle.empty([]) + self.assertEqual(out.shape, []) + + out = paddle.empty_like(out) + self.assertEqual(out.shape, []) + + out = paddle.empty(self.shape) + self.assertEqual(out.shape, [2, 3, 4]) + + def test_full_and_full_like(self): + out = paddle.full([], 0.5) + self.assertEqual(out.shape, []) + + out = paddle.full_like(out, 0.5) + self.assertEqual(out.shape, []) + + out = paddle.full(self.shape, 0.5) + self.assertEqual(out.shape, [2, 3, 4]) + + def test_ones_and_ones_like(self): + out = paddle.ones([]) + self.assertEqual(out.shape, []) + + out = paddle.ones_like(out) + self.assertEqual(out.shape, []) + + out = paddle.ones(self.shape) + self.assertEqual(out.shape, [2, 3, 4]) + + def test_zeros_and_zeros_like(self): + out = paddle.zeros([]) + self.assertEqual(out.shape, []) + + out = paddle.zeros_like(out) + self.assertEqual(out.shape, []) + + out = paddle.zeros(self.shape) + self.assertEqual(out.shape, [2, 3, 4]) + + +class TestNoBackwardAPIStatic(unittest.TestCase): + def setUp(self): + paddle.enable_static() + self.exe = paddle.static.Executor() + self.shape = [ + paddle.full([], 2, 'int32'), + paddle.full([], 3, 'int32'), + paddle.full([], 4, 'int32'), + ] + + def test_slice(self): + starts = [paddle.full([], 1, 'int32'), paddle.full([], 1, 'int32')] + ends = [paddle.full([], 3, 'int32'), paddle.full([], 3, 'int32')] + x = paddle.rand([5, 3, 3]) + out = paddle.slice(x, [1, 2], starts, ends) + res = self.exe.run( + paddle.static.default_main_program(), fetch_list=[out] + )[0] + self.assertEqual(res.shape, (5, 2, 2)) + + def test_strided_slice(self): + starts = [paddle.full([], 0, 'int32'), paddle.full([], 0, 'int32')] + ends = [paddle.full([], 4, 'int32'), paddle.full([], 4, 'int32')] + strides = [paddle.full([], 2, 'int32'), paddle.full([], 2, 'int32')] + x = paddle.rand([5, 5, 5]) + out = paddle.strided_slice(x, [1, 2], starts, ends, strides) + res = self.exe.run( + paddle.static.default_main_program(), fetch_list=[out] + )[0] + self.assertEqual(res.shape, (5, 2, 2)) + + def test_linspace(self): + start = paddle.full([], 1.0) + stop = paddle.full([], 5.0) + num = paddle.full([], 5, 'int32') + out = paddle.linspace(start, stop, num) + res = self.exe.run( + paddle.static.default_main_program(), fetch_list=[out] + )[0] + np.testing.assert_array_equal(res, [1.0, 2.0, 3.0, 4.0, 5.0]) + + def test_arange(self): + start = paddle.full([], 1.0) + stop = paddle.full([], 6.0) + step = paddle.full([], 1.0) + out = paddle.arange(start, stop, step) + res = self.exe.run( + paddle.static.default_main_program(), fetch_list=[out] + )[0] + np.testing.assert_array_equal(res, [1.0, 2.0, 3.0, 4.0, 5.0]) + + def test_normal(self): + mean = paddle.full([], 0.0) + std = paddle.full([], 0.0) + out1 = paddle.normal(mean, std) + out2 = paddle.normal(0.0, 1.0, []) + out3 = paddle.normal(0.0, 1.0, self.shape) + + res = self.exe.run( + paddle.static.default_main_program(), fetch_list=[out1, out2, out3] + ) + self.assertEqual(res[0].shape, ()) + self.assertEqual(res[1].shape, ()) + self.assertEqual(res[2].shape, (2, 3, 4)) + + def test_rand(self): + out1 = paddle.rand([]) + out2 = paddle.rand(self.shape) + + res = self.exe.run( + paddle.static.default_main_program(), fetch_list=[out1, out2] + ) + self.assertEqual(res[0].shape, ()) + self.assertEqual(res[1].shape, (2, 3, 4)) + + def test_randn(self): + out1 = paddle.randn([]) + out2 = paddle.randn(self.shape) + + res = self.exe.run( + paddle.static.default_main_program(), fetch_list=[out1, out2] + ) + self.assertEqual(res[0].shape, ()) + self.assertEqual(res[1].shape, (2, 3, 4)) + + def test_randint_and_randint_like(self): + out1 = paddle.randint(-10, 10, []) + out2 = paddle.randint_like(out1, -10, 10) + out3 = paddle.randint(-10, 10, self.shape) + + res = self.exe.run( + paddle.static.default_main_program(), fetch_list=[out1, out2, out3] + ) + self.assertEqual(res[0].shape, ()) + self.assertEqual(res[1].shape, ()) + self.assertEqual(res[2].shape, (2, 3, 4)) + + def test_standard_normal(self): + out1 = paddle.standard_normal([]) + out2 = paddle.standard_normal(self.shape) + + res = self.exe.run( + paddle.static.default_main_program(), fetch_list=[out1, out2] + ) + self.assertEqual(res[0].shape, ()) + self.assertEqual(res[1].shape, (2, 3, 4)) + + def test_uniform(self): + out1 = paddle.uniform([]) + out2 = paddle.uniform(self.shape) + + res = self.exe.run( + paddle.static.default_main_program(), fetch_list=[out1, out2] + ) + self.assertEqual(res[0].shape, ()) + self.assertEqual(res[1].shape, (2, 3, 4)) + + def test_empty_and_empty_like(self): + out1 = paddle.empty([]) + out2 = paddle.empty_like(out1) + out3 = paddle.empty(self.shape) + + res = self.exe.run( + paddle.static.default_main_program(), fetch_list=[out1, out2, out3] + ) + self.assertEqual(res[0].shape, ()) + self.assertEqual(res[1].shape, ()) + self.assertEqual(res[2].shape, (2, 3, 4)) + + def test_full_and_full_like(self): + out1 = paddle.full([], 0.5) + out2 = paddle.full_like(out1, 0.5) + out3 = paddle.full(self.shape, 0.5) + out4 = paddle.full(self.shape, paddle.full([], 0.5)) + + res = self.exe.run( + paddle.static.default_main_program(), + fetch_list=[out1, out2, out3, out4], + ) + self.assertEqual(res[0].shape, ()) + self.assertEqual(res[1].shape, ()) + self.assertEqual(res[2].shape, (2, 3, 4)) + self.assertEqual(res[3].shape, (2, 3, 4)) + + def test_ones_and_ones_like(self): + out1 = paddle.ones([]) + out2 = paddle.ones_like(out1) + out3 = paddle.ones(self.shape) + + res = self.exe.run( + paddle.static.default_main_program(), fetch_list=[out1, out2, out3] + ) + self.assertEqual(res[0].shape, ()) + self.assertEqual(res[1].shape, ()) + self.assertEqual(res[2].shape, (2, 3, 4)) + + def test_zeros_and_zeros_like(self): + out1 = paddle.zeros([]) + out2 = paddle.zeros_like(out1) + out3 = paddle.zeros(self.shape) + + res = self.exe.run( + paddle.static.default_main_program(), fetch_list=[out1, out2, out3] + ) + self.assertEqual(res[0].shape, ()) + self.assertEqual(res[1].shape, ()) + self.assertEqual(res[2].shape, (2, 3, 4)) + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py index cf04a44cb02fb..774261237d228 100644 --- a/python/paddle/tensor/creation.py +++ b/python/paddle/tensor/creation.py @@ -65,12 +65,12 @@ def linspace(start, stop, num, dtype=None, name=None): Return fixed number of evenly spaced values within a given interval. Args: - start(int|float|Tensor): The input :attr:`start` is start variable of range. It is a scalar, \ - or a Tensor of shape [1] with input data type int32, int64, float32 or float64. - stop(int|float|Tensor): The input :attr:`stop` is start variable of range. It is a scalar, \ - or a Tensor of shape [1] with input data type int32, int64, float32 or float64. - num(int|Tensor): The input :attr:`num` is given num of the sequence. It is an int scalar, \ - or a Tensor of shape [1] with data type int32. + start(int|float|Tensor): The input :attr:`start` is start of range. It is a int, float, \ + or a 0-D Tensor with data type int32, int64, float32 or float64. + stop(int|float|Tensor): The input :attr:`stop` is start variable of range. It is a int, float, \ + or a 0-D Tensor with data type int32, int64, float32 or float64. + num(int|Tensor): The input :attr:`num` is given num of the sequence. It is an int, \ + or a 0-D Tensor with data type int32. dtype(np.dtype|str, optional): The data type of output tensor, it could be int32, int64, float32 and float64. Default: if None, the data type is float32. name(str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None. @@ -620,7 +620,9 @@ def ones(shape, dtype=None, name=None): Create a Tensor of specified :attr:`shape` and :attr:`dtype` and fill it with 1. Args: - shape (tuple|list|Tensor): Shape of the Tensor to be created, the data type of shape should be int32 or int64. + shape (tuple|list|Tensor): Shape of the Tensor to be created. The data type is ``int32`` or ``int64`` . + If ``shape`` is a list or tuple, the elements of it should be integers or 0-D Tensor with shape []. + If ``shape`` is an Tensor, it should be an 1-D Tensor which represents a list. dtype (np.dtype|str, optional): Data type of output Tensor, it should be one of bool, float16, float32, float64, int32 and int64. If it is set to None, the data type will be float32. name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None. @@ -633,21 +635,25 @@ def ones(shape, dtype=None, name=None): import paddle - # default dtype for ones OP + # shape is a list/tuple data1 = paddle.ones(shape=[3, 2]) # [[1. 1.] # [1. 1.] # [1. 1.]] - data2 = paddle.ones(shape=[2, 2], dtype='int32') - # [[1 1] - # [1 1]] - # shape is a Tensor - shape = paddle.full(shape=[2], dtype='int32', fill_value=2) - data3 = paddle.ones(shape=shape, dtype='int32') - # [[1 1] - # [1 1]] + shape = paddle.to_tensor([3, 2]) + data2 = paddle.ones(shape=shape) + # [[1. 1.] + # [1. 1.] + # [1. 1.]] + + # shape is a Tensor List + shape = [paddle.to_tensor(3), paddle.to_tensor(2)] + data3 = paddle.ones(shape=shape) + # [[1. 1.] + # [1. 1.] + # [1. 1.]] """ if dtype is None: dtype = 'float32' @@ -690,7 +696,9 @@ def zeros(shape, dtype=None, name=None): Creates a tensor of specified :attr:`shape` and :attr:`dtype`, and fills it with 0. Args: - shape(tuple|list|Tensor): Shape of the Tensor to be created, the data type of ``shape`` is int32 or int64. + shape (tuple|list|Tensor): Shape of the Tensor to be created. The data type is ``int32`` or ``int64`` . + If ``shape`` is a list or tuple, each element of it should be integer or 0-D Tensor with shape []. + If ``shape`` is an Tensor, it should be an 1-D Tensor which represents a list. dtype(np.dtype|str, optional): Data type of output Tensor, it supports bool, float16, float32, float64, int32 and int64. Default: if None, the date type is float32. name(str, optional): The default value is None. Normally there is no need for user to set this @@ -702,21 +710,27 @@ def zeros(shape, dtype=None, name=None): Examples: .. code-block:: python - import paddle + import paddle - data = paddle.zeros(shape=[3, 2], dtype='float32') - # [[0. 0.] - # [0. 0.] - # [0. 0.]] - data = paddle.zeros(shape=[2, 2]) - # [[0. 0.] - # [0. 0.]] - - # shape is a Tensor - shape = paddle.full(shape=[2], dtype='int32', fill_value=2) - data3 = paddle.zeros(shape=shape, dtype='int32') - # [[0 0] - # [0 0]] + # shape is a list/tuple + data1 = paddle.zeros(shape=[3, 2]) + # [[0. 0.] + # [0. 0.] + # [0. 0.]] + + # shape is a Tensor + shape = paddle.to_tensor([3, 2]) + data2 = paddle.zeros(shape=shape) + # [[0. 0.] + # [0. 0.] + # [0. 0.]] + + # shape is a Tensor List + shape = [paddle.to_tensor(3), paddle.to_tensor(2)] + data3 = paddle.zeros(shape=shape) + # [[0. 0.] + # [0. 0.] + # [0. 0.]] """ if dtype is None: dtype = 'float32' @@ -844,12 +858,11 @@ def full(shape, fill_value, dtype=None, name=None): Return a Tensor with the ``fill_value`` which size is same as ``shape``. Args: - shape(list|tuple|Tensor): Shape of the Tensor to be created. - The data type is ``int32`` or ``int64`` . If ``shape`` is a list or tuple, - the elements of it should be integers or Tensors with shape [1]. - If ``shape`` is an Tensor, it should be an 1-D Tensor. - fill_value(bool|float|int|Tensor): The constant value - used to initialize the Tensor to be created. If ``fill_value`` is an Tensor, it must be an 1-D Tensor. + shape (tuple|list|Tensor): Shape of the Tensor to be created. The data type is ``int32`` or ``int64`` . + If ``shape`` is a list or tuple, each element of it should be integer or 0-D Tensor with shape []. + If ``shape`` is an Tensor, it should be an 1-D Tensor which represents a list. + fill_value(bool|float|int|Tensor): The constant value used to initialize the Tensor to be created. + If ``fill_value`` is an Tensor, it shoule be an 0-D Tensor which represents a scalar. dtype(np.dtype|str, optional): Data type of the output Tensor which can be float16, float32, float64, int32, int64, if dytpe is `None`, the data type of created Tensor is `float32`. @@ -863,26 +876,32 @@ def full(shape, fill_value, dtype=None, name=None): import paddle - data1 = paddle.full(shape=[2,1], fill_value=0, dtype='int64') - #[[0] - # [0]] - - # attr shape is a list which contains Tensor. - positive_2 = paddle.full([1], 2, "int32") - data3 = paddle.full(shape=[1, positive_2], dtype='float32', fill_value=1.5) - # [[1.5 1.5]] - - # attr shape is a Tensor. - shape = paddle.full([2], 2, "int32") - data4 = paddle.full(shape=shape, dtype='bool', fill_value=True) - # [[True True] - # [True True]] - - # attr fill_value is a Tensor. - val = paddle.full([1], 2.0, "float32") - data5 = paddle.full(shape=[2,1], fill_value=val, dtype='float32') - # [[2.0] - # [2.0]] + # shape is a list/tuple + data1 = paddle.full(shape=[3, 2], fill_value=1.) + # [[1. 1.] + # [1. 1.] + # [1. 1.]] + + # shape is a Tensor + shape = paddle.to_tensor([3, 2]) + data2 = paddle.full(shape=shape, fill_value=2.) + # [[2. 2.] + # [2. 2.] + # [2. 2.]] + + # shape is a Tensor List + shape = [paddle.to_tensor(3), paddle.to_tensor(2)] + data3 = paddle.full(shape=shape, fill_value=3.) + # [[3. 3.] + # [3. 3.] + # [3. 3.]] + + # fill_value is a Tensor. + val = paddle.full([], 2.0, "float32") + data5 = paddle.full(shape=[3, 2], fill_value=val) + # [[2. 2.] + # [2. 2.] + # [2. 2.]] """ if dtype is None: @@ -904,16 +923,17 @@ def arange(start=0, end=None, step=1, dtype=None, name=None): Parameters: start(float|int|Tensor): Start of interval. The interval includes this value. If ``end`` is None, the half-open interval is [0, ``start``). - If ``start`` is a Tensor, it is a 1-D Tensor with shape [1], with - data type int32, int64, float32, float64. Default is 0. + If ``start`` is a Tensor, it is a 0-D Tensor which represents a scalar + and data type is int32, int64, float32, float64. Default is 0. end(float|int|Tensor, optional): End of interval. The interval does not - include this value. If ``end`` is a Tensor, it is a 1-D Tensor with - shape [1], with data type int32, int64, float32, float64. If ``end`` - is None, the half-open interval is [0, ``start``). Default is None. + include this value. If ``end`` is a Tensor, it is a 0-D Tensor which + represents a scalar and data type is int32, int64, float32, float64. + If ``end`` is None, the half-open interval is [0, ``start``). + Default is None. step(float|int|Tensor, optional): Spacing between values. For any out, it is the istance between two adjacent values, out[i+1] - out[i]. - If ``step`` is a Tensor, it is a 1-D Tensor with shape [1], with - data type int32, int64, float32, float64. Default is 1. + If ``step`` is a Tensor, it is a 0-D Tensor which represents a scalar + and data type is int32, int64, float32, float64. . Default is 1. dtype(str|np.dtype, optional): The data type of the output tensor. Supported data types: int32, int64, float32, float64. If ``dytpe`` is None, the data type is float32. Default is None. @@ -939,7 +959,7 @@ def arange(start=0, end=None, step=1, dtype=None, name=None): out3 = paddle.arange(4.999, dtype='float32') # [0., 1., 2., 3., 4.] - start_var = paddle.to_tensor([3]) + start_var = paddle.to_tensor(3) out4 = paddle.arange(start_var, 7) # [3, 4, 5, 6] @@ -1501,10 +1521,9 @@ def empty(shape, dtype=None, name=None): Returns a Tensor with uninitialized data which size is same as ``shape``. Args: - shape(list|tuple|Tensor): Shape of the Tensor to be created. - The data type of dimension of shape is ``int32`` or ``int64`` . If ``shape`` is a list or tuple, - the elements of it should be integers or Tensors with shape [1]. - If ``shape`` is an Tensor, it should be an 1-D Tensor. + shape (tuple|list|Tensor): Shape of the Tensor to be created. The data type is ``int32`` or ``int64`` . + If ``shape`` is a list or tuple, each element of it should be integer or 0-D Tensor with shape []. + If ``shape`` is an Tensor, it should be an 1-D Tensor which represents a list. dtype(np.dtype|str, optional): Data type of the output Tensor which can be bool, float16, float32, float64, int32, int64, if dytpe is `None`, the data type of created Tensor use global default dtype (see ``get_default_dtype`` @@ -1519,30 +1538,25 @@ def empty(shape, dtype=None, name=None): import paddle - paddle.set_device("cpu") # and use cpu device - - # example 1: argument ``shape`` is a list which doesn't contain Tensor. - data1 = paddle.empty(shape=[2, 3], dtype='float32') - print(data1) - # Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True, - # [[0.00000000, 0. , 0.00000000], - # [0. , 0.29652897, 0.09356152]]) # uninitialized + # shape is a list/tuple + data1 = paddle.empty(shape=[3, 2]) + # [[1. 1.] + # [1. 1.] + # [1. 1.]] - # example 2: argument ``shape`` is a Tensor, the data type must be int64 or int32. - shape_data = paddle.to_tensor([2, 3]).astype('int32') - data2 = paddle.empty(shape=shape_data, dtype='float32') - print(data2) - # Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True, - # [[-0.50543123, -0.09872390, -0.92634487], - # [-0.51007903, -0.02454148, 1.29315734]]) # uninitialized + # shape is a Tensor + shape = paddle.to_tensor([3, 2]) + data2 = paddle.empty(shape=shape) + # [[1. 1.] + # [1. 1.] + # [1. 1.]] - # example 3: argument ``shape`` is a list which contains Tensor. - dim2 = paddle.to_tensor([3]).astype('int32') - data3 = paddle.empty(shape=[2, dim2], dtype='float32') - print(data3) - # Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True, - # [[ 0.00000000, 0. , -0.92634487], - # [-0.51007903, -0.02454148, 1.29315734]]) # uninitialized + # shape is a Tensor List + shape = [paddle.to_tensor(3), paddle.to_tensor(2)] + data3 = paddle.empty(shape=shape) + # [[1. 1.] + # [1. 1.] + # [1. 1.]] """ if dtype is None: diff --git a/python/paddle/tensor/random.py b/python/paddle/tensor/random.py index d49941e199bae..54e8459661c45 100644 --- a/python/paddle/tensor/random.py +++ b/python/paddle/tensor/random.py @@ -221,11 +221,9 @@ def gaussian(shape, mean=0.0, std=1.0, dtype=None, name=None): distribution, with ``shape`` and ``dtype``. Args: - shape (list|tuple|Tensor): The shape of the output Tensor. If ``shape`` - is a list or tuple, the elements of it should be integers or Tensors - (with the shape [1], and the data type int32 or int64). If ``shape`` - is a Tensor, it should be a 1-D Tensor(with the data type int32 or - int64). + shape (tuple|list|Tensor): Shape of the Tensor to be created. The data type is ``int32`` or ``int64`` . + If ``shape`` is a list or tuple, each element of it should be integer or 0-D Tensor with shape []. + If ``shape`` is an Tensor, it should be an 1-D Tensor which represents a list. mean (float|int, optional): Mean of the output tensor, default is 0.0. std (float|int, optional): Standard deviation of the output tensor, default is 1.0. @@ -307,11 +305,9 @@ def standard_normal(shape, dtype=None, name=None): and ``dtype``. Args: - shape (list|tuple|Tensor): The shape of the output Tensor. If ``shape`` - is a list or tuple, the elements of it should be integers or Tensors - (with the shape [1], and the data type int32 or int64). If ``shape`` - is a Tensor, it should be a 1-D Tensor(with the data type int32 or - int64). + shape (tuple|list|Tensor): Shape of the Tensor to be created. The data type is ``int32`` or ``int64`` . + If ``shape`` is a list or tuple, each element of it should be integer or 0-D Tensor with shape []. + If ``shape`` is an Tensor, it should be an 1-D Tensor which represents a list. dtype (str|np.dtype, optional): The data type of the output Tensor. Supported data types: float32, float64. Default is None, use global default dtype (see ``get_default_dtype`` @@ -335,8 +331,8 @@ def standard_normal(shape, dtype=None, name=None): # [ 0.39632758, 0.08177969, 0.2692008 ]] # random # example 2: attr shape is a list which contains Tensor. - dim1 = paddle.to_tensor([2], 'int64') - dim2 = paddle.to_tensor([3], 'int32') + dim1 = paddle.to_tensor(2, 'int64') + dim2 = paddle.to_tensor(3, 'int32') out2 = paddle.standard_normal(shape=[dim1, dim2, 2]) # [[[-2.8852394 , -0.25898588], # random # [-0.47420555, 0.17683524], # random @@ -362,11 +358,9 @@ def randn(shape, dtype=None, name=None): and ``dtype``. Args: - shape (list|tuple|Tensor): The shape of the output Tensor. If ``shape`` - is a list or tuple, the elements of it should be integers or Tensors - (with the shape [1], and the data type int32 or int64). If ``shape`` - is a Tensor, it should be a 1-D Tensor(with the data type int32 or - int64). + shape (tuple|list|Tensor): Shape of the Tensor to be created. The data type is ``int32`` or ``int64`` . + If ``shape`` is a list or tuple, each element of it should be integer or 0-D Tensor with shape []. + If ``shape`` is an Tensor, it should be an 1-D Tensor which represents a list. dtype (str|np.dtype, optional): The data type of the output Tensor. Supported data types: float32, float64. Default is None, use global default dtype (see ``get_default_dtype`` @@ -390,8 +384,8 @@ def randn(shape, dtype=None, name=None): # [ 0.39632758, 0.08177969, 0.2692008 ]] # random # example 2: attr shape is a list which contains Tensor. - dim1 = paddle.to_tensor([2], 'int64') - dim2 = paddle.to_tensor([3], 'int32') + dim1 = paddle.to_tensor(2, 'int64') + dim2 = paddle.to_tensor(3, 'int32') out2 = paddle.randn(shape=[dim1, dim2, 2]) # [[[-2.8852394 , -0.25898588], # random # [-0.47420555, 0.17683524], # random @@ -429,12 +423,10 @@ def normal(mean=0.0, std=1.0, shape=None, name=None): If ``std`` is float, all elements of the output Tensor shared the same standard deviation. If ``std`` is a Tensor(data type supports float32, float64), it has per-element standard deviations. Defaule is 1.0 - shape (list|tuple|Tensor, optional): The shape of the output Tensor. If ``shape`` - is a list or tuple, the elements of it should be integers or Tensors - (with the shape [1], and the data type int32 or int64). If ``shape`` - is a Tensor, it should be a 1-D Tensor(with the data type int32 or - int64). If ``mean`` or ``std`` is a Tensor, the shape of the output - Tensor is the same as ``mean`` or ``std`` , attr ``shape`` is ignored. + shape (tuple|list|Tensor): Shape of the Tensor to be created. The data type is ``int32`` or ``int64`` . + If ``shape`` is a list or tuple, each element of it should be integer or 0-D Tensor with shape []. + If ``shape`` is an Tensor, it should be an 1-D Tensor which represents a list. If ``mean`` or ``std`` + is a Tensor, the shape of the output Tensor is the same as ``mean`` or ``std`` , attr ``shape`` is ignored. Default is None name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. @@ -518,11 +510,9 @@ def uniform(shape, dtype=None, min=-1.0, max=1.0, seed=0, name=None): result=[[0.8505902, 0.8397286]] Args: - shape(list|tuple|Tensor): The shape of the output Tensor. If ``shape`` - is a list or tuple, the elements of it should be integers or Tensors - (with the shape [1], and the data type int32 or int64). If ``shape`` - is a Tensor, it should be a 1-D Tensor(with the data type int32 or - int64). + shape (tuple|list|Tensor): Shape of the Tensor to be created. The data type is ``int32`` or ``int64`` . + If ``shape`` is a list or tuple, each element of it should be integer or 0-D Tensor with shape []. + If ``shape`` is an Tensor, it should be an 1-D Tensor which represents a list. dtype(str|np.dtype, optional): The data type of the output Tensor. Supported data types: float32, float64. Default is None, use global default dtype (see ``get_default_dtype`` @@ -557,8 +547,8 @@ def uniform(shape, dtype=None, min=-1.0, max=1.0, seed=0, name=None): # example 2: # attr shape is a list which contains Tensor. - dim1 = paddle.to_tensor([2], 'int64') - dim2 = paddle.to_tensor([3], 'int32') + dim1 = paddle.to_tensor(2, 'int64') + dim2 = paddle.to_tensor(3, 'int32') out2 = paddle.uniform(shape=[dim1, dim2]) # [[-0.9951253, 0.30757582, 0.9899647 ], # random # [ 0.5864527, 0.6607096, -0.8886161]] # random @@ -684,11 +674,9 @@ def randint(low=0, high=None, shape=[1], dtype=None, name=None): high (int, optional): The upper bound on the range of random values to generate, the ``high`` is excluded in the range. Default is None (see above for behavior if high = None). Default is None. - shape (list|tuple|Tensor, optional): The shape of the output Tensor. If ``shape`` - is a list or tuple, the elements of it should be integers or Tensors - (with the shape [1], and the data type int32 or int64). If ``shape`` - is a Tensor, it should be a 1-D Tensor(with the data type int32 or - int64). Default is [1]. + shape (tuple|list|Tensor): Shape of the Tensor to be created. The data type is ``int32`` or ``int64`` . + If ``shape`` is a list or tuple, each element of it should be integer or 0-D Tensor with shape []. + If ``shape`` is an Tensor, it should be an 1-D Tensor which represents a list. Default is [1]. dtype (str|np.dtype, optional): The data type of the output tensor. Supported data types: int32, int64. If ``dytpe`` is None, the data type is int64. Default is None. @@ -707,22 +695,23 @@ def randint(low=0, high=None, shape=[1], dtype=None, name=None): # example 1: # attr shape is a list which doesn't contain Tensor. - out1 = paddle.randint(low=-5, high=5, shape=[3]) + out1 = paddle.randint(low=-5, high=5, shape=[2, 3]) # [0, -3, 2] # random # example 2: # attr shape is a list which contains Tensor. - dim1 = paddle.to_tensor([2], 'int64') - dim2 = paddle.to_tensor([3], 'int32') + dim1 = paddle.to_tensor(2, 'int64') + dim2 = paddle.to_tensor(3, 'int32') out2 = paddle.randint(low=-5, high=5, shape=[dim1, dim2]) # [[0, -1, -3], # random # [4, -2, 0]] # random # example 3: # attr shape is a Tensor - shape_tensor = paddle.to_tensor(3) + shape_tensor = paddle.to_tensor([2, 3]) out3 = paddle.randint(low=-5, high=5, shape=shape_tensor) - # [-2, 2, 3] # random + # [[ 2, -3, -1], # random + # [-3, -2, 1]]) # random # example 4: # data type is int32 @@ -1033,11 +1022,9 @@ def rand(shape, dtype=None, name=None): distribution in the range [0, 1), with ``shape`` and ``dtype``. Args: - shape (list|tuple|Tensor): The shape of the output Tensor. If ``shape`` - is a list or tuple, the elements of it should be integers or Tensors - (with the shape [1], and the data type int32 or int64). If ``shape`` - is a Tensor, it should be a 1-D Tensor(with the data type int32 or - int64). + shape (tuple|list|Tensor): Shape of the Tensor to be created. The data type is ``int32`` or ``int64`` . + If ``shape`` is a list or tuple, each element of it should be integer or 0-D Tensor with shape []. + If ``shape`` is an Tensor, it should be an 1-D Tensor which represents a list. dtype (str|np.dtype, optional): The data type of the output Tensor. Supported data types: float32, float64. Default is None, use global default dtype (see ``get_default_dtype`` @@ -1061,8 +1048,8 @@ def rand(shape, dtype=None, name=None): # [0.22550228, 0.22106001, 0.7877319 ]] # random # example 2: attr shape is a list which contains Tensor. - dim1 = paddle.to_tensor([2], 'int64') - dim2 = paddle.to_tensor([3], 'int32') + dim1 = paddle.to_tensor(2, 'int64') + dim2 = paddle.to_tensor(3, 'int32') out2 = paddle.rand(shape=[dim1, dim2, 2]) # [[[0.8879919 , 0.25788337], # random # [0.28826773, 0.9712097 ], # random @@ -1076,7 +1063,6 @@ def rand(shape, dtype=None, name=None): out3 = paddle.rand(shape_tensor) # [[0.22920267, 0.841956 , 0.05981819], # random # [0.4836288 , 0.24573246, 0.7516129 ]] # random - """ return uniform(shape, dtype, min=0.0, max=1.0, name=name) From 2d383b811d9429eba1bd5c8278eae67b8dcea08e Mon Sep 17 00:00:00 2001 From: LiYuRio <63526175+LiYuRio@users.noreply.github.com> Date: Mon, 14 Nov 2022 20:53:31 +0800 Subject: [PATCH 005/210] Remove place for process group (#47857) --- .../distributed/collective/ProcessGroup.h | 7 +++---- .../collective/ProcessGroupBKCL.cc | 13 +++++++----- .../distributed/collective/ProcessGroupBKCL.h | 1 - .../collective/ProcessGroupCustom.cc | 15 ++++++------- .../collective/ProcessGroupCustom.h | 2 +- .../collective/ProcessGroupGloo.cc | 3 +-- .../distributed/collective/ProcessGroupGloo.h | 1 - .../collective/ProcessGroupNCCL.cc | 13 +++++++----- .../distributed/collective/ProcessGroupNCCL.h | 1 - .../collective/ProcessGroupStream.cc | 7 ++----- .../collective/ProcessGroupStream.h | 2 +- paddle/fluid/distributed/collective/Types.h | 3 ++- paddle/fluid/pybind/distributed_py.cc | 21 +++++++------------ python/paddle/distributed/collective.py | 21 +++++++++++-------- .../custom_runtime/process_group_xccl.py | 11 +++++----- .../collective/process_group_gloo.py | 3 +-- .../collective/process_group_nccl.py | 9 ++++---- .../tests/unittests/xpu/process_group_bkcl.py | 10 ++++----- 18 files changed, 67 insertions(+), 76 deletions(-) diff --git a/paddle/fluid/distributed/collective/ProcessGroup.h b/paddle/fluid/distributed/collective/ProcessGroup.h index 949dd62deb53d..50d2807202d35 100644 --- a/paddle/fluid/distributed/collective/ProcessGroup.h +++ b/paddle/fluid/distributed/collective/ProcessGroup.h @@ -83,15 +83,14 @@ class ProcessGroup { }; public: + explicit ProcessGroup(int rank, int size, int gid); + virtual ~ProcessGroup() = default; + // TODO(dev): This constructor will be removed later. explicit ProcessGroup(int rank, int size, const platform::Place& place, int gid); - explicit ProcessGroup(int rank, int size, int gid); - - virtual ~ProcessGroup() {} - int GetRank() const { return rank_; } int GetSize() const { return size_; } diff --git a/paddle/fluid/distributed/collective/ProcessGroupBKCL.cc b/paddle/fluid/distributed/collective/ProcessGroupBKCL.cc index 40f2172b374ca..d9b6d490a5570 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupBKCL.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupBKCL.cc @@ -20,6 +20,7 @@ #include "paddle/fluid/platform/device/xpu/xpu_info.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/place.h" +#include "paddle/phi/core/errors.h" namespace paddle { namespace distributed { @@ -68,11 +69,8 @@ void ProcessGroupBKCL::BKCLTask::Synchronize() { Wait(kWaitTimeout); } ProcessGroupBKCL::ProcessGroupBKCL(const std::shared_ptr& store, int rank, int size, - const platform::Place& place, int gid) - : ProcessGroupStream(rank, size, place, gid), store_(store) { - platform::SetXPUDeviceId(place_.device); -} + : ProcessGroupStream(rank, size, gid), store_(store) {} void ProcessGroupBKCL::GroupStart() { PADDLE_ENFORCE_XPU_SUCCESS(bkcl_group_start()); @@ -255,8 +253,13 @@ std::shared_ptr ProcessGroupBKCL::AllGather( std::shared_ptr ProcessGroupBKCL::Barrier( const BarrierOptions& opts) { + PADDLE_ENFORCE_GE(opts.device_id, + 0, + platform::errors::PreconditionNotMet( + "The barrier device id must greater or equal than 0.")); + platform::XPUPlace place(opts.device_id); auto allocator = std::unique_ptr( - new paddle::experimental::DefaultAllocator(place_)); + new paddle::experimental::DefaultAllocator(place)); phi::DenseTensorMeta meta(phi::DataType::FLOAT32, phi::DDim{1}); phi::DenseTensor barrier_tensor{allocator.get(), meta}; diff --git a/paddle/fluid/distributed/collective/ProcessGroupBKCL.h b/paddle/fluid/distributed/collective/ProcessGroupBKCL.h index 0041d903de78a..6d457c88a8e77 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupBKCL.h +++ b/paddle/fluid/distributed/collective/ProcessGroupBKCL.h @@ -71,7 +71,6 @@ class ProcessGroupBKCL : public ProcessGroupStream { ProcessGroupBKCL(const std::shared_ptr& store, int rank, int size, - const platform::Place& place, int gid); std::string GetBackendName() const override { diff --git a/paddle/fluid/distributed/collective/ProcessGroupCustom.cc b/paddle/fluid/distributed/collective/ProcessGroupCustom.cc index 87bd474477eb9..d71a8b975e46e 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupCustom.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupCustom.cc @@ -98,15 +98,11 @@ bool ProcessGroupCustom::CustomTask::Wait(std::chrono::milliseconds timeout) { void ProcessGroupCustom::CustomTask::Synchronize() { Wait(kWaitTimeout); } ProcessGroupCustom::ProcessGroupCustom(const std::shared_ptr& store, + const std::string& device_type, int rank, int size, - const platform::Place& place, int gid) - : ProcessGroup(rank, size, place, gid), - store_(store), - device_type_(place.GetDeviceType()) { - phi::DeviceManager::SetDevice(place_); -} + : ProcessGroup(rank, size, gid), store_(store), device_type_(device_type) {} void ProcessGroupCustom::BroadcastUniqueCustomID( std::vector& ccl_ids) { // NOLINT @@ -379,7 +375,12 @@ std::shared_ptr ProcessGroupCustom::Broadcast( std::shared_ptr ProcessGroupCustom::Barrier( const BarrierOptions& opts) { // Only support single card single process - std::vector places = {place_}; + PADDLE_ENFORCE_GE(opts.device_id, + 0, + platform::errors::PreconditionNotMet( + "The barrier device id must greater or equal than 0.")); + platform::CustomPlace place(device_type_, opts.device_id); + std::vector places = {place}; std::vector barrierTensors; barrierTensors.reserve(places.size()); diff --git a/paddle/fluid/distributed/collective/ProcessGroupCustom.h b/paddle/fluid/distributed/collective/ProcessGroupCustom.h index 15d6193237d87..b74d0c70de623 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupCustom.h +++ b/paddle/fluid/distributed/collective/ProcessGroupCustom.h @@ -64,9 +64,9 @@ class ProcessGroupCustom : public ProcessGroup { }; ProcessGroupCustom(const std::shared_ptr& store, + const std::string& device_type, int rank, int size, - const platform::Place& place, int gid); std::string GetBackendName() const override { return "XCCL_" + device_type_; } diff --git a/paddle/fluid/distributed/collective/ProcessGroupGloo.cc b/paddle/fluid/distributed/collective/ProcessGroupGloo.cc index 5cb4daf728b33..2d6d4c88dd4d0 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupGloo.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupGloo.cc @@ -180,10 +180,9 @@ ProcessGroupGloo::ProcessGroupGloo( const std::shared_ptr& store, int rank, int world_size, - const platform::Place& place, int gid, const std::shared_ptr options) - : ProcessGroup(rank, world_size, place, gid), + : ProcessGroup(rank, world_size, gid), _tag(0), _store(new GlooStore(store)) { _context = std::make_shared(rank, world_size); diff --git a/paddle/fluid/distributed/collective/ProcessGroupGloo.h b/paddle/fluid/distributed/collective/ProcessGroupGloo.h index 9796f91663954..4e2c0eff12c82 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupGloo.h +++ b/paddle/fluid/distributed/collective/ProcessGroupGloo.h @@ -102,7 +102,6 @@ class ProcessGroupGloo : public ProcessGroup { const std::shared_ptr& store, int rank, int world_size, - const platform::Place& place, int gid, std::shared_ptr options); diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc index 3748f22ebe52d..a1f7754a57900 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc @@ -16,6 +16,7 @@ #include "paddle/fluid/distributed/collective/Common.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h" +#include "paddle/fluid/platform/place.h" #include "paddle/phi/api/lib/utils/allocator.h" DECLARE_bool(nccl_blocking_wait); @@ -81,11 +82,8 @@ void ProcessGroupNCCL::NCCLTask::Synchronize() { Wait(kWaitTimeout); } ProcessGroupNCCL::ProcessGroupNCCL(const std::shared_ptr& store, int rank, int size, - const platform::Place& place, int gid) - : ProcessGroupStream(rank, size, place, gid), store_(store) { - platform::SetDeviceId(place_.device); -} + : ProcessGroupStream(rank, size, gid), store_(store) {} void ProcessGroupNCCL::GroupStart() { PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart()); @@ -182,8 +180,13 @@ std::shared_ptr ProcessGroupNCCL::AllReduce( std::shared_ptr ProcessGroupNCCL::Barrier( const BarrierOptions& opts) { + PADDLE_ENFORCE_GE(opts.device_id, + 0, + platform::errors::PreconditionNotMet( + "The barrier device id must greater or equal than 0.")); + platform::CUDAPlace place(opts.device_id); auto allocator = std::unique_ptr( - new paddle::experimental::DefaultAllocator(place_)); + new paddle::experimental::DefaultAllocator(place)); phi::DenseTensorMeta meta(phi::DataType::FLOAT32, phi::DDim{1}); phi::DenseTensor barrier_tensor{allocator.get(), meta}; diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h index 54ac390231734..7933636e3d17b 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h +++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h @@ -85,7 +85,6 @@ class ProcessGroupNCCL final : public ProcessGroupStream { ProcessGroupNCCL(const std::shared_ptr& store, int rank, int size, - const platform::Place& place, int gid); std::string GetBackendName() const override { return "NCCL"; } diff --git a/paddle/fluid/distributed/collective/ProcessGroupStream.cc b/paddle/fluid/distributed/collective/ProcessGroupStream.cc index 0c191428502dd..7fd01576fabe0 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupStream.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupStream.cc @@ -17,11 +17,8 @@ namespace paddle { namespace distributed { -ProcessGroupStream::ProcessGroupStream(int rank, - int size, - const platform::Place& place, - int gid) - : ProcessGroup(rank, size, place, gid) {} +ProcessGroupStream::ProcessGroupStream(int rank, int size, int gid) + : ProcessGroup(rank, size, gid) {} const phi::DeviceContext& ProcessGroupStream::GetDeviceContext( const Place& place, bool use_calc_stream) const { diff --git a/paddle/fluid/distributed/collective/ProcessGroupStream.h b/paddle/fluid/distributed/collective/ProcessGroupStream.h index ec1a3391911fe..fd68f6db5e360 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupStream.h +++ b/paddle/fluid/distributed/collective/ProcessGroupStream.h @@ -55,7 +55,7 @@ class ProcessGroupStream : public ProcessGroup { }; public: - ProcessGroupStream(int rank, int size, const platform::Place& place, int gid); + ProcessGroupStream(int rank, int size, int gid); virtual ~ProcessGroupStream() = default; virtual const phi::DeviceContext& GetDeviceContext( diff --git a/paddle/fluid/distributed/collective/Types.h b/paddle/fluid/distributed/collective/Types.h index 0ce92111f6a13..11628ea1f052a 100644 --- a/paddle/fluid/distributed/collective/Types.h +++ b/paddle/fluid/distributed/collective/Types.h @@ -16,6 +16,7 @@ #include #include #include +#include "paddle/phi/common/place.h" namespace paddle { namespace distributed { @@ -33,7 +34,7 @@ struct BroadcastOptions { }; struct BarrierOptions { - std::vector place_ids; + int8_t device_id; }; struct ReduceOptions { diff --git a/paddle/fluid/pybind/distributed_py.cc b/paddle/fluid/pybind/distributed_py.cc index de415393caacf..9c7a89c395fa9 100644 --- a/paddle/fluid/pybind/distributed_py.cc +++ b/paddle/fluid/pybind/distributed_py.cc @@ -110,7 +110,7 @@ void BindDistributed(py::module *m) { py::class_(*m, "BarrierOptions") .def(py::init<>()) - .def_readwrite("place_ids", &distributed::BarrierOptions::place_ids); + .def_readwrite("device_id", &distributed::BarrierOptions::device_id); py::class_(*m, "ReduceOptions") .def(py::init<>()) @@ -513,12 +513,12 @@ void BindDistributed(py::module *m) { .def( "barrier", - [](distributed::ProcessGroup &self, std::vector place_ids) { + [](distributed::ProcessGroup &self, int8_t device_id) { distributed::BarrierOptions opts; - opts.place_ids = place_ids; + opts.device_id = device_id; return self.Barrier(opts); }, - py::arg("place_ids") = std::vector{}, + py::arg("device_id") = -1, py::call_guard()) // TODO(liyurui): Interface below will be removed in the future. @@ -1214,12 +1214,10 @@ void BindDistributed(py::module *m) { .def(py::init &, int, int, - const platform::CUDAPlace &, int>(), py::arg("store"), py::arg("rank"), py::arg("world_size"), - py::arg("place"), py::arg("group_id") = 0, py::call_guard()); @@ -1254,14 +1252,14 @@ void BindDistributed(py::module *m) { std::shared_ptr>( *m, "ProcessGroupCustom", ProcessGroup) .def(py::init &, + const std::string &, int, int, - const platform::CustomPlace &, int>(), py::arg("store"), + py::arg("device_type"), py::arg("rank"), py::arg("world_size"), - py::arg("place"), py::arg("group_id") = 0, py::call_guard()); @@ -1275,12 +1273,10 @@ void BindDistributed(py::module *m) { .def(py::init &, int, int, - const platform::XPUPlace &, int>(), py::arg("store"), py::arg("rank"), py::arg("world_size"), - py::arg("place"), py::arg("group_id") = 0, py::call_guard()); #endif @@ -1303,14 +1299,12 @@ void BindDistributed(py::module *m) { .def(py::init &, int, int, - const platform::CPUPlace &, int, std::shared_ptr &>(), py::call_guard()) .def(py::init([](const std::shared_ptr &store, int rank, int world_size, - const platform::CPUPlace &place, int gid) { auto opts = GlooOptions::create(); char *ifname = getenv(GLOO_SOCKET_IFNAME_ENV.c_str()); @@ -1321,12 +1315,11 @@ void BindDistributed(py::module *m) { opts->device = ProcessGroupGloo::createDefaultDevice(); } return std::make_shared( - store, rank, world_size, place, gid, opts); + store, rank, world_size, gid, opts); }), py::arg("store"), py::arg("rank"), py::arg("world_size"), - py::arg("place"), py::arg("group_id") = 0, py::call_guard()) .def_static("create_default_device", diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py index c4e09a620fc10..4bdc473f9a0ac 100644 --- a/python/paddle/distributed/collective.py +++ b/python/paddle/distributed/collective.py @@ -152,17 +152,15 @@ def _new_process_group_impl( genv = _get_global_env() assert backend in _valid_backend_list, "Unsupported backend: %s." % backend if backend == "gloo": - place = core.CPUPlace() - pg = core.ProcessGroupGloo(store, rank, world_size, place, group_id) + pg = core.ProcessGroupGloo(store, rank, world_size, group_id) elif backend == "nccl": - place = core.CUDAPlace(genv.device_id) - pg = core.ProcessGroupNCCL(store, rank, world_size, place, group_id) + pg = core.ProcessGroupNCCL(store, rank, world_size, group_id) elif backend == "xccl": - place = core.CustomPlace(genv.device_type, genv.device_id) - pg = core.ProcessGroupCustom(store, rank, world_size, place, group_id) + pg = core.ProcessGroupCustom( + store, genv.device_type, rank, world_size, group_id + ) elif backend == "bkcl": - place = core.XPUPlace(genv.device_id) - pg = core.ProcessGroupBKCL(store, rank, world_size, place, group_id) + pg = core.ProcessGroupBKCL(store, rank, world_size, group_id) return pg @@ -192,7 +190,12 @@ def barrier(group=None): if in_dygraph_mode(): group = _get_default_group() if group is None else group - task = group.process_group.barrier() + place = paddle.fluid.framework._current_expected_place() + if isinstance(place, paddle.fluid.core.CPUPlace): + task = group.process_group.barrier() + else: + device_id = place.get_device_id() + task = group.process_group.barrier(device_id) task.wait() return diff --git a/python/paddle/fluid/tests/custom_runtime/process_group_xccl.py b/python/paddle/fluid/tests/custom_runtime/process_group_xccl.py index 201b2d3df8657..1d3dfce9597a1 100644 --- a/python/paddle/fluid/tests/custom_runtime/process_group_xccl.py +++ b/python/paddle/fluid/tests/custom_runtime/process_group_xccl.py @@ -30,9 +30,9 @@ def init_process_group(strategy=None): store = paddle.fluid.core.TCPStore("127.0.0.1", 6173, is_master, nranks) pg_group = core.ProcessGroupCustom( store, + ParallelEnv().device_type, rank, nranks, - paddle.CustomPlace(ParallelEnv().device_type, ParallelEnv().device_id), ) return pg_group @@ -51,9 +51,8 @@ def config(self): def test_create_process_group_xccl(self): with _test_eager_guard(): - paddle.set_device( - 'custom_cpu:%d' % paddle.distributed.ParallelEnv().dev_id - ) + device_id = paddle.distributed.ParallelEnv().dev_id + paddle.set_device('custom_cpu:%d' % device_id) pg = init_process_group() @@ -119,11 +118,11 @@ def test_create_process_group_xccl(self): # test barrier # rank 0 if pg.rank() == 0: - task = pg.barrier() + task = pg.barrier(device_id) task.wait() # rank 1 else: - task = pg.barrier() + task = pg.barrier(device_id) task.wait() print("test barrier api ok\n") diff --git a/python/paddle/fluid/tests/unittests/collective/process_group_gloo.py b/python/paddle/fluid/tests/unittests/collective/process_group_gloo.py index 3ace517991322..f93adb60910b5 100644 --- a/python/paddle/fluid/tests/unittests/collective/process_group_gloo.py +++ b/python/paddle/fluid/tests/unittests/collective/process_group_gloo.py @@ -42,8 +42,7 @@ def test_create_process_group_gloo(self): store = paddle.fluid.core.TCPStore( "127.0.0.1", 6272, is_master, nranks, 30 ) - place = paddle.fluid.core.CPUPlace() - pg = paddle.fluid.core.ProcessGroupGloo(store, rank, nranks, place) + pg = paddle.fluid.core.ProcessGroupGloo(store, rank, nranks) # test allreduce sum # rank 0 diff --git a/python/paddle/fluid/tests/unittests/collective/process_group_nccl.py b/python/paddle/fluid/tests/unittests/collective/process_group_nccl.py index ff949d8f14cf4..0303f469b301c 100644 --- a/python/paddle/fluid/tests/unittests/collective/process_group_nccl.py +++ b/python/paddle/fluid/tests/unittests/collective/process_group_nccl.py @@ -44,9 +44,8 @@ def config(self): def test_create_process_group_nccl(self): with _test_eager_guard(): - paddle.set_device( - 'gpu:%d' % paddle.distributed.ParallelEnv().dev_id - ) + device_id = paddle.distributed.ParallelEnv().dev_id + paddle.set_device('gpu:%d' % device_id) pg = init_process_group() print("rank:", pg.rank(), "size:", pg.size(), "name:", pg.name()) @@ -170,10 +169,10 @@ def test_create_process_group_nccl(self): # test barrier # rank 0 if pg.rank() == 0: - dist.barrier() + pg.barrier(device_id) # rank 1 else: - task = pg.barrier() + task = pg.barrier(device_id) task.wait() print("test barrier api ok\n") diff --git a/python/paddle/fluid/tests/unittests/xpu/process_group_bkcl.py b/python/paddle/fluid/tests/unittests/xpu/process_group_bkcl.py index bb2cf6e1db7e0..5ea3845c0bd18 100644 --- a/python/paddle/fluid/tests/unittests/xpu/process_group_bkcl.py +++ b/python/paddle/fluid/tests/unittests/xpu/process_group_bkcl.py @@ -20,7 +20,6 @@ import paddle from paddle.fluid.framework import _test_eager_guard from paddle.fluid.dygraph.parallel import ParallelEnv -import paddle.distributed as dist def init_process_group(strategy=None): @@ -45,9 +44,8 @@ def config(self): def test_create_process_group_bkcl(self): with _test_eager_guard(): - paddle.set_device( - 'xpu:%d' % paddle.distributed.ParallelEnv().dev_id - ) + device_id = paddle.distributed.ParallelEnv().dev_id + paddle.set_device('xpu:%d' % device_id) pg = init_process_group() sys.stdout.write( @@ -108,10 +106,10 @@ def test_create_process_group_bkcl(self): # test barrier # rank 0 if pg.rank() == 0: - dist.barrier() + pg.barrier(device_id) # rank 1 else: - task = pg.barrier() + task = pg.barrier(device_id) task.wait() sys.stdout.write("rank {}: test barrier api ok\n".format(pg.rank())) From dac0f7dd2fc68522446e0fa8ac182e1760257fa5 Mon Sep 17 00:00:00 2001 From: xiaoxiaohehe001 <49090790+xiaoxiaohehe001@users.noreply.github.com> Date: Mon, 14 Nov 2022 20:56:00 +0800 Subject: [PATCH 006/210] [Paddle Inference] Add where trt converter (#47820) --- .../fluid/inference/api/analysis_predictor.cc | 1 + .../inference/tensorrt/convert/CMakeLists.txt | 1 + .../inference/tensorrt/convert/where_op.cc | 62 +++++ paddle/fluid/inference/tensorrt/engine.h | 4 + paddle/fluid/inference/tensorrt/op_teller.cc | 13 + .../operators/tensorrt/tensorrt_engine_op.h | 10 +- .../unittests/ir/inference/auto_scan_test.py | 25 +- .../ir/inference/test_trt_convert_where.py | 231 ++++++++++++++++++ 8 files changed, 337 insertions(+), 10 deletions(-) create mode 100644 paddle/fluid/inference/tensorrt/convert/where_op.cc create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_where.py diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 686af28f76b81..48dc6f0afcda7 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -2259,6 +2259,7 @@ USE_TRT_CONVERTER(prelu); USE_TRT_CONVERTER(conv2d_transpose); USE_TRT_CONVERTER(leaky_relu); USE_TRT_CONVERTER(shuffle_channel); +USE_TRT_CONVERTER(where); USE_TRT_CONVERTER(swish); USE_TRT_CONVERTER(silu); USE_TRT_CONVERTER(group_norm); diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt index 3a9a7527db69c..7ede7cd2a2b81 100644 --- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt @@ -25,6 +25,7 @@ list( multihead_matmul_op.cc multihead_matmul_roformer_op.cc shuffle_channel_op.cc + where_op.cc swish_op.cc silu_op.cc instance_norm_op.cc diff --git a/paddle/fluid/inference/tensorrt/convert/where_op.cc b/paddle/fluid/inference/tensorrt/convert/where_op.cc new file mode 100644 index 0000000000000..aff67b981d871 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/where_op.cc @@ -0,0 +1,62 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" + +namespace paddle { +namespace framework { +class Scope; + +namespace proto { +class OpDesc; +} // namespace proto +} // namespace framework +} // namespace paddle + +namespace paddle { +namespace inference { +namespace tensorrt { + +/* + * Where Op + */ +class WhereOpConverter : public OpConverter { + public: + void operator()(const framework::proto::OpDesc& op, + const framework::Scope& scope, + bool test_mode) override { + VLOG(3) << "convert a fluid where op to tensorrt where layer"; + + framework::OpDesc op_desc(op, nullptr); + std::string input_x_name = op_desc.Input("X").front(); + std::string condition_name = op_desc.Input("Condition").front(); + std::string input_y_name = op_desc.Input("Y").front(); + std::string output_name = op_desc.Output("Out").front(); + + const auto input_x_tensor = engine_->GetITensor(input_x_name); + const auto condition_tensor = engine_->GetITensor(condition_name); + const auto input_y_tensor = engine_->GetITensor(input_y_name); + + auto layer = TRT_ENGINE_ADD_LAYER( + engine_, Select, *condition_tensor, *input_x_tensor, *input_y_tensor); + + RreplenishLayerAndOutput(layer, "where", {output_name}, test_mode); + } +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +REGISTER_TRT_OP_CONVERTER(where, WhereOpConverter); diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h index f19b9fc505acf..b0e300dca6047 100644 --- a/paddle/fluid/inference/tensorrt/engine.h +++ b/paddle/fluid/inference/tensorrt/engine.h @@ -63,6 +63,10 @@ TRT_DT FluidDataType2TRT(FluidDT type) { return TRT_DT::kINT32; case FluidDT::VarType_Type_FP16: return TRT_DT::kHALF; +#if IS_TRT_VERSION_GE(8400) + case FluidDT::VarType_Type_BOOL: + return TRT_DT::kBOOL; +#endif default: return TRT_DT::kINT32; } diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc index 32297317df806..fd21e70780bd0 100644 --- a/paddle/fluid/inference/tensorrt/op_teller.cc +++ b/paddle/fluid/inference/tensorrt/op_teller.cc @@ -1654,6 +1654,17 @@ struct SimpleOpTypeSetTeller : public Teller { #endif } + if (op_type == "where") { +#if !IS_TRT_VERSION_GE(8400) + VLOG(3) << "where is not supported when TensorRT < 8.4"; + return false; +#endif + if (!with_dynamic_shape) { + VLOG(3) << "the where op does not support static shape yet"; + return false; + } + } + if (op_type == "skip_layernorm") { if (!with_dynamic_shape) { VLOG(3) << "the skip_layernorm does not support static shape yet"; @@ -2285,6 +2296,7 @@ struct SimpleOpTypeSetTeller : public Teller { "leaky_relu", "fc", "shuffle_channel", + "where", "swish", "silu", "celu", @@ -2409,6 +2421,7 @@ struct SimpleOpTypeSetTeller : public Teller { "leaky_relu", "fc", "shuffle_channel", + "where", "swish", "silu", "celu", diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h index 8096acc0a821c..38cf5e2b82346 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h @@ -601,10 +601,14 @@ class TensorRTEngineOp : public framework::OperatorBase { buffers[bind_index] = static_cast(t.data()); } else if (type == framework::proto::VarType::FP16) { buffers[bind_index] = static_cast(t.data()); +#if IS_TRT_VERSION_GE(8400) + } else if (type == framework::proto::VarType::BOOL) { + buffers[bind_index] = static_cast(t.data()); +#endif } else { - PADDLE_THROW( - platform::errors::Fatal("The TRT Engine OP only support " - "float/int32_t/int64_t/float16 input.")); + PADDLE_THROW(platform::errors::Fatal( + "The TRT Engine OP only support " + "float/int32_t/int64_t/float16/bool input.")); } } diff --git a/python/paddle/fluid/tests/unittests/ir/inference/auto_scan_test.py b/python/paddle/fluid/tests/unittests/ir/inference/auto_scan_test.py index 34e43338b589e..243074787ebc4 100755 --- a/python/paddle/fluid/tests/unittests/ir/inference/auto_scan_test.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/auto_scan_test.py @@ -181,14 +181,25 @@ def generate_op_config( ops = [] for i in range(len(ops_config)): op_config = ops_config[i] - ops.append( - OpConfig( - type=op_config['op_type'], - inputs=op_config['op_inputs'], - outputs=op_config['op_outputs'], - attrs=op_config['op_attrs'], + if 'outputs_dtype' in op_config: + ops.append( + OpConfig( + type=op_config['op_type'], + inputs=op_config['op_inputs'], + outputs=op_config['op_outputs'], + attrs=op_config['op_attrs'], + outputs_dtype=op_config['outputs_dtype'], + ) + ) + else: + ops.append( + OpConfig( + type=op_config['op_type'], + inputs=op_config['op_inputs'], + outputs=op_config['op_outputs'], + attrs=op_config['op_attrs'], + ) ) - ) return ops @abc.abstractmethod diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_where.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_where.py new file mode 100644 index 0000000000000..45d8f7d30a568 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_where.py @@ -0,0 +1,231 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from trt_layer_auto_scan_test import TrtLayerAutoScanTest +from program_config import TensorConfig, ProgramConfig +import unittest +import numpy as np +import paddle.inference as paddle_infer +from functools import partial +from typing import List + + +class TrtConvertActivationTest(TrtLayerAutoScanTest): + def is_program_valid(self, program_config: ProgramConfig) -> bool: + ver = paddle_infer.get_trt_compile_version() + if ver[0] * 1000 + ver[1] * 100 + ver[2] * 10 < 8415: + return False + return True + + def sample_program_configs(self): + self.trt_param.workspace_size = 1073741824 + + def generate_input1(dims, batch): + if dims == 1: + return np.zeros((batch)).astype(np.float32) + elif dims == 2: + return np.ones((batch, 4)).astype(np.float32) + elif dims == 3: + return np.ones((batch, 4, 6)).astype(np.float32) + else: + return np.ones((batch, 4, 6, 8)).astype(np.float32) + + def generate_input2(dims, batch): + if dims == 1: + return np.zeros((batch)).astype(np.float32) + elif dims == 2: + return np.ones((batch, 4)).astype(np.float32) + elif dims == 3: + return np.ones((batch, 4, 6)).astype(np.float32) + else: + return np.ones((batch, 4, 6, 8)).astype(np.float32) + + def generate_input3(dims, batch): + if dims == 1: + return np.zeros((batch)).astype(np.float32) + elif dims == 2: + return np.ones((batch, 4)).astype(np.float32) + elif dims == 3: + return np.ones((batch, 4, 6)).astype(np.float32) + else: + return np.ones((batch, 4, 6, 8)).astype(np.float32) + + for dims in [1, 2, 3, 4]: + for batch in [1, 2]: + self.dims = dims + dics = [{}] + ops_config = [ + { + "op_type": "cast", + "op_inputs": {"X": ["condition_data"]}, + "op_outputs": {"Out": ["condition_data_bool"]}, + "op_attrs": {"in_dtype": 5, "out_dtype": 0}, + "outputs_dtype": {"condition_data_bool": np.bool}, + }, + { + "op_type": "where", + "op_inputs": { + "Condition": ["condition_data_bool"], + "X": ["input_x_data"], + "Y": ["input_y_data"], + }, + "op_outputs": {"Out": ["output_data"]}, + "op_attrs": dics[0], + "outputs_dtype": {"condition_data_bool": np.bool}, + }, + ] + ops = self.generate_op_config(ops_config) + + program_config = ProgramConfig( + ops=ops, + weights={}, + inputs={ + "condition_data": TensorConfig( + data_gen=partial(generate_input1, dims, batch) + ), + "input_x_data": TensorConfig( + data_gen=partial(generate_input2, dims, batch) + ), + "input_y_data": TensorConfig( + data_gen=partial(generate_input3, dims, batch) + ), + }, + outputs=["output_data"], + ) + + yield program_config + + def sample_predictor_configs( + self, program_config + ) -> (paddle_infer.Config, List[int], float): + def generate_dynamic_shape(attrs): + if self.dims == 1: + self.dynamic_shape.min_input_shape = { + "condition_data": [1], + "condition_data_bool": [1], + "input_x_data": [1], + "input_y_data": [1], + } + self.dynamic_shape.max_input_shape = { + "condition_data": [2], + "condition_data_bool": [2], + "input_x_data": [2], + "input_y_data": [2], + } + self.dynamic_shape.opt_input_shape = { + "condition_data": [1], + "condition_data_bool": [1], + "input_x_data": [1], + "input_y_data": [1], + } + elif self.dims == 2: + self.dynamic_shape.min_input_shape = { + "condition_data": [1, 4], + "condition_data_bool": [1, 4], + "input_x_data": [1, 4], + "input_y_data": [1, 4], + } + self.dynamic_shape.max_input_shape = { + "condition_data": [2, 4], + "condition_data_bool": [2, 4], + "input_x_data": [2, 4], + "input_y_data": [2, 4], + } + self.dynamic_shape.opt_input_shape = { + "condition_data": [1, 4], + "condition_data_bool": [1, 4], + "input_x_data": [1, 4], + "input_y_data": [1, 4], + } + elif self.dims == 3: + self.dynamic_shape.min_input_shape = { + "condition_data": [1, 4, 6], + "condition_data_bool": [1, 4, 6], + "input_x_data": [1, 4, 6], + "input_y_data": [1, 4, 6], + } + self.dynamic_shape.max_input_shape = { + "condition_data": [2, 4, 6], + "condition_data_bool": [2, 4, 6], + "input_x_data": [2, 4, 6], + "input_y_data": [2, 4, 6], + } + self.dynamic_shape.opt_input_shape = { + "condition_data": [1, 4, 6], + "condition_data_bool": [1, 4, 6], + "input_x_data": [1, 4, 6], + "input_y_data": [1, 4, 6], + } + elif self.dims == 4: + self.dynamic_shape.min_input_shape = { + "condition_data": [1, 4, 6, 8], + "condition_data_bool": [1, 4, 6, 8], + "input_x_data": [1, 4, 6, 8], + "input_y_data": [1, 4, 6, 8], + } + self.dynamic_shape.max_input_shape = { + "condition_data": [2, 4, 6, 8], + "condition_data_bool": [2, 4, 6, 8], + "input_x_data": [2, 4, 6, 8], + "input_y_data": [2, 4, 6, 8], + } + self.dynamic_shape.opt_input_shape = { + "condition_data": [1, 4, 6, 8], + "condition_data_bool": [1, 4, 6, 8], + "input_x_data": [1, 4, 6, 8], + "input_y_data": [1, 4, 6, 8], + } + + def clear_dynamic_shape(): + self.dynamic_shape.min_input_shape = {} + self.dynamic_shape.max_input_shape = {} + self.dynamic_shape.opt_input_shape = {} + + def generate_trt_nodes_num(attrs, dynamic_shape): + if not dynamic_shape: + return 0, 6 + return 1, 5 + + attrs = [ + program_config.ops[i].attrs for i in range(len(program_config.ops)) + ] + + # for static_shape + clear_dynamic_shape() + self.trt_param.precision = paddle_infer.PrecisionType.Float32 + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, False + ), 1e-5 + self.trt_param.precision = paddle_infer.PrecisionType.Half + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, False + ), 1e-5 + + # for dynamic_shape + generate_dynamic_shape(attrs) + self.trt_param.precision = paddle_infer.PrecisionType.Float32 + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, True + ), 1e-5 + self.trt_param.precision = paddle_infer.PrecisionType.Half + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, True + ), 1e-5 + + def test(self): + self.run_test() + + +if __name__ == "__main__": + unittest.main() From 25e63dca10d56343cd6363ca061a530a72df3b92 Mon Sep 17 00:00:00 2001 From: Wen Sun <35923278+HermitSun@users.noreply.github.com> Date: Mon, 14 Nov 2022 20:56:37 +0800 Subject: [PATCH 007/210] Refactor collective communication send_partial, recv_partial, all_gather_partial C++ API (#47863) * refactor: simplify send, recv interfaces * refactor: rm send_partial, recv_partial, all_gather_partial --- .../distributed/collective/ProcessGroup.h | 105 ++---- .../collective/ProcessGroupBKCL.cc | 2 + .../distributed/collective/ProcessGroupBKCL.h | 2 + .../collective/ProcessGroupCustom.cc | 28 +- .../collective/ProcessGroupCustom.h | 13 +- .../collective/ProcessGroupGloo.cc | 2 + .../distributed/collective/ProcessGroupGloo.h | 2 + .../collective/ProcessGroupNCCL.cc | 304 +++--------------- .../distributed/collective/ProcessGroupNCCL.h | 62 +--- .../collective/ProcessGroupStream.cc | 166 ++-------- .../collective/ProcessGroupStream.h | 85 +---- paddle/fluid/distributed/collective/utils.h | 32 ++ .../collective/global_gather_op.cu.cc | 16 +- .../collective/global_scatter_op.cu.cc | 18 +- .../collective/partial_allgather_op.cu.cc | 7 +- .../collective/partial_recv_op.cu.cc | 2 +- .../collective/partial_send_op.cu.cc | 2 +- paddle/fluid/pybind/distributed_py.cc | 179 +++++------ .../Utils.h => pybind/process_group_utils.h} | 6 +- 19 files changed, 276 insertions(+), 757 deletions(-) create mode 100644 paddle/fluid/distributed/collective/utils.h rename paddle/fluid/{distributed/collective/Utils.h => pybind/process_group_utils.h} (98%) diff --git a/paddle/fluid/distributed/collective/ProcessGroup.h b/paddle/fluid/distributed/collective/ProcessGroup.h index 50d2807202d35..029a64a25cca4 100644 --- a/paddle/fluid/distributed/collective/ProcessGroup.h +++ b/paddle/fluid/distributed/collective/ProcessGroup.h @@ -98,17 +98,19 @@ class ProcessGroup { virtual std::string GetBackendName() const = 0; virtual const phi::DeviceContext& GetDeviceContext(const Place& place) const { - PADDLE_THROW(platform::errors::InvalidArgument( - "Does not support to get device_context from ProcessGroup%s.", + PADDLE_THROW(platform::errors::Unimplemented( + "ProcessGroup%s does not support get device_context.", GetBackendName())); } virtual std::shared_ptr AllGather( phi::DenseTensor* out_tensor, const phi::DenseTensor& in_tensor, + int64_t offset, + int64_t numel, bool sync_op) { - PADDLE_THROW(platform::errors::InvalidArgument( - "ProcessGroup%s does not support all_gather with sync_op flag", + PADDLE_THROW(platform::errors::Unimplemented( + "ProcessGroup%s does not support all_gather with sync_op flag.", GetBackendName())); } @@ -117,15 +119,15 @@ class ProcessGroup { const phi::DenseTensor& in_tensor, const AllreduceOptions& opts, bool sync_op) { - PADDLE_THROW(platform::errors::InvalidArgument( - "ProcessGroup%s does not support all_reduce with sync_op flag", + PADDLE_THROW(platform::errors::Unimplemented( + "ProcessGroup%s does not support all_reduce with sync_op flag.", GetBackendName())); } virtual std::shared_ptr Barrier( const BarrierOptions& = BarrierOptions()) { - PADDLE_THROW(platform::errors::InvalidArgument( - "ProcessGroup%s does not support barrier", GetBackendName())); + PADDLE_THROW(platform::errors::Unimplemented( + "ProcessGroup%s does not support barrier.", GetBackendName())); } virtual std::shared_ptr Broadcast( @@ -133,46 +135,28 @@ class ProcessGroup { const phi::DenseTensor& in_tensor, const BroadcastOptions& opts, bool sync_op) { - PADDLE_THROW(platform::errors::InvalidArgument( + PADDLE_THROW(platform::errors::Unimplemented( "ProcessGroup%s does not support broadcast with sync_op flag", GetBackendName())); } virtual std::shared_ptr Recv(phi::DenseTensor* tensor, int src_rank, + int64_t offset, + int64_t numel, bool sync_op) { - PADDLE_THROW(platform::errors::InvalidArgument( - "ProcessGroup%s does not support recv with sync_op flag", - GetBackendName())); - } - - virtual std::shared_ptr RecvPartial( - phi::DenseTensor* tensor, - int src_rank, - int64_t offset, - int64_t length, - bool sync_op) { - PADDLE_THROW(platform::errors::InvalidArgument( - "ProcessGroup%s does not support recv_partial with sync_op flag", + PADDLE_THROW(platform::errors::Unimplemented( + "ProcessGroup%s does not support recv with sync_op flag.", GetBackendName())); } virtual std::shared_ptr Send(phi::DenseTensor*, int dst_rank, + int64_t offset, + int64_t numel, bool sync_op) { - PADDLE_THROW(platform::errors::InvalidArgument( - "ProcessGroup%s does not support send with sync_op flag", - GetBackendName())); - } - - virtual std::shared_ptr SendPartial( - phi::DenseTensor* tensor, - int dst_rank, - int64_t offset, - int64_t length, - bool sync_op) { - PADDLE_THROW(platform::errors::InvalidArgument( - "ProcessGroup%s does not support send_partial with sync_op flag", + PADDLE_THROW(platform::errors::Unimplemented( + "ProcessGroup%s does not support send with sync_op flag.", GetBackendName())); } @@ -240,38 +224,6 @@ class ProcessGroup { GetBackendName())); } - virtual std::shared_ptr Send_Partial( - phi::DenseTensor&, // NOLINT - int, - int64_t, - int64_t) { - PADDLE_THROW(platform::errors::InvalidArgument( - "ProcessGroup%s does not support send_partial", GetBackendName())); - } - - virtual std::shared_ptr Send_Partial( - phi::DenseTensor&, int, int64_t, int64_t, bool) { // NOLINT - PADDLE_THROW(platform::errors::InvalidArgument( - "ProcessGroup%s does not support send_partial with sync_op flag", - GetBackendName())); - } - - virtual std::shared_ptr Recv_Partial( - phi::DenseTensor&, // NOLINT - int, - int64_t, - int64_t) { - PADDLE_THROW(platform::errors::InvalidArgument( - "ProcessGroup%s does not support recv_partial", GetBackendName())); - } - - virtual std::shared_ptr Recv_Partial( - phi::DenseTensor&, int, int64_t, int64_t, bool) { // NOLINT - PADDLE_THROW(platform::errors::InvalidArgument( - "ProcessGroup%s does not support recv_partial with sync_op flag", - GetBackendName())); - } - virtual std::shared_ptr AllGather( std::vector&, // NOLINT std::vector&) { // NOLINT @@ -288,25 +240,6 @@ class ProcessGroup { GetBackendName())); } - virtual std::shared_ptr AllGather_Partial( - std::vector& in_tensors, // NOLINT - std::vector& out_tensors, // NOLINT - int64_t offset, - int64_t length) { - PADDLE_THROW(platform::errors::InvalidArgument( - "ProcessGroup%s does not support AllGather_Partial", GetBackendName())); - } - - virtual std::shared_ptr AllGather_Partial( - std::vector& in_tensors, // NOLINT - std::vector& out_tensors, // NOLINT - int64_t offset, - int64_t length, - bool) { - PADDLE_THROW(platform::errors::InvalidArgument( - "ProcessGroup%s does not support AllGather_Partial", GetBackendName())); - } - virtual std::shared_ptr AllToAll( std::vector&, // NOLINT std::vector&) { // NOLINT diff --git a/paddle/fluid/distributed/collective/ProcessGroupBKCL.cc b/paddle/fluid/distributed/collective/ProcessGroupBKCL.cc index d9b6d490a5570..a5c80cb04108d 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupBKCL.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupBKCL.cc @@ -228,6 +228,8 @@ std::shared_ptr ProcessGroupBKCL::Broadcast( std::shared_ptr ProcessGroupBKCL::AllGather( phi::DenseTensor* out_tensor, const phi::DenseTensor& in_tensor, + int64_t offset, // for compatibility, no use now + int64_t numel, // for compatibility, no use now bool sync_op, bool use_calc_stream) { return Collective( diff --git a/paddle/fluid/distributed/collective/ProcessGroupBKCL.h b/paddle/fluid/distributed/collective/ProcessGroupBKCL.h index 6d457c88a8e77..f7a95f9e48269 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupBKCL.h +++ b/paddle/fluid/distributed/collective/ProcessGroupBKCL.h @@ -99,6 +99,8 @@ class ProcessGroupBKCL : public ProcessGroupStream { std::shared_ptr AllGather( phi::DenseTensor* out_tensor, const phi::DenseTensor& in_tensor, + int64_t offset, // for compatibility, no use now + int64_t numel, // for compatibility, no use now bool sync_op, bool use_calc_stream) override; diff --git a/paddle/fluid/distributed/collective/ProcessGroupCustom.cc b/paddle/fluid/distributed/collective/ProcessGroupCustom.cc index d71a8b975e46e..61e68889190f0 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupCustom.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupCustom.cc @@ -259,24 +259,18 @@ void* XcclGetPointerByOffset(void* raw_pointer, return nullptr; } -std::shared_ptr ProcessGroupCustom::AllGather_Partial( - std::vector& in_tensors, - std::vector& out_tensors, +// NOTE: this is ONLY for compatibility +std::shared_ptr ProcessGroupCustom::AllGather( + phi::DenseTensor* out_tensor, + const phi::DenseTensor& in_tensor, int64_t offset, - int64_t length) { - PADDLE_ENFORCE_EQ( - CheckTensorsInCustomPlace(in_tensors, device_type_), - true, - platform::errors::InvalidArgument( - "All inputs should be in CustomPlace(%s).", device_type_)); - PADDLE_ENFORCE_EQ( - CheckTensorsInCustomPlace(out_tensors, device_type_), - true, - platform::errors::InvalidArgument( - "All outputs should be in CustomPlace(%s).", device_type_)); + int64_t numel, + bool sync_op) { + std::vector in_wrapper{in_tensor}; + std::vector out_wrapper{*out_tensor}; return Collective( - in_tensors, - out_tensors, + in_wrapper, + out_wrapper, [&](phi::DenseTensor& input, phi::DenseTensor& output, phi::ccl::CCLComm comm, @@ -285,7 +279,7 @@ std::shared_ptr ProcessGroupCustom::AllGather_Partial( device_type_, XcclGetPointerByOffset(input.data(), offset, input.dtype()), output.data(), - length, + numel, phi::ccl::ToCCLDataType(input.dtype()), comm, stream); diff --git a/paddle/fluid/distributed/collective/ProcessGroupCustom.h b/paddle/fluid/distributed/collective/ProcessGroupCustom.h index b74d0c70de623..3ca2d767c7fc5 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupCustom.h +++ b/paddle/fluid/distributed/collective/ProcessGroupCustom.h @@ -72,14 +72,15 @@ class ProcessGroupCustom : public ProcessGroup { std::string GetBackendName() const override { return "XCCL_" + device_type_; } std::shared_ptr AllGather( - std::vector& in_tensors, - std::vector& out_tensors) override; + phi::DenseTensor* out_tensor, + const phi::DenseTensor& in_tensor, + int64_t offset, + int64_t numel, + bool sync_op) override; - std::shared_ptr AllGather_Partial( + std::shared_ptr AllGather( std::vector& in_tensors, - std::vector& out_tensors, - int64_t offset, - int64_t length) override; + std::vector& out_tensors) override; std::shared_ptr AllReduce( std::vector& in_tensors, diff --git a/paddle/fluid/distributed/collective/ProcessGroupGloo.cc b/paddle/fluid/distributed/collective/ProcessGroupGloo.cc index 2d6d4c88dd4d0..2574eb11be200 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupGloo.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupGloo.cc @@ -393,6 +393,8 @@ class AllgatherGlooTask : public ProcessGroupGloo::GlooTask { std::shared_ptr ProcessGroupGloo::AllGather( phi::DenseTensor* out_tensor, const phi::DenseTensor& in_tensor, + int64_t offset, // for compatibility, no use now + int64_t numel, // for compatibility, no use now bool sync_op) { std::vector in_wrapper = {in_tensor}; std::vector out_wrapper = {*out_tensor}; diff --git a/paddle/fluid/distributed/collective/ProcessGroupGloo.h b/paddle/fluid/distributed/collective/ProcessGroupGloo.h index 4e2c0eff12c82..474fb0c027c62 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupGloo.h +++ b/paddle/fluid/distributed/collective/ProcessGroupGloo.h @@ -110,6 +110,8 @@ class ProcessGroupGloo : public ProcessGroup { std::shared_ptr AllGather( phi::DenseTensor* out_tensor, const phi::DenseTensor& in_tensor, + int64_t offset, // for compatibility, no use now + int64_t numel, // for compatibility, no use now bool sync_op) override; std::shared_ptr Broadcast( diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc index a1f7754a57900..d7d5beea8959b 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc @@ -15,6 +15,7 @@ #include "paddle/fluid/distributed/collective/ProcessGroupNCCL.h" #include "paddle/fluid/distributed/collective/Common.h" +#include "paddle/fluid/distributed/collective/utils.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h" #include "paddle/fluid/platform/place.h" #include "paddle/phi/api/lib/utils/allocator.h" @@ -129,15 +130,20 @@ ncclComm_t ProcessGroupNCCL::NCCLComm(const Place& place) const { std::shared_ptr ProcessGroupNCCL::AllGather( phi::DenseTensor* out_tensor, const phi::DenseTensor& in_tensor, + int64_t offset, + int64_t numel, bool sync_op, bool use_calc_stream) { + // numel > 0 indicates the tensor need to be sliced + const phi::DenseTensor& in_tensor_maybe_partial = + numel > 0 ? GetPartialTensor(in_tensor, offset, numel) : in_tensor; return Collective( out_tensor, - in_tensor, - [&](phi::DenseTensor* output, - const phi::DenseTensor& input, - ncclComm_t comm, - gpuStream_t stream) { + in_tensor_maybe_partial, + [](phi::DenseTensor* output, + const phi::DenseTensor& input, + ncclComm_t comm, + gpuStream_t stream) { return platform::dynload::ncclAllGather( input.data(), output->data(), @@ -229,48 +235,25 @@ std::shared_ptr ProcessGroupNCCL::Broadcast( } std::shared_ptr ProcessGroupNCCL::Recv( - phi::DenseTensor* tensor, - int src_rank, - bool sync_op, - bool use_calc_stream) { - return PointToPoint( - tensor, - src_rank, - [&](phi::DenseTensor* output, - int src, - ncclComm_t comm, - gpuStream_t stream) { - return platform::dynload::ncclRecv( - output->data(), - output->numel(), - platform::ToNCCLDataType(output->dtype()), - src, - comm, - stream); - }, - CommType::RECV, - sync_op, - use_calc_stream); -} - -std::shared_ptr ProcessGroupNCCL::RecvPartial( phi::DenseTensor* tensor, int src_rank, int64_t offset, - int64_t length, + int64_t numel, bool sync_op, bool use_calc_stream) { - phi::DenseTensor tensor_flattened; - tensor_flattened.ShareDataWith(*tensor).Resize({tensor->numel()}); - phi::DenseTensor tensor_recv = - tensor_flattened.Slice(offset, offset + length); + // numel > 0 indicates the tensor need to be sliced + phi::DenseTensor partial_tensor; + if (numel > 0) { + partial_tensor = GetPartialTensor(*tensor, offset, numel); + tensor = &partial_tensor; + } return PointToPoint( - &tensor_recv, + tensor, src_rank, - [&](phi::DenseTensor* output, - int src, - ncclComm_t comm, - gpuStream_t stream) { + [](phi::DenseTensor* output, + int src, + ncclComm_t comm, + gpuStream_t stream) { return platform::dynload::ncclRecv( output->data(), output->numel(), @@ -285,48 +268,25 @@ std::shared_ptr ProcessGroupNCCL::RecvPartial( } std::shared_ptr ProcessGroupNCCL::Send( - phi::DenseTensor* tensor, - int dst_rank, - bool sync_op, - bool use_calc_stream) { - return PointToPoint( - tensor, - dst_rank, - [&](phi::DenseTensor* input, - int dst, - ncclComm_t comm, - gpuStream_t stream) { - return platform::dynload::ncclSend( - input->data(), - input->numel(), - platform::ToNCCLDataType(input->dtype()), - dst, - comm, - stream); - }, - CommType::SEND, - sync_op, - use_calc_stream); -} - -std::shared_ptr ProcessGroupNCCL::SendPartial( phi::DenseTensor* tensor, int dst_rank, int64_t offset, - int64_t length, + int64_t numel, bool sync_op, bool use_calc_stream) { - phi::DenseTensor tensor_flattened; - tensor_flattened.ShareDataWith(*tensor).Resize({tensor->numel()}); - phi::DenseTensor tensor_send = - tensor_flattened.Slice(offset, offset + length); + // numel > 0 indicates the tensor need to be sliced + phi::DenseTensor partial_tensor; + if (numel > 0) { + partial_tensor = GetPartialTensor(*tensor, offset, numel); + tensor = &partial_tensor; + } return PointToPoint( - &tensor_send, + tensor, dst_rank, - [&](phi::DenseTensor* input, - int dst, - ncclComm_t comm, - gpuStream_t stream) { + [](phi::DenseTensor* input, + int dst, + ncclComm_t comm, + gpuStream_t stream) { return platform::dynload::ncclSend( input->data(), input->numel(), @@ -1041,132 +1001,6 @@ std::shared_ptr ProcessGroupNCCL::Recv( return task; } -std::shared_ptr ProcessGroupNCCL::Send_Partial( - phi::DenseTensor& tensors, int dst_rank, int64_t offset, int64_t length) { - // CheckTensorsInDifferentDevices(tensors, static_cast(GetSize())); - - phi::DenseTensor flatten_tensor; - flatten_tensor.ShareDataWith(tensors).Resize({tensors.numel()}); - - std::vector shared_tensors{ - flatten_tensor.Slice(offset, offset + length)}; - - auto task = PointToPoint( - shared_tensors, - [&](phi::DenseTensor& input, - ncclComm_t comm, - const gpuStream_t& stream, - int dst_rank) { - return platform::dynload::ncclSend( - input.data(), - input.numel(), - platform::ToNCCLDataType(input.dtype()), - dst_rank, - comm, - stream); - }, - dst_rank, - CommType::SEND); - return task; -} - -std::shared_ptr ProcessGroupNCCL::Send_Partial( - phi::DenseTensor& tensors, - int dst_rank, - int64_t offset, - int64_t length, - bool sync_op, - bool use_calc_stream) { - phi::DenseTensor flatten_tensor; - flatten_tensor.ShareDataWith(tensors).Resize({tensors.numel()}); - - std::vector shared_tensors{ - flatten_tensor.Slice(offset, offset + length)}; - - auto task = PointToPoint( - shared_tensors, - [&](phi::DenseTensor& input, - ncclComm_t comm, - const gpuStream_t& stream, - int dst_rank) { - return platform::dynload::ncclSend( - input.data(), - input.numel(), - platform::ToNCCLDataType(input.dtype()), - dst_rank, - comm, - stream); - }, - dst_rank, - CommType::SEND, - sync_op, - use_calc_stream); - return task; -} - -std::shared_ptr ProcessGroupNCCL::Recv_Partial( - phi::DenseTensor& tensors, int src_rank, int64_t offset, int64_t length) { - // phi::DenseTensor shared_input = tensors.Slice(offset, offset+length); - - phi::DenseTensor flatten_tensor; - flatten_tensor.ShareDataWith(tensors).Resize({tensors.numel()}); - - std::vector shared_tensors{ - flatten_tensor.Slice(offset, offset + length)}; - - auto task = PointToPoint( - shared_tensors, - [&](phi::DenseTensor& output, - ncclComm_t comm, - const gpuStream_t& stream, - int src_rank) { - return platform::dynload::ncclRecv( - output.data(), - output.numel(), - platform::ToNCCLDataType(output.dtype()), - src_rank, - comm, - stream); - }, - src_rank, - CommType::RECV); - return task; -} - -std::shared_ptr ProcessGroupNCCL::Recv_Partial( - phi::DenseTensor& tensors, - int src_rank, - int64_t offset, - int64_t length, - bool sync_op, - bool use_calc_stream) { - phi::DenseTensor flatten_tensor; - flatten_tensor.ShareDataWith(tensors).Resize({tensors.numel()}); - - std::vector shared_tensors{ - flatten_tensor.Slice(offset, offset + length)}; - - auto task = PointToPoint( - shared_tensors, - [&](phi::DenseTensor& output, - ncclComm_t comm, - const gpuStream_t& stream, - int src_rank) { - return platform::dynload::ncclRecv( - output.data(), - output.numel(), - platform::ToNCCLDataType(output.dtype()), - src_rank, - comm, - stream); - }, - src_rank, - CommType::RECV, - sync_op, - use_calc_stream); - return task; -} - std::shared_ptr ProcessGroupNCCL::AllGather( std::vector& in_tensors, std::vector& out_tensors) { @@ -1228,77 +1062,11 @@ void* GetPointerByOffset(void* raw_pointer, offset); } else { PADDLE_THROW(platform::errors::Unimplemented( - "This datatype in nccl is not supported.")); + "Datatype %s in NCCL is not supported.", type)); } return nullptr; } -std::shared_ptr ProcessGroupNCCL::AllGather_Partial( - std::vector& in_tensors, - std::vector& out_tensors, - int64_t offset, - int64_t length) { - PADDLE_ENFORCE_EQ( - CheckTensorsInCudaPlace(in_tensors), - true, - platform::errors::InvalidArgument("All inputs should be in CudaPlace.")); - PADDLE_ENFORCE_EQ( - CheckTensorsInCudaPlace(out_tensors), - true, - platform::errors::InvalidArgument("All outputs should be in CudaPlace.")); - return Collective( - in_tensors, - out_tensors, - [&](phi::DenseTensor& input, - phi::DenseTensor& output, - ncclComm_t comm, - const gpuStream_t& stream) { - return platform::dynload::ncclAllGather( - GetPointerByOffset(input.data(), offset, input.dtype()), - output.data(), - length, - platform::ToNCCLDataType(input.dtype()), - comm, - stream); - }, - CommType::ALLGATHER); -} - -std::shared_ptr ProcessGroupNCCL::AllGather_Partial( - std::vector& in_tensors, - std::vector& out_tensors, - int64_t offset, - int64_t length, - bool sync_op, - bool use_calc_stream) { - PADDLE_ENFORCE_EQ( - CheckTensorsInCudaPlace(in_tensors), - true, - platform::errors::InvalidArgument("All inputs should be in CudaPlace.")); - PADDLE_ENFORCE_EQ( - CheckTensorsInCudaPlace(out_tensors), - true, - platform::errors::InvalidArgument("All outputs should be in CudaPlace.")); - return Collective( - in_tensors, - out_tensors, - [&](phi::DenseTensor& input, - phi::DenseTensor& output, - ncclComm_t comm, - const gpuStream_t& stream) { - return platform::dynload::ncclAllGather( - GetPointerByOffset(input.data(), offset, input.dtype()), - output.data(), - length, - platform::ToNCCLDataType(input.dtype()), - comm, - stream); - }, - CommType::ALLGATHER, - sync_op, - use_calc_stream); -} - std::shared_ptr ProcessGroupNCCL::AllToAll( std::vector& in_tensors, std::vector& out_tensors) { diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h index 7933636e3d17b..dab6d9428892b 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h +++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h @@ -97,6 +97,8 @@ class ProcessGroupNCCL final : public ProcessGroupStream { std::shared_ptr AllGather( phi::DenseTensor* out_tensor, const phi::DenseTensor& in_tensor, + int64_t offset, + int64_t numel, bool sync_op, bool use_calc_stream) override; @@ -119,30 +121,18 @@ class ProcessGroupNCCL final : public ProcessGroupStream { std::shared_ptr Recv(phi::DenseTensor* tensor, int src_rank, + int64_t offset, + int64_t numel, bool sync_op, bool use_calc_stream) override; - std::shared_ptr RecvPartial( - phi::DenseTensor* tensor, - int src_rank, - int64_t offset, - int64_t length, - bool sync_op, - bool use_calc_stream) override; - std::shared_ptr Send(phi::DenseTensor* tensor, int dst_rank, + int64_t offset, + int64_t numel, bool sync_op, bool use_calc_stream) override; - std::shared_ptr SendPartial( - phi::DenseTensor* tensor, - int dst_rank, - int64_t offset, - int64_t length, - bool sync_op, - bool use_calc_stream) override; - static void GroupStart(); static void GroupEnd(); @@ -167,50 +157,10 @@ class ProcessGroupNCCL final : public ProcessGroupStream { std::shared_ptr Recv( std::vector& tensors, int src_rank) override; - std::shared_ptr Send_Partial(phi::DenseTensor& tensors, - int dst_rank, - int64_t offset, - int64_t length) override; - - std::shared_ptr Send_Partial( - phi::DenseTensor& tensors, - int dst_rank, - int64_t offset, - int64_t length, - bool sync_op, - bool use_calc_stream) override; - - std::shared_ptr Recv_Partial(phi::DenseTensor& tensors, - int src_rank, - int64_t offset, - int64_t length) override; - - std::shared_ptr Recv_Partial( - phi::DenseTensor& tensors, - int src_rank, - int64_t offset, - int64_t length, - bool sync_op, - bool use_calc_stream) override; - std::shared_ptr AllGather( std::vector& in_tensors, std::vector& out_tensors) override; - std::shared_ptr AllGather_Partial( - std::vector& in_tensors, - std::vector& out_tensors, - int64_t offset, - int64_t length) override; - - std::shared_ptr AllGather_Partial( - std::vector& in_tensors, - std::vector& out_tensors, - int64_t offset, - int64_t length, - bool sync_op, - bool use_calc_stream) override; - std::shared_ptr AllToAll( std::vector& in_tensors, std::vector& out_tensors) override; diff --git a/paddle/fluid/distributed/collective/ProcessGroupStream.cc b/paddle/fluid/distributed/collective/ProcessGroupStream.cc index 7fd01576fabe0..2561a4f5b295a 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupStream.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupStream.cc @@ -22,16 +22,20 @@ ProcessGroupStream::ProcessGroupStream(int rank, int size, int gid) const phi::DeviceContext& ProcessGroupStream::GetDeviceContext( const Place& place, bool use_calc_stream) const { - PADDLE_THROW(platform::errors::InvalidArgument( + PADDLE_THROW(platform::errors::Unimplemented( "ProcessGroup%s does not support get device_context.", GetBackendName())); } std::shared_ptr ProcessGroupStream::AllGather( phi::DenseTensor* out_tensor, const phi::DenseTensor& in_tensor, + int64_t offset, + int64_t numel, bool sync_op) { return AllGather(out_tensor, in_tensor, + offset, + numel, sync_op, /*use_calc_stream*/ false); } @@ -39,10 +43,12 @@ std::shared_ptr ProcessGroupStream::AllGather( std::shared_ptr ProcessGroupStream::AllGather( phi::DenseTensor* out_tensor, const phi::DenseTensor& in_tensor, + int64_t offset, + int64_t numel, bool sync_op, bool use_calc_stream) { - PADDLE_THROW(platform::errors::InvalidArgument( - "ProcessGroup%s does not support do all_gather", GetBackendName())); + PADDLE_THROW(platform::errors::Unimplemented( + "ProcessGroup%s does not support all_gather.", GetBackendName())); } std::shared_ptr ProcessGroupStream::AllReduce( @@ -63,8 +69,8 @@ std::shared_ptr ProcessGroupStream::AllReduce( const AllreduceOptions& opts, bool sync_op, bool use_calc_stream) { - PADDLE_THROW(platform::errors::InvalidArgument( - "ProcessGroup%s does not support do all_reduce", GetBackendName())); + PADDLE_THROW(platform::errors::Unimplemented( + "ProcessGroup%s does not support all_reduce.", GetBackendName())); } std::shared_ptr ProcessGroupStream::Broadcast( @@ -85,14 +91,20 @@ std::shared_ptr ProcessGroupStream::Broadcast( const BroadcastOptions& opts, bool sync_op, bool use_calc_stream) { - PADDLE_THROW(platform::errors::InvalidArgument( - "ProcessGroup%s does not support do broadcast", GetBackendName())); + PADDLE_THROW(platform::errors::Unimplemented( + "ProcessGroup%s does not support broadcast.", GetBackendName())); } std::shared_ptr ProcessGroupStream::Recv( - phi::DenseTensor* tensor, int src_rank, bool sync_op) { + phi::DenseTensor* tensor, + int src_rank, + int64_t offset, + int64_t numel, + bool sync_op) { return Recv(tensor, src_rank, + offset, + numel, sync_op, /*use_calc_stream*/ false); } @@ -100,74 +112,37 @@ std::shared_ptr ProcessGroupStream::Recv( std::shared_ptr ProcessGroupStream::Recv( phi::DenseTensor* tensor, int src_rank, + int64_t offset, + int64_t numel, bool sync_op, bool use_calc_stream) { - PADDLE_THROW(platform::errors::InvalidArgument( - "ProcessGroup%s does not support do recv", GetBackendName())); + PADDLE_THROW(platform::errors::Unimplemented( + "ProcessGroup%s does not support recv.", GetBackendName())); } -std::shared_ptr ProcessGroupStream::RecvPartial( +std::shared_ptr ProcessGroupStream::Send( phi::DenseTensor* tensor, - int src_rank, + int dst_rank, int64_t offset, - int64_t length, + int64_t numel, bool sync_op) { - return RecvPartial(tensor, - src_rank, - offset, - length, - sync_op, - /*use_calc_stream*/ false); -} - -std::shared_ptr ProcessGroupStream::RecvPartial( - phi::DenseTensor* tensor, - int src_rank, - int64_t offset, - int64_t length, - bool sync_op, - bool use_calc_stream) { - PADDLE_THROW(platform::errors::InvalidArgument( - "ProcessGroup%s does not support do recv_partial", GetBackendName())); -} - -std::shared_ptr ProcessGroupStream::Send( - phi::DenseTensor* tensor, int dst_rank, bool sync_op) { return Send(tensor, dst_rank, + offset, + numel, sync_op, /*use_calc_stream*/ false); } std::shared_ptr ProcessGroupStream::Send( - phi::DenseTensor*, int dst_rank, bool sync_op, bool use_calc_stream) { - PADDLE_THROW(platform::errors::InvalidArgument( - "ProcessGroup%s does not support do send", GetBackendName())); -} - -std::shared_ptr ProcessGroupStream::SendPartial( - phi::DenseTensor* tensor, + phi::DenseTensor*, int dst_rank, int64_t offset, - int64_t length, - bool sync_op) { - return SendPartial(tensor, - dst_rank, - offset, - length, - sync_op, - /*use_calc_stream*/ false); -} - -std::shared_ptr ProcessGroupStream::SendPartial( - phi::DenseTensor* tensor, - int dst_rank, - int64_t offset, - int64_t length, + int64_t numel, bool sync_op, bool use_calc_stream) { - PADDLE_THROW(platform::errors::InvalidArgument( - "ProcessGroup%s does not support do send partial", GetBackendName())); + PADDLE_THROW(platform::errors::Unimplemented( + "ProcessGroup%s does not support send.", GetBackendName())); } // TODO(sunyilun): methods below will be removed later @@ -281,31 +256,6 @@ std::shared_ptr ProcessGroupStream::Scatter( "ProcessGroup%s does not support do scatter", GetBackendName())); } -std::shared_ptr ProcessGroupStream::Send_Partial( - phi::DenseTensor& tensors, - int dst_rank, - int64_t offset, - int64_t length, - bool sync_op) { - return Send_Partial(tensors, - dst_rank, - offset, - length, - sync_op, - /*use_calc_stream*/ false); -} - -std::shared_ptr ProcessGroupStream::Send_Partial( - phi::DenseTensor& tensors, - int dst_rank, - int64_t offset, - int64_t length, - bool sync_op, - bool use_calc_stream) { - PADDLE_THROW(platform::errors::InvalidArgument( - "ProcessGroup%s does not support do send_partial", GetBackendName())); -} - std::shared_ptr ProcessGroupStream::Recv( std::vector& tensors, int src_rank, bool sync_op) { return Recv(tensors, @@ -323,55 +273,5 @@ std::shared_ptr ProcessGroupStream::Recv( "ProcessGroup%s does not support do recv", GetBackendName())); } -std::shared_ptr ProcessGroupStream::Recv_Partial( - phi::DenseTensor& tensors, - int src_rank, - int64_t offset, - int64_t length, - bool sync_op) { - return Recv_Partial(tensors, - src_rank, - offset, - length, - sync_op, - /*use_calc_stream*/ false); -} - -std::shared_ptr ProcessGroupStream::Recv_Partial( - phi::DenseTensor& tensors, - int src_rank, - int64_t offset, - int64_t length, - bool sync_op, - bool use_calc_stream) { - PADDLE_THROW(platform::errors::InvalidArgument( - "ProcessGroup%s does not support do recv_partial", GetBackendName())); -} - -std::shared_ptr ProcessGroupStream::AllGather_Partial( - std::vector& in_tensors, - std::vector& out_tensors, - int64_t offset, - int64_t length, - bool sync_op) { - return AllGather_Partial(in_tensors, - out_tensors, - offset, - length, - sync_op, - /*use_calc_stream*/ false); -} - -std::shared_ptr ProcessGroupStream::AllGather_Partial( - std::vector& in_tensors, - std::vector& out_tensors, - int64_t offset, - int64_t length, - bool sync_op, - bool use_calc_stream) { - PADDLE_THROW(platform::errors::InvalidArgument( - "ProcessGroup%s does not support do recv_partial", GetBackendName())); -} - } // namespace distributed } // namespace paddle diff --git a/paddle/fluid/distributed/collective/ProcessGroupStream.h b/paddle/fluid/distributed/collective/ProcessGroupStream.h index fd68f6db5e360..15b0635c5a6b8 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupStream.h +++ b/paddle/fluid/distributed/collective/ProcessGroupStream.h @@ -64,11 +64,15 @@ class ProcessGroupStream : public ProcessGroup { std::shared_ptr AllGather( phi::DenseTensor* out_tensor, const phi::DenseTensor& in_tensor, + int64_t offset, + int64_t numel, bool sync_op) override; virtual std::shared_ptr AllGather( phi::DenseTensor* out_tensor, const phi::DenseTensor& in_tensor, + int64_t offset, + int64_t numel, bool sync_op, bool use_calc_stream); @@ -100,50 +104,30 @@ class ProcessGroupStream : public ProcessGroup { std::shared_ptr Recv(phi::DenseTensor* tensor, int src_rank, + int64_t offset, + int64_t numel, bool sync_op) override; virtual std::shared_ptr Recv(phi::DenseTensor* tensor, int src_rank, + int64_t offset, + int64_t numel, bool sync_op, bool use_calc_stream); - std::shared_ptr RecvPartial(phi::DenseTensor* tensor, - int src_rank, - int64_t offset, - int64_t length, - bool sync_op) override; - - virtual std::shared_ptr RecvPartial( - phi::DenseTensor* tensor, - int src_rank, - int64_t offset, - int64_t length, - bool sync_op, - bool use_calc_stream); - std::shared_ptr Send(phi::DenseTensor* tensor, int dst_rank, + int64_t offset, + int64_t numel, bool sync_op) override; virtual std::shared_ptr Send(phi::DenseTensor* tensor, int dst_rank, + int64_t offset, + int64_t numel, bool sync_op, bool use_calc_stream); - std::shared_ptr SendPartial(phi::DenseTensor* tensor, - int dst_rank, - int64_t offset, - int64_t length, - bool sync_op) override; - - virtual std::shared_ptr SendPartial( - phi::DenseTensor* tensor, - int dst_rank, - int64_t offset, - int64_t length, - bool sync_op, - bool use_calc_stream); - // TODO(sunyilun): methods below will be removed later std::shared_ptr AllToAll( std::vector& in_tensors, // NOLINT @@ -210,21 +194,6 @@ class ProcessGroupStream : public ProcessGroup { bool sync_op, bool use_calc_stream); - std::shared_ptr Send_Partial( - phi::DenseTensor& tensors, // NOLINT - int dst_rank, - int64_t offset, - int64_t length, - bool sync_op) override; - - virtual std::shared_ptr Send_Partial( - phi::DenseTensor& tensors, // NOLINT - int dst_rank, - int64_t offset, - int64_t length, - bool sync_op, - bool use_calc_stream); - std::shared_ptr Recv( std::vector& tensors, // NOLINT int src_rank, @@ -235,36 +204,6 @@ class ProcessGroupStream : public ProcessGroup { int src_rank, bool sync_op, bool use_calc_stream); - - std::shared_ptr Recv_Partial( - phi::DenseTensor& tensors, // NOLINT - int src_rank, - int64_t offset, - int64_t length, - bool sync_op) override; - - virtual std::shared_ptr Recv_Partial( - phi::DenseTensor& tensors, // NOLINT - int src_rank, - int64_t offset, - int64_t length, - bool sync_op, - bool use_calc_stream); - - std::shared_ptr AllGather_Partial( - std::vector& in_tensors, - std::vector& out_tensors, - int64_t offset, - int64_t length, - bool sync_op) override; - - virtual std::shared_ptr AllGather_Partial( - std::vector& in_tensors, // NOLINT - std::vector& out_tensors, // NOLINT - int64_t offset, - int64_t length, - bool sync_op, - bool use_calc_stream); }; } // namespace distributed diff --git a/paddle/fluid/distributed/collective/utils.h b/paddle/fluid/distributed/collective/utils.h new file mode 100644 index 0000000000000..a730a47dd0dff --- /dev/null +++ b/paddle/fluid/distributed/collective/utils.h @@ -0,0 +1,32 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace paddle { +namespace distributed { + +inline phi::DenseTensor GetPartialTensor(const phi::DenseTensor &tensor, + int64_t offset, + int64_t numel) { + phi::DenseTensor tensor_flattened; + tensor_flattened.ShareDataWith(tensor); + tensor_flattened.Resize({tensor.numel()}); + return tensor_flattened.Slice(offset, offset + numel); +} + +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/operators/collective/global_gather_op.cu.cc b/paddle/fluid/operators/collective/global_gather_op.cu.cc index 072e63f8fe340..580f815c9ab6e 100644 --- a/paddle/fluid/operators/collective/global_gather_op.cu.cc +++ b/paddle/fluid/operators/collective/global_gather_op.cu.cc @@ -226,15 +226,19 @@ struct GlobalGatherProcessGroupFunctor { int idx = i + j * n_expert; if (cpu_global_count_data[idx]) { phi::DenseTensor tmp = *x; - pg->Send_Partial( - tmp, j, send_ptr * in_feat, cpu_global_count_data[idx] * in_feat); + pg->Send(&tmp, + j, + send_ptr * in_feat, + cpu_global_count_data[idx] * in_feat, + /*sync_op*/ true); send_ptr += cpu_global_count_data[idx]; } if (cpu_local_count_data[idx]) { - pg->Recv_Partial(*out, - j, - expert_ptr[idx] * in_feat, - cpu_local_count_data[idx] * in_feat); + pg->Recv(out, + j, + expert_ptr[idx] * in_feat, + cpu_local_count_data[idx] * in_feat, + /*sync_op*/ true); } } PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd()); diff --git a/paddle/fluid/operators/collective/global_scatter_op.cu.cc b/paddle/fluid/operators/collective/global_scatter_op.cu.cc index a3d1b35bf0680..a6eb714662200 100644 --- a/paddle/fluid/operators/collective/global_scatter_op.cu.cc +++ b/paddle/fluid/operators/collective/global_scatter_op.cu.cc @@ -224,16 +224,18 @@ struct GlobalScatterProcessGroupFunctor { int idx = i + j * n_expert; if (cpu_local_count_data[idx]) { phi::DenseTensor tmp = *x; - pg->Send_Partial(tmp, - j, - expert_ptr[idx] * in_feat, - cpu_local_count_data[idx] * in_feat); + pg->Send(&tmp, + j, + expert_ptr[idx] * in_feat, + cpu_local_count_data[idx] * in_feat, + /*sync_op*/ true); } if (cpu_global_count_data[idx]) { - pg->Recv_Partial(*out, - j, - recv_ptr * in_feat, - cpu_global_count_data[idx] * in_feat); + pg->Recv(out, + j, + recv_ptr * in_feat, + cpu_global_count_data[idx] * in_feat, + /*sync_op*/ true); recv_ptr += cpu_global_count_data[idx]; } } diff --git a/paddle/fluid/operators/collective/partial_allgather_op.cu.cc b/paddle/fluid/operators/collective/partial_allgather_op.cu.cc index eeda5c72d9cae..cd1e12d7e1bab 100644 --- a/paddle/fluid/operators/collective/partial_allgather_op.cu.cc +++ b/paddle/fluid/operators/collective/partial_allgather_op.cu.cc @@ -67,12 +67,7 @@ class PartialAllGatherOpCUDAKernel : public framework::OpKernel { if (map->has(rid)) { // Use ProcessGroup distributed::ProcessGroup* pg = map->get(rid); - std::vector in_tensors; - std::vector out_tensors; - in_tensors.push_back(*in); - out_tensors.push_back(*out); - auto task = - pg->AllGather_Partial(in_tensors, out_tensors, offset, send_numel); + auto task = pg->AllGather(out, *in, offset, send_numel, /*sync_op*/ true); task->Wait(); } else { const T* send_buff = in->data() + offset; diff --git a/paddle/fluid/operators/collective/partial_recv_op.cu.cc b/paddle/fluid/operators/collective/partial_recv_op.cu.cc index 33074ea725a8f..c8a49f51d5c46 100644 --- a/paddle/fluid/operators/collective/partial_recv_op.cu.cc +++ b/paddle/fluid/operators/collective/partial_recv_op.cu.cc @@ -75,7 +75,7 @@ class PartialRecvOpCUDAKernel : public framework::OpKernel { if (map->has(rid)) { // Use ProcessGroup distributed::ProcessGroup *pg = map->get(rid); - auto task = pg->Recv_Partial(*out, peer, offset, recv_numel); + auto task = pg->Recv(out, peer, offset, recv_numel, /*sync_op*/ true); task->Wait(); } else { gpuStream_t stream = nullptr; diff --git a/paddle/fluid/operators/collective/partial_send_op.cu.cc b/paddle/fluid/operators/collective/partial_send_op.cu.cc index ea835f87d289e..b7196473c9ac1 100644 --- a/paddle/fluid/operators/collective/partial_send_op.cu.cc +++ b/paddle/fluid/operators/collective/partial_send_op.cu.cc @@ -70,7 +70,7 @@ class PartialSendCUDAKernel : public framework::OpKernel { // Use ProcessGroup distributed::ProcessGroup* pg = map->get(rid); phi::DenseTensor tmp = *x; - auto task = pg->Send_Partial(tmp, peer, offset, send_numel); + auto task = pg->Send(&tmp, peer, offset, send_numel, /*sync_op*/ true); task->Wait(); } else { gpuStream_t stream = nullptr; diff --git a/paddle/fluid/pybind/distributed_py.cc b/paddle/fluid/pybind/distributed_py.cc index 9c7a89c395fa9..a596275015612 100644 --- a/paddle/fluid/pybind/distributed_py.cc +++ b/paddle/fluid/pybind/distributed_py.cc @@ -24,13 +24,13 @@ limitations under the License. */ #include "paddle/fluid/distributed/collective/ProcessGroup.h" #include "paddle/fluid/distributed/collective/ProcessGroupStream.h" #include "paddle/fluid/distributed/collective/Types.h" -#include "paddle/fluid/distributed/collective/Utils.h" #include "paddle/fluid/distributed/collective/reducer.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/imperative/layer.h" #include "paddle/fluid/pybind/distributed_py.h" #include "paddle/fluid/pybind/eager_utils.h" +#include "paddle/fluid/pybind/process_group_utils.h" #include "paddle/phi/api/all.h" #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) @@ -171,7 +171,9 @@ void BindDistributed(py::module *m) { auto p_dense = std::dynamic_pointer_cast(tensor.impl()); auto *out_dense = p_dense.get(); - return self.Send(out_dense, dst, sync_op); + // numel == -1 indicates sending the whole tensor + return self.Send( + out_dense, dst, /*offset*/ 0, /*numel*/ -1, sync_op); }, py::arg("tensor"), py::arg("dst"), @@ -189,18 +191,20 @@ void BindDistributed(py::module *m) { auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0); auto p_dense = std::dynamic_pointer_cast(tensor.impl()); + auto *out_dense = p_dense.get(); + int64_t numel = p_dense->numel(); int64_t send_numel = numel / nranks; int64_t offset = send_numel * rank_id; - auto *out_dense = p_dense.get(); - return self.SendPartial( + + return self.Send( out_dense, dst_rank, offset, send_numel, sync_op); }, py::arg("tensor"), py::arg("dst"), py::arg("num"), py::arg("id"), - py::arg("sync_op"), + py::arg("sync_op") = true, py::call_guard()) .def( @@ -213,7 +217,9 @@ void BindDistributed(py::module *m) { auto p_dense = std::dynamic_pointer_cast(tensor.impl()); auto *in_dense = p_dense.get(); - return self.Recv(in_dense, src, sync_op); + // numel == -1 indicates receiving the whole tensor + return self.Recv( + in_dense, src, /*offset*/ 0, /*numel*/ -1, sync_op); }, py::arg("tensor"), py::arg("src"), @@ -231,18 +237,20 @@ void BindDistributed(py::module *m) { auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0); auto p_dense = std::dynamic_pointer_cast(tensor.impl()); + auto *out_dense = p_dense.get(); + int64_t numel = p_dense->numel(); int64_t recv_numel = numel / nranks; int64_t offset = recv_numel * rank_id; - auto *out_dense = p_dense.get(); - return self.RecvPartial( + + return self.Recv( out_dense, src_rank, offset, recv_numel, sync_op); }, py::arg("tensor"), py::arg("src"), py::arg("num"), py::arg("id"), - py::arg("sync_op"), + py::arg("sync_op") = true, py::call_guard()) .def( @@ -264,7 +272,11 @@ void BindDistributed(py::module *m) { auto in_dense = *p_in_tensor; const auto &dev_ctx = self.GetDeviceContext(in_tensor.place()); - auto task = self.AllGather(out_dense, in_dense, sync_op); + auto task = self.AllGather(out_dense, + in_dense, + /*offset*/ 0, + /*numel*/ -1, + sync_op); distributed::SplitTensor(dev_ctx, *out_dense, &out_tensor_list); task->UpdateWaitChain(dev_ctx); return task; @@ -290,7 +302,11 @@ void BindDistributed(py::module *m) { in_tensor.impl()); auto in_dense = *p_in_tensor; - return self.AllGather(out_dense, in_dense, sync_op); + return self.AllGather(out_dense, + in_dense, + /*offset*/ 0, + /*numel*/ -1, + sync_op); }, py::arg("out"), py::arg("in"), @@ -571,27 +587,6 @@ void BindDistributed(py::module *m) { py::arg("dst"), py::call_guard()) - .def( - "send_partial", - [](distributed::ProcessGroup &self, - py::handle py_tensor, - int dst_rank, - int nranks, - int rank_id) { - auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0); - auto dense = - std::dynamic_pointer_cast(tensor.impl()); - int64_t numel = (*dense).numel(); - int64_t send_numel = numel / nranks; - int64_t offset = send_numel * rank_id; - return self.Send_Partial(*dense, dst_rank, offset, send_numel); - }, - py::arg("tensor"), - py::arg("dst"), - py::arg("num"), - py::arg("id"), - py::call_guard()) - .def( "recv", [](distributed::ProcessGroup &self, @@ -607,27 +602,6 @@ void BindDistributed(py::module *m) { py::arg("src"), py::call_guard()) - .def( - "recv_partial", - [](distributed::ProcessGroup &self, - py::handle py_tensor, - int src_rank, - int nranks, - int rank_id) { - auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0); - auto dense = - std::dynamic_pointer_cast(tensor.impl()); - int64_t numel = (*dense).numel(); - int64_t recv_numel = numel / nranks; - int64_t offset = recv_numel * rank_id; - return self.Recv_Partial(*dense, src_rank, offset, recv_numel); - }, - py::arg("tensor"), - py::arg("src"), - py::arg("num"), - py::arg("id"), - py::call_guard()) - .def( "all_gather", [](distributed::ProcessGroup &self, @@ -650,26 +624,28 @@ void BindDistributed(py::module *m) { .def( "all_gather_partial", [](distributed::ProcessGroup &self, - py::handle py_in_tensor, py::handle py_out_tensor, + py::handle py_in_tensor, int nranks, int rank_id) { - auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0); auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0); - auto in_dense = std::dynamic_pointer_cast( - in_tensor.impl()); - auto out_dense = std::dynamic_pointer_cast( + auto p_out_tensor = std::dynamic_pointer_cast( out_tensor.impl()); - std::vector in_tensors = {*in_dense}; - std::vector out_tensors = {*out_dense}; - int64_t numel = (*in_dense).numel(); + auto *out_dense = p_out_tensor.get(); + + auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0); + auto p_in_tensor = std::dynamic_pointer_cast( + in_tensor.impl()); + auto in_dense = *p_in_tensor; + + int64_t numel = in_dense.numel(); int64_t send_numel = numel / nranks; int64_t offset = send_numel * rank_id; - return self.AllGather_Partial( - in_tensors, out_tensors, offset, send_numel); + return self.AllGather( + out_dense, in_dense, offset, send_numel, /*sync_op*/ true); }, - py::arg("in"), py::arg("out"), + py::arg("in"), py::arg("num"), py::arg("id"), py::call_guard()) @@ -785,6 +761,8 @@ void BindDistributed(py::module *m) { self.GetDeviceContext(in_tensor.place(), true); auto task = self.AllGather(out_dense, in_dense, + /*offset*/ 0, + /*numel*/ -1, /*sync_op*/ true, /*use_calc_stream*/ true); distributed::SplitTensor(dev_ctx, *out_dense, &out_tensor_list); @@ -811,6 +789,8 @@ void BindDistributed(py::module *m) { return self.AllGather(out_dense, in_dense, + /*offset*/ 0, + /*numel*/ -1, /*sync_op*/ true, /*use_calc_stream*/ true); }, @@ -821,30 +801,33 @@ void BindDistributed(py::module *m) { .def( "all_gather_partial_on_calc_stream", [](distributed::ProcessGroupStream &self, - py::handle py_in_tensor, py::handle py_out_tensor, + py::handle py_in_tensor, int nranks, int rank_id) { - auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0); auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0); - auto in_dense = std::dynamic_pointer_cast( - in_tensor.impl()); - auto out_dense = std::dynamic_pointer_cast( + auto p_out_tensor = std::dynamic_pointer_cast( out_tensor.impl()); - std::vector in_tensors = {*in_dense}; - std::vector out_tensors = {*out_dense}; - int64_t numel = (*in_dense).numel(); + auto *out_dense = p_out_tensor.get(); + + auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0); + auto p_in_tensor = std::dynamic_pointer_cast( + in_tensor.impl()); + auto in_dense = *p_in_tensor; + + int64_t numel = in_dense.numel(); int64_t send_numel = numel / nranks; int64_t offset = send_numel * rank_id; - return self.AllGather_Partial(in_tensors, - out_tensors, - offset, - send_numel, - /*sync_op*/ true, - /*use_calc_stream*/ true); + + return self.AllGather(out_dense, + in_dense, + offset, + send_numel, + /*sync_op*/ true, + /*use_calc_stream*/ true); }, - py::arg("in"), py::arg("out"), + py::arg("in"), py::arg("num"), py::arg("id"), py::call_guard()) @@ -1125,8 +1108,11 @@ void BindDistributed(py::module *m) { auto p_dense = std::dynamic_pointer_cast(tensor.impl()); auto *out_dense = p_dense.get(); + // numel == -1 indicates sending the whole tensor return self.Send(out_dense, dst, + /*offset*/ 0, + /*numel*/ -1, /*sync_op*/ true, /*use_calc_stream*/ true); }, @@ -1144,16 +1130,18 @@ void BindDistributed(py::module *m) { auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0); auto p_dense = std::dynamic_pointer_cast(tensor.impl()); + auto *out_dense = p_dense.get(); + int64_t numel = p_dense->numel(); int64_t send_numel = numel / nranks; int64_t offset = send_numel * rank_id; - auto *out_dense = p_dense.get(); - return self.SendPartial(out_dense, - dst_rank, - offset, - send_numel, - /*sync_op*/ true, - /*use_calc_stream*/ true); + + return self.Send(out_dense, + dst_rank, + offset, + send_numel, + /*sync_op*/ true, + /*use_calc_stream*/ true); }, py::arg("tensor"), py::arg("dst"), @@ -1170,8 +1158,11 @@ void BindDistributed(py::module *m) { auto p_dense = std::dynamic_pointer_cast(tensor.impl()); auto *in_dense = p_dense.get(); + // numel == -1 indicates receiving the whole tensor return self.Recv(in_dense, src, + /*offset*/ 0, + /*numel*/ -1, /*sync_op*/ true, /*use_calc_stream*/ true); }, @@ -1189,16 +1180,18 @@ void BindDistributed(py::module *m) { auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0); auto p_dense = std::dynamic_pointer_cast(tensor.impl()); + auto *out_dense = p_dense.get(); + int64_t numel = p_dense->numel(); int64_t recv_numel = numel / nranks; int64_t offset = recv_numel * rank_id; - auto *out_dense = p_dense.get(); - return self.RecvPartial(out_dense, - src_rank, - offset, - recv_numel, - /*sync_op*/ true, - /*use_calc_stream*/ true); + + return self.Recv(out_dense, + src_rank, + offset, + recv_numel, + /*sync_op*/ true, + /*use_calc_stream*/ true); }, py::arg("tensor"), py::arg("src"), diff --git a/paddle/fluid/distributed/collective/Utils.h b/paddle/fluid/pybind/process_group_utils.h similarity index 98% rename from paddle/fluid/distributed/collective/Utils.h rename to paddle/fluid/pybind/process_group_utils.h index d9260b98dcf44..35a5ef0b1bb14 100644 --- a/paddle/fluid/distributed/collective/Utils.h +++ b/paddle/fluid/pybind/process_group_utils.h @@ -14,10 +14,10 @@ #pragma once -#include "paddle/fluid/platform/device_context.h" #include "paddle/phi/api/include/tensor.h" #include "paddle/phi/backends/device_guard.h" #include "paddle/phi/backends/device_manager.h" +#include "paddle/phi/core/device_context.h" #include "paddle/phi/kernels/funcs/concat_and_split_functor.h" namespace paddle { @@ -110,7 +110,7 @@ void ConcatDenseTensorWithType(const DeviceContext &dev_ctx, ConcatDenseTensor()(dev_ctx, t_list, p_out); break; case phi::DataType::FLOAT16: - ConcatDenseTensor()( + ConcatDenseTensor()( dev_ctx, t_list, p_out); break; case phi::DataType::FLOAT32: @@ -147,7 +147,7 @@ void SplitDenseTensorWithType(const DeviceContext &dev_ctx, SplitDenseTensor()(dev_ctx, t_in, p_list); break; case phi::DataType::FLOAT16: - SplitDenseTensor()( + SplitDenseTensor()( dev_ctx, t_in, p_list); break; case phi::DataType::FLOAT32: From 04c29558bce3dddaa660f1acefec08b5d5964df1 Mon Sep 17 00:00:00 2001 From: huangjiyi <43315610+huangjiyi@users.noreply.github.com> Date: Tue, 15 Nov 2022 10:11:52 +0800 Subject: [PATCH 008/210] [PHI decoupling] remove dependency on "paddle/fluid/operators/elementwise/xxx.h" in phi (#47870) * rm "paddle/fluid/operators/elementwise/xxx.h" in phi * fix bugs * add LaunchElementwiseCudaKernel in phi * Revert "add LaunchElementwiseCudaKernel in phi" This reverts commit 588f45bbdad2372ec7bff0c567a29bff675d22e1. * rm indirect dependence to "elementwise_op_impl.cu.h" rm indirect dependence to "elementwise_op_impl.cu.h" Revert "add LaunchElementwiseCudaKernel in phi" This reverts commit 588f45bbdad2372ec7bff0c567a29bff675d22e1. add LaunchElementwiseCudaKernel in phi fix bugs * rm LaunchSameDimsElementwiseCudaKernel and LaunchElementwiseCudaKernel in phi --- paddle/phi/kernels/gpu/label_smooth_grad_kernel.cu | 5 ++--- paddle/phi/kernels/gpu/label_smooth_kernel.cu | 5 ++--- paddle/phi/kernels/gpu/p_norm_kernel.cu | 2 +- paddle/phi/kernels/gpu/viterbi_decode_kernel.cu | 10 +++++----- 4 files changed, 10 insertions(+), 12 deletions(-) diff --git a/paddle/phi/kernels/gpu/label_smooth_grad_kernel.cu b/paddle/phi/kernels/gpu/label_smooth_grad_kernel.cu index ddf7657fe528c..2ac6442967b38 100644 --- a/paddle/phi/kernels/gpu/label_smooth_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/label_smooth_grad_kernel.cu @@ -14,9 +14,9 @@ #include "paddle/phi/kernels/label_smooth_grad_kernel.h" -#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/elementwise_base.h" namespace phi { template @@ -42,8 +42,7 @@ void LabelSmoothGradKernel(const Context& ctx, std::vector ins = {&out_grad}; std::vector outs = {label_grad}; auto functor = LabelSmoothGradFunctor(epsilon); - paddle::operators::LaunchSameDimsElementwiseCudaKernel( - ctx, ins, &outs, functor); + phi::funcs::ElementwiseKernel(ctx, ins, &outs, functor); } } // namespace phi diff --git a/paddle/phi/kernels/gpu/label_smooth_kernel.cu b/paddle/phi/kernels/gpu/label_smooth_kernel.cu index 4a2f27d3018c4..ff2fff4445174 100644 --- a/paddle/phi/kernels/gpu/label_smooth_kernel.cu +++ b/paddle/phi/kernels/gpu/label_smooth_kernel.cu @@ -16,9 +16,9 @@ #include -#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/elementwise_base.h" namespace phi { @@ -77,8 +77,7 @@ void LabelSmoothKernel(const Context& ctx, std::vector ins = {&label}; std::vector outs = {out}; auto functor = LabelSmoothFunctor(epsilon, label_dim); - paddle::operators::LaunchSameDimsElementwiseCudaKernel( - ctx, ins, &outs, functor); + phi::funcs::ElementwiseKernel(ctx, ins, &outs, functor); } } diff --git a/paddle/phi/kernels/gpu/p_norm_kernel.cu b/paddle/phi/kernels/gpu/p_norm_kernel.cu index 80ef97d9cf88c..c7a6261ce381e 100644 --- a/paddle/phi/kernels/gpu/p_norm_kernel.cu +++ b/paddle/phi/kernels/gpu/p_norm_kernel.cu @@ -14,9 +14,9 @@ #include "paddle/phi/kernels/p_norm_kernel.h" -#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h" #include "paddle/phi/common/amp_type_traits.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/elementwise_base.h" #include "paddle/phi/kernels/funcs/reduce_function.h" #include "paddle/phi/kernels/gpu/reduce.h" diff --git a/paddle/phi/kernels/gpu/viterbi_decode_kernel.cu b/paddle/phi/kernels/gpu/viterbi_decode_kernel.cu index 31227e59433ea..b80e9253128f3 100644 --- a/paddle/phi/kernels/gpu/viterbi_decode_kernel.cu +++ b/paddle/phi/kernels/gpu/viterbi_decode_kernel.cu @@ -30,13 +30,14 @@ namespace cub = hipcub; #include #include -#include "paddle/fluid/operators/elementwise/elementwise_op_function.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/tensor_utils.h" #include "paddle/phi/kernels/empty_kernel.h" +#include "paddle/phi/kernels/funcs/broadcast_function.h" #include "paddle/phi/kernels/funcs/compare_functors.h" #include "paddle/phi/kernels/funcs/concat_and_split_functor.h" +#include "paddle/phi/kernels/funcs/elementwise_base.h" #include "paddle/phi/kernels/funcs/elementwise_functor.h" #include "paddle/phi/kernels/funcs/gather.cu.h" #include "paddle/phi/kernels/funcs/viterbi_decode_functor.h" @@ -90,9 +91,8 @@ struct BinaryOperation { DenseTensor* output) { std::vector ins{&lhs, &rhs}; std::vector outs{output}; - paddle::operators:: - LaunchElementwiseCudaKernel( - dev_ctx, ins, &outs, 0, BinaryFunctor()); + phi::funcs::BroadcastKernel( + dev_ctx, ins, &outs, 0, BinaryFunctor()); } }; @@ -107,7 +107,7 @@ struct GetMask { DenseTensor* mask) { std::vector ins = {&lhs, &rhs}; std::vector outs = {mask}; - paddle::operators::LaunchSameDimsElementwiseCudaKernel( + phi::funcs::ElementwiseKernel( dev_ctx, ins, &outs, CompareFunctor()); } }; From f7bf2930f0d63b580744636b3688b39b49981ce3 Mon Sep 17 00:00:00 2001 From: Wang Xin Date: Tue, 15 Nov 2022 10:14:50 +0800 Subject: [PATCH 009/210] remove 'paddle/fluid/operators/conv_op.h' from phi (#47914) --- paddle/phi/kernels/gpu/depthwise_conv_kernel.cu | 1 - 1 file changed, 1 deletion(-) diff --git a/paddle/phi/kernels/gpu/depthwise_conv_kernel.cu b/paddle/phi/kernels/gpu/depthwise_conv_kernel.cu index 18e6913be1f1e..8617a42e4e544 100644 --- a/paddle/phi/kernels/gpu/depthwise_conv_kernel.cu +++ b/paddle/phi/kernels/gpu/depthwise_conv_kernel.cu @@ -12,7 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/conv_op.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/cpu/conv_util.h" From aa08b769fa43de8d1071a61a0d4434dcfd58f566 Mon Sep 17 00:00:00 2001 From: huangjiyi <43315610+huangjiyi@users.noreply.github.com> Date: Tue, 15 Nov 2022 10:21:10 +0800 Subject: [PATCH 010/210] [PHI decoupling] remove "paddle/fluid/platform/complex.h" in phi (#47926) * rm "paddle/fluid/platform/complex.h" in phi * fix codestyle with pre-commit --- paddle/phi/backends/dynload/lapack.h | 1 - paddle/phi/kernels/cpu/concat_kernel.cc | 2 +- paddle/phi/kernels/gpu/concat_kernel.cu | 2 +- paddle/phi/kernels/gpu/pad_kernel.cu | 2 +- 4 files changed, 3 insertions(+), 4 deletions(-) diff --git a/paddle/phi/backends/dynload/lapack.h b/paddle/phi/backends/dynload/lapack.h index 1a680e32d1c32..74051821eaebb 100644 --- a/paddle/phi/backends/dynload/lapack.h +++ b/paddle/phi/backends/dynload/lapack.h @@ -17,7 +17,6 @@ limitations under the License. */ #include #include -#include "paddle/fluid/platform/complex.h" #include "paddle/phi/backends/dynload/dynamic_loader.h" #include "paddle/phi/backends/dynload/port.h" diff --git a/paddle/phi/kernels/cpu/concat_kernel.cc b/paddle/phi/kernels/cpu/concat_kernel.cc index 625d7d7436a46..96e02f4c42046 100644 --- a/paddle/phi/kernels/cpu/concat_kernel.cc +++ b/paddle/phi/kernels/cpu/concat_kernel.cc @@ -15,9 +15,9 @@ #include "paddle/phi/kernels/concat_kernel.h" #include "paddle/fluid/operators/strided_memcpy.h" -#include "paddle/fluid/platform/complex.h" #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/common/bfloat16.h" +#include "paddle/phi/common/complex.h" #include "paddle/phi/common/scalar.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/kernel_registry.h" diff --git a/paddle/phi/kernels/gpu/concat_kernel.cu b/paddle/phi/kernels/gpu/concat_kernel.cu index 7635c94cde789..0666c60a8d0c1 100644 --- a/paddle/phi/kernels/gpu/concat_kernel.cu +++ b/paddle/phi/kernels/gpu/concat_kernel.cu @@ -15,9 +15,9 @@ #include "paddle/phi/kernels/concat_kernel.h" #include "paddle/fluid/operators/strided_memcpy.h" -#include "paddle/fluid/platform/complex.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/common/bfloat16.h" +#include "paddle/phi/common/complex.h" #include "paddle/phi/common/scalar.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/kernel_registry.h" diff --git a/paddle/phi/kernels/gpu/pad_kernel.cu b/paddle/phi/kernels/gpu/pad_kernel.cu index cdf90513a3934..90d81046b999d 100644 --- a/paddle/phi/kernels/gpu/pad_kernel.cu +++ b/paddle/phi/kernels/gpu/pad_kernel.cu @@ -14,8 +14,8 @@ #include "paddle/phi/kernels/pad_kernel.h" -#include "paddle/fluid/platform/complex.h" #include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/common/complex.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/impl/pad_kernel_impl.h" From 4e09b0893c259c72cfae79f1834201d5e061354b Mon Sep 17 00:00:00 2001 From: Nyakku Shigure Date: Tue, 15 Nov 2022 11:00:38 +0800 Subject: [PATCH 011/210] [CodeStyle][F821] fix remaining F821 issues (#47968) * [CodeStyle][F821] fix remained F821 issues * refine comment * fix _set_item --- .flake8 | 6 +++++- .../paddle/distributed/auto_parallel/completion.py | 8 ++++++-- .../distributed/auto_parallel/cost/tensor_cost.py | 4 ++-- .../auto_parallel/operators/dist_reduce_sum_p.py | 2 +- .../dygraph_to_static/test_closure_analysis.py | 4 ++++ .../tests/unittests/dygraph_to_static/yolov3.py | 4 +++- .../paddle/fluid/tests/unittests/gradient_checker.py | 12 ++++++------ .../unittests/test_auto_parallel_reshard_serial.py | 12 ++++++++---- .../fluid/tests/unittests/test_reorder_lod_tensor.py | 4 +++- .../tests/unittests/test_weight_normalization.py | 4 +++- 10 files changed, 41 insertions(+), 19 deletions(-) diff --git a/.flake8 b/.flake8 index 6204cfa9f8b95..84ebba974763b 100644 --- a/.flake8 +++ b/.flake8 @@ -23,7 +23,7 @@ ignore = # F, see https://flake8.pycqa.org/en/latest/user/error-codes.html F405, - F811,F821,F841, + F811,F841, # W, see https://pycodestyle.pycqa.org/en/latest/intro.html#error-codes W503 @@ -33,3 +33,7 @@ per-file-ignores = python/paddle/fluid/tests/unittests/collective/fleet/test_hdfs1.py:E101,W191 # Ignore unused imports in __init__.py __init__.py: F401 + # Ignore undefined variables in CMake config and some dygraph_to_static tests + .cmake-format.py: F821 + python/paddle/fluid/tests/unittests/dygraph_to_static/test_loop.py: F821 + python/paddle/fluid/tests/unittests/dygraph_to_static/test_closure_analysis.py: F821 diff --git a/python/paddle/distributed/auto_parallel/completion.py b/python/paddle/distributed/auto_parallel/completion.py index 02a8c17247534..c0f70f482dd17 100644 --- a/python/paddle/distributed/auto_parallel/completion.py +++ b/python/paddle/distributed/auto_parallel/completion.py @@ -1510,8 +1510,12 @@ def _get_op_by_id(ops, id): self._dist_context.set_op_dist_attr_for_program( grad_op, grad_op_dist_attr ) - grad_op_dist_attr.impl_type = fwd_op_dist_attr.impl_type - grad_op_dist_attr.impl_idx = fwd_op_dist_attr.impl_idx + grad_op_dist_attr.impl_type = ( + fwd_op_dist_attr.impl_type # noqa: F821 + ) + grad_op_dist_attr.impl_idx = ( + fwd_op_dist_attr.impl_idx # noqa: F821 + ) continue diff --git a/python/paddle/distributed/auto_parallel/cost/tensor_cost.py b/python/paddle/distributed/auto_parallel/cost/tensor_cost.py index 0303e29749f9a..9d0794e23757d 100644 --- a/python/paddle/distributed/auto_parallel/cost/tensor_cost.py +++ b/python/paddle/distributed/auto_parallel/cost/tensor_cost.py @@ -100,9 +100,9 @@ def calc_cost(self): if dtype == paddle.float32 or dtype == paddle.int32: dtype_factor = 4 - elif node.dtype == paddle.int64: + elif dtype == paddle.int64: dtype_factor = 8 - elif node.dtype == paddle.uint8: + elif dtype == paddle.uint8: dtype_factor = 1 else: dtype_factor = 2 diff --git a/python/paddle/distributed/auto_parallel/operators/dist_reduce_sum_p.py b/python/paddle/distributed/auto_parallel/operators/dist_reduce_sum_p.py index b9ccb7b3c32a9..01b326d3a562c 100644 --- a/python/paddle/distributed/auto_parallel/operators/dist_reduce_sum_p.py +++ b/python/paddle/distributed/auto_parallel/operators/dist_reduce_sum_p.py @@ -141,7 +141,7 @@ def forward(ctx, *args, **kwargs): def backward(ctx, *args, **kwargs): raise RuntimeError( "primitive operator does NOT have backward function, op type: {}".format( - str(op.type) + str(op.type) # noqa: F821 ) ) diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_closure_analysis.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_closure_analysis.py index 8712950e01b05..0210e260f8238 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_closure_analysis.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_closure_analysis.py @@ -299,6 +299,8 @@ def vlist_of_dict(x): print(paddle.jit.to_static(vlist_of_dict)(x)) def test4(self): + import numpy as np + def vlist_of_dict(x): a = np.array([1, 2, 3]) for i in range(3): @@ -310,6 +312,8 @@ def vlist_of_dict(x): print(paddle.jit.to_static(vlist_of_dict)(x)) def test5(self): + import numpy as np + def vlist_of_dict(x): a = np.array([1, 2, 3]) for i in range(3): diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/yolov3.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/yolov3.py index 6ed5758893f11..28078aba7893c 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/yolov3.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/yolov3.py @@ -296,7 +296,9 @@ def forward( blocks = self.block(inputs) for i, block in enumerate(blocks): if i > 0: - block = fluid.layers.concat(input=[route, block], axis=1) + block = fluid.layers.concat( + input=[route, block], axis=1 # noqa: F821 + ) route, tip = self.yolo_blocks[i](block) block_out = self.block_outputs[i](tip) self.outputs.append(block_out) diff --git a/python/paddle/fluid/tests/unittests/gradient_checker.py b/python/paddle/fluid/tests/unittests/gradient_checker.py index 6e20037185285..9b08f17dadd7b 100644 --- a/python/paddle/fluid/tests/unittests/gradient_checker.py +++ b/python/paddle/fluid/tests/unittests/gradient_checker.py @@ -59,7 +59,7 @@ def _get_item(t, i, np_dtype): raise ValueError("Not supported data type " + str(np_dtype)) -def _set_item(t, i, e, np_dtype): +def _set_item(t, i, e, np_dtype, place): if np_dtype == np.float16: np_t = np.array(t).astype(np.float16) shape = np_t.shape @@ -145,14 +145,14 @@ def run(): for i in range(x_size): orig = _get_item(x_t, i, np_type) x_pos = orig + delta - _set_item(x_t, i, x_pos, np_type) + _set_item(x_t, i, x_pos, np_type, place) y_pos = run() x_neg = orig - delta - _set_item(x_t, i, x_neg, np_type) + _set_item(x_t, i, x_neg, np_type, place) y_neg = run() - _set_item(x_t, i, orig, np_type) + _set_item(x_t, i, orig, np_type, place) for j in range(len(y)): jacobian[j][i, :] = (y_pos[j] - y_neg[j]) / delta / 2.0 @@ -207,7 +207,7 @@ def _compute_analytical_jacobian(program, x, y, place, scope): filted_idx, filted_dx = zip(*filted) for i in range(y_size): - _set_item(dy_t, i, 1, np_type) + _set_item(dy_t, i, 1, np_type, place) dx_res = exe.run(program, scope=scope, fetch_list=filted_dx) @@ -220,7 +220,7 @@ def _compute_analytical_jacobian(program, x, y, place, scope): dx[dx_idx].shape, dtype=np_type ).flatten() - _set_item(dy_t, i, 0, np_type) + _set_item(dy_t, i, 0, np_type, place) return jacobian diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_serial.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_serial.py index a150683a415f3..5c55ced4f292c 100644 --- a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_serial.py +++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_serial.py @@ -60,8 +60,12 @@ def __init__( def forward(self, input): if _global_parallel_strategy == "pp": - auto.shard_tensor(self.linear0.weight, PP_MESH_0, [None, None]) - auto.shard_tensor(self.linear1.weight, PP_MESH_1, [None, None]) + auto.shard_tensor( + self.linear0.weight, PP_MESH_0, [None, None] # noqa: F821 + ) + auto.shard_tensor( + self.linear1.weight, PP_MESH_1, [None, None] # noqa: F821 + ) else: auto.shard_tensor( self.linear0.weight, _global_process_mesh, [None, None] @@ -93,8 +97,8 @@ def mlp_forward(train_program, start_program): ) if _global_parallel_strategy == "pp": - auto.shard_tensor(input, PP_MESH_0, [None, None]) - auto.shard_tensor(label, PP_MESH_1, [None, None]) + auto.shard_tensor(input, PP_MESH_0, [None, None]) # noqa: F821 + auto.shard_tensor(label, PP_MESH_1, [None, None]) # noqa: F821 elif _global_parallel_strategy == "dp": auto.shard_tensor(input, _global_process_mesh, ["x", None]) else: diff --git a/python/paddle/fluid/tests/unittests/test_reorder_lod_tensor.py b/python/paddle/fluid/tests/unittests/test_reorder_lod_tensor.py index dbf14c81948f9..14d9676b339d4 100644 --- a/python/paddle/fluid/tests/unittests/test_reorder_lod_tensor.py +++ b/python/paddle/fluid/tests/unittests/test_reorder_lod_tensor.py @@ -86,7 +86,9 @@ def set_data(self): lod_level_i = np.random.randint( low=1, high=5, - size=self.num_seq if i == 0 else sum(lod_level_i), + size=self.num_seq + if i == 0 + else sum(lod_level_i), # noqa: F821 ).tolist() data_lod.append(lod_level_i) data_value = np.random.random( diff --git a/python/paddle/fluid/tests/unittests/test_weight_normalization.py b/python/paddle/fluid/tests/unittests/test_weight_normalization.py index 9d77dadf8dc09..0481096d5760b 100644 --- a/python/paddle/fluid/tests/unittests/test_weight_normalization.py +++ b/python/paddle/fluid/tests/unittests/test_weight_normalization.py @@ -83,7 +83,9 @@ def set_data(self): lod_level_i = np.random.randint( low=1, high=5, - size=self.batch_size if i == 0 else sum(lod_level_i), + size=self.batch_size + if i == 0 + else sum(lod_level_i), # noqa: F821 ).tolist() data_lod.append(lod_level_i) data_value = np.random.random( From 8a339d24c7aa15fb071a02ab85f3438e99af4b69 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awomir=20Siwek?= Date: Tue, 15 Nov 2022 04:59:41 +0100 Subject: [PATCH 012/210] mkldnn directory cleanup (#47779) * cleanup unused code * unify is_int8 is_bfloat16 * Simplify matmul_v2 FWD kernel * remove RunKernel methods * remove import namespace * remove headers * clean fluid/phi cross imports * remove fluid axpy_handler * delete fluid methods * activations * OneDNNMemDesc * MKLDNNFormatForSize * MatchShapeToLayout * MKLDNNMemoryFormat * MKLDNNFormat * ReorderMKLDNNHandler * to_void_cast * review suggestions * interpolate * remove fluid depedency --- .../fluid/framework/data_layout_transform.cc | 21 +- .../fluid/framework/data_layout_transform.h | 48 +- .../framework/data_layout_transform_test.cc | 7 +- paddle/fluid/framework/data_transform.cc | 14 +- .../conv_activation_mkldnn_fuse_pass.cc | 6 +- .../ir/mkldnn/elt_act_mkldnn_fuse_pass.cc | 4 +- .../ir/mkldnn/fc_act_mkldnn_fuse_pass.cc | 4 +- .../ir/mkldnn/interpolate_mkldnn_pass.cc | 4 +- .../ir/mkldnn/interpolate_mkldnn_pass.h | 4 +- .../matmul_activation_mkldnn_fuse_pass.cc | 4 +- .../softplus_activation_mkldnn_fuse_pass.cc | 4 +- .../new_executor/interpreter/data_transfer.cc | 6 +- .../new_executor/new_executor_defs.cc | 6 +- paddle/fluid/framework/op_kernel_type.h | 4 +- paddle/fluid/framework/operator.cc | 30 +- paddle/fluid/framework/phi_utils_test.cc | 2 +- paddle/fluid/framework/tensor_util.cc | 6 +- paddle/fluid/imperative/infer_shape_context.h | 2 +- paddle/fluid/imperative/prepared_operator.cc | 4 +- .../inference/api/details/zero_copy_tensor.cc | 6 +- paddle/fluid/operators/CMakeLists.txt | 3 - paddle/fluid/operators/activation_op.cc | 1 - paddle/fluid/operators/batch_norm_op.cc | 8 +- .../fluid/operators/controlflow/fetch_op.cc | 2 +- .../operators/controlflow/fetch_v2_op.cc | 2 +- paddle/fluid/operators/conv_op.cc | 8 +- paddle/fluid/operators/conv_transpose_op.cc | 4 +- paddle/fluid/operators/dequantize_op.cc | 2 +- .../operators/elementwise/elementwise_op.h | 4 +- .../mkldnn/elementwise_mkldnn_op.h | 121 ++- .../fused/mkldnn/fusion_gru_mkldnn_op.cc | 42 +- .../fused/mkldnn/fusion_lstm_mkldnn_op.cc | 73 +- .../fused/mkldnn/fusion_rnn_mkldnn.h | 21 +- .../fused/mkldnn/multi_gru_mkldnn_op.cc | 71 +- paddle/fluid/operators/fused/multi_gru_op.cc | 2 +- paddle/fluid/operators/interpolate_op.cc | 4 +- paddle/fluid/operators/interpolate_v2_op.cc | 4 +- paddle/fluid/operators/lrn_op.cc | 8 +- paddle/fluid/operators/matmul_op.cc | 4 +- paddle/fluid/operators/matmul_v2_op.cc | 4 +- paddle/fluid/operators/mkldnn/CMakeLists.txt | 4 - paddle/fluid/operators/mkldnn/axpy_handler.cc | 142 ---- paddle/fluid/operators/mkldnn/axpy_handler.h | 59 -- .../operators/mkldnn/batch_norm_mkldnn_op.cc | 19 +- .../mkldnn/conv_transpose_mkldnn_op.cc | 69 +- .../operators/mkldnn/dequantize_mkldnn_op.cc | 17 +- paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc | 35 +- .../operators/mkldnn/interpolate_mkldnn_op.cc | 23 +- .../operators/mkldnn/layer_norm_mkldnn_op.cc | 13 +- .../fluid/operators/mkldnn/lrn_mkldnn_op.cc | 26 +- .../fluid/operators/mkldnn/matmul_mkldnn_op.h | 48 -- .../operators/mkldnn/matmul_v2_mkldnn_op.cc | 696 ++++++++---------- .../operators/mkldnn/mkldnn_activation_op.h | 57 -- .../fluid/operators/mkldnn/mul_mkldnn_op.cc | 43 +- .../operators/mkldnn/quantize_mkldnn_op.cc | 25 +- .../operators/mkldnn/requantize_mkldnn_op.cc | 11 +- .../operators/mkldnn/reshape_mkldnn_op.cc | 26 +- .../mkldnn/shuffle_channel_mkldnn_op.cc | 7 +- .../operators/mkldnn/transpose_mkldnn_op.cc | 24 +- paddle/fluid/operators/pad2d_op.cc | 6 +- paddle/fluid/operators/pad3d_op.cc | 6 +- paddle/fluid/operators/pool_op.cc | 8 +- paddle/fluid/operators/quantize_op.cc | 2 +- paddle/fluid/operators/requantize_op.cc | 2 +- paddle/fluid/operators/slice_op.cc | 4 +- paddle/fluid/operators/split_op.cc | 2 +- paddle/fluid/operators/squeeze_op.cc | 8 +- paddle/fluid/operators/transfer_layout_op.cc | 2 +- paddle/fluid/operators/transfer_layout_op.h | 17 +- paddle/fluid/platform/mkldnn_helper.h | 266 +------ paddle/fluid/platform/mkldnn_reuse.h | 198 +---- paddle/phi/backends/onednn/onednn_helper.h | 27 - paddle/phi/backends/onednn/onednn_reuse.h | 47 +- paddle/phi/core/dense_tensor.inl | 24 +- paddle/phi/kernels/funcs/CMakeLists.txt | 10 +- .../kernels/funcs/selected_rows_functor.cc | 16 +- paddle/phi/kernels/onednn/conv_handler.h | 4 +- paddle/phi/kernels/onednn/conv_kernel.cc | 3 +- paddle/phi/kernels/transfer_layout_kernel.cc | 2 +- 79 files changed, 861 insertions(+), 1711 deletions(-) delete mode 100644 paddle/fluid/operators/mkldnn/CMakeLists.txt delete mode 100644 paddle/fluid/operators/mkldnn/axpy_handler.cc delete mode 100644 paddle/fluid/operators/mkldnn/axpy_handler.h delete mode 100644 paddle/fluid/operators/mkldnn/matmul_mkldnn_op.h delete mode 100644 paddle/fluid/operators/mkldnn/mkldnn_activation_op.h diff --git a/paddle/fluid/framework/data_layout_transform.cc b/paddle/fluid/framework/data_layout_transform.cc index 1594272fc5b5e..e31c9055320da 100644 --- a/paddle/fluid/framework/data_layout_transform.cc +++ b/paddle/fluid/framework/data_layout_transform.cc @@ -101,15 +101,16 @@ void* GetDataFromTensor(const phi::DenseTensor& tensor, dnnl::memory::data_type type) { switch (type) { case dnnl::memory::data_type::f32: - return platform::to_void_cast(tensor.data()); + return phi::funcs::to_void_cast(tensor.data()); case dnnl::memory::data_type::s8: - return platform::to_void_cast(tensor.data()); + return phi::funcs::to_void_cast(tensor.data()); case dnnl::memory::data_type::u8: - return platform::to_void_cast(tensor.data()); + return phi::funcs::to_void_cast(tensor.data()); case dnnl::memory::data_type::s32: - return platform::to_void_cast(tensor.data()); + return phi::funcs::to_void_cast(tensor.data()); case dnnl::memory::data_type::bf16: - return platform::to_void_cast(tensor.data()); + return phi::funcs::to_void_cast( + tensor.data()); default: PADDLE_THROW( platform::errors::InvalidArgument("Wrong mkldnn type provided.")); @@ -125,7 +126,7 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var, auto place = expected_kernel_type.place_; PADDLE_ENFORCE( - in_layout == DataLayout::kMKLDNN && out_layout != DataLayout::kMKLDNN, + in_layout == DataLayout::ONEDNN && out_layout != DataLayout::ONEDNN, platform::errors::InvalidArgument( "TransDataLayoutFromMKLDNN only supports transform from MKLDNN to " "non-MKLDNN")); @@ -165,7 +166,7 @@ void innerTransDataLayoutFromMKLDNN(DataLayout in_layout, DataTypeToString(framework::TransToProtoVarType(in.dtype())))); auto out_format = - platform::MKLDNNFormatForSize(in_tz.size(), ToMKLDNNFormat(out_layout)); + phi::funcs::OneDNNFormatForSize(in_tz.size(), ToOneDNNFormat(out_layout)); dnnl::memory::desc out_mem_desc(out_tz, in_type, out_format); // output tensor has the same dims as input. Reorder don't change dims @@ -177,8 +178,8 @@ void innerTransDataLayoutFromMKLDNN(DataLayout in_layout, if (in.initialized() && ((in.mem_desc() != out->mem_desc()) || always_copy)) { void* in_data = GetDataFromTensor(in, in_type); - platform::ReorderMKLDNNHandler handler( - in_tz, framework::TransToProtoVarType(in.dtype()), in_type, cpu_engine); + phi::funcs::ReorderOneDNNHandler handler( + in_tz, in.dtype(), in_type, cpu_engine); auto reorder_src_memory_p = handler.AcquireSrcMemory(in.mem_desc(), in_data); @@ -199,7 +200,7 @@ void innerTransDataLayoutFromMKLDNN(DataLayout in_layout, } // For exepected NHWC data format we need to reshape the Output tensor // As MKL-DNN description was in NCHW and paddle is expecting NHWC - platform::MatchShapeToLayout(out, in_layout, out_layout); + phi::funcs::MatchShapeToLayout(out, in_layout, out_layout); out->set_layout(DataLayout::kNCHW); } diff --git a/paddle/fluid/framework/data_layout_transform.h b/paddle/fluid/framework/data_layout_transform.h index a32ab69d10076..f3bfdc17497f0 100644 --- a/paddle/fluid/framework/data_layout_transform.h +++ b/paddle/fluid/framework/data_layout_transform.h @@ -52,51 +52,35 @@ struct CastDataLayout { }; #ifdef PADDLE_WITH_MKLDNN -using MKLDNNDataType = dnnl::memory::data_type; +using OneDNNDataType = dnnl::memory::data_type; -inline MKLDNNMemoryFormat ToMKLDNNFormat(const DataLayout& layout) { +inline OneDNNMemoryFormat ToOneDNNFormat(const DataLayout& layout) { switch (layout) { case DataLayout::kNHWC: - return MKLDNNMemoryFormat::nhwc; + return OneDNNMemoryFormat::nhwc; case DataLayout::kNCHW: - return MKLDNNMemoryFormat::nchw; + return OneDNNMemoryFormat::nchw; case DataLayout::kNCDHW: - return MKLDNNMemoryFormat::ncdhw; + return OneDNNMemoryFormat::ncdhw; case DataLayout::kNDHWC: - return MKLDNNMemoryFormat::ndhwc; + return OneDNNMemoryFormat::ndhwc; default: PADDLE_THROW(platform::errors::InvalidArgument( - "Fail to convert layout %s to MKLDNN format.", + "Fail to convert layout %s to oneDNN format.", phi::DataLayoutToString(layout))); } } -inline DataLayout ToPaddleLayout(const MKLDNNMemoryFormat& format) { - switch (format) { - case MKLDNNMemoryFormat::nhwc: - return DataLayout::kNHWC; - case MKLDNNMemoryFormat::nchw: - return DataLayout::kNCHW; - case MKLDNNMemoryFormat::ncdhw: - return DataLayout::kNCDHW; - case MKLDNNMemoryFormat::ndhwc: - return DataLayout::kNDHWC; - default: - PADDLE_THROW(platform::errors::InvalidArgument( - "Fail to convert MKLDNN format to paddle layout.")); - } -} - -inline MKLDNNDataType ToMKLDNNDataType(proto::VarType::Type type) { - static std::unordered_map dict{ - {DataTypeTrait::DataType(), MKLDNNDataType::f32}, - {DataTypeTrait::DataType(), MKLDNNDataType::s8}, - {DataTypeTrait::DataType(), MKLDNNDataType::u8}, - {DataTypeTrait::DataType(), MKLDNNDataType::s32}, - {DataTypeTrait::DataType(), MKLDNNDataType::bf16}}; +inline OneDNNDataType ToMKLDNNDataType(proto::VarType::Type type) { + static std::unordered_map dict{ + {DataTypeTrait::DataType(), OneDNNDataType::f32}, + {DataTypeTrait::DataType(), OneDNNDataType::s8}, + {DataTypeTrait::DataType(), OneDNNDataType::u8}, + {DataTypeTrait::DataType(), OneDNNDataType::s32}, + {DataTypeTrait::DataType(), OneDNNDataType::bf16}}; auto iter = dict.find(static_cast(type)); if (iter != dict.end()) return iter->second; - return MKLDNNDataType::undef; + return OneDNNDataType::undef; } void innerTransDataLayoutFromMKLDNN(DataLayout in_layout, @@ -111,7 +95,7 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var, const phi::DenseTensor& in, phi::DenseTensor* out); -void* GetDataFromTensor(const phi::DenseTensor& tensor, MKLDNNDataType type); +void* GetDataFromTensor(const phi::DenseTensor& tensor, OneDNNDataType type); #endif diff --git a/paddle/fluid/framework/data_layout_transform_test.cc b/paddle/fluid/framework/data_layout_transform_test.cc index 55f6215598485..68fee94d61775 100644 --- a/paddle/fluid/framework/data_layout_transform_test.cc +++ b/paddle/fluid/framework/data_layout_transform_test.cc @@ -54,9 +54,8 @@ TEST(DataTransformBf16, GetDataFromTensorDNNL) { void* in_data = paddle::framework::GetDataFromTensor(in, dnnl::memory::data_type::bf16); - EXPECT_EQ( - in_data, - paddle::platform::to_void_cast(in.data())); + EXPECT_EQ(in_data, + phi::funcs::to_void_cast(in.data())); } TEST(DataTransformInt32, GetDataFromTensorDNNL) { @@ -66,6 +65,6 @@ TEST(DataTransformInt32, GetDataFromTensorDNNL) { void* in_data = paddle::framework::GetDataFromTensor(in, dnnl::memory::data_type::s32); - EXPECT_EQ(in_data, paddle::platform::to_void_cast(in.data())); + EXPECT_EQ(in_data, phi::funcs::to_void_cast(in.data())); } #endif diff --git a/paddle/fluid/framework/data_transform.cc b/paddle/fluid/framework/data_transform.cc index 8b226fc2c2932..b9247571f1923 100644 --- a/paddle/fluid/framework/data_transform.cc +++ b/paddle/fluid/framework/data_transform.cc @@ -49,24 +49,24 @@ void TransformData(const OpKernelType &expected_kernel_type, // do layout transform if (NeedTransformLayout(lout, lin)) { #ifdef PADDLE_WITH_MKLDNN - if (lin == DataLayout::kMKLDNN || lout == DataLayout::kMKLDNN) { + if (lin == DataLayout::ONEDNN || lout == DataLayout::ONEDNN) { PADDLE_ENFORCE_EQ( - !(lin == DataLayout::kMKLDNN && lout == DataLayout::kMKLDNN), + !(lin == DataLayout::ONEDNN && lout == DataLayout::ONEDNN), true, platform::errors::PreconditionNotMet( - "No layout transform needed between two MKLDNN OPKernels.")); + "No layout transform needed between two oneDNN OPKernels.")); - if (lin != DataLayout::kMKLDNN && lout == DataLayout::kMKLDNN) { + if (lin != DataLayout::ONEDNN && lout == DataLayout::ONEDNN) { // Case1 - transform from Non-MKLDNN OPKernel to MKLDNN OPKernel // Just set layout/format. No real transform occur - auto out_format = platform::MKLDNNFormatForSize(in.dims().size(), - ToMKLDNNFormat(lin)); + auto out_format = phi::funcs::OneDNNFormatForSize(in.dims().size(), + ToOneDNNFormat(lin)); out.ShareDataWith(input_tensor); // For NHWC data we need reshape of tensors as MKL-DNN // is expecting NHWC dims description order if (lin == DataLayout::kNHWC || lin == DataLayout::kNDHWC) { - platform::MatchShapeToLayout(&out, lin, lout); + phi::funcs::MatchShapeToLayout(&out, lin, lout); // We register only NHWC assuming that model is consistent e.g. either // NHWC or NCHW paddle::platform::MKLDNNDeviceContext::tls() diff --git a/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.cc index 8723cab36c621..3cd3cc8f7b054 100644 --- a/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.cc @@ -25,7 +25,7 @@ namespace ir { using string::PrettyLogDetail; void ConvActivationMkldnnFusePass::ApplyImpl(Graph* graph) const { - auto act_types = paddle::platform::GetSupportedActivations(); + auto act_types = phi::funcs::GetSupportedActivations(); std::vector conv_types = {"conv2d"}; for (auto& act_type : act_types) { @@ -64,7 +64,7 @@ void ConvActivationMkldnnFusePass::FuseConvAct(Graph* graph, OpDesc* conv_op = conv->Op(); OpDesc* act_op = activation->Op(); - auto attr_map = paddle::platform::GetAttributeMap(act_type); + auto attr_map = phi::funcs::GetAttributeMap(act_type); for (const auto& attrs : attr_map) { if (act_op->HasAttr(attrs.first)) { conv_op->SetAttr(attrs.second, act_op->GetAttr(attrs.first)); @@ -145,7 +145,7 @@ void ConvActivationMkldnnFusePass::FuseConvConcatAct( OpDesc* conv_op = node->inputs[0]->Op(); OpDesc* act_op = activation_op->Op(); - auto attr_map = paddle::platform::GetAttributeMap(act_type); + auto attr_map = phi::funcs::GetAttributeMap(act_type); for (const auto& attrs : attr_map) { if (act_op->HasAttr(attrs.first)) { conv_op->SetAttr(attrs.second, act_op->GetAttr(attrs.first)); diff --git a/paddle/fluid/framework/ir/mkldnn/elt_act_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/elt_act_mkldnn_fuse_pass.cc index 55ff71ba2eb56..87b2e6406137d 100644 --- a/paddle/fluid/framework/ir/mkldnn/elt_act_mkldnn_fuse_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/elt_act_mkldnn_fuse_pass.cc @@ -27,7 +27,7 @@ namespace ir { using string::PrettyLogDetail; void ElementwiseActivationOneDNNPass::ApplyImpl(Graph *graph) const { - auto act_types = paddle::platform::GetSupportedActivations(); + auto act_types = phi::funcs::GetSupportedActivations(); std::vector elt_types = { "elementwise_add", "elementwise_sub", "elementwise_mul"}; @@ -76,7 +76,7 @@ void ElementwiseActivationOneDNNPass::FuseElementwiseAct( } auto *activation_op = activation->Op(); - auto attr_map = paddle::platform::GetAttributeMap(act_type); + auto attr_map = phi::funcs::GetAttributeMap(act_type); for (const auto &attr : attr_map) { if (activation_op->HasAttr(attr.first)) { elementwise_op->SetAttr(attr.second, diff --git a/paddle/fluid/framework/ir/mkldnn/fc_act_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/fc_act_mkldnn_fuse_pass.cc index f4ac65a9ab199..298e9cf49caea 100644 --- a/paddle/fluid/framework/ir/mkldnn/fc_act_mkldnn_fuse_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/fc_act_mkldnn_fuse_pass.cc @@ -25,7 +25,7 @@ namespace ir { using string::PrettyLogDetail; void FuseFCActOneDNNPass::ApplyImpl(Graph *graph) const { - auto act_types = paddle::platform::GetSupportedActivations(); + auto act_types = phi::funcs::GetSupportedActivations(); for (auto act_type : act_types) FuseFCAct(graph, act_type); } @@ -61,7 +61,7 @@ void FuseFCActOneDNNPass::FuseFCAct(Graph *graph, "is used.")); } - auto attr_map = paddle::platform::GetAttributeMap(act_type); + auto attr_map = phi::funcs::GetAttributeMap(act_type); for (const auto &attr : attr_map) { if (act_op->HasAttr(attr.first)) { fc_op->SetAttr(attr.second, act_op->GetAttr(attr.first)); diff --git a/paddle/fluid/framework/ir/mkldnn/interpolate_mkldnn_pass.cc b/paddle/fluid/framework/ir/mkldnn/interpolate_mkldnn_pass.cc index 53212e3dd6913..17a9da84100da 100644 --- a/paddle/fluid/framework/ir/mkldnn/interpolate_mkldnn_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/interpolate_mkldnn_pass.cc @@ -31,7 +31,7 @@ namespace ir { class Graph; -void InterpolateMKLDNNPass::ApplyImpl(ir::Graph* graph) const { +void InterpolateOneDNNPass::ApplyImpl(ir::Graph* graph) const { PADDLE_ENFORCE_NOT_NULL(graph, platform::errors::InvalidArgument( "Pointer to graph argument should not be NULL.")); @@ -70,4 +70,4 @@ void InterpolateMKLDNNPass::ApplyImpl(ir::Graph* graph) const { } // namespace paddle REGISTER_PASS(interpolate_mkldnn_pass, - paddle::framework::ir::InterpolateMKLDNNPass); + paddle::framework::ir::InterpolateOneDNNPass); diff --git a/paddle/fluid/framework/ir/mkldnn/interpolate_mkldnn_pass.h b/paddle/fluid/framework/ir/mkldnn/interpolate_mkldnn_pass.h index c18ed16fe595a..59d1c81cf7ea5 100644 --- a/paddle/fluid/framework/ir/mkldnn/interpolate_mkldnn_pass.h +++ b/paddle/fluid/framework/ir/mkldnn/interpolate_mkldnn_pass.h @@ -28,9 +28,9 @@ namespace ir { */ class Graph; -class InterpolateMKLDNNPass : public FusePassBase { +class InterpolateOneDNNPass : public FusePassBase { public: - virtual ~InterpolateMKLDNNPass() {} + virtual ~InterpolateOneDNNPass() {} protected: void ApplyImpl(ir::Graph* graph) const override; diff --git a/paddle/fluid/framework/ir/mkldnn/matmul_activation_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/matmul_activation_mkldnn_fuse_pass.cc index 3f1478e3fe5fd..7a40a145bdb4f 100644 --- a/paddle/fluid/framework/ir/mkldnn/matmul_activation_mkldnn_fuse_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/matmul_activation_mkldnn_fuse_pass.cc @@ -25,7 +25,7 @@ namespace ir { using string::PrettyLogDetail; void MatmulActivationMkldnnFusePass::ApplyImpl(Graph* graph) const { - auto act_types = paddle::platform::GetSupportedActivations(); + auto act_types = phi::funcs::GetSupportedActivations(); auto matmul_types = {"matmul", "matmul_v2"}; for (const auto& matmul_type : matmul_types) @@ -64,7 +64,7 @@ void MatmulActivationMkldnnFusePass::FuseMatmulAct( OpDesc* matmul_op = matmul->Op(); OpDesc* act_op = activation->Op(); - auto attr_map = paddle::platform::GetAttributeMap(act_type); + auto attr_map = phi::funcs::GetAttributeMap(act_type); for (const auto& attrs : attr_map) { if (act_op->HasAttr(attrs.first)) { matmul_op->SetAttr(attrs.second, act_op->GetAttr(attrs.first)); diff --git a/paddle/fluid/framework/ir/mkldnn/softplus_activation_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/softplus_activation_mkldnn_fuse_pass.cc index 3620a305dee87..38c86d225f970 100644 --- a/paddle/fluid/framework/ir/mkldnn/softplus_activation_mkldnn_fuse_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/softplus_activation_mkldnn_fuse_pass.cc @@ -27,7 +27,7 @@ namespace ir { using string::PrettyLogDetail; void SoftplusActivationOneDNNPass::ApplyImpl(Graph *graph) const { - auto act_types = paddle::platform::GetSupportedActivations(); + auto act_types = phi::funcs::GetSupportedActivations(); // Currently softplus can't be fused with hard_sigmoid act_types.erase( @@ -75,7 +75,7 @@ void SoftplusActivationOneDNNPass::FuseSoftplusActivation( } auto *activation_op = activation->Op(); - auto attr_map = paddle::platform::GetAttributeMap(act_type); + auto attr_map = phi::funcs::GetAttributeMap(act_type); for (const auto &attr : attr_map) { if (activation_op->HasAttr(attr.first)) { softplus_op->SetAttr(attr.second, activation_op->GetAttr(attr.first)); diff --git a/paddle/fluid/framework/new_executor/interpreter/data_transfer.cc b/paddle/fluid/framework/new_executor/interpreter/data_transfer.cc index 4ab7cf1c49479..0416ed16d7a2c 100644 --- a/paddle/fluid/framework/new_executor/interpreter/data_transfer.cc +++ b/paddle/fluid/framework/new_executor/interpreter/data_transfer.cc @@ -230,7 +230,7 @@ std::shared_ptr TransferLayout(const std::string& var_name, #ifdef PADDLE_WITH_MKLDNN // NOTE(zhiqiu): hot fix, follow the same logic in DataCopy() in fetch_op.cc - if (in_layout == phi::DataLayout::kMKLDNN && + if (in_layout == phi::DataLayout::ONEDNN && var_name == framework::GradVarName("Filter") && is_fetch_v2) { VLOG(4) << "Match special case(Filter && fetch_v2) " << var_name; out_layout = phi::DataLayout::kNCHW; @@ -484,9 +484,9 @@ void ApplyDataTransform(const OpKernelType& expected_kernel_key, // MKL-DNN shape of Var may differ from kNHWC Var // In such situation corressponding resized Var // has to be created and registered - if ((tensor_in->layout() == DataLayout::kMKLDNN) && + if ((tensor_in->layout() == DataLayout::ONEDNN) && (var->IsType() == true) && - (expected_kernel_key.data_layout_ != DataLayout::kMKLDNN) && + (expected_kernel_key.data_layout_ != DataLayout::ONEDNN) && (paddle::platform::MKLDNNDeviceContext::tls() .get_cur_paddle_data_layout() == DataLayout::kNHWC)) { VLOG(7) << "Created reshaped dummy input based on MKL-DNN " diff --git a/paddle/fluid/framework/new_executor/new_executor_defs.cc b/paddle/fluid/framework/new_executor/new_executor_defs.cc index a2b7cb9879026..38dfd391a01e0 100644 --- a/paddle/fluid/framework/new_executor/new_executor_defs.cc +++ b/paddle/fluid/framework/new_executor/new_executor_defs.cc @@ -244,7 +244,7 @@ void InterpretercoreInferShapeContext::ShareAllLoD( auto* out_tensor = out_var->GetMutable(); out_tensor->set_lod(in_tensor.lod()); #ifdef PADDLE_WITH_MKLDNN - if (in_tensor.layout() != DataLayout::kMKLDNN) + if (in_tensor.layout() != DataLayout::ONEDNN) #endif out_tensor->set_layout(in_tensor.layout()); } @@ -309,7 +309,7 @@ void InterpretercoreInferShapeContext::ShareLoD(const std::string& in, // This is to avoid kMKLDNN is populated wrongly into a non-MKLDNN // OPKernel. In all MKLDNN OPkernel, set_layout(kMKLDNN) should be called // in Compute() - if (in_tensor.layout() != DataLayout::kMKLDNN) + if (in_tensor.layout() != DataLayout::ONEDNN) #endif out_tensor->set_layout(in_tensor.layout()); } @@ -338,7 +338,7 @@ bool InterpretercoreInferShapeContext::IsRunMKLDNNKernel() const { auto& op_with_kernel = dynamic_cast(op_); return ((op_with_kernel.kernel_type()) && (op_with_kernel.kernel_type()->data_layout_ == - phi::DataLayout::kMKLDNN)); + phi::DataLayout::ONEDNN)); } catch (std::bad_cast& exp) { return false; } diff --git a/paddle/fluid/framework/op_kernel_type.h b/paddle/fluid/framework/op_kernel_type.h index f226328d2a93b..78b38eed080b7 100644 --- a/paddle/fluid/framework/op_kernel_type.h +++ b/paddle/fluid/framework/op_kernel_type.h @@ -102,8 +102,8 @@ inline bool NeedTransformLayout(const DataLayout& l, const DataLayout& r) { (l != DataLayout::kAnyLayout && r != DataLayout::kAnyLayout && l != r); #ifdef PADDLE_WITH_MKLDNN // Layout transform needed for either non-MKLDNN to MKLDNN or vice versa - ret |= (l != DataLayout::kMKLDNN && r == DataLayout::kMKLDNN); - ret |= (l == DataLayout::kMKLDNN && r != DataLayout::kMKLDNN); + ret |= (l != DataLayout::ONEDNN && r == DataLayout::ONEDNN); + ret |= (l == DataLayout::ONEDNN && r != DataLayout::ONEDNN); #endif return ret; } diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index c28947c75d6ca..1013cf8c49914 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -913,7 +913,7 @@ class RuntimeInferShapeContext : public InferShapeContext { auto* out_tensor = out_var->GetMutable(); out_tensor->set_lod(in_tensor.lod()); #ifdef PADDLE_WITH_MKLDNN - if (in_tensor.layout() != DataLayout::kMKLDNN) + if (in_tensor.layout() != DataLayout::ONEDNN) #endif out_tensor->set_layout(in_tensor.layout()); } @@ -978,7 +978,7 @@ class RuntimeInferShapeContext : public InferShapeContext { // This is to avoid kMKLDNN is populated wrongly into a non-MKLDNN // OPKernel. In all MKLDNN OPkernel, set_layout(kMKLDNN) should be called // in Compute() - if (in_tensor.layout() != DataLayout::kMKLDNN) + if (in_tensor.layout() != DataLayout::ONEDNN) #endif out_tensor->set_layout(in_tensor.layout()); } @@ -1006,7 +1006,7 @@ class RuntimeInferShapeContext : public InferShapeContext { auto& op_with_kernel = dynamic_cast(op_); return ((op_with_kernel.kernel_type()) && (op_with_kernel.kernel_type()->data_layout_ == - phi::DataLayout::kMKLDNN)); + phi::DataLayout::ONEDNN)); } catch (const std::bad_cast& exp) { return false; } @@ -1441,7 +1441,7 @@ bool OperatorWithKernel::SupportsKernelType( this->CanMKLDNNBeUsed(exe_ctx, kernel_type.data_type_)) { auto tmp_kernel_type = kernel_type; tmp_kernel_type.library_type_ = framework::LibraryType::kMKLDNN; - tmp_kernel_type.data_layout_ = framework::DataLayout::kMKLDNN; + tmp_kernel_type.data_layout_ = framework::DataLayout::ONEDNN; return kernels.find(tmp_kernel_type) != kernels.end(); } #endif @@ -1637,7 +1637,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope, phi_kernel_name = kernel_signature_->name; // NOTE(jiahongyu): The registered MKLDNN kernel have library_type = -// LibraryType::kMKLDNN and data_layout_ = DataLayout::kMKLDNN. But the default +// LibraryType::kMKLDNN and data_layout_ = DataLayout::ONEDNN. But the default // values are kPlain, so we need to modify the library_type and data_layout_ // here. There are three statements in if condition: // 1. Whether mkldnn kernel fallbacks to plain kernel; @@ -1648,7 +1648,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope, !paddle::platform::in_mkldnn_white_list(type_) && this->CanMKLDNNBeUsed(exe_ctx, kernel_type_->data_type_)) { kernel_type_->library_type_ = framework::LibraryType::kMKLDNN; - kernel_type_->data_layout_ = framework::DataLayout::kMKLDNN; + kernel_type_->data_layout_ = framework::DataLayout::ONEDNN; } #endif @@ -1897,7 +1897,7 @@ OpKernelType OperatorWithKernel::InnerGetExpectedKernelType( if (!this->DnnFallback() && !paddle::platform::in_mkldnn_white_list(type_) && this->CanMKLDNNBeUsed(ctx, expected_kernel_key.data_type_)) { expected_kernel_key.library_type_ = framework::LibraryType::kMKLDNN; - expected_kernel_key.data_layout_ = framework::DataLayout::kMKLDNN; + expected_kernel_key.data_layout_ = framework::DataLayout::ONEDNN; } #endif @@ -2295,16 +2295,16 @@ Scope* OperatorWithKernel::PrepareData( // Var without buffer may be needed // for some situation like InferShape(). // In this situation We cannot skip Var analysis, as - // MKL-DNN shape of Var may differ from kNHWC Var + // oneDNN shape of Var may differ from kNHWC Var // In such situation corressponding resized Var // has to be created and registered - if ((tensor_in->layout() == DataLayout::kMKLDNN) && + if ((tensor_in->layout() == DataLayout::ONEDNN) && (var->IsType() == true) && - (expected_kernel_key.data_layout_ != DataLayout::kMKLDNN) && + (expected_kernel_key.data_layout_ != DataLayout::ONEDNN) && (paddle::platform::MKLDNNDeviceContext::tls() .get_cur_paddle_data_layout() == DataLayout::kNHWC) && (tensor_in->dims().size() >= 3)) { - // Mixed execution : MKL-DNN and GPU is not supported! + // Mixed execution : oneDNN and GPU is not supported! if (!new_scope) { new_scope = &scope.NewScope(); } @@ -2312,9 +2312,9 @@ Scope* OperatorWithKernel::PrepareData( in_vars->at(i) = trans_var; auto out = trans_var->GetMutable(); out->Resize(tensor_in->dims()); - platform::MatchShapeToLayout( + phi::funcs::MatchShapeToLayout( out, tensor_in->layout(), DataLayout::kNHWC); - VLOG(7) << "Created reshaped dummy input based on MKL-DNN " + VLOG(7) << "Created reshaped dummy input based on oneDNN " "phi::DenseTensor , " "but kNHWC layout" << in_name << " in Operator " << type_; @@ -2752,8 +2752,8 @@ OpKernelType OperatorWithKernel::GetKernelTypeForVar( // When the op is first oneDNN op (there was some non oneDNN op // previously) // then we also need to rotate shape NHWC -> NCWH - if ((expected_kernel_type.data_layout_ == phi::DataLayout::kMKLDNN) && - (tensor.layout() != phi::DataLayout::kMKLDNN) && + if ((expected_kernel_type.data_layout_ == phi::DataLayout::ONEDNN) && + (tensor.layout() != phi::DataLayout::ONEDNN) && paddle::platform::MKLDNNDeviceContext::tls() .get_cur_paddle_data_layout() == phi::DataLayout::kNHWC) { return framework::OpKernelType(expected_kernel_type.data_type_, diff --git a/paddle/fluid/framework/phi_utils_test.cc b/paddle/fluid/framework/phi_utils_test.cc index 77202a7abec9b..2ec99181058c7 100644 --- a/paddle/fluid/framework/phi_utils_test.cc +++ b/paddle/fluid/framework/phi_utils_test.cc @@ -70,7 +70,7 @@ TEST(PhiUtils, TransOpKernelTypeToPhiKernelKey) { paddle::framework::OpKernelType op_kernel_type_mkldnn( paddle::framework::proto::VarType::FP32, paddle::platform::CPUPlace(), - phi::DataLayout::kMKLDNN, + phi::DataLayout::ONEDNN, paddle::framework::LibraryType::kMKLDNN); auto kernel_key_mkldnn = paddle::framework::TransOpKernelTypeToPhiKernelKey(op_kernel_type_mkldnn); diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc index bff439d13a508..867f15a3e09bd 100644 --- a/paddle/fluid/framework/tensor_util.cc +++ b/paddle/fluid/framework/tensor_util.cc @@ -57,7 +57,7 @@ void TensorCopyImpl(const TENSOR& src, // oneDNN tensors due to padding may be of bigger size // than numel()*size(type()) auto dst_ptr = - src.layout() == DataLayout::kMKLDNN + src.layout() == DataLayout::ONEDNN ? dst->mutable_data(dst_place, src.dtype(), src.memory_size()) : dst->mutable_data(dst_place, src.dtype()); #else @@ -72,7 +72,7 @@ void TensorCopyImpl(const TENSOR& src, VLOG(4) << "src:" << src_ptr << ", dst:" << dst_ptr; #ifdef PADDLE_WITH_MKLDNN - auto size = src.layout() == DataLayout::kMKLDNN + auto size = src.layout() == DataLayout::ONEDNN ? src.memory_size() : src.numel() * framework::DataTypeSize(src.dtype()); #else @@ -471,7 +471,7 @@ void TensorCopySync(const phi::DenseTensor& src, dst->Resize(src.dims()); dst->set_layout(src.layout()); #ifdef PADDLE_WITH_MKLDNN - if (src.layout() == DataLayout::kMKLDNN) { + if (src.layout() == DataLayout::ONEDNN) { dst->set_mem_desc(src.mem_desc()); } #endif diff --git a/paddle/fluid/imperative/infer_shape_context.h b/paddle/fluid/imperative/infer_shape_context.h index 7e8c97703d073..d3f163da2a8e3 100644 --- a/paddle/fluid/imperative/infer_shape_context.h +++ b/paddle/fluid/imperative/infer_shape_context.h @@ -251,7 +251,7 @@ class DygraphInferShapeContext : public framework::InferShapeContext { bool IsRunMKLDNNKernel() const override { return (op_kernel_type_ && - (op_kernel_type_->data_layout_ == phi::DataLayout::kMKLDNN)); + (op_kernel_type_->data_layout_ == phi::DataLayout::ONEDNN)); } paddle::small_vector diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc index 5d9eff29e7180..8454f8f2e247d 100644 --- a/paddle/fluid/imperative/prepared_operator.cc +++ b/paddle/fluid/imperative/prepared_operator.cc @@ -232,7 +232,7 @@ PreparedOp PrepareImpl( std::string phi_kernel_name; // NOTE(jiahongyu): The registered MKLDNN kernel have library_type = -// LibraryType::kMKLDNN and data_layout_ = DataLayout::kMKLDNN. But the default +// LibraryType::kMKLDNN and data_layout_ = DataLayout::ONEDNN. But the default // values are kPlain, so we need to modify the library_type and data_layout_ // here. There are three statements in if condition: // 1. Whether mkldnn kernel fallbacks to plain kernel; @@ -242,7 +242,7 @@ PreparedOp PrepareImpl( if (!op.DnnFallback() && !paddle::platform::in_mkldnn_white_list(op.Type()) && op.CanMKLDNNBeUsed(dygraph_exe_ctx, expected_kernel_key.data_type_)) { expected_kernel_key.library_type_ = framework::LibraryType::kMKLDNN; - expected_kernel_key.data_layout_ = framework::DataLayout::kMKLDNN; + expected_kernel_key.data_layout_ = framework::DataLayout::ONEDNN; } #endif diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc index ba2a1f9ac2843..c72ef18cb0f9b 100644 --- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc +++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc @@ -377,7 +377,7 @@ void Tensor::CopyToCpuImpl(T *data, if (paddle::platform::is_cpu_place(t_place)) { #ifdef PADDLE_WITH_MKLDNN - if (tensor->layout() == phi::DataLayout::kMKLDNN) + if (tensor->layout() == phi::DataLayout::ONEDNN) paddle::framework::innerTransDataLayoutFromMKLDNN( tensor->layout(), paddle::platform::MKLDNNDeviceContext::tls() @@ -664,7 +664,7 @@ std::vector Tensor::shape() const { // mkldnn may does layout transform internally, so need to reorder before // return #ifdef PADDLE_WITH_MKLDNN - if (tensor->layout() == phi::DataLayout::kMKLDNN) { + if (tensor->layout() == phi::DataLayout::ONEDNN) { phi::DataLayout out_layout = paddle::platform::MKLDNNDeviceContext::tls() .get_cur_paddle_data_layout(); // Set default as NCHW in case not specified @@ -852,7 +852,7 @@ void InternalUtils::CopyToCpuWithIoStream(paddle_infer::Tensor *t, if (paddle::platform::is_cpu_place(t_place)) { #ifdef PADDLE_WITH_MKLDNN - if (tensor->layout() == phi::DataLayout::kMKLDNN) + if (tensor->layout() == phi::DataLayout::ONEDNN) paddle::framework::innerTransDataLayoutFromMKLDNN( tensor->layout(), paddle::platform::MKLDNNDeviceContext::tls() diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 5293604160682..2823db516010e 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -26,9 +26,6 @@ add_subdirectory(sequence_ops) add_subdirectory(string) add_subdirectory(jit) add_subdirectory(prim_ops) -if(WITH_MKLDNN) - add_subdirectory(mkldnn) -endif() if(WITH_DISTRIBUTE) diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc index 10ea47e4bfd5d..53cd5c92cda3c 100644 --- a/paddle/fluid/operators/activation_op.cc +++ b/paddle/fluid/operators/activation_op.cc @@ -23,7 +23,6 @@ limitations under the License. */ #include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/operators/common_infer_shape_functions.h" -#include "paddle/fluid/operators/mkldnn/mkldnn_activation_op.h" #include "paddle/phi/backends/dynload/port.h" #include "paddle/phi/infermeta/backward.h" diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc index ccf5aa6a62268..6c6591f34abce 100644 --- a/paddle/fluid/operators/batch_norm_op.cc +++ b/paddle/fluid/operators/batch_norm_op.cc @@ -211,8 +211,8 @@ framework::OpKernelType BatchNormOp::GetKernelTypeForVar( // Only input require reshaping, weights and // bias are having shape in NCHW order if ((var_name == "X") && - (expected_kernel_type.data_layout_ == phi::DataLayout::kMKLDNN) && - (tensor.layout() != phi::DataLayout::kMKLDNN)) { + (expected_kernel_type.data_layout_ == phi::DataLayout::ONEDNN) && + (tensor.layout() != phi::DataLayout::ONEDNN)) { auto attrs = Attrs(); auto ar = paddle::framework::AttrReader(attrs); const std::string data_layout = ar.Get("data_layout"); @@ -401,8 +401,8 @@ framework::OpKernelType BatchNormGradOp::GetKernelTypeForVar( // Only input require reshaping, weights and // bias are having shape in NCHW order if (((var_name == "X") || (var_name == framework::GradVarName("Y"))) && - (expected_kernel_type.data_layout_ == phi::DataLayout::kMKLDNN) && - (tensor.layout() != phi::DataLayout::kMKLDNN)) { + (expected_kernel_type.data_layout_ == phi::DataLayout::ONEDNN) && + (tensor.layout() != phi::DataLayout::ONEDNN)) { auto attrs = Attrs(); auto ar = paddle::framework::AttrReader(attrs); const std::string data_layout = ar.Get("data_layout"); diff --git a/paddle/fluid/operators/controlflow/fetch_op.cc b/paddle/fluid/operators/controlflow/fetch_op.cc index df60ce5dd68ce..a84c5cf04e274 100644 --- a/paddle/fluid/operators/controlflow/fetch_op.cc +++ b/paddle/fluid/operators/controlflow/fetch_op.cc @@ -29,7 +29,7 @@ static void DataCopy(const phi::DenseTensor &src_item, if (src_item.IsInitialized() && src_item.numel() > 0) { #ifdef PADDLE_WITH_MKLDNN // Conversion from MKL-DNN to Paddle - if (src_item.layout() == phi::DataLayout::kMKLDNN) { + if (src_item.layout() == phi::DataLayout::ONEDNN) { phi::DenseTensor out; // Convert to desired Paddle layout, apart from grads of filter // as params are not a subject to paddle's data_format diff --git a/paddle/fluid/operators/controlflow/fetch_v2_op.cc b/paddle/fluid/operators/controlflow/fetch_v2_op.cc index 6ef4be16b287c..4daec3a6f9246 100644 --- a/paddle/fluid/operators/controlflow/fetch_v2_op.cc +++ b/paddle/fluid/operators/controlflow/fetch_v2_op.cc @@ -37,7 +37,7 @@ static void DeepCopy(const phi::DenseTensor &src_item, if (src_item.IsInitialized() && src_item.numel() > 0) { #ifdef PADDLE_WITH_MKLDNN // Conversion from MKL-DNN to Paddle - if (src_item.layout() == phi::DataLayout::kMKLDNN) { + if (src_item.layout() == phi::DataLayout::ONEDNN) { phi::DenseTensor out; // Convert to desired Paddle layout, apart from grads of filter // as params are not a subject to paddle's data_format diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc index 7158685c3ec86..50b90e56c03e0 100644 --- a/paddle/fluid/operators/conv_op.cc +++ b/paddle/fluid/operators/conv_op.cc @@ -220,8 +220,8 @@ framework::OpKernelType ConvOp::GetKernelTypeForVar( // Only input require reshaping, weights and // bias are having shape in NCHW order if ((var_name == "Input") && - (expected_kernel_type.data_layout_ == phi::DataLayout::kMKLDNN) && - (tensor.layout() != phi::DataLayout::kMKLDNN)) { + (expected_kernel_type.data_layout_ == phi::DataLayout::ONEDNN) && + (tensor.layout() != phi::DataLayout::ONEDNN)) { auto attrs = Attrs(); auto ar = paddle::framework::AttrReader(attrs); const std::string data_format = ar.Get("data_format"); @@ -470,8 +470,8 @@ framework::OpKernelType ConvOpGrad::GetKernelTypeForVar( // bias are having shape in NCHW order if (((var_name == "Input") || (var_name == framework::GradVarName("Output"))) && - (expected_kernel_type.data_layout_ == phi::DataLayout::kMKLDNN) && - (tensor.layout() != phi::DataLayout::kMKLDNN)) { + (expected_kernel_type.data_layout_ == phi::DataLayout::ONEDNN) && + (tensor.layout() != phi::DataLayout::ONEDNN)) { auto attrs = Attrs(); auto ar = paddle::framework::AttrReader(attrs); const std::string data_format = ar.Get("data_format"); diff --git a/paddle/fluid/operators/conv_transpose_op.cc b/paddle/fluid/operators/conv_transpose_op.cc index e9c4245bc4731..99ec8d0a8b11d 100644 --- a/paddle/fluid/operators/conv_transpose_op.cc +++ b/paddle/fluid/operators/conv_transpose_op.cc @@ -48,8 +48,8 @@ framework::OpKernelType ConvTransposeOp::GetKernelTypeForVar( // Only input require reshaping, weights and // bias are having shape in NCHW order if ((var_name == "Input") && - (expected_kernel_type.data_layout_ == phi::DataLayout::kMKLDNN) && - (tensor.layout() != phi::DataLayout::kMKLDNN)) { + (expected_kernel_type.data_layout_ == phi::DataLayout::ONEDNN) && + (tensor.layout() != phi::DataLayout::ONEDNN)) { auto attrs = Attrs(); auto ar = paddle::framework::AttrReader(attrs); const std::string data_format = ar.Get("data_format"); diff --git a/paddle/fluid/operators/dequantize_op.cc b/paddle/fluid/operators/dequantize_op.cc index 31a4b50760fa1..c39f351fcb178 100644 --- a/paddle/fluid/operators/dequantize_op.cc +++ b/paddle/fluid/operators/dequantize_op.cc @@ -26,7 +26,7 @@ framework::OpKernelType DeQuantOp::GetExpectedKernelType( return framework::OpKernelType(input_data_type, ctx.GetPlace(), - phi::DataLayout::kMKLDNN, + phi::DataLayout::ONEDNN, framework::LibraryType::kMKLDNN); } diff --git a/paddle/fluid/operators/elementwise/elementwise_op.h b/paddle/fluid/operators/elementwise/elementwise_op.h index c6b47d5d97948..c1dacdcef711e 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_op.h @@ -174,8 +174,8 @@ class ElementwiseOp : public framework::OperatorWithKernel { // When elementwise is first oneDNN op (there was some non oneDNN op // previously) // then we also need to rotate shape NHWC -> NCWH - if ((expected_kernel_type.data_layout_ == phi::DataLayout::kMKLDNN) && - (tensor.layout() != phi::DataLayout::kMKLDNN) && + if ((expected_kernel_type.data_layout_ == phi::DataLayout::ONEDNN) && + (tensor.layout() != phi::DataLayout::ONEDNN) && paddle::platform::MKLDNNDeviceContext::tls() .get_cur_paddle_data_layout() == phi::DataLayout::kNHWC) { return framework::OpKernelType(expected_kernel_type.data_type_, diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h index e30bafb5a500a..af6ef1fbdb051 100644 --- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h +++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h @@ -28,6 +28,7 @@ using dnnl::memory; using dnnl::primitive; using dnnl::stream; using phi::DataLayout; +using phi::funcs::BinaryOneDNNHandler; inline std::vector CalculateBroadcastedDims( const phi::DenseTensor* x, const phi::DenseTensor* y) { @@ -51,11 +52,12 @@ inline std::vector CalculateBroadcastedDims( return dst_tz_ex; } -inline void AddSubNonBroadcast(platform::ReorderMKLDNNHandler* reorder_handler, - phi::DenseTensor* grad_tensor, - const std::shared_ptr& src_memory, - const std::shared_ptr& dst_memory, - const std::vector& scales) { +inline void AddSubNonBroadcast( + phi::funcs::ReorderOneDNNHandler* reorder_handler, + phi::DenseTensor* grad_tensor, + const std::shared_ptr& src_memory, + const std::shared_ptr& dst_memory, + const std::vector& scales) { dnnl::primitive_attr reorder_attr; reorder_attr.set_output_scales(0, scales); auto reorder_p = @@ -84,7 +86,7 @@ inline void BroadcastReduction(const framework::ExecutionContext& ctx, broadcast_reduction_attr.set_post_ops(po); } - platform::ReductionMKLDNNHandler reduction_handler( + phi::funcs::ReductionOneDNNHandler reduction_handler( dnnl::algorithm::reduction_sum, 0.0f, 0.0f, @@ -132,18 +134,18 @@ class EltwiseMKLDNNKernel : public framework::OpKernel { float scale_o = ctx.Attr("Scale_out"); int axis = ctx.Attr("axis"); - platform::BinaryMKLDNNHandler handler(BINARY_OP, - axis, - mkldnn_engine, - ctx.GetPlace(), - x, - y, - z, - scale_x, - scale_y, - scale_o, - true, - get_post_ops(ctx)); + BinaryOneDNNHandler handler(BINARY_OP, + axis, + mkldnn_engine, + ctx.GetPlace(), + x, + y, + z, + scale_x, + scale_y, + scale_o, + true, + get_post_ops(ctx)); // oneDNN's binary is optimized for broadcasting y into x, so in other case // we have to swap tensors to achieve optimal performance @@ -239,16 +241,13 @@ class EltwiseMKLDNNGradKernel : public ElemwiseGradKernel { int axis = ctx.Attr("axis"); auto tz = phi::vectorize(dout->dims()); - auto proto_type_dout = framework::TransToProtoVarType(dout->dtype()); + auto dout_type = phi::funcs::ToOneDNNDataType(dout->dtype()); - platform::ReorderMKLDNNHandler reorder_handler( - tz, - proto_type_dout, - framework::ToMKLDNNDataType(proto_type_dout), - onednn_engine); + phi::funcs::ReorderOneDNNHandler reorder_handler( + tz, dout->dtype(), dout_type, onednn_engine); auto reorder_src_memory = reorder_handler.AcquireSrcMemory( - dout->mem_desc(), platform::to_void_cast(dout->data())); + dout->mem_desc(), phi::funcs::to_void_cast(dout->data())); std::shared_ptr dst_memory; std::shared_ptr broadcast_src_memory = reorder_src_memory; @@ -265,17 +264,17 @@ class EltwiseMKLDNNGradKernel : public ElemwiseGradKernel { &reorder_handler, dx, reorder_src_memory, dst_memory, scales); } } else { // elementwise_mul & elementwise_div - platform::BinaryMKLDNNHandler binary_handler(BINARY_OP, - axis, - onednn_engine, - ctx.GetPlace(), - dout, - y, - dx, - 1.0f, - 1.0f, - 1.0f, - false); + BinaryOneDNNHandler binary_handler(BINARY_OP, + axis, + onednn_engine, + ctx.GetPlace(), + dout, + y, + dx, + 1.0f, + 1.0f, + 1.0f, + false); const auto src_dout_memory = binary_handler.AcquireSrcMemory(dout); const auto src_y_memory = binary_handler.AcquireSecondSrcMemory(y); @@ -323,23 +322,22 @@ class EltwiseMKLDNNGradKernel : public ElemwiseGradKernel { std::shared_ptr src_0_memory; std::shared_ptr src_1_memory; - platform::BinaryMKLDNNHandler binary_handler( - dnnl::algorithm::binary_mul, - axis, - onednn_engine, - ctx.GetPlace(), - dout, - x, - nullptr, - 1.0f, - 1.0f, - 1.0f, - false); + BinaryOneDNNHandler binary_handler(dnnl::algorithm::binary_mul, + axis, + onednn_engine, + ctx.GetPlace(), + dout, + x, + nullptr, + 1.0f, + 1.0f, + 1.0f, + false); src_1_memory = binary_handler.AcquireSecondSrcMemory(x); if (BINARY_OP == dnnl::algorithm::binary_div) { - platform::BinaryMKLDNNHandler post_op_binary_handler( + BinaryOneDNNHandler post_op_binary_handler( dnnl::algorithm::binary_div, axis, onednn_engine, @@ -358,19 +356,18 @@ class EltwiseMKLDNNGradKernel : public ElemwiseGradKernel { po.append_binary(dnnl::algorithm::binary_div, post_op_memory->get_desc()); - binary_handler = - platform::BinaryMKLDNNHandler(dnnl::algorithm::binary_mul, - axis, - onednn_engine, - ctx.GetPlace(), - dout, - out, - nullptr, - -1.0f, - 1.0f, - 1.0f, - false, - po); + binary_handler = BinaryOneDNNHandler(dnnl::algorithm::binary_mul, + axis, + onednn_engine, + ctx.GetPlace(), + dout, + out, + nullptr, + -1.0f, + 1.0f, + 1.0f, + false, + po); src_1_memory = binary_handler.AcquireSecondSrcMemory(out); } diff --git a/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc b/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc index b49c0cafffc65..31e74372cb9b8 100644 --- a/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc +++ b/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc @@ -20,10 +20,8 @@ limitations under the License. */ namespace paddle { namespace operators { -using paddle::platform::MKLDNNGetDataType; -using paddle::platform::MKLDNNMemDesc; -using phi::CPUContext; -using platform::to_void_cast; +using phi::funcs::OneDNNGetDataType; +using phi::funcs::OneDNNMemDesc; template class GRUMKLDNNHandler : public RNNMKLDNNHandler { @@ -73,7 +71,7 @@ class GRUMKLDNNHandler : public RNNMKLDNNHandler { // Weights for int8 kernel are of a type s8 const auto weights_dt = - is_INT8 ? dnnl::memory::data_type::s8 : MKLDNNGetDataType(); + is_INT8 ? dnnl::memory::data_type::s8 : OneDNNGetDataType(); // oneDNN RNN dimensions const int64_t D = 1; // Directions @@ -81,18 +79,18 @@ class GRUMKLDNNHandler : public RNNMKLDNNHandler { const int64_t G = 3; // Number of Gates, 3 for GRU // Create memory descriptors - auto input_md = MKLDNNMemDesc( - {Ti, N, IC}, MKLDNNGetDataType(), MKLDNNMemoryFormat::ntc); + auto input_md = OneDNNMemDesc( + {Ti, N, IC}, OneDNNGetDataType(), OneDNNMemoryFormat::ntc); auto weight_x_md = - MKLDNNMemDesc({L, D, IC, G, OC}, weights_dt, MKLDNNMemoryFormat::any); + OneDNNMemDesc({L, D, IC, G, OC}, weights_dt, OneDNNMemoryFormat::any); auto weight_h_md = - MKLDNNMemDesc({L, D, OC, G, OC}, weights_dt, MKLDNNMemoryFormat::any); - auto bias_md = MKLDNNMemDesc( - {L, D, G, OC}, MKLDNNGetDataType(), MKLDNNMemoryFormat::ldgo); - auto hidden_md = MKLDNNMemDesc( - {Ti, N, OC}, MKLDNNGetDataType(), MKLDNNMemoryFormat::ntc); - auto h0_md = MKLDNNMemDesc( - {L, D, N, OC}, MKLDNNGetDataType(), MKLDNNMemoryFormat::ldnc); + OneDNNMemDesc({L, D, OC, G, OC}, weights_dt, OneDNNMemoryFormat::any); + auto bias_md = OneDNNMemDesc( + {L, D, G, OC}, OneDNNGetDataType(), OneDNNMemoryFormat::ldgo); + auto hidden_md = OneDNNMemDesc( + {Ti, N, OC}, OneDNNGetDataType(), OneDNNMemoryFormat::ntc); + auto h0_md = OneDNNMemDesc( + {L, D, N, OC}, OneDNNGetDataType(), OneDNNMemoryFormat::ldnc); // Create GRU oneDNN primitive const auto direction = @@ -121,9 +119,9 @@ class GRUMKLDNNHandler : public RNNMKLDNNHandler { std::static_pointer_cast(this->dev_ctx_.GetBlob(wx_key)); if (!memory_p) { - auto user_md = MKLDNNMemDesc({1, 1, this->IC, this->G, this->OC}, - MKLDNNGetDataType(), - MKLDNNMemoryFormat::ldigo); + auto user_md = OneDNNMemDesc({1, 1, this->IC, this->G, this->OC}, + OneDNNGetDataType(), + OneDNNMemoryFormat::ldigo); auto user_memory = dnnl::memory(user_md, this->engine_); auto* weight_x_data = reinterpret_cast(user_memory.get_data_handle()); @@ -161,9 +159,9 @@ class GRUMKLDNNHandler : public RNNMKLDNNHandler { std::static_pointer_cast(this->dev_ctx_.GetBlob(wh_key)); if (!memory_p) { - auto user_md = MKLDNNMemDesc({1, 1, this->OC, this->G, this->OC}, - MKLDNNGetDataType(), - MKLDNNMemoryFormat::ldigo); + auto user_md = OneDNNMemDesc({1, 1, this->OC, this->G, this->OC}, + OneDNNGetDataType(), + OneDNNMemoryFormat::ldigo); auto user_memory = dnnl::memory(user_md, this->engine_); // Reorder weights_h from PP format [OC, 2OC] + [OC, OC] to @@ -357,7 +355,7 @@ class FusionGRUMKLDNNKernel : public framework::OpKernel { auto* hidden_onednn_data = hidden_onednn_memory_p->get_data_handle(); auto* hidden_data = - to_void_cast(hidden->mutable_data(ctx.GetPlace())); + phi::funcs::to_void_cast(hidden->mutable_data(ctx.GetPlace())); if (handler.is_NTC()) { handler.reorderRNNdata(hidden_onednn_data, hidden_data, diff --git a/paddle/fluid/operators/fused/mkldnn/fusion_lstm_mkldnn_op.cc b/paddle/fluid/operators/fused/mkldnn/fusion_lstm_mkldnn_op.cc index a9464e947cf0a..1ce97637358d9 100644 --- a/paddle/fluid/operators/fused/mkldnn/fusion_lstm_mkldnn_op.cc +++ b/paddle/fluid/operators/fused/mkldnn/fusion_lstm_mkldnn_op.cc @@ -20,10 +20,8 @@ limitations under the License. */ namespace paddle { namespace operators { -using paddle::platform::MKLDNNGetDataType; -using paddle::platform::MKLDNNMemDesc; -using phi::CPUContext; -using platform::to_void_cast; +using phi::funcs::OneDNNGetDataType; +using phi::funcs::OneDNNMemDesc; template class LSTMMKLDNNHandler @@ -80,7 +78,7 @@ class LSTMMKLDNNHandler // Weights for int8 kernel are of a type s8 const auto weights_dt = - is_INT8 ? dnnl::memory::data_type::s8 : MKLDNNGetDataType(); + is_INT8 ? dnnl::memory::data_type::s8 : OneDNNGetDataType(); // oneDNN RNN dimensions const int64_t D = 1; // Directions @@ -88,21 +86,21 @@ class LSTMMKLDNNHandler const int64_t G = 4; // Number of Gates, 4 for LSTM // Create memory descriptors - auto input_md = MKLDNNMemDesc( - {Ti, N, IC}, MKLDNNGetDataType(), MKLDNNMemoryFormat::tnc); + auto input_md = OneDNNMemDesc( + {Ti, N, IC}, OneDNNGetDataType(), OneDNNMemoryFormat::tnc); auto weight_x_md = - MKLDNNMemDesc({L, D, IC, G, OC}, weights_dt, MKLDNNMemoryFormat::any); + OneDNNMemDesc({L, D, IC, G, OC}, weights_dt, OneDNNMemoryFormat::any); auto weight_h_md = - MKLDNNMemDesc({L, D, OC, G, OC}, weights_dt, MKLDNNMemoryFormat::any); - auto bias_md = MKLDNNMemDesc( - {L, D, G, OC}, MKLDNNGetDataType(), MKLDNNMemoryFormat::ldgo); - auto hidden_md = MKLDNNMemDesc( - {Ti, N, OC}, MKLDNNGetDataType(), MKLDNNMemoryFormat::any); + OneDNNMemDesc({L, D, OC, G, OC}, weights_dt, OneDNNMemoryFormat::any); + auto bias_md = OneDNNMemDesc( + {L, D, G, OC}, OneDNNGetDataType(), OneDNNMemoryFormat::ldgo); + auto hidden_md = OneDNNMemDesc( + {Ti, N, OC}, OneDNNGetDataType(), OneDNNMemoryFormat::any); - auto h0_md = MKLDNNMemDesc( - {L, D, N, OC}, MKLDNNGetDataType(), MKLDNNMemoryFormat::any); - auto c0_md = MKLDNNMemDesc( - {L, D, N, OC}, MKLDNNGetDataType(), MKLDNNMemoryFormat::any); + auto h0_md = OneDNNMemDesc( + {L, D, N, OC}, OneDNNGetDataType(), OneDNNMemoryFormat::any); + auto c0_md = OneDNNMemDesc( + {L, D, N, OC}, OneDNNGetDataType(), OneDNNMemoryFormat::any); // Create LSTM oneDNN primitive const auto direction = @@ -123,9 +121,9 @@ class LSTMMKLDNNHandler dnnl::memory::desc(), dnnl::memory::desc()); } else { - auto weight_peephole_md = MKLDNNMemDesc({L, D, 3, OC}, - MKLDNNGetDataType(), - MKLDNNMemoryFormat::ldgo); + auto weight_peephole_md = OneDNNMemDesc({L, D, 3, OC}, + OneDNNGetDataType(), + OneDNNMemoryFormat::ldgo); this->AcquireForwardPrimitiveDescriptor( this->attr_, dnnl::prop_kind::forward_inference, @@ -173,9 +171,9 @@ class LSTMMKLDNNHandler std::static_pointer_cast(this->dev_ctx_.GetBlob(wx_key)); if (!memory_p) { - auto user_md = MKLDNNMemDesc({1, 1, this->IC, this->G, this->OC}, - MKLDNNGetDataType(), - MKLDNNMemoryFormat::ldigo); + auto user_md = OneDNNMemDesc({1, 1, this->IC, this->G, this->OC}, + OneDNNGetDataType(), + OneDNNMemoryFormat::ldigo); auto user_memory = dnnl::memory(user_md, this->engine_); auto* weight_x_data = reinterpret_cast(user_memory.get_data_handle()); @@ -205,9 +203,9 @@ class LSTMMKLDNNHandler std::static_pointer_cast(this->dev_ctx_.GetBlob(wh_key)); if (!memory_p) { - auto user_md = MKLDNNMemDesc({1, 1, this->OC, this->G, this->OC}, - MKLDNNGetDataType(), - MKLDNNMemoryFormat::ldigo); + auto user_md = OneDNNMemDesc({1, 1, this->OC, this->G, this->OC}, + OneDNNGetDataType(), + OneDNNMemoryFormat::ldigo); auto user_memory = dnnl::memory(user_md, this->engine_); auto* weight_h_data = reinterpret_cast(user_memory.get_data_handle()); @@ -264,9 +262,9 @@ class LSTMMKLDNNHandler this->dev_ctx_.GetBlob(peepholes_key)); if (!memory_p) { - auto user_md = MKLDNNMemDesc({1, 1, 3, this->OC}, - MKLDNNGetDataType(), - MKLDNNMemoryFormat::ldgo); + auto user_md = OneDNNMemDesc({1, 1, 3, this->OC}, + OneDNNGetDataType(), + OneDNNMemoryFormat::ldgo); auto user_memory = dnnl::memory(user_md, this->engine_); memory_p = std::make_shared( this->fwd_pd_->weights_peephole_desc(), this->engine_); @@ -292,15 +290,16 @@ class LSTMMKLDNNHandler if (!memory_p) { auto user_c0_memory = dnnl::memory(); if (c0) { - user_c0_memory = dnnl::memory({{1, 1, this->N, this->OC}, - MKLDNNGetDataType(), - MKLDNNMemoryFormat::ldnc}, - this->engine_, - to_void_cast(c0->data())); + user_c0_memory = + dnnl::memory({{1, 1, this->N, this->OC}, + OneDNNGetDataType(), + OneDNNMemoryFormat::ldnc}, + this->engine_, + phi::funcs::to_void_cast(c0->data())); } else { user_c0_memory = dnnl::memory({{1, 1, this->N, this->OC}, - MKLDNNGetDataType(), - MKLDNNMemoryFormat::ldnc}, + OneDNNGetDataType(), + OneDNNMemoryFormat::ldnc}, this->engine_); memset(user_c0_memory.get_data_handle(), 0, @@ -451,7 +450,7 @@ class FusionLSTMMKLDNNKernel : public framework::OpKernel { auto* hidden_onednn_data = hidden_onednn_memory_p->get_data_handle(); auto* hidden_data = - to_void_cast(hidden->mutable_data(ctx.GetPlace())); + phi::funcs::to_void_cast(hidden->mutable_data(ctx.GetPlace())); if (handler.is_NTC()) { handler.reorderRNNdata(hidden_onednn_data, hidden_data, diff --git a/paddle/fluid/operators/fused/mkldnn/fusion_rnn_mkldnn.h b/paddle/fluid/operators/fused/mkldnn/fusion_rnn_mkldnn.h index b04b3d19281ba..2ed30d3c16af0 100644 --- a/paddle/fluid/operators/fused/mkldnn/fusion_rnn_mkldnn.h +++ b/paddle/fluid/operators/fused/mkldnn/fusion_rnn_mkldnn.h @@ -20,13 +20,10 @@ namespace paddle { namespace operators { using paddle::platform::CreateKey; -using paddle::platform::MKLDNNGetDataType; -using paddle::platform::MKLDNNMemDesc; -using phi::CPUContext; -using platform::to_void_cast; +using phi::funcs::OneDNNGetDataType; template -class RNNMKLDNNHandler : public platform::MKLDNNHandlerT { +class RNNMKLDNNHandler : public phi::funcs::OneDNNHandlerT { public: RNNMKLDNNHandler(const paddle::framework::ExecutionContext& ctx, const platform::MKLDNNDeviceContext& dev_ctx, @@ -42,11 +39,11 @@ class RNNMKLDNNHandler : public platform::MKLDNNHandlerT { const int64_t OC, const int64_t G, const std::string& unique_name) - : platform::MKLDNNHandlerT( + : phi::funcs::OneDNNHandlerT( dev_ctx, dev_ctx.GetEngine(), cpu_place, - CreateKey(dev_ctx, unique_name, MKLDNNGetDataType(), Ti)), + CreateKey(dev_ctx, unique_name, OneDNNGetDataType(), Ti)), N(N), Ti(Ti), IC(IC), @@ -55,7 +52,7 @@ class RNNMKLDNNHandler : public platform::MKLDNNHandlerT { // Create memory key without Ti because weights, bias and h0 memories // do not depend on Ti size but primitive and input/output memory do memory_key_ = platform::ExtendKeyWithThreadInfoIfNeeded( - dev_ctx, CreateKey(dev_ctx, unique_name, MKLDNNGetDataType())); + dev_ctx, CreateKey(dev_ctx, unique_name, OneDNNGetDataType())); // Is it int8 kernel const bool is_INT8 = std::is_same::value; @@ -163,7 +160,7 @@ class RNNMKLDNNHandler : public platform::MKLDNNHandlerT { } const auto& input_lod = input->lod()[0]; - auto* x_data = to_void_cast(input->data()); + auto* x_data = phi::funcs::to_void_cast(input->data()); auto* x_onednn_data = memory_p->get_data_handle(); memset(x_onednn_data, 0, sizeof(T) * N * Ti * IC); @@ -210,12 +207,12 @@ class RNNMKLDNNHandler : public platform::MKLDNNHandlerT { auto user_h0_memory = dnnl::memory(); if (h0) { user_h0_memory = dnnl::memory( - {{1, 1, N, OC}, MKLDNNGetDataType(), MKLDNNMemoryFormat::ldnc}, + {{1, 1, N, OC}, OneDNNGetDataType(), OneDNNMemoryFormat::ldnc}, this->engine_, - to_void_cast(h0->data())); + phi::funcs::to_void_cast(h0->data())); } else { user_h0_memory = dnnl::memory( - {{1, 1, N, OC}, MKLDNNGetDataType(), MKLDNNMemoryFormat::ldnc}, + {{1, 1, N, OC}, OneDNNGetDataType(), OneDNNMemoryFormat::ldnc}, this->engine_); memset(user_h0_memory.get_data_handle(), 0, sizeof(U) * N * OC); } diff --git a/paddle/fluid/operators/fused/mkldnn/multi_gru_mkldnn_op.cc b/paddle/fluid/operators/fused/mkldnn/multi_gru_mkldnn_op.cc index 79ce2ea2c90a2..84ee7c0fb9b6a 100644 --- a/paddle/fluid/operators/fused/mkldnn/multi_gru_mkldnn_op.cc +++ b/paddle/fluid/operators/fused/mkldnn/multi_gru_mkldnn_op.cc @@ -27,11 +27,9 @@ namespace paddle { namespace operators { using paddle::platform::CreateKey; -using paddle::platform::MKLDNNGetDataType; -using paddle::platform::MKLDNNMemDesc; -using phi::CPUContext; using phi::vectorize; -using platform::to_void_cast; +using phi::funcs::OneDNNGetDataType; +using phi::funcs::OneDNNMemDesc; using Direction = dnnl::rnn_direction; namespace { @@ -115,7 +113,7 @@ class MultiGRUHandler { // Create memory key without Ti because weights, bias and h0 memories // do not depend on Ti size but primitive and input/output memory do memory_key_ = platform::ExtendKeyWithThreadInfoIfNeeded( - dev_ctx, CreateKey(dev_ctx, unique_name, MKLDNNGetDataType())); + dev_ctx, CreateKey(dev_ctx, unique_name, OneDNNGetDataType())); key_ = memory_key_; key_.append("T").append(std::to_string(Ti_)); @@ -176,26 +174,26 @@ class MultiGRUHandler { const auto weights_dt = is_int8 ? dnnl::memory::data_type::s8 : dnnl::memory::data_type::f32; - auto x_md = MKLDNNMemDesc({Ti_, N_, ICs[layer]}, - MKLDNNGetDataType(), - MKLDNNMemoryFormat::ntc); - auto h0_md = MKLDNNMemDesc({L, D, N_, OCs[layer]}, - MKLDNNGetDataType(), - MKLDNNMemoryFormat::ldnc); - auto wx_md = MKLDNNMemDesc({L, D, ICs[layer], G, OCs[layer]}, + auto x_md = OneDNNMemDesc({Ti_, N_, ICs[layer]}, + OneDNNGetDataType(), + OneDNNMemoryFormat::ntc); + auto h0_md = OneDNNMemDesc({L, D, N_, OCs[layer]}, + OneDNNGetDataType(), + OneDNNMemoryFormat::ldnc); + auto wx_md = OneDNNMemDesc({L, D, ICs[layer], G, OCs[layer]}, weights_dt, - MKLDNNMemoryFormat::any); - auto wh_md = MKLDNNMemDesc({L, D, OCs[layer], G, OCs[layer]}, + OneDNNMemoryFormat::any); + auto wh_md = OneDNNMemDesc({L, D, OCs[layer], G, OCs[layer]}, weights_dt, - MKLDNNMemoryFormat::any); - auto b_md = MKLDNNMemDesc({L, D, G, OCs[layer]}, - MKLDNNGetDataType(), - MKLDNNMemoryFormat::ldgo); + OneDNNMemoryFormat::any); + auto b_md = OneDNNMemDesc({L, D, G, OCs[layer]}, + OneDNNGetDataType(), + OneDNNMemoryFormat::ldgo); auto h_md = - MKLDNNMemDesc({Ti_, N_, OCs[layer]}, - (layer == layers_ - 1) ? MKLDNNGetDataType() - : MKLDNNGetDataType(), - MKLDNNMemoryFormat::ntc); + OneDNNMemDesc({Ti_, N_, OCs[layer]}, + (layer == layers_ - 1) ? OneDNNGetDataType() + : OneDNNGetDataType(), + OneDNNMemoryFormat::ntc); auto desc = std::make_shared( dnnl::prop_kind::forward_inference, @@ -226,10 +224,10 @@ class MultiGRUHandler { if (pd == nullptr) { const int axis = 2; auto in_md = - MKLDNNMemDesc({Ti_, N_, OCs[layer]}, - (layer == layers_ - 1) ? MKLDNNGetDataType() - : MKLDNNGetDataType(), - MKLDNNMemoryFormat::ntc); + OneDNNMemDesc({Ti_, N_, OCs[layer]}, + (layer == layers_ - 1) ? OneDNNGetDataType() + : OneDNNGetDataType(), + OneDNNMemoryFormat::ntc); std::vector src_mds{in_md, in_md}; pd = std::make_shared( @@ -251,7 +249,7 @@ class MultiGRUHandler { dev_ctx_.SetBlob(key, memory_p); } - auto* x_data = to_void_cast(x_->data()); + auto* x_data = phi::funcs::to_void_cast(x_->data()); auto* x_onednn_data = memory_p->get_data_handle(); memset(x_onednn_data, 0, sizeof(T) * N_ * Ti_ * ICs[0]); @@ -336,8 +334,8 @@ class MultiGRUHandler { if (!memory_p) { auto user_h0_memory = dnnl::memory(); user_h0_memory = dnnl::memory({{1, 1, N_, OCs[layer]}, - MKLDNNGetDataType(), - MKLDNNMemoryFormat::ldnc}, + OneDNNGetDataType(), + OneDNNMemoryFormat::ldnc}, engine_); memset( user_h0_memory.get_data_handle(), 0, sizeof(float) * N_ * OCs[layer]); @@ -360,9 +358,9 @@ class MultiGRUHandler { std::static_pointer_cast(dev_ctx_.GetBlob(key)); if (!memory_p) { - auto user_md = MKLDNNMemDesc({1, 1, ICs[layer], 3, OCs[layer]}, - MKLDNNGetDataType(), - MKLDNNMemoryFormat::ldigo); + auto user_md = OneDNNMemDesc({1, 1, ICs[layer], 3, OCs[layer]}, + OneDNNGetDataType(), + OneDNNMemoryFormat::ldigo); auto user_memory = dnnl::memory(user_md, engine_); auto* weight_x_data = @@ -400,9 +398,9 @@ class MultiGRUHandler { std::static_pointer_cast(dev_ctx_.GetBlob(key)); if (!memory_p) { - auto user_md = MKLDNNMemDesc({1, 1, OCs[layer], 3, OCs[layer]}, - MKLDNNGetDataType(), - MKLDNNMemoryFormat::ldigo); + auto user_md = OneDNNMemDesc({1, 1, OCs[layer], 3, OCs[layer]}, + OneDNNGetDataType(), + OneDNNMemoryFormat::ldigo); auto user_memory = dnnl::memory(user_md, engine_); // Reorder weights_h from PP format [OC, 2OC] + [OC, OC] to @@ -599,7 +597,8 @@ class MultiGRUHandler { template void reorderOutput(std::shared_ptr mem, int layer) { auto* data = mem->get_data_handle(); - auto* hidden_data = to_void_cast(hidden_->mutable_data(place_)); + auto* hidden_data = + phi::funcs::to_void_cast(hidden_->mutable_data(place_)); if (isNTC(gru_pds_[{layers_ - 1, L2R}]->dst_desc())) { reorderNTCtoPP(data, hidden_data, layers_ - 1); diff --git a/paddle/fluid/operators/fused/multi_gru_op.cc b/paddle/fluid/operators/fused/multi_gru_op.cc index d7c64efa4afbd..9fb260aee1936 100644 --- a/paddle/fluid/operators/fused/multi_gru_op.cc +++ b/paddle/fluid/operators/fused/multi_gru_op.cc @@ -143,7 +143,7 @@ framework::OpKernelType MultiGRUOp::GetExpectedKernelType( return framework::OpKernelType( OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace(), - phi::DataLayout::kMKLDNN, + phi::DataLayout::ONEDNN, framework::LibraryType::kMKLDNN); } diff --git a/paddle/fluid/operators/interpolate_op.cc b/paddle/fluid/operators/interpolate_op.cc index 230092422e556..c1b2ae3ea531b 100644 --- a/paddle/fluid/operators/interpolate_op.cc +++ b/paddle/fluid/operators/interpolate_op.cc @@ -348,8 +348,8 @@ class InterpolateOp : public framework::OperatorWithKernel { const phi::DenseTensor& tensor, const framework::OpKernelType& expected_kernel_type) const override { #ifdef PADDLE_WITH_MKLDNN - if ((expected_kernel_type.data_layout_ == phi::DataLayout::kMKLDNN) && - (tensor.layout() != phi::DataLayout::kMKLDNN)) { + if ((expected_kernel_type.data_layout_ == phi::DataLayout::ONEDNN) && + (tensor.layout() != phi::DataLayout::ONEDNN)) { auto attrs = Attrs(); auto ar = paddle::framework::AttrReader(attrs); const std::string data_format = ar.Get("data_layout"); diff --git a/paddle/fluid/operators/interpolate_v2_op.cc b/paddle/fluid/operators/interpolate_v2_op.cc index a04d66d288f55..10a072b5623f9 100644 --- a/paddle/fluid/operators/interpolate_v2_op.cc +++ b/paddle/fluid/operators/interpolate_v2_op.cc @@ -452,8 +452,8 @@ class InterpolateV2Op : public framework::OperatorWithKernel { const phi::DenseTensor& tensor, const framework::OpKernelType& expected_kernel_type) const override { #ifdef PADDLE_WITH_MKLDNN - if ((expected_kernel_type.data_layout_ == phi::DataLayout::kMKLDNN) && - (tensor.layout() != phi::DataLayout::kMKLDNN)) { + if ((expected_kernel_type.data_layout_ == phi::DataLayout::ONEDNN) && + (tensor.layout() != phi::DataLayout::ONEDNN)) { auto attrs = Attrs(); auto ar = paddle::framework::AttrReader(attrs); const std::string data_format = ar.Get("data_layout"); diff --git a/paddle/fluid/operators/lrn_op.cc b/paddle/fluid/operators/lrn_op.cc index 285e1ae063484..ce31108aa5448 100644 --- a/paddle/fluid/operators/lrn_op.cc +++ b/paddle/fluid/operators/lrn_op.cc @@ -233,8 +233,8 @@ class LRNOp : public framework::OperatorWithKernel { const phi::DenseTensor& tensor, const framework::OpKernelType& expected_kernel_type) const override { #ifdef PADDLE_WITH_MKLDNN - if ((expected_kernel_type.data_layout_ == phi::DataLayout::kMKLDNN) && - (tensor.layout() != phi::DataLayout::kMKLDNN)) { + if ((expected_kernel_type.data_layout_ == phi::DataLayout::ONEDNN) && + (tensor.layout() != phi::DataLayout::ONEDNN)) { auto attrs = Attrs(); auto ar = paddle::framework::AttrReader(attrs); const std::string data_format = ar.Get("data_format"); @@ -357,8 +357,8 @@ class LRNOpGrad : public framework::OperatorWithKernel { const phi::DenseTensor& tensor, const framework::OpKernelType& expected_kernel_type) const override { #ifdef PADDLE_WITH_MKLDNN - if ((expected_kernel_type.data_layout_ == phi::DataLayout::kMKLDNN) && - (tensor.layout() != phi::DataLayout::kMKLDNN)) { + if ((expected_kernel_type.data_layout_ == phi::DataLayout::ONEDNN) && + (tensor.layout() != phi::DataLayout::ONEDNN)) { auto attrs = Attrs(); auto ar = paddle::framework::AttrReader(attrs); const std::string data_format = ar.Get("data_format"); diff --git a/paddle/fluid/operators/matmul_op.cc b/paddle/fluid/operators/matmul_op.cc index f9af0b60e55d5..2f01aec1f7c48 100644 --- a/paddle/fluid/operators/matmul_op.cc +++ b/paddle/fluid/operators/matmul_op.cc @@ -715,8 +715,8 @@ class MatMulOp : public framework::OperatorWithKernel { // When matmul is first oneDNN op in a chain (there was some non oneDNN op // previously) // then we also need to rotate shape NHWC -> NCWH - if ((expected_kernel_type.data_layout_ == phi::DataLayout::kMKLDNN) && - (tensor.layout() != phi::DataLayout::kMKLDNN) && + if ((expected_kernel_type.data_layout_ == phi::DataLayout::ONEDNN) && + (tensor.layout() != phi::DataLayout::ONEDNN) && paddle::platform::MKLDNNDeviceContext::tls() .get_cur_paddle_data_layout() == phi::DataLayout::kNHWC) { return framework::OpKernelType(expected_kernel_type.data_type_, diff --git a/paddle/fluid/operators/matmul_v2_op.cc b/paddle/fluid/operators/matmul_v2_op.cc index 454e0f347877c..1412c9fe715b5 100644 --- a/paddle/fluid/operators/matmul_v2_op.cc +++ b/paddle/fluid/operators/matmul_v2_op.cc @@ -152,8 +152,8 @@ class MatMulV2Op : public framework::OperatorWithKernel { #ifdef PADDLE_WITH_MKLDNN // When matmul_v2 is first oneDNN op in a chain (there was some non oneDNN // op previously) then we also need to rotate shape NHWC -> NCWH - if ((expected_kernel_type.data_layout_ == phi::DataLayout::kMKLDNN) && - (tensor.layout() != phi::DataLayout::kMKLDNN) && + if ((expected_kernel_type.data_layout_ == phi::DataLayout::ONEDNN) && + (tensor.layout() != phi::DataLayout::ONEDNN) && paddle::platform::MKLDNNDeviceContext::tls() .get_cur_paddle_data_layout() == phi::DataLayout::kNHWC) { return framework::OpKernelType(expected_kernel_type.data_type_, diff --git a/paddle/fluid/operators/mkldnn/CMakeLists.txt b/paddle/fluid/operators/mkldnn/CMakeLists.txt deleted file mode 100644 index f40286ad5d8a2..0000000000000 --- a/paddle/fluid/operators/mkldnn/CMakeLists.txt +++ /dev/null @@ -1,4 +0,0 @@ -cc_library( - mkldnn_axpy_handler - SRCS axpy_handler.cc - DEPS place device_context enforce) diff --git a/paddle/fluid/operators/mkldnn/axpy_handler.cc b/paddle/fluid/operators/mkldnn/axpy_handler.cc deleted file mode 100644 index c7b6400e2efd1..0000000000000 --- a/paddle/fluid/operators/mkldnn/axpy_handler.cc +++ /dev/null @@ -1,142 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/mkldnn/axpy_handler.h" - -#include -#include -#include -#include - -#include "dnnl.hpp" -#include "paddle/fluid/platform/bfloat16.h" -#include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/platform/mkldnn_helper.h" -#include "paddle/fluid/platform/place.h" - -namespace paddle { -namespace operators { - -namespace plat = paddle::platform; - -namespace { - -template -class AXPYHandler { - public: - AXPYHandler(const dnnl::engine mkldnn_engine, int n, float alpha) { - platform::MKLDNNDeviceContext::tls().log_lib_version(); - auto md = dnnl::memory::desc( - {n}, plat::MKLDNNGetDataType(), dnnl::memory::format_tag::x); - src_mem_ = dnnl::memory(md, mkldnn_engine, DNNL_MEMORY_NONE); - dst_mem_ = dnnl::memory(md, mkldnn_engine, DNNL_MEMORY_NONE); - dnnl::primitive_attr reorder_attr; - dnnl::post_ops post_operations; - if (alpha != 1.f) { - std::vector scales(1, alpha); - reorder_attr.set_output_scales(0, scales); - } - post_operations.append_sum(1.0f); - - reorder_attr.set_post_ops(post_operations); - reorder_p_ = dnnl::reorder(src_mem_, dst_mem_, reorder_attr); - } - - dnnl::memory &AcquireSrcMemory(const T *x) { - src_mem_.set_data_handle(plat::to_void_cast(x)); - return src_mem_; - } - - dnnl::memory &AcquireDstMemory(T *y) { - dst_mem_.set_data_handle(y); - return dst_mem_; - } - - const dnnl::reorder &AcquireReorder() { return reorder_p_; } - - private: - dnnl::memory src_mem_; - dnnl::memory dst_mem_; - dnnl::reorder reorder_p_; -}; - -template class AXPYHandler; -template class AXPYHandler; - -template -static void naive_axpy(int n, T alpha, const T *x, T *y) { - while (n-- > 0) { - *y += alpha * *x; - ++y; - ++x; - } -} - -} // namespace - -template -class OneDNNAXPYHandler::Impl { - public: - Impl(int64_t n, T alpha); - void operator()(const T *x, T *y); - - private: - std::unique_ptr> handler_; - int64_t n_; - T alpha_; -}; - -template -OneDNNAXPYHandler::Impl::Impl(int64_t n, T alpha) : n_{n}, alpha_{alpha} { - auto &pool = plat::DeviceContextPool::Instance(); - auto cpu_place = plat::CPUPlace(); - auto *dev_ctx = - dynamic_cast(pool.Get(cpu_place)); - auto &cpu_engine = dev_ctx->GetEngine(); - handler_ = std::make_unique>( - cpu_engine, n, static_cast(alpha)); -} - -template -void OneDNNAXPYHandler::Impl::operator()(const T *x, T *y) { - if (this->n_ < 100) { - naive_axpy(this->n_, this->alpha_, x, y); - return; - } - - auto &reorder_src_mem_p = handler_->AcquireSrcMemory(x); - auto &reorder_dst_mem_p = handler_->AcquireDstMemory(y); - auto reorder_p = handler_->AcquireReorder(); - auto &astream = plat::MKLDNNDeviceContext::tls().get_stream(); - reorder_p.execute(astream, reorder_src_mem_p, reorder_dst_mem_p); - astream.wait(); -} - -template -OneDNNAXPYHandler::OneDNNAXPYHandler(int64_t n, T alpha) - : pimpl_{new Impl{n, alpha}, [](Impl *impl) { delete impl; }} { - VLOG(4) << "[OneDNN] OneDNNAXPYHandler<" << typeid(T).name() << ">, " - << "n: " << n << ", alpha: " << alpha; -} - -template -void OneDNNAXPYHandler::operator()(const T *x, T *y) { - pimpl_->operator()(x, y); -} - -template class OneDNNAXPYHandler; -template class OneDNNAXPYHandler; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/mkldnn/axpy_handler.h b/paddle/fluid/operators/mkldnn/axpy_handler.h deleted file mode 100644 index 677fe3b010c24..0000000000000 --- a/paddle/fluid/operators/mkldnn/axpy_handler.h +++ /dev/null @@ -1,59 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#pragma once - -#include - -namespace paddle { -namespace operators { - -/// -/// @brief Helper class for AXPY execution using oneDNN library. -/// -/// @tparam T Data type. -/// -template -class OneDNNAXPYHandler { - public: - OneDNNAXPYHandler(OneDNNAXPYHandler&) = delete; - OneDNNAXPYHandler(OneDNNAXPYHandler&&) = delete; - OneDNNAXPYHandler& operator=(OneDNNAXPYHandler&) = delete; - OneDNNAXPYHandler& operator=(OneDNNAXPYHandler&&) = delete; - /// - /// @brief Constructor. - /// - /// @param[in] n The number of elements in tensor (assumed 1D tensor) - /// @param[in] alpha The alpha coefficient. - /// - OneDNNAXPYHandler(int64_t n, T alpha); - /// - /// @brief Executes AXPY. - /// - /// @param[in] x The pointer to input X tensor data. - /// @param[out] y The pointer to output Y tensor data. - /// - void operator()(const T* x, T* y); - - private: - OneDNNAXPYHandler() = delete; - // (arogowie-intel) Private implementation idiom to hide dependency - // on OneDNN headers. - class Impl; - // We need custom deleter, since the compiler is unable to parameterize - // an allocator's default deleter due to incomple type. - std::unique_ptr pimpl_; -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc index 4144608de4b6d..aeba1e0ae6379 100644 --- a/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc @@ -24,13 +24,11 @@ namespace operators { using dnnl::memory; using dnnl::primitive; -using dnnl::reorder; using dnnl::stream; using paddle::platform::MKLDNNDeviceContext; -using platform::to_void_cast; template -class BatchNormMKLDNNHandler : public platform::MKLDNNHandlerNoCachingT< +class BatchNormMKLDNNHandler : public phi::funcs::OneDNNHandlerNoCachingT< T, dnnl::batch_normalization_forward, dnnl::batch_normalization_backward> { @@ -40,9 +38,9 @@ class BatchNormMKLDNNHandler : public platform::MKLDNNHandlerNoCachingT< const Tensor *in_x, const Tensor *scale, const Tensor *out_grad) - : platform::MKLDNNHandlerNoCachingT( + : phi::funcs::OneDNNHandlerNoCachingT( mkldnn_engine, ctx.GetPlace()) { auto scale_tz = phi::vectorize(scale->dims()); PADDLE_ENFORCE_EQ( @@ -98,8 +96,8 @@ class BatchNormMKLDNNHandler : public platform::MKLDNNHandlerNoCachingT< std::shared_ptr AcquireMeanMemory( const phi::DenseTensor *mean) { const T *mean_data = mean->data(); - return this->AcquireMemoryFromPrimitive(this->fwd_pd_->mean_desc(), - to_void_cast(mean_data)); + return this->AcquireMemoryFromPrimitive( + this->fwd_pd_->mean_desc(), phi::funcs::to_void_cast(mean_data)); } std::shared_ptr AcquireMeanMemory(phi::DenseTensor *mean) { @@ -112,8 +110,9 @@ class BatchNormMKLDNNHandler : public platform::MKLDNNHandlerNoCachingT< std::shared_ptr AcquireVarianceMemory( const phi::DenseTensor *variance) { const T *variance_data = variance->data(); - return this->AcquireMemoryFromPrimitive(this->fwd_pd_->variance_desc(), - to_void_cast(variance_data)); + return this->AcquireMemoryFromPrimitive( + this->fwd_pd_->variance_desc(), + phi::funcs::to_void_cast(variance_data)); } std::shared_ptr AcquireVarianceMemory( diff --git a/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc index 21efe523b850e..63fe71bce7c35 100644 --- a/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc @@ -23,13 +23,14 @@ namespace operators { using Tensor = phi::DenseTensor; using phi::DataLayout; +using phi::funcs::OneDNNMemDesc; inline dnnl::memory::dims GetWeightsTz(const phi::DenseTensor* filter, const int groups) { auto weights_tz = phi::vectorize(filter->dims()); int g = std::max(groups, 1); int g_dim = (g > 1) ? 1 : 0; - platform::GetGroupConvWeightsTz(weights_tz, g); + phi::funcs::GetGroupConvWeightsTz(weights_tz, g); // gIOHW -> gOIHW || IOHW -> OIHW std::swap(weights_tz[g_dim + 0], weights_tz[g_dim + 1]); return weights_tz; @@ -37,7 +38,8 @@ inline dnnl::memory::dims GetWeightsTz(const phi::DenseTensor* filter, template class ConvTransposeMKLDNNHandlerT - : public platform::MKLDNNHandlerNoCachingT { + : public phi::funcs::OneDNNHandlerNoCachingT { public: ConvTransposeMKLDNNHandlerT(const framework::ExecutionContext& ctx, const dnnl::engine mkldnn_engine, @@ -45,7 +47,7 @@ class ConvTransposeMKLDNNHandlerT const phi::DenseTensor* filter, const phi::DenseTensor* bias, phi::DenseTensor* output) - : platform::MKLDNNHandlerNoCachingT( + : phi::funcs::OneDNNHandlerNoCachingT( mkldnn_engine, ctx.GetPlace()), is_test_(ctx.Attr("is_test")) { PADDLE_ENFORCE_EQ(is_test_, @@ -57,16 +59,16 @@ class ConvTransposeMKLDNNHandlerT PADDLE_ENFORCE_EQ( input->layout(), - DataLayout::kMKLDNN, + DataLayout::ONEDNN, platform::errors::InvalidArgument( "Got wrong layout = %d for Input tensor.", input->layout())); PADDLE_ENFORCE_EQ( filter->layout(), - DataLayout::kMKLDNN, + DataLayout::ONEDNN, platform::errors::InvalidArgument( "The filter tensor's layout should be %d, but got %d.", - DataLayout::kMKLDNN, + DataLayout::ONEDNN, filter->layout())); PADDLE_ENFORCE_EQ( @@ -85,10 +87,10 @@ class ConvTransposeMKLDNNHandlerT if (bias) { PADDLE_ENFORCE_EQ( bias->layout(), - DataLayout::kMKLDNN, + DataLayout::ONEDNN, platform::errors::InvalidArgument( "The bias tensor's laytout should be %d, but got %d.", - DataLayout::kMKLDNN, + DataLayout::ONEDNN, bias->layout())); PADDLE_ENFORCE_EQ( @@ -136,25 +138,24 @@ class ConvTransposeMKLDNNHandlerT const auto src_tz = phi::vectorize(input->dims()); const auto weights_tz = GetWeightsTz(filter, groups); const auto dst_tz = phi::vectorize(output->dims()); - const auto mkldnn_paddings = platform::ToMkldnnPadding(paddings); + const auto mkldnn_paddings = phi::funcs::ToOneDNNPadding(paddings); /* create memory descriptor for convolution without specified format * ('any') which lets a primitive (convolution in this case) choose * the memory format preferred for best performance */ - const auto chosen_memory_format = MKLDNNMemoryFormat::any; + const auto chosen_memory_format = OneDNNMemoryFormat::any; auto data_type = dnnl::memory::data_type::f32; if (ctx.Attr("mkldnn_data_type") == "bfloat16" || std::is_same::value) data_type = dnnl::memory::data_type::bf16; - const auto src_md = - platform::MKLDNNMemDesc(src_tz, data_type, chosen_memory_format); + const auto src_md = OneDNNMemDesc(src_tz, data_type, chosen_memory_format); const auto weights_md = - platform::MKLDNNMemDesc(weights_tz, data_type, chosen_memory_format); - const auto dst_md = platform::MKLDNNMemDesc( - dst_tz, platform::MKLDNNGetDataType(), chosen_memory_format); + OneDNNMemDesc(weights_tz, data_type, chosen_memory_format); + const auto dst_md = OneDNNMemDesc( + dst_tz, phi::funcs::OneDNNGetDataType(), chosen_memory_format); const dnnl::primitive_attr conv_trans_attr = CreateConvAttrs(ctx); auto fwd_prop_kind = is_test_ ? dnnl::prop_kind::forward_inference @@ -162,7 +163,7 @@ class ConvTransposeMKLDNNHandlerT if (bias) { std::vector bias_tz = phi::vectorize(bias->dims()); const auto bias_md = - platform::MKLDNNMemDesc(bias_tz, data_type, MKLDNNMemoryFormat::x); + OneDNNMemDesc(bias_tz, data_type, OneDNNMemoryFormat::x); this->AcquireForwardPrimitiveDescriptor( conv_trans_attr, fwd_prop_kind, @@ -221,10 +222,10 @@ class ConvTransposeMKLDNNHandlerT std::shared_ptr AcquireSrcMemoryWithReorder( const phi::DenseTensor* input) { const T* input_data = input->data(); - return platform::MKLDNNHandlerNoCachingT:: + return phi::funcs::OneDNNHandlerNoCachingT:: AcquireMemoryWithReorder(input->mem_desc(), this->fwd_pd_->src_desc(), - platform::to_void_cast(input_data)); + phi::funcs::to_void_cast(input_data)); } std::shared_ptr AcquireWeightsMemoryWithReorder( @@ -236,16 +237,16 @@ class ConvTransposeMKLDNNHandlerT auto weights_tz = GetWeightsTz(filter, groups); int g = std::max(groups, 1); - auto user_src_md = platform::MKLDNNMemDesc( + auto user_src_md = OneDNNMemDesc( weights_tz, - platform::MKLDNNGetDataType(), - (g == 1) ? MKLDNNMemoryFormat::iohw : MKLDNNMemoryFormat::giohw); + phi::funcs::OneDNNGetDataType(), + (g == 1) ? OneDNNMemoryFormat::iohw : OneDNNMemoryFormat::giohw); return this->template AcquireMemoryWithReorder( dev_ctx, user_src_md, this->fwd_pd_->weights_desc(), - platform::to_void_cast(filter_data), + phi::funcs::to_void_cast(filter_data), key, "@weights_mem_p", is_test_); @@ -276,7 +277,7 @@ class ConvTransposeMKLDNNHandlerT target_memory_p = std::make_shared(target_md, this->engine_); dnnl::reorder::primitive_desc reorder_pdesc; - if (platform::is_int8()) { + if (phi::funcs::is_int8()) { dnnl::primitive_attr attr; attr.set_output_scales(mask, scale_data); reorder_pdesc = dnnl::reorder::primitive_desc( @@ -334,17 +335,17 @@ class ConvTransposeMKLDNNHandlerT const std::string& key, const phi::DenseTensor* bias) { const K* bias_data = bias->data(); - auto user_bias_md = - platform::MKLDNNMemDesc(phi::vectorize(bias->dims()), - platform::MKLDNNGetDataType(), - MKLDNNMemoryFormat::x); - return this->AcquireMemoryWithReorder(dev_ctx, - user_bias_md, - this->fwd_pd_->bias_desc(), - platform::to_void_cast(bias_data), - key, - "@bias_mem_p", - is_test_); + auto user_bias_md = OneDNNMemDesc(phi::vectorize(bias->dims()), + phi::funcs::OneDNNGetDataType(), + OneDNNMemoryFormat::x); + return this->AcquireMemoryWithReorder( + dev_ctx, + user_bias_md, + this->fwd_pd_->bias_desc(), + phi::funcs::to_void_cast(bias_data), + key, + "@bias_mem_p", + is_test_); } private: diff --git a/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc index c4b8b267a00c4..38c5bd1029049 100644 --- a/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc @@ -26,10 +26,8 @@ namespace operators { using dnnl::memory; using dnnl::primitive; using dnnl::reorder; -using platform::to_void_cast; using Tensor = phi::DenseTensor; using dnnl::stream; -using phi::DataLayout; template class DeQuantOpKernel : public framework::OpKernel { @@ -55,8 +53,8 @@ class DeQuantOpKernel : public framework::OpKernel { ctx.template device_context(); auto x_tz = phi::vectorize(x->dims()); - auto x_paddle_dtype = framework::TransToProtoVarType(x->dtype()); - auto out_paddle_dtype = framework::TransToProtoVarType(out->dtype()); + auto x_type = phi::funcs::ToOneDNNDataType(x->dtype()); + auto out_type = phi::funcs::ToOneDNNDataType(out->dtype()); dnnl::primitive_attr attrs; static constexpr int32_t mask = 0; // same shift and scale for whole tensor @@ -69,16 +67,11 @@ class DeQuantOpKernel : public framework::OpKernel { DNNL_ARG_SRC, mask, {static_cast(quantization_shift)}); } - platform::ReorderMKLDNNHandler reorder_handler( - x_tz, - x_paddle_dtype, - framework::ToMKLDNNDataType(x_paddle_dtype), - out_paddle_dtype, - framework::ToMKLDNNDataType(out_paddle_dtype), - dev_ctx.GetEngine()); + phi::funcs::ReorderOneDNNHandler reorder_handler( + x_tz, x->dtype(), x_type, out->dtype(), out_type, dev_ctx.GetEngine()); auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory( - x->mem_desc(), platform::to_void_cast(x->data())); + x->mem_desc(), phi::funcs::to_void_cast(x->data())); auto reorder_dst_memory_p = reorder_handler.AcquireDstMemory( out, x->mem_desc(), dev_ctx.GetPlace()); diff --git a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc index 962a96b9bc978..a9d1e6e9d5810 100644 --- a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc @@ -29,14 +29,9 @@ using dnnl::stream; using framework::DDim; using framework::ExecutionContext; using LoDTensor = phi::DenseTensor; +using phi::funcs::OneDNNGetDataType; +using phi::funcs::to_void_cast; using platform::MKLDNNDeviceContext; -using platform::MKLDNNGetDataType; -using platform::to_void_cast; - -template -constexpr bool IsInt8() { - return std::is_same::value || std::is_same::value; -} struct InnerProductCache { dnnl::inner_product_forward inner_product_p; @@ -47,8 +42,8 @@ struct InnerProductCache { }; template class FCMKLDNNHandler - : public platform::MKLDNNHandlerNoCachingT { + : public phi::funcs::OneDNNHandlerNoCachingT { public: FCMKLDNNHandler(const paddle::framework::ExecutionContext& ctx, const platform::MKLDNNDeviceContext& dev_ctx, @@ -59,7 +54,7 @@ class FCMKLDNNHandler const int in_num_col_dims, dnnl::engine mkldnn_engine, platform::Place cpu_place) - : platform::MKLDNNHandlerNoCachingT( + : phi::funcs::OneDNNHandlerNoCachingT( mkldnn_engine, cpu_place), dev_ctx_(dev_ctx) { this->memory_key_ = ctx.InputName("W"); @@ -82,14 +77,14 @@ class FCMKLDNNHandler dnnl::memory::desc bias_md; auto src_md = dnnl::memory::desc( - {MB, IC}, MKLDNNGetDataType(), dnnl::memory::format_tag::any); + {MB, IC}, OneDNNGetDataType(), dnnl::memory::format_tag::any); auto weights_md = dnnl::memory::desc( - {OC, IC}, MKLDNNGetDataType(), dnnl::memory::format_tag::any); + {OC, IC}, OneDNNGetDataType(), dnnl::memory::format_tag::any); auto dst_md = dnnl::memory::desc( - {MB, OC}, MKLDNNGetDataType(), dnnl::memory::format_tag::any); + {MB, OC}, OneDNNGetDataType(), dnnl::memory::format_tag::any); if (bias) { bias_md = dnnl::memory::desc({bias->numel()}, - MKLDNNGetDataType(), + OneDNNGetDataType(), dnnl::memory::format_tag::a); } @@ -110,7 +105,7 @@ class FCMKLDNNHandler std::vector output_shift_scale; float scale = 1.0f; - if (IsInt8()) { + if (phi::funcs::is_int8()) { std::tie(output_shift_scale, scale) = ComputeOutputShiftScale(ctx); int mask = CreateMask(1, output_shift_scale.size() > 1); attributes.set_output_scales(mask, output_shift_scale); @@ -250,7 +245,7 @@ class FCMKLDNNHandler const std::vector& scale_weights) { const float* bias_data = bias->data(); - if (IsInt8() == false) { + if (phi::funcs::is_int8() == false) { // for BF16/FP32 bias is 1D and has no scales, so reorder is not needed return this->AcquireMemoryFromPrimitive(this->fwd_pd_->bias_desc(), to_void_cast(bias_data)); @@ -267,7 +262,7 @@ class FCMKLDNNHandler attrs.set_output_scales(mask, scale_data); auto user_md = dnnl::memory::desc({bias->dims()[0]}, - MKLDNNGetDataType(), + OneDNNGetDataType(), dnnl::memory::format_tag::a); memory_p = this->AcquireMemoryWithReorderAndAttrs( @@ -292,10 +287,10 @@ class FCMKLDNNHandler auto weights_dims = this->fwd_pd_->weights_desc().dims(); auto user_md = dnnl::memory::desc(weights_dims, - MKLDNNGetDataType(), + OneDNNGetDataType(), dnnl::memory::format_tag::io); - if (IsInt8()) { + if (phi::funcs::is_int8()) { dnnl::primitive_attr attrs; int mask = CreateMask(0, scale_data.size() > 1); attrs.set_output_scales(mask, scale_data); @@ -358,7 +353,7 @@ class FCMKLDNNKernel : public framework::OpKernel { IF_CHANGE_FC_TW_TYPENAME((std::is_same::value), ([&] { if (force_fp32_output) { this->RunKernel(ctx); - } else if (IsInt8()) { + } else if (phi::funcs::is_int8()) { if (fuse_relu) { this->RunKernel(ctx); } else { diff --git a/paddle/fluid/operators/mkldnn/interpolate_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/interpolate_mkldnn_op.cc index a868bc3b502eb..ff3cbce546874 100644 --- a/paddle/fluid/operators/mkldnn/interpolate_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/interpolate_mkldnn_op.cc @@ -25,29 +25,28 @@ using dnnl::reorder; using dnnl::resampling_forward; using dnnl::stream; using phi::DataLayout; -using platform::to_void_cast; template -class InterpolateMKLDNNHandler - : public platform::MKLDNNHandlerNoCachingT { +class InterpolateOneDNNHandler + : public phi::funcs::OneDNNHandlerNoCachingT { public: - InterpolateMKLDNNHandler(const dnnl::algorithm algo, + InterpolateOneDNNHandler(const dnnl::algorithm algo, const dnnl::engine engine, platform::Place cpu_place, const phi::DenseTensor* x, phi::DenseTensor* out) - : platform::MKLDNNHandlerNoCachingT( + : phi::funcs::OneDNNHandlerNoCachingT( engine, cpu_place) { const auto dst_tz = phi::vectorize(out->dims()); const auto dst_md = memory::desc( - dst_tz, platform::MKLDNNGetDataType(), MKLDNNMemoryFormat::any); + dst_tz, phi::funcs::OneDNNGetDataType(), OneDNNMemoryFormat::any); this->AcquireForwardPrimitiveDescriptor( dnnl::prop_kind::forward_inference, algo, x->mem_desc(), dst_md); } }; template -class InterpolateMKLDNNKernel : public framework::OpKernel { +class InterpolateOneDNNKernel : public framework::OpKernel { std::vector ComputeOutputShape( const framework::ExecutionContext& ctx) const { const auto* x = ctx.Input("X"); @@ -147,7 +146,7 @@ class InterpolateMKLDNNKernel : public framework::OpKernel { framework::DDim dim_out = phi::make_ddim(out_dims_vec); out->Resize(dim_out); - InterpolateMKLDNNHandler handler( + InterpolateOneDNNHandler handler( algo, mkldnn_engine, ctx.GetPlace(), x, out); auto src_memory_p = handler.AcquireSrcMemory(x); @@ -173,10 +172,10 @@ namespace ops = paddle::operators; REGISTER_OP_KERNEL(nearest_interp, MKLDNN, ::paddle::platform::CPUPlace, - ops::InterpolateMKLDNNKernel, - ops::InterpolateMKLDNNKernel, - ops::InterpolateMKLDNNKernel); + ops::InterpolateOneDNNKernel, + ops::InterpolateOneDNNKernel, + ops::InterpolateOneDNNKernel); REGISTER_OP_KERNEL(bilinear_interp, MKLDNN, ::paddle::platform::CPUPlace, - ops::InterpolateMKLDNNKernel); + ops::InterpolateOneDNNKernel); diff --git a/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc index d69185f4526ec..24ae86df61ba9 100644 --- a/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc @@ -20,18 +20,19 @@ namespace paddle { namespace operators { template -class LayerNormMKLDNNHandler - : public platform:: - MKLDNNHandlerNoCachingT { +class LayerNormOneDNNHandler + : public phi::funcs:: + OneDNNHandlerNoCachingT { public: - LayerNormMKLDNNHandler(const std::vector& dims, + LayerNormOneDNNHandler(const std::vector& dims, const float& epsilon, const dnnl::normalization_flags& flags, const bool& is_test, const phi::DenseTensor* x, const dnnl::engine engine, platform::Place cpu_place) - : platform::MKLDNNHandlerNoCachingT( + : phi::funcs::OneDNNHandlerNoCachingT( engine, cpu_place) { const auto fwd_prop_kind = is_test ? dnnl::prop_kind::forward_inference : dnnl::prop_kind::forward_training; @@ -103,7 +104,7 @@ class LayerNormMKLDNNOpKernel : public paddle::framework::OpKernel { flags |= dnnl::normalization_flags::use_scale_shift; } - LayerNormMKLDNNHandler handler( + LayerNormOneDNNHandler handler( src_tz, epsilon, flags, is_test, x, mkldnn_engine, ctx.GetPlace()); auto src_memory = handler.AcquireSrcMemory(x); diff --git a/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc index 5ca1f3eafafb2..a163a20309a9b 100644 --- a/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc @@ -20,17 +20,17 @@ namespace operators { using paddle::platform::MKLDNNDeviceContext; template -class LRNMKLDNNHandler - : public platform:: - MKLDNNHandlerNoCachingT { +class LRNOneDNNHandler + : public phi::funcs:: + OneDNNHandlerNoCachingT { public: - LRNMKLDNNHandler(const framework::ExecutionContext& ctx, + LRNOneDNNHandler(const framework::ExecutionContext& ctx, const dnnl::engine mkldnn_engine, platform::Place cpu_place, const phi::DenseTensor* input) - : platform:: - MKLDNNHandlerNoCachingT( + : phi::funcs:: + OneDNNHandlerNoCachingT( mkldnn_engine, cpu_place) { const int n = ctx.Attr("n"); // MKL-DNN implements LRN in a caffe way: @@ -55,14 +55,14 @@ class LRNMKLDNNHandler k); } - LRNMKLDNNHandler(const framework::ExecutionContext& ctx, + LRNOneDNNHandler(const framework::ExecutionContext& ctx, const dnnl::engine mkldnn_engine, platform::Place cpu_place, const phi::DenseTensor* in_x, const phi::DenseTensor* out_grad, phi::DenseTensor* in_x_grad) - : platform:: - MKLDNNHandlerNoCachingT( + : phi::funcs:: + OneDNNHandlerNoCachingT( mkldnn_engine, cpu_place) { PADDLE_ENFORCE_EQ( ctx.Attr("is_test"), @@ -107,7 +107,7 @@ class LRNMKLDNNHandler const T* workspace_data = workspace->data(); return this->AcquireMemoryFromPrimitive( this->fwd_pd_->workspace_desc(), - platform::to_void_cast(workspace_data)); + phi::funcs::to_void_cast(workspace_data)); } }; @@ -132,7 +132,7 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel { auto out = ctx.Output("Out"); auto mid = ctx.Output("MidOut"); - LRNMKLDNNHandler handler(ctx, mkldnn_engine, ctx.GetPlace(), x); + LRNOneDNNHandler handler(ctx, mkldnn_engine, ctx.GetPlace(), x); auto src_memory = handler.AcquireSrcMemory(x); auto dst_memory = handler.AcquireDstMemory(out); @@ -140,7 +140,7 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel { auto lrn_p = handler.AcquireForwardPrimitive(); auto workspace_memory = handler.AcquireWorkspaceMemory(mid); - mid->set_layout(phi::DataLayout::kMKLDNN); + mid->set_layout(phi::DataLayout::ONEDNN); auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); if (!workspace_memory->get_desc().is_zero()) { @@ -182,7 +182,7 @@ class LRNMKLDNNGradOpKernel : public paddle::framework::OpKernel { auto& dev_ctx = ctx.template device_context(); const auto& mkldnn_engine = dev_ctx.GetEngine(); - LRNMKLDNNHandler handler( + LRNOneDNNHandler handler( ctx, mkldnn_engine, ctx.GetPlace(), in_x, out_grad, in_x_grad); auto src_memory = handler.AcquireSrcMemory(in_x); diff --git a/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.h b/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.h deleted file mode 100644 index 53dd177071496..0000000000000 --- a/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.h +++ /dev/null @@ -1,48 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/platform/mkldnn_reuse.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" - -namespace paddle { -namespace operators { - -using framework::ExecutionContext; -using platform::MKLDNNDeviceContext; -using Tensor = phi::DenseTensor; - -template -class MatMulGradMKLDNNKernel : public framework::OpKernel { - public: - void Compute(const ExecutionContext& ctx) const override; - - private: - void ExecuteMatMulGrad(const ExecutionContext& ctx, - const MKLDNNDeviceContext& dev_ctx, - const dnnl::engine& engine, - phi::DenseTensor* x, - bool trans_x, - bool is_fold_init_dims_x, - phi::DenseTensor* y, - bool trans_y, - bool is_fold_init_dims_y, - phi::DenseTensor* out) const; - void RunKernel(const ExecutionContext& ctx) const; -}; -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc index 52c25fb5e827f..68813fbb5482e 100644 --- a/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc @@ -11,16 +11,19 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/mkldnn/matmul_mkldnn_op.h" + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/platform/mkldnn_reuse.h" +#include "paddle/phi/kernels/funcs/blas/blas.h" namespace { using dnnl::memory; using paddle::framework::ExecutionContext; using paddle::platform::MatMulV2MKLDNNHandler; using paddle::platform::MKLDNNDeviceContext; -using paddle::platform::MKLDNNGetDataType; -using paddle::platform::to_void_cast; using phi::vectorize; +using phi::funcs::OneDNNGetDataType; using Tensor = phi::DenseTensor; using paddle::framework::GradVarName; using phi::make_ddim; @@ -54,15 +57,11 @@ static Tensor FoldFirstAndLastDims(const MKLDNNDeviceContext &dev_ctx, memory::data_type input_type = paddle::framework::ToMKLDNNDataType( paddle::framework::TransToProtoVarType(input->dtype())); - paddle::platform::ReorderMKLDNNHandler reorder_handler( - output_dims, - paddle::framework::TransToProtoVarType(input->dtype()), - input_type, - dev_ctx.GetEngine()); + phi::funcs::ReorderOneDNNHandler reorder_handler( + output_dims, input->dtype(), input_type, dev_ctx.GetEngine()); auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory( - memory::format_tag::abc, - paddle::platform::to_void_cast(input->data())); + memory::format_tag::abc, phi::funcs::to_void_cast(input->data())); auto reorder_dst_memory_p = reorder_handler.AcquireDstMemory( &output, memory::format_tag::bac, dev_ctx.GetPlace()); auto reorder_p = reorder_handler.AcquireReorder(reorder_src_memory_p, @@ -76,16 +75,6 @@ static Tensor FoldFirstAndLastDims(const MKLDNNDeviceContext &dev_ctx, return output; } -template -constexpr bool IsInt8() { - return std::is_same::value || std::is_same::value; -} - -template -constexpr bool IsBfloat16() { - return std::is_same::value; -} - // Get row matrix shape from a vector shape. If the rank of x_dim > 1, the // original x_dim is returned. static paddle::framework::DDim RowMatrixDimsFromVector( @@ -112,7 +101,7 @@ phi::DDim GetDimForInput(const ExecutionContext &ctx, std::string input_name) { template class MatMulMKLDNNHandler - : public paddle::platform::MKLDNNHandlerNoCachingT { + : public phi::funcs::OneDNNHandlerNoCachingT { public: MatMulMKLDNNHandler(const dnnl::engine engine, paddle::platform::Place cpu_place, @@ -122,8 +111,8 @@ class MatMulMKLDNNHandler bool trans_y, Tensor *out, float scale) - : paddle::platform::MKLDNNHandlerNoCachingT(engine, - cpu_place) { + : phi::funcs::OneDNNHandlerNoCachingT(engine, + cpu_place) { auto mat_dim_x = phi::funcs::CreateMatrixDescriptor(x->dims(), 0, trans_x); auto mat_dim_y = phi::funcs::CreateMatrixDescriptor(y->dims(), 0, trans_y); @@ -146,9 +135,9 @@ class MatMulMKLDNNHandler !trans_y ? memory::dims{N * K, N, 1} : memory::dims{N * K, 1, K}; memory::dims out_strides = memory::dims{M * N, N, 1}; - auto x_md = memory::desc(x_dims, MKLDNNGetDataType(), x_strides); - auto y_md = memory::desc(y_dims, MKLDNNGetDataType(), y_strides); - auto out_md = memory::desc(out_dims, MKLDNNGetDataType(), out_strides); + auto x_md = memory::desc(x_dims, OneDNNGetDataType(), x_strides); + auto y_md = memory::desc(y_dims, OneDNNGetDataType(), y_strides); + auto out_md = memory::desc(out_dims, OneDNNGetDataType(), out_strides); dnnl::primitive_attr attrs; if (scale != 1.0f) attrs.set_output_scales(0, {scale}); @@ -158,8 +147,9 @@ class MatMulMKLDNNHandler std::shared_ptr AcquireWeightsMemory(const Tensor *input) { const YT *input_data = input->data(); - return this->AcquireMemoryFromPrimitive(this->fwd_pd_->weights_desc(), - to_void_cast(input_data)); + return this->AcquireMemoryFromPrimitive( + this->fwd_pd_->weights_desc(), + phi::funcs::to_void_cast(input_data)); } public: @@ -350,18 +340,14 @@ bool IsOutputFused(const ExecutionContext &ctx) { template void ExecuteMatMulV2(const ExecutionContext &ctx, - const MKLDNNDeviceContext &dev_ctx, const dnnl::engine onednn_engine, - paddle::platform::Place cpu_place, const Tensor *x, const std::vector &x_dims, bool trans_x, const Tensor *y, const std::vector &y_dims, bool trans_y, - Tensor *out, - const std::vector &out_dims, - int execution_number = 0) { + Tensor *out) { std::vector x_strides_override = GetInputStrides(ctx, "X"); std::vector y_strides_override = GetInputStrides(ctx, "Y"); MatMulV2MKLDNNHandler handler(ctx, @@ -399,14 +385,13 @@ void ExecuteMatMulV2(const ExecutionContext &ctx, // TODO(jczaja): Explain why int8 format of dst is ABCD and do not need // permute - if (IsOutputFused(ctx) && !IsInt8()) { + if (IsOutputFused(ctx) && !phi::funcs::is_int8()) { auto axis = ctx.Attr>("fused_transpose_Out"); auto permuted_md = dst_memory_p->get_desc().permute_axes(axis); - out->set_mem_desc( - permuted_md.reshape(phi::vectorize(out->dims()))); + out->set_mem_desc(permuted_md.reshape(vectorize(out->dims()))); } else { out->set_mem_desc( - dst_memory_p->get_desc().reshape(phi::vectorize(out->dims()))); + dst_memory_p->get_desc().reshape(vectorize(out->dims()))); } } @@ -423,20 +408,75 @@ class MatMulV2MKLDNNKernel : public paddle::framework::OpKernel { "head_number=1. But received `head_number` is %d", ctx.Attr("head_number"))); } - constexpr bool is_int8 = IsInt8(); - constexpr bool is_bfloat16 = IsBfloat16(); + constexpr bool is_int8 = phi::funcs::is_int8(); + constexpr bool is_bfloat16 = phi::funcs::is_bfloat16(); const bool force_fp32_output = ctx.HasAttr("force_fp32_output") ? ctx.Attr("force_fp32_output") : false; constexpr bool fuse_relu = false; // TODO(intel): Enable eltwise fuses + + const auto &dev_ctx = ctx.template device_context(); + const auto &onednn_engine = dev_ctx.GetEngine(); + + auto *x = ctx.Input("X"); + auto *y = ctx.Input("Y"); + auto *out = ctx.Output("Out"); + bool trans_x = ctx.HasAttr("trans_x") ? ctx.Attr("trans_x") + : ctx.Attr("transpose_X"); + bool trans_y = ctx.HasAttr("trans_y") ? ctx.Attr("trans_y") + : ctx.Attr("transpose_Y"); + + auto x_dims = vectorize(GetDimForInput(ctx, "X")); + auto y_dims = vectorize(GetDimForInput(ctx, "Y")); + + int ndims = std::max(x_dims.size(), y_dims.size()); + ndims = std::max(ndims, 3); + + std::vector x_bd_dims(ndims, 1); + std::vector y_bd_dims(ndims, 1); + + CalculateMatrixDims(ctx, x_dims, y_dims, &x_bd_dims, &y_bd_dims, out); + if (force_fp32_output || ((!is_int8) && (!is_bfloat16))) { - RunKernel(ctx); + ExecuteMatMulV2(ctx, + onednn_engine, + x, + x_bd_dims, + trans_x, + y, + y_bd_dims, + trans_y, + out); } else if (is_bfloat16) { - RunKernel(ctx); + ExecuteMatMulV2(ctx, + onednn_engine, + x, + x_bd_dims, + trans_x, + y, + y_bd_dims, + trans_y, + out); } else if (fuse_relu) { - RunKernel(ctx); + ExecuteMatMulV2(ctx, + onednn_engine, + x, + x_bd_dims, + trans_x, + y, + y_bd_dims, + trans_y, + out); } else { - RunKernel(ctx); + ExecuteMatMulV2(ctx, + onednn_engine, + x, + x_bd_dims, + trans_x, + y, + y_bd_dims, + trans_y, + out); } } @@ -446,7 +486,6 @@ class MatMulV2MKLDNNKernel : public paddle::framework::OpKernel { const std::vector &y_dims, std::vector *x_bd_dims, std::vector *y_bd_dims, - std::vector *out_dims, Tensor *out) const { if (x_dims.size() == 1) { (*x_bd_dims)[(*x_bd_dims).size() - 1] = x_dims[0]; @@ -470,6 +509,7 @@ class MatMulV2MKLDNNKernel : public paddle::framework::OpKernel { } if (!IsOutputFused(ctx) && x_dims.size() > 2 && y_dims.size() > 2) { + auto out_dims = vectorize(out->dims()); for (size_t i = 0; i < (*x_bd_dims).size() - 2; ++i) { PADDLE_ENFORCE_EQ( (*x_bd_dims)[i] == (*y_bd_dims)[i] || (*x_bd_dims)[i] == 1 || @@ -483,126 +523,194 @@ class MatMulV2MKLDNNKernel : public paddle::framework::OpKernel { (*x_bd_dims)[i], i, (*y_bd_dims)[i])); - (*out_dims)[i] = std::max((*x_bd_dims)[i], (*y_bd_dims)[i]); + (out_dims)[i] = std::max((*x_bd_dims)[i], (*y_bd_dims)[i]); } - out->Resize(phi::make_ddim((*out_dims))); + out->Resize(phi::make_ddim((out_dims))); } } +}; - template - void RunKernel(const ExecutionContext &ctx) const { - const auto &dev_ctx = ctx.template device_context(); - const auto &onednn_engine = dev_ctx.GetEngine(); - - auto *x = ctx.Input("X"); - auto *y = ctx.Input("Y"); - auto *out = ctx.Output("Out"); - bool trans_x = ctx.HasAttr("trans_x") ? ctx.Attr("trans_x") - : ctx.Attr("transpose_X"); - bool trans_y = ctx.HasAttr("trans_y") ? ctx.Attr("trans_y") - : ctx.Attr("transpose_Y"); - - auto x_dims = vectorize(GetDimForInput(ctx, "X")); - auto y_dims = vectorize(GetDimForInput(ctx, "Y")); - auto out_dims = vectorize(out->dims()); +template +class MatMulGradMKLDNNKernel : public paddle::framework::OpKernel { + public: + void Compute(const ExecutionContext &ctx) const override { + if (ctx.HasAttr("head_number")) { + PADDLE_ENFORCE_EQ( + ctx.Attr("head_number"), + 1, + paddle::platform::errors::Unimplemented( + "oneDNN matmul doesn't support multiple heads. Expected " + "head_number=1. But received `head_number` is %d", + ctx.Attr("head_number"))); + } - int ndims = std::max(x_dims.size(), y_dims.size()); - ndims = std::max(ndims, 3); + const auto &dev_ctx = + ctx.template device_context(); + const auto &onednn_engine = dev_ctx.GetEngine(); - std::vector x_bd_dims(ndims, 1); - std::vector y_bd_dims(ndims, 1); + auto x = *ctx.Input("X"); + auto y = *ctx.Input("Y"); + auto dout = + *ctx.Input(paddle::framework::GradVarName("Out")); + auto *dx = + ctx.Output(paddle::framework::GradVarName("X")); + auto *dy = + ctx.Output(paddle::framework::GradVarName("Y")); + + bool transpose_x = ctx.HasAttr("transpose_X") + ? ctx.Attr("transpose_X") + : ctx.Attr("trans_x"); + bool transpose_y = ctx.HasAttr("transpose_Y") + ? ctx.Attr("transpose_Y") + : ctx.Attr("trans_y"); + + ReshapeXYOutToMatrixSequence(&x, &y, &dout, transpose_x, transpose_y); + + paddle::framework::DDim dx_dims; + if (dx) { + dx_dims = dx->dims(); + if (dx_dims != x.dims()) { + dx->Resize(x.dims()); + } + } - CalculateMatrixDims( - ctx, x_dims, y_dims, &x_bd_dims, &y_bd_dims, &out_dims, out); + paddle::framework::DDim dy_dims; + if (dy) { + dy_dims = dy->dims(); + if (dy_dims != y.dims()) { + dy->Resize(y.dims()); + } + } - ExecuteMatMulV2(ctx, + if (transpose_x && transpose_y) { + this->ExecuteMatMulGrad( + ctx, dev_ctx, onednn_engine, &y, true, true, &dout, true, false, dx); + this->ExecuteMatMulGrad( + ctx, dev_ctx, onednn_engine, &dout, true, true, &x, true, false, dy); + } else if (transpose_x) { + this->ExecuteMatMulGrad(ctx, dev_ctx, onednn_engine, - ctx.GetPlace(), - x, - x_bd_dims, - trans_x, - y, - y_bd_dims, - trans_y, - out, - out_dims); - } -}; - -template -class MatMulV2GradMKLDNNKernel : public paddle::framework::OpKernel { - public: - void Compute(const ExecutionContext &ctx) const override { RunKernel(ctx); } + &y, + false, + false, + &dout, + true, + false, + dx); + this->ExecuteMatMulGrad(ctx, + dev_ctx, + onednn_engine, + &x, + false, + false, + &dout, + false, + true, + dy); + } else if (transpose_y) { + this->ExecuteMatMulGrad(ctx, + dev_ctx, + onednn_engine, + &dout, + false, + false, + &y, + false, + true, + dx); + this->ExecuteMatMulGrad( + ctx, dev_ctx, onednn_engine, &dout, true, true, &x, false, true, dy); + } else { + this->ExecuteMatMulGrad(ctx, + dev_ctx, + onednn_engine, + &dout, + false, + false, + &y, + true, + false, + dx); + this->ExecuteMatMulGrad( + ctx, dev_ctx, onednn_engine, &x, true, true, &dout, false, true, dy); + } - private: - void CalculateGradMatrixDims(const ExecutionContext &ctx, - Tensor *dx_tmp, - Tensor *dy_tmp, - const std::vector &dx_dims, - const std::vector &dy_dims, - std::vector *dx_bd_dims, - std::vector *dy_bd_dims) const { - for (size_t i = 0; i < dx_dims.size() - 2; ++i) { - if (dx_dims[i] != dy_dims[i]) { - if (dx_dims[i] == 1) { - (*dx_bd_dims)[i] = dy_dims[i]; - } else { - (*dy_bd_dims)[i] = dx_dims[i]; - } + if (dx) { + if (dx_dims != x.dims()) { + dx->Resize(dx_dims); + dx->set_mem_desc(x.mem_desc()); + } + } + if (dy) { + if (dy_dims != y.dims()) { + dy->Resize(dy_dims); + dy->set_mem_desc(y.mem_desc()); } } - - dx_tmp->Resize(phi::make_ddim((*dx_bd_dims))); - dx_tmp->mutable_data(ctx.GetPlace()); - dy_tmp->Resize(phi::make_ddim((*dy_bd_dims))); - dy_tmp->mutable_data(ctx.GetPlace()); } - void ReduceSumForMatmulGradOutput( - const ExecutionContext &ctx, - const MKLDNNDeviceContext &dev_ctx, - const dnnl::engine onednn_engine, - const Tensor *dx_tmp, - Tensor *dx, - const std::vector &dx_dims, - const std::vector &squeezed_dims) const { - paddle::platform::ReductionMKLDNNHandler handler( - dnnl::algorithm::reduction_sum, - 0.0f, - 0.0f, - onednn_engine, - ctx.GetPlace(), - dx_tmp, - dx, - dx_dims); + private: + void ExecuteMatMulGrad(const ExecutionContext &ctx, + const MKLDNNDeviceContext &dev_ctx, + const dnnl::engine &engine, + phi::DenseTensor *x, + bool trans_x, + bool is_fold_init_dims_x, + phi::DenseTensor *y, + bool trans_y, + bool is_fold_init_dims_y, + phi::DenseTensor *out) const { + // gradient is calculated in a different way when broadcasting is used + bool need_combine = (x->dims().size() == 3 || y->dims().size() == 3) && + out->dims().size() == 2; + + Tensor x_combined, y_combined; + if (!need_combine) { + x_combined = *x; + y_combined = *y; + } else { + x_combined = is_fold_init_dims_x ? FoldOuterDims(*x) + : FoldFirstAndLastDims(dev_ctx, x); + y_combined = is_fold_init_dims_y ? FoldOuterDims(*y) + : FoldFirstAndLastDims(dev_ctx, y); + } - auto src_memory_p = handler.AcquireSrcMemory(dx_tmp); - auto dst_memory_p = handler.AcquireDstMemory(dx); + float alpha = ctx.HasAttr("alpha") ? ctx.Attr("alpha") : 1.0f; - std::unordered_map reduction_args = { - {DNNL_ARG_SRC, *src_memory_p}, {DNNL_ARG_DST, *dst_memory_p}}; + MatMulMKLDNNHandler handler(engine, + ctx.GetPlace(), + &x_combined, + trans_x, + &y_combined, + trans_y, + out, + alpha); - auto &astream = MKLDNNDeviceContext::tls().get_stream(); - auto reduction_p = handler.AcquireForwardPrimitive(); + const auto src_memory_p = handler.AcquireSrcMemory(&x_combined); + const auto weights_memory_p = handler.AcquireWeightsMemory(&y_combined); + const auto dst_memory_p = handler.AcquireDstMemory(out); - reduction_p->execute(astream, reduction_args); - astream.wait(); + auto matmul_p = handler.AcquireForwardPrimitive(); - dx->set_mem_desc(dst_memory_p->get_desc().reshape(squeezed_dims)); - } + std::unordered_map matmul_args = { + {DNNL_ARG_SRC, *src_memory_p}, + {DNNL_ARG_WEIGHTS, *weights_memory_p}, + {DNNL_ARG_DST, *dst_memory_p}}; - std::vector ExtendDimsWithOnes(const std::vector &dims, - int new_size) const { - std::vector new_dims(new_size, 1); - for (size_t i = 0; i < dims.size(); ++i) { - new_dims[new_size - dims.size() + i] = dims[i]; - } + auto &astream = paddle::platform::MKLDNNDeviceContext::tls().get_stream(); + matmul_p->execute(astream, matmul_args); + astream.wait(); - return new_dims; + out->set_mem_desc( + dst_memory_p->get_desc().reshape(vectorize(out->dims()))); } +}; - void RunKernel(const ExecutionContext &ctx) const { +template +class MatMulV2GradMKLDNNKernel : public paddle::framework::OpKernel { + public: + void Compute(const ExecutionContext &ctx) const override { const auto &dev_ctx = ctx.template device_context(); const auto &onednn_engine = dev_ctx.GetEngine(); @@ -660,113 +768,39 @@ class MatMulV2GradMKLDNNKernel : public paddle::framework::OpKernel { ctx, &dx_tmp, &dy_tmp, x_dims, y_dims, &dx_bd_dims, &dy_bd_dims); if (trans_x && trans_y) { - ExecuteMatMulV2(ctx, - dev_ctx, - onednn_engine, - ctx.GetPlace(), - y, - y_dims, - true, - dout, - dout_dims, - true, - &dx_tmp, - dx_bd_dims, - 1); - ExecuteMatMulV2(ctx, - dev_ctx, - onednn_engine, - ctx.GetPlace(), - dout, - dout_dims, - true, - x, - x_dims, - true, - &dy_tmp, - dy_bd_dims, - 2); + ExecuteMatMulV2( + ctx, onednn_engine, y, y_dims, true, dout, dout_dims, true, &dx_tmp); + ExecuteMatMulV2( + ctx, onednn_engine, dout, dout_dims, true, x, x_dims, true, &dy_tmp); } else if (trans_x) { + ExecuteMatMulV2( + ctx, onednn_engine, y, y_dims, false, dout, dout_dims, true, &dx_tmp); ExecuteMatMulV2(ctx, - dev_ctx, onednn_engine, - ctx.GetPlace(), - y, - y_dims, - false, - dout, - dout_dims, - true, - &dx_tmp, - dx_bd_dims, - 1); - ExecuteMatMulV2(ctx, - dev_ctx, - onednn_engine, - ctx.GetPlace(), x, x_dims, false, dout, dout_dims, false, - &dy_tmp, - dy_bd_dims, - 2); + &dy_tmp); } else if (trans_y) { ExecuteMatMulV2(ctx, - dev_ctx, onednn_engine, - ctx.GetPlace(), dout, dout_dims, false, y, y_dims, false, - &dx_tmp, - dx_bd_dims, - 1); - ExecuteMatMulV2(ctx, - dev_ctx, - onednn_engine, - ctx.GetPlace(), - dout, - dout_dims, - true, - x, - x_dims, - false, - &dy_tmp, - dy_bd_dims, - 2); + &dx_tmp); + ExecuteMatMulV2( + ctx, onednn_engine, dout, dout_dims, true, x, x_dims, false, &dy_tmp); } else { - ExecuteMatMulV2(ctx, - dev_ctx, - onednn_engine, - ctx.GetPlace(), - dout, - dout_dims, - false, - y, - y_dims, - true, - &dx_tmp, - dx_bd_dims, - 1); - ExecuteMatMulV2(ctx, - dev_ctx, - onednn_engine, - ctx.GetPlace(), - x, - x_dims, - true, - dout, - dout_dims, - false, - &dy_tmp, - dy_bd_dims, - 2); + ExecuteMatMulV2( + ctx, onednn_engine, dout, dout_dims, false, y, y_dims, true, &dx_tmp); + ExecuteMatMulV2( + ctx, onednn_engine, x, x_dims, true, dout, dout_dims, false, &dy_tmp); } if (x_dims != dx_bd_dims) { @@ -776,7 +810,7 @@ class MatMulV2GradMKLDNNKernel : public paddle::framework::OpKernel { &dx_tmp, dx, x_dims, - phi::vectorize(x->dims())); + vectorize(x->dims())); } else { *dx = std::move(dx_tmp); } @@ -787,7 +821,7 @@ class MatMulV2GradMKLDNNKernel : public paddle::framework::OpKernel { &dy_tmp, dy, y_dims, - phi::vectorize(y->dims())); + vectorize(y->dims())); } else { *dy = std::move(dy_tmp); } @@ -797,162 +831,76 @@ class MatMulV2GradMKLDNNKernel : public paddle::framework::OpKernel { } private: - paddle::operators::MatMulGradMKLDNNKernel matmul_v1_grad_mkldnn_kernel; -}; -} // anonymous namespace - -namespace paddle { -namespace operators { - -template -void MatMulGradMKLDNNKernel::Compute(const ExecutionContext &ctx) const { - if (ctx.HasAttr("head_number")) { - PADDLE_ENFORCE_EQ( - ctx.Attr("head_number"), - 1, - platform::errors::Unimplemented( - "oneDNN matmul doesn't support multiple heads. Expected " - "head_number=1. But received `head_number` is %d", - ctx.Attr("head_number"))); - } - RunKernel(ctx); -} + void CalculateGradMatrixDims(const ExecutionContext &ctx, + Tensor *dx_tmp, + Tensor *dy_tmp, + const std::vector &dx_dims, + const std::vector &dy_dims, + std::vector *dx_bd_dims, + std::vector *dy_bd_dims) const { + for (size_t i = 0; i < dx_dims.size() - 2; ++i) { + if (dx_dims[i] != dy_dims[i]) { + if (dx_dims[i] == 1) { + (*dx_bd_dims)[i] = dy_dims[i]; + } else { + (*dy_bd_dims)[i] = dx_dims[i]; + } + } + } -template -void MatMulGradMKLDNNKernel::ExecuteMatMulGrad( - const ExecutionContext &ctx, - const MKLDNNDeviceContext &dev_ctx, - const dnnl::engine &engine, - Tensor *x, - bool trans_x, - bool is_fold_init_dims_x, - Tensor *y, - bool trans_y, - bool is_fold_init_dims_y, - Tensor *out) const { - // gradient is calculated in a different way when broadcasting is used - bool need_combine = (x->dims().size() == 3 || y->dims().size() == 3) && - out->dims().size() == 2; - - Tensor x_combined, y_combined; - if (!need_combine) { - x_combined = *x; - y_combined = *y; - } else { - x_combined = is_fold_init_dims_x ? FoldOuterDims(*x) - : FoldFirstAndLastDims(dev_ctx, x); - y_combined = is_fold_init_dims_y ? FoldOuterDims(*y) - : FoldFirstAndLastDims(dev_ctx, y); + dx_tmp->Resize(phi::make_ddim((*dx_bd_dims))); + dx_tmp->mutable_data(ctx.GetPlace()); + dy_tmp->Resize(phi::make_ddim((*dy_bd_dims))); + dy_tmp->mutable_data(ctx.GetPlace()); } - float alpha = ctx.HasAttr("alpha") ? ctx.Attr("alpha") : 1.0f; - - MatMulMKLDNNHandler handler(engine, - ctx.GetPlace(), - &x_combined, - trans_x, - &y_combined, - trans_y, - out, - alpha); - - const auto src_memory_p = handler.AcquireSrcMemory(&x_combined); - const auto weights_memory_p = handler.AcquireWeightsMemory(&y_combined); - const auto dst_memory_p = handler.AcquireDstMemory(out); + void ReduceSumForMatmulGradOutput( + const ExecutionContext &ctx, + const MKLDNNDeviceContext &dev_ctx, + const dnnl::engine onednn_engine, + const Tensor *dx_tmp, + Tensor *dx, + const std::vector &dx_dims, + const std::vector &squeezed_dims) const { + phi::funcs::ReductionOneDNNHandler handler( + dnnl::algorithm::reduction_sum, + 0.0f, + 0.0f, + onednn_engine, + ctx.GetPlace(), + dx_tmp, + dx, + dx_dims); - auto matmul_p = handler.AcquireForwardPrimitive(); + auto src_memory_p = handler.AcquireSrcMemory(dx_tmp); + auto dst_memory_p = handler.AcquireDstMemory(dx); - std::unordered_map matmul_args = { - {DNNL_ARG_SRC, *src_memory_p}, - {DNNL_ARG_WEIGHTS, *weights_memory_p}, - {DNNL_ARG_DST, *dst_memory_p}}; + std::unordered_map reduction_args = { + {DNNL_ARG_SRC, *src_memory_p}, {DNNL_ARG_DST, *dst_memory_p}}; - auto &astream = platform::MKLDNNDeviceContext::tls().get_stream(); - matmul_p->execute(astream, matmul_args); - astream.wait(); + auto &astream = MKLDNNDeviceContext::tls().get_stream(); + auto reduction_p = handler.AcquireForwardPrimitive(); - out->set_mem_desc( - dst_memory_p->get_desc().reshape(vectorize(out->dims()))); -} + reduction_p->execute(astream, reduction_args); + astream.wait(); -template -void MatMulGradMKLDNNKernel::RunKernel(const ExecutionContext &ctx) const { - const auto &dev_ctx = - ctx.template device_context(); - const auto &onednn_engine = dev_ctx.GetEngine(); - - auto x = *ctx.Input("X"); - auto y = *ctx.Input("Y"); - auto dout = *ctx.Input(framework::GradVarName("Out")); - auto *dx = ctx.Output(framework::GradVarName("X")); - auto *dy = ctx.Output(framework::GradVarName("Y")); - - bool transpose_x = ctx.HasAttr("transpose_X") ? ctx.Attr("transpose_X") - : ctx.Attr("trans_x"); - bool transpose_y = ctx.HasAttr("transpose_Y") ? ctx.Attr("transpose_Y") - : ctx.Attr("trans_y"); - - ReshapeXYOutToMatrixSequence(&x, &y, &dout, transpose_x, transpose_y); - - framework::DDim dx_dims; - if (dx) { - dx_dims = dx->dims(); - if (dx_dims != x.dims()) { - dx->Resize(x.dims()); - } + dx->set_mem_desc(dst_memory_p->get_desc().reshape(squeezed_dims)); } - framework::DDim dy_dims; - if (dy) { - dy_dims = dy->dims(); - if (dy_dims != y.dims()) { - dy->Resize(y.dims()); + std::vector ExtendDimsWithOnes(const std::vector &dims, + int new_size) const { + std::vector new_dims(new_size, 1); + for (size_t i = 0; i < dims.size(); ++i) { + new_dims[new_size - dims.size() + i] = dims[i]; } - } - if (transpose_x && transpose_y) { - this->ExecuteMatMulGrad( - ctx, dev_ctx, onednn_engine, &y, true, true, &dout, true, false, dx); - this->ExecuteMatMulGrad( - ctx, dev_ctx, onednn_engine, &dout, true, true, &x, true, false, dy); - } else if (transpose_x) { - this->ExecuteMatMulGrad( - ctx, dev_ctx, onednn_engine, &y, false, false, &dout, true, false, dx); - this->ExecuteMatMulGrad( - ctx, dev_ctx, onednn_engine, &x, false, false, &dout, false, true, dy); - } else if (transpose_y) { - this->ExecuteMatMulGrad( - ctx, dev_ctx, onednn_engine, &dout, false, false, &y, false, true, dx); - this->ExecuteMatMulGrad( - ctx, dev_ctx, onednn_engine, &dout, true, true, &x, false, true, dy); - } else { - this->ExecuteMatMulGrad( - ctx, dev_ctx, onednn_engine, &dout, false, false, &y, true, false, dx); - this->ExecuteMatMulGrad( - ctx, dev_ctx, onednn_engine, &x, true, true, &dout, false, true, dy); - } - - if (dx) { - if (dx_dims != x.dims()) { - dx->Resize(dx_dims); - dx->set_mem_desc(x.mem_desc()); - } - } - if (dy) { - if (dy_dims != y.dims()) { - dy->Resize(dy_dims); - dy->set_mem_desc(y.mem_desc()); - } + return new_dims; } -} - -template class MatMulGradMKLDNNKernel; -template class MatMulGradMKLDNNKernel; -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; + private: + MatMulGradMKLDNNKernel matmul_v1_grad_mkldnn_kernel; +}; +} // anonymous namespace REGISTER_OP_KERNEL(matmul, MKLDNN, @@ -965,8 +913,8 @@ REGISTER_OP_KERNEL(matmul, REGISTER_OP_KERNEL(matmul_grad, MKLDNN, ::paddle::platform::CPUPlace, - ops::MatMulGradMKLDNNKernel, - ops::MatMulGradMKLDNNKernel); + MatMulGradMKLDNNKernel, + MatMulGradMKLDNNKernel); REGISTER_OP_KERNEL(matmul_v2, MKLDNN, diff --git a/paddle/fluid/operators/mkldnn/mkldnn_activation_op.h b/paddle/fluid/operators/mkldnn/mkldnn_activation_op.h deleted file mode 100644 index b2457374a73ac..0000000000000 --- a/paddle/fluid/operators/mkldnn/mkldnn_activation_op.h +++ /dev/null @@ -1,57 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include - -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -template -class MKLDNNActivationKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - OP_INOUT_CHECK(context.HasInput("X"), "Input", "X", "Activation"); - OP_INOUT_CHECK(context.HasInput("Out"), "Output", "Out", "Activation"); - Functor functor; - - auto attrs = functor.GetAttrs(); - for (auto& attr : attrs) { - *attr.second = context.Attr(attr.first); - } - functor(context); - } -}; - -template -class MKLDNNActivationGradKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - Functor functor; - - auto attrs = functor.GetAttrs(); - for (auto& attr : attrs) { - *attr.second = context.Attr(attr.first); - } - functor(context); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc index 2622dfb4eb204..f667c9809df04 100644 --- a/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc @@ -30,7 +30,6 @@ using LoDTensor = phi::DenseTensor; using platform::MatMulV2MKLDNNHandler; using platform::MKLDNNDeviceContext; -using platform::to_void_cast; using dnnl::inner_product_forward; using dnnl::memory; @@ -73,7 +72,7 @@ class MulPrimitiveFactory { return *(mul_); } - auto src_desc = CreateMemDescriptor(&x_matrix, MKLDNNMemoryFormat::nc); + auto src_desc = CreateMemDescriptor(&x_matrix, OneDNNMemoryFormat::nc); x_input_ = CreateMemory(src_desc, &x_matrix); if (is_int8_) { @@ -84,7 +83,7 @@ class MulPrimitiveFactory { y_input_ = TransposeInputY(&y_matrix); } - auto dst_desc = CreateMemDescriptor(output, MKLDNNMemoryFormat::any); + auto dst_desc = CreateMemDescriptor(output, OneDNNMemoryFormat::any); mul_ = CreateMulPrimitive(*x_input_, *y_input_, dst_desc, output, ctx); Execute(); @@ -126,8 +125,8 @@ class MulPrimitiveFactory { auto ndims = input_y.get_desc().data.ndims; auto y_dims = std::vector(dims, dims + ndims); - auto user_y_desc = CreateMemDescriptor(y_dims, MKLDNNMemoryFormat::oi); - auto y_desc = CreateMemDescriptor(y_dims, MKLDNNMemoryFormat::oi); + auto user_y_desc = CreateMemDescriptor(y_dims, OneDNNMemoryFormat::oi); + auto y_desc = CreateMemDescriptor(y_dims, OneDNNMemoryFormat::oi); return ReorderWithScale( user_y_desc, y_desc, input_y.get_data_handle(), scale_y); @@ -205,8 +204,8 @@ class MulPrimitiveFactory { auto dst_mdesc = data->dims().size() >= 4 ? (data->dims().size() == 5 - ? CreateMemDescriptor(data, MKLDNNMemoryFormat::ncdhw) - : CreateMemDescriptor(data, MKLDNNMemoryFormat::nchw)) + ? CreateMemDescriptor(data, OneDNNMemoryFormat::ncdhw) + : CreateMemDescriptor(data, OneDNNMemoryFormat::nchw)) : src_mdesc; if (src_mdesc != dst_mdesc) { @@ -214,8 +213,8 @@ class MulPrimitiveFactory { Reorder(src_mdesc, dst_mdesc, - to_void_cast(data->data()), - to_void_cast(x_tmp.data())); + phi::funcs::to_void_cast(data->data()), + phi::funcs::to_void_cast(x_tmp.data())); x_tmp.Resize(data->dims()); x_tmp.set_mem_desc(dst_mdesc); @@ -230,7 +229,7 @@ class MulPrimitiveFactory { void UpdateDataPointers(const ExecutionContext &ctx, Tensor *out, const Tensor *in) { - x_input_->set_data_handle(to_void_cast(in->data())); + x_input_->set_data_handle(phi::funcs::to_void_cast(in->data())); output_->set_data_handle(out->mutable_data(ctx.GetPlace())); out->set_mem_desc(output_->get_desc()); } @@ -238,23 +237,24 @@ class MulPrimitiveFactory { template memory::desc CreateMemDescriptor( const Tensor *tensor, - MKLDNNMemoryFormat format, - memory::data_type type = platform::MKLDNNGetDataType()) { + OneDNNMemoryFormat format, + memory::data_type type = phi::funcs::OneDNNGetDataType()) { auto dims = phi::vectorize(tensor->dims()); - return platform::MKLDNNMemDesc(dims, type, format); + return phi::funcs::OneDNNMemDesc(dims, type, format); } template memory::desc CreateMemDescriptor( const std::vector &dims, - MKLDNNMemoryFormat format, - memory::data_type type = platform::MKLDNNGetDataType()) { - return platform::MKLDNNMemDesc(dims, type, format); + OneDNNMemoryFormat format, + memory::data_type type = phi::funcs::OneDNNGetDataType()) { + return phi::funcs::OneDNNMemDesc(dims, type, format); } template memory CreateMemory(const memory::desc &desc, const Tensor *tensor) { - return memory(desc, engine_, to_void_cast(tensor->data())); + return memory( + desc, engine_, phi::funcs::to_void_cast(tensor->data())); } memory CreateDstMemory( @@ -266,7 +266,7 @@ class MulPrimitiveFactory { OT *output_data = output->mutable_data(ctx.GetPlace(), buffer_size); output->set_mem_desc(dst_desc); - return memory(dst_desc, engine_, to_void_cast(output_data)); + return memory(dst_desc, engine_, phi::funcs::to_void_cast(output_data)); } memory Reorder(const memory::desc &src_desc, @@ -296,9 +296,10 @@ class MulPrimitiveFactory { memory TransposeInputY(const Tensor *input_y) { auto dims = phi::vectorize(input_y->dims()); std::swap(dims[0], dims[1]); // Correct output dimensions - auto src_desc = CreateMemDescriptor(dims, MKLDNNMemoryFormat::io); - auto dst_desc = CreateMemDescriptor(dims, MKLDNNMemoryFormat::oi); - return Reorder(src_desc, dst_desc, to_void_cast(input_y->data())); + auto src_desc = CreateMemDescriptor(dims, OneDNNMemoryFormat::io); + auto dst_desc = CreateMemDescriptor(dims, OneDNNMemoryFormat::oi); + return Reorder( + src_desc, dst_desc, phi::funcs::to_void_cast(input_y->data())); } const dnnl::engine &engine_; diff --git a/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc index bd62094d89e64..047f7470cd6a6 100644 --- a/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc @@ -25,7 +25,6 @@ namespace operators { using dnnl::memory; using dnnl::primitive; using dnnl::reorder; -using platform::to_void_cast; using Tensor = phi::DenseTensor; using dnnl::stream; using phi::DataLayout; @@ -72,28 +71,24 @@ class QuantOpKernel : public framework::OpKernel { DNNL_ARG_DST, mask, {static_cast(quantization_shift)}); } - framework::proto::VarType::Type x_paddle_dtype = - framework::TransToProtoVarType(x->dtype()); - framework::proto::VarType::Type out_paddle_dtype; + auto x_type = phi::funcs::ToOneDNNDataType(x->dtype()); + DataType out_dtype; if (bfloat16) { - out_paddle_dtype = framework::proto::VarType::BF16; + out_dtype = DataType::BFLOAT16; } else if (is_negative_input && !with_shift) { - out_paddle_dtype = framework::proto::VarType::INT8; + out_dtype = DataType::INT8; } else { - out_paddle_dtype = framework::proto::VarType::UINT8; + out_dtype = DataType::UINT8; } - platform::ReorderMKLDNNHandler reorder_handler( - x_tz, - x_paddle_dtype, - framework::ToMKLDNNDataType(x_paddle_dtype), - out_paddle_dtype, - framework::ToMKLDNNDataType(out_paddle_dtype), - dev_ctx.GetEngine()); + auto out_type = phi::funcs::ToOneDNNDataType(out_dtype); + + phi::funcs::ReorderOneDNNHandler reorder_handler( + x_tz, x->dtype(), x_type, out_dtype, out_type, dev_ctx.GetEngine()); auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory( - x->mem_desc(), platform::to_void_cast(x->data())); + x->mem_desc(), phi::funcs::to_void_cast(x->data())); auto reorder_dst_memory_p = reorder_handler.AcquireDstMemory( out, x->mem_desc(), dev_ctx.GetPlace()); diff --git a/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc index b1a323e7ab5a6..4ac14d5ff95e5 100644 --- a/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc @@ -24,7 +24,6 @@ namespace operators { using dnnl::memory; using dnnl::reorder; -using platform::to_void_cast; using Tensor = phi::DenseTensor; namespace { @@ -88,10 +87,10 @@ class ReQuantOpKernel : public framework::OpKernel { if (reorder_p == nullptr) { auto src_dt = framework::ToMKLDNNDataType( framework::TransToProtoVarType(input->dtype())); - auto dst_dt = with_shift ? framework::MKLDNNDataType::u8 : src_dt; + auto dst_dt = with_shift ? framework::OneDNNDataType::u8 : src_dt; src_memory = std::make_shared( - input->mem_desc(), engine, to_void_cast(input_data)); + input->mem_desc(), engine, phi::funcs::to_void_cast(input_data)); auto xstrides = input->mem_desc().data.format_desc.blocking.strides; @@ -112,11 +111,11 @@ class ReQuantOpKernel : public framework::OpKernel { clip_to_uint8(shift_out - reorder_scale * shift_in); std::memset(output_data, reorder_shift, output->numel()); dst_memory = std::make_shared( - dst_md, engine, to_void_cast(output_data)); + dst_md, engine, phi::funcs::to_void_cast(output_data)); } else { T* output_data = output->mutable_data(ctx.GetPlace()); dst_memory = std::make_shared( - dst_md, engine, to_void_cast(output_data)); + dst_md, engine, phi::funcs::to_void_cast(output_data)); } auto reorder_pd = @@ -129,7 +128,7 @@ class ReQuantOpKernel : public framework::OpKernel { } else { src_memory = std::static_pointer_cast(dev_ctx.GetBlob(key_src_mem)); - src_memory->set_data_handle(to_void_cast(input_data)); + src_memory->set_data_handle(phi::funcs::to_void_cast(input_data)); dst_memory = std::static_pointer_cast(dev_ctx.GetBlob(key_dst_mem)); diff --git a/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc index 6480125c93fd7..f1b321c5ddab7 100644 --- a/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc @@ -30,8 +30,6 @@ enum class ReshapeKernelOpName { namespace paddle { namespace operators { -using platform::to_void_cast; - static std::vector extract_shape( const std::vector& list_new_shape_tensor) { std::vector vec_new_shape; @@ -73,16 +71,12 @@ class ReshapeMKLDNNKernel : public framework::OpKernel { auto x_vec_dims = phi::vectorize(x_dims); - dnnl::memory::data_type x_type = - framework::ToMKLDNNDataType(framework::TransToProtoVarType(x->dtype())); - platform::ReorderMKLDNNHandler reorder_handler( - x_vec_dims, - framework::TransToProtoVarType(x->dtype()), - x_type, - onednn_engine); + auto x_type = phi::funcs ::ToOneDNNDataType(x->dtype()); + phi::funcs::ReorderOneDNNHandler reorder_handler( + x_vec_dims, x->dtype(), x_type, onednn_engine); auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory( - x->mem_desc(), platform::to_void_cast(x->data())); + x->mem_desc(), phi::funcs::to_void_cast(x->data())); out->Resize(x_dims); // to match x numel, format is changed later // reorder is done into a plain tag to allow usage with blocked formats auto reorder_dst_memory_p = reorder_handler.AcquireDstMemory( @@ -347,16 +341,12 @@ class ReshapeGradMKLDNNKernel : public ReshapeMKLDNNKernel { auto dout_vec_dims = phi::vectorize(dout->dims()); - dnnl::memory::data_type dout_type = framework::ToMKLDNNDataType( - framework::TransToProtoVarType(dout->dtype())); - platform::ReorderMKLDNNHandler reorder_handler( - dout_vec_dims, - framework::TransToProtoVarType(dout->dtype()), - dout_type, - onednn_engine); + auto dout_type = phi::funcs::ToOneDNNDataType(dout->dtype()); + phi::funcs::ReorderOneDNNHandler reorder_handler( + dout_vec_dims, dout->dtype(), dout_type, onednn_engine); auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory( - dout->mem_desc(), platform::to_void_cast(dout->data())); + dout->mem_desc(), phi::funcs::to_void_cast(dout->data())); auto reorder_dst_memory_p = reorder_handler.AcquireDstMemory( dx, this->getPlainFormatTag(dout), ctx.GetPlace()); auto reorder_p = reorder_handler.AcquireReorder(reorder_dst_memory_p, diff --git a/paddle/fluid/operators/mkldnn/shuffle_channel_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/shuffle_channel_mkldnn_op.cc index fd1b1927f5fbb..424aa906eb22b 100644 --- a/paddle/fluid/operators/mkldnn/shuffle_channel_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/shuffle_channel_mkldnn_op.cc @@ -17,17 +17,16 @@ limitations under the License. */ namespace paddle { namespace operators { -using platform::MKLDNNGetDataType; template class ShuffleChannelMKLDNNHandler - : public platform::MKLDNNHandlerNoCachingT { + : public phi::funcs::OneDNNHandlerNoCachingT { public: ShuffleChannelMKLDNNHandler(const phi::DenseTensor* x, const int group, const dnnl::engine engine, platform::Place cpu_place) - : platform::MKLDNNHandlerNoCachingT(engine, - cpu_place) { + : phi::funcs::OneDNNHandlerNoCachingT( + engine, cpu_place) { static constexpr int channel_axis = 1; this->AcquireForwardPrimitiveDescriptor( dnnl::prop_kind::forward_training, x->mem_desc(), channel_axis, group); diff --git a/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc index d84cfe6de41d3..b3f02153f0d75 100644 --- a/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc @@ -53,19 +53,17 @@ class TransposeMKLDNNOpKernel : public paddle::framework::OpKernel { auto x_vec_dims = phi::vectorize(x->dims()); - framework::proto::VarType::Type x_paddle_type = - framework::TransToProtoVarType(x->dtype()); - dnnl::memory::data_type x_type = framework::ToMKLDNNDataType(x_paddle_type); - platform::ReorderMKLDNNHandler reorder_handler( - x_vec_dims, x_paddle_type, x_type, dnnl_engine); + auto x_type = phi::funcs::ToOneDNNDataType(x->dtype()); + phi::funcs::ReorderOneDNNHandler reorder_handler( + x_vec_dims, x->dtype(), x_type, dnnl_engine); auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory( - x->mem_desc(), platform::to_void_cast(x->data())); + x->mem_desc(), phi::funcs::to_void_cast(x->data())); auto dst_md = dnnl::memory::desc(x_vec_dims, x->mem_desc().data_type(), - platform::GetPlainMKLDNNFormat(x_vec_dims.size())); + phi::funcs::GetPlainOneDNNFormat(x_vec_dims.size())); // a trick is used here to fake transpose of out_md, so later it will be // "untransposed", leaving output data in plain format tag auto dst_strides = FakeTranposeStrides(dst_md, transpose_axis); @@ -148,17 +146,13 @@ class TransposeMKLDNNGradOpKernel : public paddle::framework::OpKernel { } auto dout_vec_dims = phi::vectorize(dout->dims()); + auto dout_type = phi::funcs::ToOneDNNDataType(dout->dtype()); - framework::proto::VarType::Type dout_paddle_type = - framework::TransToProtoVarType(dout->dtype()); - dnnl::memory::data_type dout_type = - framework::ToMKLDNNDataType(dout_paddle_type); - - platform::ReorderMKLDNNHandler reorder_handler( - dout_vec_dims, dout_paddle_type, dout_type, dnnl_engine); + phi::funcs::ReorderOneDNNHandler reorder_handler( + dout_vec_dims, dout->dtype(), dout_type, dnnl_engine); auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory( - dout->mem_desc(), platform::to_void_cast(dout->data())); + dout->mem_desc(), phi::funcs::to_void_cast(dout->data())); auto reorder_dst_memory_p = reorder_handler.AcquireDstMemory(dx, dout->mem_desc(), ctx.GetPlace()); diff --git a/paddle/fluid/operators/pad2d_op.cc b/paddle/fluid/operators/pad2d_op.cc index 4d8e140c013ae..6686912941a32 100644 --- a/paddle/fluid/operators/pad2d_op.cc +++ b/paddle/fluid/operators/pad2d_op.cc @@ -708,7 +708,7 @@ class Pad2dOp : public framework::OperatorWithKernel { .data.format_desc.blocking.inner_nblks == 0) { return framework::OpKernelType(input_data_type, ctx.GetPlace(), - phi::DataLayout::kMKLDNN, + phi::DataLayout::ONEDNN, framework::LibraryType::kMKLDNN); } #endif @@ -720,8 +720,8 @@ class Pad2dOp : public framework::OperatorWithKernel { const phi::DenseTensor& tensor, const framework::OpKernelType& expected_kernel_type) const override { #ifdef PADDLE_WITH_MKLDNN - if ((expected_kernel_type.data_layout_ == phi::DataLayout::kMKLDNN) && - (tensor.layout() != phi::DataLayout::kMKLDNN)) { + if ((expected_kernel_type.data_layout_ == phi::DataLayout::ONEDNN) && + (tensor.layout() != phi::DataLayout::ONEDNN)) { auto attrs = Attrs(); auto ar = paddle::framework::AttrReader(attrs); const std::string data_format = ar.Get("data_format"); diff --git a/paddle/fluid/operators/pad3d_op.cc b/paddle/fluid/operators/pad3d_op.cc index f52ab294e0745..f457151b707e7 100644 --- a/paddle/fluid/operators/pad3d_op.cc +++ b/paddle/fluid/operators/pad3d_op.cc @@ -42,7 +42,7 @@ class Pad3dOp : public framework::OperatorWithKernel { .data.format_desc.blocking.inner_nblks == 0) { return framework::OpKernelType(input_data_type, ctx.GetPlace(), - phi::DataLayout::kMKLDNN, + phi::DataLayout::ONEDNN, framework::LibraryType::kMKLDNN); } #endif @@ -54,8 +54,8 @@ class Pad3dOp : public framework::OperatorWithKernel { const phi::DenseTensor& tensor, const framework::OpKernelType& expected_kernel_type) const override { #ifdef PADDLE_WITH_MKLDNN - if ((expected_kernel_type.data_layout_ == phi::DataLayout::kMKLDNN) && - (tensor.layout() != phi::DataLayout::kMKLDNN)) { + if ((expected_kernel_type.data_layout_ == phi::DataLayout::ONEDNN) && + (tensor.layout() != phi::DataLayout::ONEDNN)) { auto attrs = Attrs(); auto ar = paddle::framework::AttrReader(attrs); const std::string data_format = ar.Get("data_format"); diff --git a/paddle/fluid/operators/pool_op.cc b/paddle/fluid/operators/pool_op.cc index 48bfa3576ab6c..25d2ac8ce0d7a 100644 --- a/paddle/fluid/operators/pool_op.cc +++ b/paddle/fluid/operators/pool_op.cc @@ -58,8 +58,8 @@ framework::OpKernelType PoolOp::GetKernelTypeForVar( const phi::DenseTensor& tensor, const framework::OpKernelType& expected_kernel_type) const { #ifdef PADDLE_WITH_MKLDNN - if ((expected_kernel_type.data_layout_ == phi::DataLayout::kMKLDNN) && - (tensor.layout() != phi::DataLayout::kMKLDNN)) { + if ((expected_kernel_type.data_layout_ == phi::DataLayout::ONEDNN) && + (tensor.layout() != phi::DataLayout::ONEDNN)) { auto attrs = Attrs(); auto ar = paddle::framework::AttrReader(attrs); const std::string data_format = ar.Get("data_format"); @@ -92,8 +92,8 @@ framework::OpKernelType PoolOpGrad::GetKernelTypeForVar( const phi::DenseTensor& tensor, const framework::OpKernelType& expected_kernel_type) const { #ifdef PADDLE_WITH_MKLDNN - if ((expected_kernel_type.data_layout_ == phi::DataLayout::kMKLDNN) && - (tensor.layout() != phi::DataLayout::kMKLDNN)) { + if ((expected_kernel_type.data_layout_ == phi::DataLayout::ONEDNN) && + (tensor.layout() != phi::DataLayout::ONEDNN)) { auto attrs = Attrs(); auto ar = paddle::framework::AttrReader(attrs); const std::string data_format = ar.Get("data_format"); diff --git a/paddle/fluid/operators/quantize_op.cc b/paddle/fluid/operators/quantize_op.cc index c60721654e345..c98e15fcff9a6 100644 --- a/paddle/fluid/operators/quantize_op.cc +++ b/paddle/fluid/operators/quantize_op.cc @@ -24,7 +24,7 @@ framework::OpKernelType QuantOp::GetExpectedKernelType( return framework::OpKernelType( OperatorWithKernel::IndicateVarDataType(ctx, "Input"), ctx.GetPlace(), - phi::DataLayout::kMKLDNN, + phi::DataLayout::ONEDNN, framework::LibraryType::kMKLDNN); } diff --git a/paddle/fluid/operators/requantize_op.cc b/paddle/fluid/operators/requantize_op.cc index b94ed47c74ccd..d0cc991e959c7 100644 --- a/paddle/fluid/operators/requantize_op.cc +++ b/paddle/fluid/operators/requantize_op.cc @@ -24,7 +24,7 @@ framework::OpKernelType ReQuantOp::GetExpectedKernelType( return framework::OpKernelType( OperatorWithKernel::IndicateVarDataType(ctx, "Input"), ctx.GetPlace(), - phi::DataLayout::kMKLDNN, + phi::DataLayout::ONEDNN, framework::LibraryType::kMKLDNN); } diff --git a/paddle/fluid/operators/slice_op.cc b/paddle/fluid/operators/slice_op.cc index 38aefa3a4f285..07867f5070b3c 100644 --- a/paddle/fluid/operators/slice_op.cc +++ b/paddle/fluid/operators/slice_op.cc @@ -167,7 +167,7 @@ class SliceOp : public framework::OperatorWithKernel { .data.format_desc.blocking.inner_nblks == 0) return framework::OpKernelType(input_data_type, ctx.GetPlace(), - phi::DataLayout::kMKLDNN, + phi::DataLayout::ONEDNN, framework::LibraryType::kMKLDNN); } #endif @@ -340,7 +340,7 @@ class SliceOpGrad : public framework::OperatorWithKernel { .data.format_desc.blocking.inner_nblks == 0) return framework::OpKernelType(input_data_type, ctx.GetPlace(), - phi::DataLayout::kMKLDNN, + phi::DataLayout::ONEDNN, framework::LibraryType::kMKLDNN); } #endif diff --git a/paddle/fluid/operators/split_op.cc b/paddle/fluid/operators/split_op.cc index 68eab9365a3f0..0c2d79a664ea8 100644 --- a/paddle/fluid/operators/split_op.cc +++ b/paddle/fluid/operators/split_op.cc @@ -124,7 +124,7 @@ class SplitOp : public framework::OperatorWithKernel { if (x_md.data.format_desc.blocking.inner_nblks == 0) return framework::OpKernelType(input_data_type, ctx.GetPlace(), - phi::DataLayout::kMKLDNN, + phi::DataLayout::ONEDNN, framework::LibraryType::kMKLDNN); } #endif diff --git a/paddle/fluid/operators/squeeze_op.cc b/paddle/fluid/operators/squeeze_op.cc index 63c5c9689f84c..93a03c535fe32 100644 --- a/paddle/fluid/operators/squeeze_op.cc +++ b/paddle/fluid/operators/squeeze_op.cc @@ -128,7 +128,7 @@ class SqueezeOp : public framework::OperatorWithKernel { // #ifdef PADDLE_WITH_MKLDNN // if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { // return framework::OpKernelType(input_data_type, ctx.GetPlace(), - // phi::DataLayout::kMKLDNN, + // phi::DataLayout::ONEDNN, // framework::LibraryType::kMKLDNN); // } // #endif @@ -155,7 +155,7 @@ class SqueezeGradOp : public framework::OperatorWithKernel { // #ifdef PADDLE_WITH_MKLDNN // if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { // return framework::OpKernelType(input_data_type, ctx.GetPlace(), - // phi::DataLayout::kMKLDNN, + // phi::DataLayout::ONEDNN, // framework::LibraryType::kMKLDNN); // } // #endif @@ -222,7 +222,7 @@ class Squeeze2Op : public framework::OperatorWithKernel { // #ifdef PADDLE_WITH_MKLDNN // if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { // return framework::OpKernelType(input_data_type, ctx.GetPlace(), - // phi::DataLayout::kMKLDNN, + // phi::DataLayout::ONEDNN, // framework::LibraryType::kMKLDNN); // } // #endif @@ -270,7 +270,7 @@ class Squeeze2GradOp : public framework::OperatorWithKernel { // #ifdef PADDLE_WITH_MKLDNN // if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { // return framework::OpKernelType(input_data_type, ctx.GetPlace(), - // phi::DataLayout::kMKLDNN, + // phi::DataLayout::ONEDNN, // framework::LibraryType::kMKLDNN); // } // #endif diff --git a/paddle/fluid/operators/transfer_layout_op.cc b/paddle/fluid/operators/transfer_layout_op.cc index ec2411bf10005..84f1948cd64a1 100644 --- a/paddle/fluid/operators/transfer_layout_op.cc +++ b/paddle/fluid/operators/transfer_layout_op.cc @@ -49,7 +49,7 @@ class TransferLayoutOp : public framework::OperatorWithKernel { auto *in_tensor = framework::GetLoDTensorOrSelectedRowsValueFromVar(*in); // NOTE(zhiqiu): hot fix, allow empty tensor of kMKLDNN layout to run this // op - if (in_tensor->layout() != DataLayout::kMKLDNN) { + if (in_tensor->layout() != DataLayout::ONEDNN) { PADDLE_ENFORCE_EQ(in_tensor->IsInitialized(), true, platform::errors::PreconditionNotMet( diff --git a/paddle/fluid/operators/transfer_layout_op.h b/paddle/fluid/operators/transfer_layout_op.h index 2017fd0e2103b..3d5fe77afad57 100644 --- a/paddle/fluid/operators/transfer_layout_op.h +++ b/paddle/fluid/operators/transfer_layout_op.h @@ -63,33 +63,32 @@ class TransferLayoutFunctor { auto in_layout = static_cast(src_layout_); auto *tensor_out = out_->GetMutable(); VLOG(4) << in_layout << "->" << out_layout << " " << in_tensor.layout(); - if (!in_tensor.IsInitialized() && in_layout == DataLayout::kMKLDNN && + if (!in_tensor.IsInitialized() && in_layout == DataLayout::ONEDNN && out_layout == DataLayout::kNHWC) { tensor_out->Resize(in_tensor.dims()); tensor_out->set_layout(out_layout); - platform::MatchShapeToLayout(tensor_out, in_layout, out_layout); + phi::funcs::MatchShapeToLayout(tensor_out, in_layout, out_layout); return; } - if (in_layout == DataLayout::kMKLDNN || out_layout == DataLayout::kMKLDNN) { + if (in_layout == DataLayout::ONEDNN || out_layout == DataLayout::ONEDNN) { PADDLE_ENFORCE_NE( in_layout, out_layout, platform::errors::PreconditionNotMet( - "No layout transform needed between two MKLDNN OPKernels.")); + "No layout transform needed between two oneDNN OPKernels.")); - if (in_layout != DataLayout::kMKLDNN && - out_layout == DataLayout::kMKLDNN) { + if (in_layout != DataLayout::ONEDNN && out_layout == DataLayout::ONEDNN) { // Case1 - transform from Non-MKLDNN OPKernel to MKLDNN OPKernel // Just set layout/format. No real transform occur - auto out_format = platform::MKLDNNFormatForSize( - in_tensor.dims().size(), framework::ToMKLDNNFormat(in_layout)); + auto out_format = phi::funcs::OneDNNFormatForSize( + in_tensor.dims().size(), framework::ToOneDNNFormat(in_layout)); out_tensor.ShareDataWith(in_tensor); // For NHWC data we need reshape of tensors as MKL-DNN // is expecting NHWC dims description order if (in_layout == DataLayout::kNHWC) { VLOG(4) << "kNHWC"; - platform::MatchShapeToLayout(&out_tensor, in_layout, out_layout); + phi::funcs::MatchShapeToLayout(&out_tensor, in_layout, out_layout); paddle::platform::MKLDNNDeviceContext::tls() .set_cur_paddle_data_layout(in_layout); } diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h index b57ee09a50960..eb2552434c2fe 100644 --- a/paddle/fluid/platform/mkldnn_helper.h +++ b/paddle/fluid/platform/mkldnn_helper.h @@ -25,121 +25,19 @@ limitations under the License. */ #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/profiler/event_tracing.h" +#include "paddle/phi/backends/onednn/onednn_helper.h" namespace paddle { #ifdef PADDLE_WITH_MKLDNN -using MKLDNNMemoryFormat = dnnl::memory::format_tag; +using OneDNNMemoryFormat = dnnl::memory::format_tag; #endif namespace platform { -using MKLDNNStream = dnnl::stream; -using MKLDNNEngine = dnnl::engine; -using MKLDNNMemory = dnnl::memory; -using MKLDNNMemoryDescriptor = dnnl::memory::desc; -using MKLDNNPrimitive = dnnl::primitive; -using MKLDNNPrimitiveDesc = dnnl::handle; - -typedef std::unique_ptr MKLDNNStreamPtr; -typedef std::unique_ptr MKLDNNEnginePtr; -typedef std::unique_ptr MKLDNNMemoryPtr; -typedef std::unique_ptr MKLDNNPrimitivePtr; -typedef std::unique_ptr MKLDNNPrimitiveDescPtr; - -template -void* to_void_cast(const Type* t) { - return static_cast(const_cast(t)); -} - -template -void* to_void_reinterpret_cast(const Type* t) { - return reinterpret_cast(const_cast(t)); -} - template using tf_desc = typename Type::desc; template using tf_pd = typename Type::primitive_desc; -template -std::shared_ptr> MKLDNNFwdPrimitiveDesc(const Engine& e, - Args&&... args) { - auto desc = tf_desc(dnnl::prop_kind::forward, (args)...); - auto pd = new tf_pd(desc, e); - return std::shared_ptr>(pd); -} - -template -tf_pd MKLDNNBwdPrimitiveDesc(const Engine& e, - const Primitive& p, - Args&&... args) { - auto desc = tf_desc(args...); - return tf_pd(desc, e, p); -} - -inline void MatchShapeToLayout(phi::DenseTensor* tensor_in, - phi::DataLayout from, - phi::DataLayout to) { - auto print_dims = [](const std::vector& dims) { - std::ostringstream oss; - - if (!dims.empty()) { - oss << "["; - // Convert all but the last element to avoid a trailing "," - std::copy( - dims.begin(), dims.end() - 1, std::ostream_iterator(oss, ",")); - - // Now add the last element with no delimiter - oss << dims.back() << "]"; - } - - return oss.str(); - }; - - // In these data layouts, channel dimension is either on 2nd position: nChw or - // at last nhwC, so for dim==2 these layouts are the same and nothing should - // be done. Similarly for dim==1 when you have just one possible combination. - if (tensor_in->dims().size() < 3) { - VLOG(3) << "Keeping kMKLDNN/kNHWC/kNDHWC output_shape" - << print_dims(phi::vectorize(tensor_in->dims())); - return; - } - - switch (from) { - case phi::DataLayout::kMKLDNN: - if ((to == phi::DataLayout::kNHWC) || (to == phi::DataLayout::kNDHWC)) { - auto dims = phi::vectorize(tensor_in->dims()); - std::rotate(dims.begin() + 1, dims.begin() + 2, dims.end()); - tensor_in->Resize(phi::make_ddim(dims)); - VLOG(3) << "Rotating Shape from: kMKLDNN to: kNHWC/kNDHWC output_shape" - << print_dims(dims); - } - break; - case phi::DataLayout::kNHWC: - case phi::DataLayout::kNDHWC: - if (to == phi::DataLayout::kMKLDNN) { - auto dims = phi::vectorize(tensor_in->dims()); - std::rotate(dims.begin() + 1, dims.end() - 1, dims.end()); - tensor_in->Resize(phi::make_ddim(dims)); - VLOG(3) << "Rotating Shape from: kNHWC/kNDHWC to: kMKLDNN output_shape" - << print_dims(dims); - } - break; - default: - break; - } -} - -struct mkldnn_dummy_primitive { - struct primitive_desc {}; - struct desc {}; -}; - -inline dnnl::memory::desc MKLDNNMemDesc(const std::vector& dims, - dnnl::memory::data_type data_type, - MKLDNNMemoryFormat format) { - return dnnl::memory::desc({dims}, data_type, format); -} - inline void ClearMKLDNNCache(const platform::Place& place, void* ptr = nullptr) { // Clear mkl-dnn cache, @@ -161,33 +59,6 @@ inline void DontClearMKLDNNCache(const platform::Place& place) { } } -template -dnnl::memory::data_type MKLDNNGetDataType() { - return dnnl::memory::data_type::undef; -} - -template <> -inline dnnl::memory::data_type MKLDNNGetDataType() { - return dnnl::memory::data_type::f32; -} -template <> -inline dnnl::memory::data_type MKLDNNGetDataType() { - return dnnl::memory::data_type::s32; -} -template <> -inline dnnl::memory::data_type MKLDNNGetDataType() { - return dnnl::memory::data_type::s8; -} -template <> -inline dnnl::memory::data_type MKLDNNGetDataType() { - return dnnl::memory::data_type::u8; -} - -template <> -inline dnnl::memory::data_type MKLDNNGetDataType() { - return dnnl::memory::data_type::bf16; -} - inline void Reorder(dnnl::memory src, dnnl::memory dst, const dnnl::engine& engine) { @@ -201,95 +72,6 @@ inline void Reorder(dnnl::memory src, astream.wait(); } -inline dnnl::memory::format_tag GetPlainMKLDNNFormat(int tensor_rank) { - switch (tensor_rank) { - case 1: - return dnnl::memory::format_tag::a; - case 2: - return dnnl::memory::format_tag::ab; - case 3: - return dnnl::memory::format_tag::abc; - case 4: - return dnnl::memory::format_tag::abcd; - case 5: - return dnnl::memory::format_tag::abcde; - case 6: - return dnnl::memory::format_tag::abcdef; - case 7: - return dnnl::memory::format_tag::abcdefg; - case 8: - return dnnl::memory::format_tag::abcdefgh; - case 9: - return dnnl::memory::format_tag::abcdefghi; - default: - PADDLE_THROW(platform::errors::Unimplemented( - "Paddle support tensors with rank in range <1, 9>, but received " - "tensor with rank: %d", - tensor_rank)); - } -} - -inline MKLDNNMemoryFormat MKLDNNFormatForSize(size_t dims_size, - MKLDNNMemoryFormat data_format) { - if (dims_size == 1) { - return MKLDNNMemoryFormat::x; - } else if (dims_size == 2) { - return MKLDNNMemoryFormat::nc; - } else if (dims_size == 3) { - if (data_format == MKLDNNMemoryFormat::nchw) { - return MKLDNNMemoryFormat::ncw; - } else if (data_format == MKLDNNMemoryFormat::nhwc) { - return MKLDNNMemoryFormat::nwc; - } - } else if (dims_size == 4) { - if (data_format == MKLDNNMemoryFormat::goihw) { - return MKLDNNMemoryFormat::oihw; - } - } else if (dims_size == 5) { - if (data_format == MKLDNNMemoryFormat::goidhw) { - return MKLDNNMemoryFormat::oidhw; - } - if (data_format == MKLDNNMemoryFormat::nchw) { - return MKLDNNMemoryFormat::ncdhw; - } else if (data_format == MKLDNNMemoryFormat::nhwc) { - return MKLDNNMemoryFormat::ndhwc; - } - } else if (dims_size == 6) { - if (data_format == MKLDNNMemoryFormat::nchw) { - return MKLDNNMemoryFormat::abcdef; - } - } - return data_format; -} - -inline MKLDNNMemoryFormat data_format_to_memory_format( - const std::string& data_format) { - switch (phi::StringToDataLayout(data_format)) { - case phi::DataLayout::kNHWC: - return MKLDNNMemoryFormat::nhwc; - case phi::DataLayout::kNCHW: - return MKLDNNMemoryFormat::nchw; - default: - return MKLDNNMemoryFormat::any; - } -} - -inline MKLDNNMemoryFormat StringToMKLDNNFormat(std::string* format) { - std::transform(format->begin(), format->end(), format->begin(), ::tolower); - - if (!format->compare("nchw")) { - return MKLDNNMemoryFormat::nchw; - } else if (!format->compare("nchw16c")) { - return MKLDNNMemoryFormat::nChw16c; - } else if (!format->compare("nchw8c")) { - return MKLDNNMemoryFormat::nChw8c; - } else if (!format->compare("nhwc")) { - return MKLDNNMemoryFormat::nhwc; - } else { - return MKLDNNMemoryFormat::any; - } -} - inline std::string ThreadIDasStr(void) { return std::to_string( std::hash()(std::this_thread::get_id())); @@ -382,41 +164,6 @@ inline std::string ExtendKeyWithThreadInfoIfNeeded( : key; } -inline std::vector> ToMkldnnPadding( - const std::vector& paddings) { - if (paddings.size() == 6) { - int padding_front = paddings[0]; - int padding_back = paddings[1]; - int padding_top = paddings[2]; - int padding_bottom = paddings[3]; - int padding_left = paddings[4]; - int padding_right = paddings[5]; - - return {{padding_front, padding_top, padding_left}, - {padding_back, padding_bottom, padding_right}}; - } else { - int padding_top = paddings[0]; - int padding_bottom = paddings[1]; - int padding_left = paddings[2]; - int padding_right = paddings[3]; - - return {{padding_top, padding_left}, {padding_bottom, padding_right}}; - } -} - -// The function adjusts the vector of weight dimensions for group convolutions -inline void GetGroupConvWeightsTz(std::vector& weights_tz, // NOLINT - const int groups) { - if (groups > 1) { - // if (is_conv3d) [o, i, d, h, w]->[g, o/g, i, d, h, w] - // else [o, i, h, w] -> [g, o/g, i, h, w] - weights_tz.push_back(0); - std::rotate(weights_tz.begin(), weights_tz.end() - 1, weights_tz.end()); - weights_tz[0] = groups; - weights_tz[1] = weights_tz[1] / groups; - } -} - inline void RegisterModelLayout( std::vector>& ops, // NOLINT const platform::Place& place) { @@ -461,17 +208,8 @@ inline bool HasOpBFLOAT16DataType(const paddle::framework::OpDesc* op) { return op->GetAttrIfExists("mkldnn_data_type") == "bfloat16"; } -inline bool HasOpFLOAT32DataType(const paddle::framework::OpDesc* op) { - return op->GetAttrIfExists("mkldnn_data_type") == "float32"; -} - enum class RNNReorderType { PP_NTC, PP_TNC, NTC_PP, TNC_PP }; -template -bool constexpr is_int8() { - return std::is_same::value || std::is_same::value; -} - } // namespace platform inline std::string FindInputNameByVarName(framework::OpDesc* op, diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h index b189196429bea..7a8ef9c939572 100644 --- a/paddle/fluid/platform/mkldnn_reuse.h +++ b/paddle/fluid/platform/mkldnn_reuse.h @@ -30,32 +30,8 @@ limitations under the License. */ namespace paddle { namespace platform { -using user_function = std::function(const float*)>; using memory = dnnl::memory; -template -using MKLDNNHandlerT = - phi::funcs::OneDNNHandlerT; - -template -using MKLDNNHandlerNoCachingT = phi::funcs:: - OneDNNHandlerNoCachingT; - -template -using ReductionMKLDNNHandler = phi::funcs::ReductionOneDNNHandler; - -template -using BroadcastDataMKLDNNHandler = phi::funcs::BroadcastDataOneDNNHandler; - -template -using BinaryMKLDNNHandler = phi::funcs::BinaryOneDNNHandler; - static void AppendActivation(const framework::ExecutionContext& ctx, dnnl::post_ops& post_ops, // NOLINT float activation_scale = 1.0f) { @@ -219,19 +195,9 @@ static void SetInMemDescWithLogicalLayoutFusesSupport( } } -template -constexpr bool IsInt8() { - return std::is_same::value || std::is_same::value; -} - -template -constexpr bool IsBfloat16() { - return std::is_same::value; -} - template class MatMulV2MKLDNNHandler - : public paddle::platform::MKLDNNHandlerNoCachingT { + : public phi::funcs::OneDNNHandlerNoCachingT { public: MatMulV2MKLDNNHandler(const framework::ExecutionContext& ctx, const dnnl::engine engine, @@ -243,8 +209,8 @@ class MatMulV2MKLDNNHandler bool is_output_fused, const std::vector& x_strides_override, const std::vector& y_strides_override) - : paddle::platform::MKLDNNHandlerNoCachingT(engine, - cpu_place) { + : phi::funcs::OneDNNHandlerNoCachingT(engine, + cpu_place) { // M X K * K X N std::vector x_dims(x_org_dims); std::vector y_dims(y_org_dims); @@ -305,13 +271,16 @@ class MatMulV2MKLDNNHandler } // TODO(jczaja): Why not for int8?? - if (!IsInt8() && is_output_fused) { + if (!phi::funcs::is_int8() && is_output_fused) { out_strides = FakeTransposeStrides(out_ddims); } - auto x_md = memory::desc(x_dims, MKLDNNGetDataType(), x_strides); - auto y_md = memory::desc(y_dims, MKLDNNGetDataType(), y_strides); - auto out_md = memory::desc(out_ddims, MKLDNNGetDataType(), out_strides); + auto x_md = + memory::desc(x_dims, phi::funcs::OneDNNGetDataType(), x_strides); + auto y_md = + memory::desc(y_dims, phi::funcs::OneDNNGetDataType(), y_strides); + auto out_md = memory::desc( + out_ddims, phi::funcs::OneDNNGetDataType(), out_strides); const dnnl::primitive_attr matmul_attrs = CreateMatmulAttrs(ctx); @@ -347,7 +316,7 @@ class MatMulV2MKLDNNHandler auto* residual_data = ctx.Input("ResidualData"); auto residual_data_tz = phi::vectorize(residual_data->dims()); auto residual_data_md = memory::desc(residual_data_tz, - MKLDNNGetDataType(), + phi::funcs::OneDNNGetDataType(), dnnl::memory::format_tag::any); post_operations.append_binary(dnnl::algorithm::binary_add, residual_data_md); @@ -389,8 +358,9 @@ class MatMulV2MKLDNNHandler std::shared_ptr AcquireWeightsMemory(const phi::DenseTensor* input) { const YT* input_data = input->data(); - return this->AcquireMemoryFromPrimitive(this->fwd_pd_->weights_desc(), - to_void_cast(input_data)); + return this->AcquireMemoryFromPrimitive( + this->fwd_pd_->weights_desc(), + phi::funcs::to_void_cast(input_data)); } std::shared_ptr AcquireDstMemory(phi::DenseTensor* output) { @@ -406,145 +376,5 @@ class MatMulV2MKLDNNHandler } }; -static std::unordered_map GetAttributeMap( - std::string act_type) { - std::unordered_map attr_map; - if (act_type == "swish") { - attr_map.emplace("beta", "fuse_alpha"); - } else if (act_type == "relu6") { - attr_map.emplace("threshold", "fuse_alpha"); - } else if (act_type == "hard_sigmoid") { - attr_map.emplace("slope", "fuse_alpha"); - attr_map.emplace("offset", "fuse_beta"); - } else if (act_type == "clip") { - attr_map.emplace("min", "fuse_alpha"); - attr_map.emplace("max", "fuse_beta"); - } else { - attr_map.emplace("alpha", "fuse_alpha"); - attr_map.emplace("beta", "fuse_beta"); - } - return attr_map; -} - -static std::vector GetSupportedActivations() { - return std::vector{"abs", - "clip", - "gelu", - "hard_sigmoid", - "hard_swish", - "leaky_relu", - "mish", - "relu", - "relu6", - "sigmoid", - "sqrt", - "swish", - "tanh"}; -} - -class ReorderMKLDNNHandler { - public: - ReorderMKLDNNHandler(std::vector& dims, // NOLINT - framework::proto::VarType::Type vtype, - dnnl::memory::data_type dtype, - dnnl::engine engine) - : dims_(dims), - vtype_(vtype), - vtype_dst_(vtype), - dtype_(dtype), - dtype_dst_(dtype), - engine_(engine) {} - - ReorderMKLDNNHandler(std::vector& dims, // NOLINT - framework::proto::VarType::Type vtype, - dnnl::memory::data_type dtype, - framework::proto::VarType::Type vtype_dst, - dnnl::memory::data_type dtype_dst, - dnnl::engine engine) - : dims_(dims), - vtype_(vtype), - vtype_dst_(vtype_dst), - dtype_(dtype), - dtype_dst_(dtype_dst), - engine_(engine) {} - - std::shared_ptr AcquireSrcMemory(const dnnl::memory::desc& md, - void* ptr) { - return std::make_shared(md, engine_, ptr); - } - - std::shared_ptr AcquireSrcMemory(const MKLDNNMemoryFormat& fmt, - void* ptr) { - auto md = dnnl::memory::desc(dims_, dtype_, fmt); - return std::make_shared(md, engine_, ptr); - } - - std::shared_ptr AcquireSubmemory( - const std::vector& dims, - const std::vector& offset, - const std::shared_ptr& mem_p) { - auto sub_md = mem_p->get_desc().submemory_desc(dims, {offset}); - auto sub_mem_p = std::make_shared( - sub_md, engine_, mem_p->get_data_handle()); - return sub_mem_p; - } - - std::shared_ptr AcquireDstMemory(phi::DenseTensor* output, - const MKLDNNMemoryFormat& fmt, - platform::Place place) { - auto dst_md = platform::MKLDNNMemDesc(dims_, dtype_dst_, fmt); - auto dst_data = output->mutable_data( - place, framework::TransToPhiDataType(vtype_dst_), dst_md.get_size()); - return std::make_shared(dst_md, engine_, dst_data); - } - - std::shared_ptr AcquireDstMemory( - phi::DenseTensor* output, - const dnnl::memory::desc& src_md, - platform::Place place) { - if (vtype_dst_ == vtype_) { - auto dst_data = output->mutable_data( - place, framework::TransToPhiDataType(vtype_dst_), src_md.get_size()); - return std::make_shared(src_md, engine_, dst_data); - } else { - auto dst_md = src_md; - dst_md.data.data_type = static_cast(dtype_dst_); - auto dst_data = output->mutable_data( - place, framework::TransToPhiDataType(vtype_dst_), dst_md.get_size()); - return std::make_shared(dst_md, engine_, dst_data); - } - } - - std::shared_ptr AcquireDstMemory( - phi::DenseTensor* output, - const std::vector& dims, - const MKLDNNMemoryFormat& fmt, - platform::Place place) { - auto dst_md = platform::MKLDNNMemDesc(dims, dtype_dst_, fmt); - auto dst_data = output->mutable_data( - place, framework::TransToPhiDataType(vtype_dst_), dst_md.get_size()); - return std::make_shared(dst_md, engine_, dst_data); - } - - std::shared_ptr AcquireReorder( - std::shared_ptr dst_memory_p, - std::shared_ptr src_memory_p) { - return std::make_shared(*(src_memory_p), *(dst_memory_p)); - } - - std::shared_ptr AcquireReorder( - std::shared_ptr dst_memory_p, - std::shared_ptr src_memory_p, - const dnnl::primitive_attr& attrs) { - return std::make_shared( - *(src_memory_p), *(dst_memory_p), attrs); - } - - private: - std::vector dims_; - framework::proto::VarType::Type vtype_, vtype_dst_; - dnnl::memory::data_type dtype_, dtype_dst_; - dnnl::engine engine_; -}; } // namespace platform } // namespace paddle diff --git a/paddle/phi/backends/onednn/onednn_helper.h b/paddle/phi/backends/onednn/onednn_helper.h index 11cc9c29f501f..f50aa11227158 100644 --- a/paddle/phi/backends/onednn/onednn_helper.h +++ b/paddle/phi/backends/onednn/onednn_helper.h @@ -195,28 +195,6 @@ inline std::string CreateKey(const OneDNNContext& dev_ctx, ArgTypes&&... args) { return key; } -inline std::vector> ToOnednnPadding( - const std::vector& paddings) { - if (paddings.size() == 6) { - int padding_front = paddings[0]; - int padding_back = paddings[1]; - int padding_top = paddings[2]; - int padding_bottom = paddings[3]; - int padding_left = paddings[4]; - int padding_right = paddings[5]; - - return {{padding_front, padding_top, padding_left}, - {padding_back, padding_bottom, padding_right}}; - } else { - int padding_top = paddings[0]; - int padding_bottom = paddings[1]; - int padding_left = paddings[2]; - int padding_right = paddings[3]; - - return {{padding_top, padding_left}, {padding_bottom, padding_right}}; - } -} - // The function adjusts the vector of weight dimensions for group convolutions inline void GetGroupConvWeightsTz(std::vector& weights_tz, // NOLINT const int groups) { @@ -306,10 +284,5 @@ inline std::string ExtendKeyWithThreadInfoIfNeeded(const OneDNNContext& dev_ctx, : key; } -template -bool constexpr is_int8() { - return std::is_same::value || std::is_same::value; -} - } // namespace funcs } // namespace phi diff --git a/paddle/phi/backends/onednn/onednn_reuse.h b/paddle/phi/backends/onednn/onednn_reuse.h index 4e9d9dfc0b622..7395138bfd63b 100644 --- a/paddle/phi/backends/onednn/onednn_reuse.h +++ b/paddle/phi/backends/onednn/onednn_reuse.h @@ -35,11 +35,20 @@ limitations under the License. */ namespace phi { namespace funcs { -using user_function = std::function(const float*)>; using memory = dnnl::memory; using OneDNNMemoryFormat = dnnl::memory::format_tag; +template +bool constexpr is_int8() { + return std::is_same::value || std::is_same::value; +} + +template +constexpr bool is_bfloat16() { + return std::is_same::value; +} + static void AppendActivation(const OneDNNContext& dev_ctx, dnnl::post_ops& post_ops, // NOLINT float activation_scale = 1.0f) { @@ -101,6 +110,42 @@ static void AppendActivation(const OneDNNContext& dev_ctx, } } +static std::unordered_map GetAttributeMap( + std::string act_type) { + std::unordered_map attr_map; + if (act_type == "swish") { + attr_map.emplace("beta", "fuse_alpha"); + } else if (act_type == "relu6") { + attr_map.emplace("threshold", "fuse_alpha"); + } else if (act_type == "hard_sigmoid") { + attr_map.emplace("slope", "fuse_alpha"); + attr_map.emplace("offset", "fuse_beta"); + } else if (act_type == "clip") { + attr_map.emplace("min", "fuse_alpha"); + attr_map.emplace("max", "fuse_beta"); + } else { + attr_map.emplace("alpha", "fuse_alpha"); + attr_map.emplace("beta", "fuse_beta"); + } + return attr_map; +} + +static std::vector GetSupportedActivations() { + return std::vector{"abs", + "clip", + "gelu", + "hard_sigmoid", + "hard_swish", + "leaky_relu", + "mish", + "relu", + "relu6", + "sigmoid", + "sqrt", + "swish", + "tanh"}; +} + template -T* mutable_data(const phi::Place& place, - size_t requested_size = 0); +T* mutable_data(const phi::Place& place, size_t requested_size = 0); template T* mutable_data(const DDim& dims, @@ -41,15 +40,14 @@ void* mutable_data(const phi::Place& place, paddle::experimental::DataType type, size_t requested_size = 0); -void* mutable_data(const phi::Place& place, - size_t requested_size = 0); +void* mutable_data(const phi::Place& place, size_t requested_size = 0); void* mutable_data(const phi::Place& place, paddle::experimental::DataType type, const phi::Stream& stream); /* @jim19930609: Remove dependency on protobuf after Tensor Unification. -*/ + */ paddle::experimental::DataType type() const; // memory size returns the holding memory size in byte. @@ -86,13 +84,11 @@ std::shared_ptr MoveMemoryHolder() { void ResetHolder(const std::shared_ptr& holder); void ResetHolderWithType(const std::shared_ptr& holder, - paddle::experimental::DataType type); + paddle::experimental::DataType type); void set_type(paddle::experimental::DataType type); -InplaceVersion& InplaceVersionCounter() { - return *inplace_version_counter_; -} +InplaceVersion& InplaceVersionCounter() { return *inplace_version_counter_; } /*! The internal of two tensors share the same memory block. */ DenseTensor& ShareDataWith(const DenseTensor& src); @@ -116,11 +112,11 @@ following codes there. #ifdef PADDLE_WITH_MKLDNN public: - const dnnl::memory::desc& mem_desc() const; +const dnnl::memory::desc& mem_desc() const; inline void set_mem_desc(const dnnl::memory::desc& mem_desc) { mem_desc_ = mem_desc; - meta_.layout = DataLayout::kMKLDNN; + meta_.layout = DataLayout::ONEDNN; } #endif @@ -141,8 +137,8 @@ void set_lod(const LoD& lod); LoD* mutable_lod(); /* -* Get the start offset and end offset of an element from LoD. -*/ + * Get the start offset and end offset of an element from LoD. + */ std::pair lod_element(size_t level, size_t elem) const; size_t NumLevels() const; diff --git a/paddle/phi/kernels/funcs/CMakeLists.txt b/paddle/phi/kernels/funcs/CMakeLists.txt index 331d35398fd08..9c26e33ccebab 100644 --- a/paddle/phi/kernels/funcs/CMakeLists.txt +++ b/paddle/phi/kernels/funcs/CMakeLists.txt @@ -40,14 +40,8 @@ else() endif() if(WITH_MKLDNN) - math_library( - selected_rows_functor - DEPS - selected_rows_utils - math_function - blas - mkldnn_axpy_handler - mixed_vector) + math_library(selected_rows_functor DEPS selected_rows_utils math_function + blas mixed_vector) else() math_library(selected_rows_functor DEPS selected_rows_utils math_function blas mixed_vector) diff --git a/paddle/phi/kernels/funcs/selected_rows_functor.cc b/paddle/phi/kernels/funcs/selected_rows_functor.cc index f17ea6d951812..e1d45eef54981 100644 --- a/paddle/phi/kernels/funcs/selected_rows_functor.cc +++ b/paddle/phi/kernels/funcs/selected_rows_functor.cc @@ -18,7 +18,7 @@ limitations under the License. */ #include "paddle/fluid/platform/device/device_wrapper.h" #ifdef PADDLE_WITH_MKLDNN -#include "paddle/fluid/operators/mkldnn/axpy_handler.h" +#include "paddle/phi/backends/onednn/axpy_handler.h" #endif namespace phi { @@ -371,7 +371,9 @@ add_sparse_inputs(const std::vector& inputs, auto& input_rows = input->rows(); #ifdef PADDLE_WITH_MKLDNN - paddle::operators::OneDNNAXPYHandler axpy_handler(input_width, T(1.f)); + OneDNNContext onednn_context(context.GetPlace()); + funcs::OneDNNAXPYHandler axpy_handler( + input_width, T(1.f), onednn_context.GetEngine()); for (size_t i = 0; i < input_rows.size(); i++) { size_t out_i = rows_to_id.at(input_rows[i]); axpy_handler(&input_data[i * input_width], @@ -869,11 +871,11 @@ struct UpdateToTensor { PADDLE_ENFORCE_EQ( in1_row_numel, input2->numel() / in1_height, - phi::errors::InvalidArgument( - "The two inputs width must be equal." - "But received first input width = [%d], second input width = [%d]", - in1_row_numel, - input2->numel() / in1_height)); + phi::errors::InvalidArgument("The two inputs width must be equal." + "But received first input width = [%d], " + "second input width = [%d]", + in1_row_numel, + input2->numel() / in1_height)); auto* in1_data = in1_value.data(); auto* input2_data = input2->data(); diff --git a/paddle/phi/kernels/onednn/conv_handler.h b/paddle/phi/kernels/onednn/conv_handler.h index 723784a845c2d..0e99113594cba 100644 --- a/paddle/phi/kernels/onednn/conv_handler.h +++ b/paddle/phi/kernels/onednn/conv_handler.h @@ -154,7 +154,7 @@ class ConvOneDNNHandlerT const auto dst_tz = phi::vectorize(output->dims()); const dnnl::memory::dims stride_dims = strides; - const auto onednn_paddings = funcs::ToOnednnPadding(paddings); + const auto onednn_paddings = funcs::ToOneDNNPadding(paddings); const dnnl::memory::dims dilations_dims = dilations; /* create memory descriptor for convolution without specified format * ('any') which lets a primitive (convolution in this case) choose @@ -326,7 +326,7 @@ class ConvOneDNNHandlerT auto diff_dst_md = funcs::OneDNNMemDesc( dst_tz, funcs::OneDNNGetDataType(), chosen_memory_format); - auto onednn_paddings = funcs::ToOnednnPadding(paddings); + auto onednn_paddings = funcs::ToOneDNNPadding(paddings); std::transform( dilations.begin(), dilations.end(), dilations.begin(), [](int64_t i) { return i - 1; diff --git a/paddle/phi/kernels/onednn/conv_kernel.cc b/paddle/phi/kernels/onednn/conv_kernel.cc index cdd5c4d968f4c..e2faaea6b023a 100644 --- a/paddle/phi/kernels/onednn/conv_kernel.cc +++ b/paddle/phi/kernels/onednn/conv_kernel.cc @@ -291,8 +291,7 @@ void ConvKernel(const Context& dev_ctx, dev_ctx.GetPlace().GetType(), AllocationType::CPU, phi::errors::PreconditionNotMet("Operator DNNL Conv must use CPUPlace")); - bool is_INT8 = - std::is_same::value || std::is_same::value; + bool is_INT8 = funcs::is_int8(); bool is_test = dev_ctx.HasDnnAttr("is_test") ? PADDLE_GET_CONST(bool, dev_ctx.GetDnnAttr("is_test")) diff --git a/paddle/phi/kernels/transfer_layout_kernel.cc b/paddle/phi/kernels/transfer_layout_kernel.cc index f7177309006dd..300fd9b5cc65e 100644 --- a/paddle/phi/kernels/transfer_layout_kernel.cc +++ b/paddle/phi/kernels/transfer_layout_kernel.cc @@ -138,7 +138,7 @@ void TransferLayoutMKLDNN(const Context& dev_ctx, src_layout, dst_layout, errors::PreconditionNotMet( - "No layout transform needed between two MKLDNN OPKernels.")); + "No layout transform needed between two oneDNN OPKernels.")); } else { TransferLayoutGeneral(dev_ctx, x, dst_layout, out); } From d4d3d7ed177f75f4828b1dfe11196d4275867a23 Mon Sep 17 00:00:00 2001 From: Zhou Wei <1183042833@qq.com> Date: Tue, 15 Nov 2022 13:15:33 +0800 Subject: [PATCH 013/210] [Zero-Dim] support input 0D Tensor for xpu kernel, test=kunlun (#47849) --- .../phi/kernels/xpu/activation_grad_kernel.cc | 46 ++- paddle/phi/kernels/xpu/activation_kernel.cc | 9 +- paddle/phi/kernels/xpu/elementwise.h | 20 + .../phi/kernels/xpu/reduce_max_grad_kernel.cc | 8 + .../kernels/xpu/reduce_mean_grad_kernel.cc | 18 +- .../phi/kernels/xpu/reduce_sum_grad_kernel.cc | 8 + paddle/phi/kernels/xpu/where_kernel.cc | 21 +- .../unittests/xpu/test_activation_op_xpu.py | 28 ++ .../xpu/test_elementwise_add_op_xpu.py | 18 + .../xpu/test_elementwise_div_op_xpu.py | 16 + .../xpu/test_elementwise_mul_op_xpu.py | 24 ++ .../xpu/test_elementwise_sub_op_xpu.py | 24 ++ .../unittests/xpu/test_zero_dim_tensor_xpu.py | 341 ++++++++++++++++++ 13 files changed, 538 insertions(+), 43 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/xpu/test_zero_dim_tensor_xpu.py diff --git a/paddle/phi/kernels/xpu/activation_grad_kernel.cc b/paddle/phi/kernels/xpu/activation_grad_kernel.cc index 875a91d2a7360..a30f63d176e50 100644 --- a/paddle/phi/kernels/xpu/activation_grad_kernel.cc +++ b/paddle/phi/kernels/xpu/activation_grad_kernel.cc @@ -169,39 +169,37 @@ struct XPULogGradFunctor : public funcs::BaseActivationFunctor { const DenseTensor* dOut, DenseTensor* dX) const { const T* x_data = nullptr; - const T* y_grad = nullptr; + const T* dout_data = nullptr; if (x != nullptr) x_data = x->data(); - if (dOut != nullptr) y_grad = dOut->data(); - T* x_grad = dX->data(); - const auto x_dims = x->dims(); - auto xshape = vectorize(x_dims); - int len = x->dims()[x_dims.size() - 1]; - std::vector yshape(1, len); - - xpu::ctx_guard RAII_GUARD(dev_ctx.x_context()); - T* y_data = RAII_GUARD.alloc_l3_or_gm(len); - PADDLE_ENFORCE_XDNN_NOT_NULL(y_data); - T* tmp_grad = RAII_GUARD.alloc_l3_or_gm(x->numel()); - PADDLE_ENFORCE_XDNN_NOT_NULL(tmp_grad); - int r = - xpu::constant(dev_ctx.x_context(), y_data, len, static_cast(1.0)); + if (dOut != nullptr) dout_data = dOut->data(); + + T* dx_data = dev_ctx.template Alloc(dX); + int r = xpu::constant( + dev_ctx.x_context(), dx_data, x->numel(), static_cast(1.0)); PADDLE_ENFORCE_XDNN_SUCCESS(r, "constant"); + auto x_dims = vectorize(x->dims()); + + // use [1] to replace [], because xpu not support [] + if (x_dims.size() == 0) { + x_dims = std::vector({1}); + } + // dx.device(d) = dout * (static_cast(1) / x); r = xpu::broadcast_div(dev_ctx.x_context(), - reinterpret_cast(y_data), + reinterpret_cast(dx_data), reinterpret_cast(x_data), - reinterpret_cast(tmp_grad), - yshape, - xshape); + reinterpret_cast(dx_data), + x_dims, + x_dims); PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast_div"); r = xpu::broadcast_mul(dev_ctx.x_context(), - reinterpret_cast(y_grad), - reinterpret_cast(tmp_grad), - reinterpret_cast(x_grad), - xshape, - xshape); + reinterpret_cast(dx_data), + reinterpret_cast(dout_data), + reinterpret_cast(dx_data), + x_dims, + x_dims); PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast_mul"); } }; diff --git a/paddle/phi/kernels/xpu/activation_kernel.cc b/paddle/phi/kernels/xpu/activation_kernel.cc index 73aae275d6941..f730c38e8f0f2 100644 --- a/paddle/phi/kernels/xpu/activation_kernel.cc +++ b/paddle/phi/kernels/xpu/activation_kernel.cc @@ -213,9 +213,14 @@ void PowKernel(const Context& dev_ctx, static_cast(&pow_factor), sizeof(T)); - // broadcast_pow(Context* ctx, const T* x, const T* y, T* z, const - // std::vector& xshape, const std::vector& yshape); auto x_dims = vectorize(x.dims()); + // use [1] to replace [], because xpu not support [] + if (x_dims.size() == 0) { + x_dims = std::vector({1}); + } + + // broadcast_pow(Context* ctx, const T* x, const T* y, T* z, const + // std::vector& xshape, const std::vector& yshape); int r = xpu::broadcast_pow(xpu_context, x_data, factor_data, y_data, x_dims, {1}); PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast_pow"); diff --git a/paddle/phi/kernels/xpu/elementwise.h b/paddle/phi/kernels/xpu/elementwise.h index 46bac6ce29914..3af7a0340696b 100644 --- a/paddle/phi/kernels/xpu/elementwise.h +++ b/paddle/phi/kernels/xpu/elementwise.h @@ -84,6 +84,17 @@ void XPUElementwise(const XPUContext& dev_ctx, int ret = xpu::SUCCESS; + // For [2, 3] + [] --> [2, 3] + [1, 1] + // For [] + [2, 3] --> [1, 1] + [2, 3] + // For [] + [], Use [1] + [1] to replace [], because xpu not support [] + if (x_dims_vec.size() == 0) { + x_dims_vec = std::vector({1}); + } + + if (y_dims_vec.size() == 0) { + y_dims_vec = std::vector({1}); + } + ret = func(dev_ctx.x_context(), reinterpret_cast(x_data), reinterpret_cast(y_data), @@ -165,6 +176,15 @@ void XPUElementwiseGrad(const XPUContext& dev_ctx, dy_data = dev_ctx.template Alloc(dy); } + // use [1] to replace [], because xpu not support [] + if (x_dims_vec.size() == 0) { + x_dims_vec = std::vector({1}); + } + + if (y_dims_vec.size() == 0) { + y_dims_vec = std::vector({1}); + } + int ret = func(dev_ctx.x_context(), reinterpret_cast(x_data), reinterpret_cast(y_data), diff --git a/paddle/phi/kernels/xpu/reduce_max_grad_kernel.cc b/paddle/phi/kernels/xpu/reduce_max_grad_kernel.cc index df4dc678392a4..1bfc5ae5f877e 100644 --- a/paddle/phi/kernels/xpu/reduce_max_grad_kernel.cc +++ b/paddle/phi/kernels/xpu/reduce_max_grad_kernel.cc @@ -75,6 +75,14 @@ void ReduceMaxGradKernel(const Context& dev_ctx, XPU_SUCCESS, errors::ResourceExhausted("XPU has no enough memory")); + // use [1] to replace [], because xpu not support [] + if (xdims.size() == 0) { + xdims = std::vector({1}); + } + if (ydims.size() == 0) { + ydims = std::vector({1}); + } + // step 1. brocast out and out_grad int r = xpu::broadcast(dev_ctx.x_context(), out_data, brocast1, ydims, xdims); diff --git a/paddle/phi/kernels/xpu/reduce_mean_grad_kernel.cc b/paddle/phi/kernels/xpu/reduce_mean_grad_kernel.cc index 2d82a77a24d61..0c2fe9a9d9e64 100644 --- a/paddle/phi/kernels/xpu/reduce_mean_grad_kernel.cc +++ b/paddle/phi/kernels/xpu/reduce_mean_grad_kernel.cc @@ -38,14 +38,8 @@ void ReduceMeanGradKernel(const Context& dev_ctx, auto reduce_dims = dims_arr.GetData(); - std::vector xdims; - for (int i = 0; i < x.dims().size(); i++) { - xdims.push_back(x.dims()[i]); - } - std::vector ydims; - for (int i = 0; i < out_grad.dims().size(); i++) { - ydims.push_back(out_grad.dims()[i]); - } + std::vector xdims = vectorize(x.dims()); + std::vector ydims = vectorize(out_grad.dims()); int reduce_numel = 1; if (reduce_all) { @@ -74,6 +68,14 @@ void ReduceMeanGradKernel(const Context& dev_ctx, dev_ctx.x_context(), x_data, x.numel(), static_cast(val)); PADDLE_ENFORCE_XDNN_SUCCESS(r, "constant"); + // use [1] to replace [], because xpu not support [] + if (xdims.size() == 0) { + xdims = std::vector({1}); + } + if (ydims.size() == 0) { + ydims = std::vector({1}); + } + r = xpu::broadcast_mul( dev_ctx.x_context(), x_data, dy_data, x_data, xdims, ydims); PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast_mul"); diff --git a/paddle/phi/kernels/xpu/reduce_sum_grad_kernel.cc b/paddle/phi/kernels/xpu/reduce_sum_grad_kernel.cc index 218eb25c5c98a..b6e4d1021e47d 100644 --- a/paddle/phi/kernels/xpu/reduce_sum_grad_kernel.cc +++ b/paddle/phi/kernels/xpu/reduce_sum_grad_kernel.cc @@ -57,6 +57,14 @@ void ReduceSumGradKernel(const Context& dev_ctx, } } + // use [1] to replace [], because xpu not support [] + if (xdims.size() == 0) { + xdims = std::vector({1}); + } + if (ydims.size() == 0) { + ydims = std::vector({1}); + } + int r = xpu::broadcast( dev_ctx.x_context(), out_data, x_grad_data, ydims, xdims); PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast"); diff --git a/paddle/phi/kernels/xpu/where_kernel.cc b/paddle/phi/kernels/xpu/where_kernel.cc index 59650a9e89649..ed32d1c631b7b 100644 --- a/paddle/phi/kernels/xpu/where_kernel.cc +++ b/paddle/phi/kernels/xpu/where_kernel.cc @@ -31,15 +31,18 @@ void WhereKernel(const Context& ctx, T* out_data = ctx.template Alloc(out); auto cond_dims = phi::vectorize(condition.dims()); - auto input_dims = phi::vectorize(x.dims()); - - int ret = xpu::select(ctx.x_context(), - cond_data, - x_data, - y_data, - out_data, - cond_dims, - input_dims); + auto x_dims = phi::vectorize(x.dims()); + + // use [1] to replace [], because xpu not support [] + if (cond_dims.size() == 0) { + cond_dims = std::vector({1}); + } + if (x_dims.size() == 0) { + x_dims = std::vector({1}); + } + + int ret = xpu::select( + ctx.x_context(), cond_data, x_data, y_data, out_data, cond_dims, x_dims); PADDLE_ENFORCE_XDNN_SUCCESS(ret, "select"); } diff --git a/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py index 49b673133fcaf..8c4c722cbfac0 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py @@ -75,6 +75,10 @@ def set_case(self): self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)} self.outputs = {'Out': out} + class XPUTestExp_ZeroDIm(TestActivationOPBase): + def set_shape(self): + self.shape = [] + support_types = get_xpu_op_support_types('exp') for stype in support_types: @@ -100,6 +104,10 @@ def set_case(self): def init_config(self): self.x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype) + class XPUTestSigmoid_ZeroDIm(XPUTestSigmoid): + def init_config(self): + self.x = np.random.uniform(-2, 2, []).astype(self.dtype) + class XPUTestSigmoid2(XPUTestSigmoid): def init_config(self): self.x = np.random.uniform(-2, 2, [100]).astype(self.dtype) @@ -310,6 +318,10 @@ def set_case(self): self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)} self.outputs = {'Out': out} + class TestLogCase_ZeroDim(XPUTestLog): + def set_shape(self): + self.shape = [] + class TestLogCase1(XPUTestLog): def set_shape(self): self.shape = [1, 11, 17] @@ -351,6 +363,10 @@ def set_case(self): def init_config(self): self.x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype) + class XPUTestSquare_ZeroDim(XPUTestSquare): + def init_config(self): + self.x = np.random.uniform(-2, 2, []).astype(self.dtype) + class XPUTestSquare2(XPUTestSquare): def init_config(self): self.x = np.random.uniform(-2, 2, [100]).astype(self.dtype) @@ -517,6 +533,10 @@ def set_case(self): def init_config(self): self.x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype) + class XPUTestSoftPlus_ZeroDim(XPUTestSoftPlusBase): + def init_config(self): + self.x = np.random.uniform(-2, 2, []).astype(self.dtype) + class XPUTestSoftPlus2(XPUTestSoftPlusBase): def init_config(self): self.x = np.random.uniform(-2, 2, [1024, 8]).astype(self.dtype) @@ -976,6 +996,10 @@ def set_case(self): def init_config(self): self.x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype) + class XPUTestSwish_ZeroDim(XPUTestSwishBase): + def init_config(self): + self.x = np.random.uniform(-2, 2, []).astype(self.dtype) + class XPUTestSwish2(XPUTestSwishBase): def init_config(self): self.x = np.random.uniform(-2, 2, [1024, 8]).astype(self.dtype) @@ -1057,6 +1081,10 @@ def set_case(self): def init_config(self): self.x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype) + class XPUTestMish_ZeroDim(XPUTestMishBase): + def init_config(self): + self.x = np.random.uniform(-2, 2, []).astype(self.dtype) + class XPUTestMish2(XPUTestMishBase): def init_config(self): self.x = np.random.uniform(-2, 2, [1024, 8]).astype(self.dtype) diff --git a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_add_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_elementwise_add_op_xpu.py index 2d04b3d7549e4..ac9a371325aee 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_add_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_elementwise_add_op_xpu.py @@ -101,6 +101,24 @@ def init_axis(self): def init_max_relative_error(self): self.max_relative_error = 0.006 + class TestElementwiseAddOp_ZeroDim1(TestElementwiseAddOp): + def init_input_output(self): + self.x = np.random.uniform(-1, 1, []).astype(self.dtype) + self.y = np.random.uniform(-1, 1, []).astype(self.dtype) + self.out = self.x + self.y + + class TestElementwiseAddOp_ZeroDim2(TestElementwiseAddOp): + def init_input_output(self): + self.x = np.random.uniform(-1, 1, []).astype(self.dtype) + self.y = np.random.uniform(-1, 1, [13, 17]).astype(self.dtype) + self.out = self.x + self.y + + class TestElementwiseAddOp_ZeroDim3(TestElementwiseAddOp): + def init_input_output(self): + self.x = np.random.uniform(-1, 1, [13, 17]).astype(self.dtype) + self.y = np.random.uniform(-1, 1, []).astype(self.dtype) + self.out = self.x + self.y + @skip_check_grad_ci( reason="[skip shape check] Use y_shape(1) to test broadcast." ) diff --git a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_div_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_elementwise_div_op_xpu.py index 4144a7068e0fa..99c1820d894d8 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_div_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_elementwise_div_op_xpu.py @@ -93,6 +93,22 @@ def test_check_grad_ingore_y(self): def init_dtype(self): pass + class TestElementwiseDivOp_ZeroDim1(ElementwiseDivOp): + def init_input_output(self): + self.inputs = { + 'X': np.random.uniform(-1, 1, []).astype(self.dtype), + 'Y': np.random.uniform(-1, 1, []).astype(self.dtype), + } + self.outputs = {'Out': self.inputs['X'] / self.inputs['Y']} + + class TestElementwiseDivOp_ZeroDim2(ElementwiseDivOp): + def init_input_output(self): + self.inputs = { + 'X': np.random.uniform(-1, 1, [13, 17]).astype(self.dtype), + 'Y': np.random.uniform(-1, 1, []).astype(self.dtype), + } + self.outputs = {'Out': self.inputs['X'] / self.inputs['Y']} + @skip_check_grad_ci( reason="[skip shape check] Use y_shape(1) to test broadcast." ) diff --git a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_mul_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_elementwise_mul_op_xpu.py index 1d9c8c80f5ae5..42ab74b1382f2 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_mul_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_elementwise_mul_op_xpu.py @@ -103,6 +103,30 @@ def init_dtype(self): def init_axis(self): pass + class TestElementwiseMulOp_ZeroDim1(ElementwiseMulOp): + def init_input_output(self): + self.inputs = { + 'X': np.random.uniform(-1, 1, []).astype(self.dtype), + 'Y': np.random.uniform(-1, 1, []).astype(self.dtype), + } + self.outputs = {'Out': self.inputs['X'] * self.inputs['Y']} + + class TestElementwiseMulOp_ZeroDim2(ElementwiseMulOp): + def init_input_output(self): + self.inputs = { + 'X': np.random.uniform(-1, 1, [13, 17]).astype(self.dtype), + 'Y': np.random.uniform(-1, 1, []).astype(self.dtype), + } + self.outputs = {'Out': self.inputs['X'] * self.inputs['Y']} + + class TestElementwiseMulOp_ZeroDim3(ElementwiseMulOp): + def init_input_output(self): + self.inputs = { + 'X': np.random.uniform(-1, 1, []).astype(self.dtype), + 'Y': np.random.uniform(-1, 1, [13, 17]).astype(self.dtype), + } + self.outputs = {'Out': self.inputs['X'] * self.inputs['Y']} + @skip_check_grad_ci( reason="[skip shape check] Use y_shape(1) to test broadcast." ) diff --git a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_sub_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_elementwise_sub_op_xpu.py index 927855f461d34..b5a3d2f853ea6 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_sub_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_elementwise_sub_op_xpu.py @@ -80,6 +80,30 @@ def test_check_grad_ingore_y(self): no_grad_set=set('Y'), ) + class TestElementwiseSubOp_ZeroDim1(TestElementwiseOp): + def init_input_output(self): + self.inputs = { + 'X': np.random.uniform(-1, 1, []).astype(self.dtype), + 'Y': np.random.uniform(-1, 1, []).astype(self.dtype), + } + self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']} + + class TestElementwiseSubOp_ZeroDim2(TestElementwiseOp): + def init_input_output(self): + self.inputs = { + 'X': np.random.uniform(-1, 1, [13, 17]).astype(self.dtype), + 'Y': np.random.uniform(-1, 1, []).astype(self.dtype), + } + self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']} + + class TestElementwiseSubOp_ZeroDim3(TestElementwiseOp): + def init_input_output(self): + self.inputs = { + 'X': np.random.uniform(-1, 1, []).astype(self.dtype), + 'Y': np.random.uniform(-1, 1, [13, 17]).astype(self.dtype), + } + self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']} + @skip_check_grad_ci( reason="[skip shape check] Use y_shape(1) to test broadcast." ) diff --git a/python/paddle/fluid/tests/unittests/xpu/test_zero_dim_tensor_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_zero_dim_tensor_xpu.py new file mode 100644 index 0000000000000..5868fe9cb531b --- /dev/null +++ b/python/paddle/fluid/tests/unittests/xpu/test_zero_dim_tensor_xpu.py @@ -0,0 +1,341 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.fluid as fluid +import paddle.nn.functional as F +import numpy as np +import unittest + +paddle.set_device('xpu') + + +unary_api_list = [ + paddle.nn.functional.elu, + paddle.nn.functional.gelu, + paddle.nn.functional.hardsigmoid, + paddle.nn.functional.hardswish, + paddle.nn.functional.leaky_relu, + paddle.nn.functional.log_sigmoid, + paddle.nn.functional.relu, + paddle.nn.functional.relu6, + paddle.nn.functional.sigmoid, + paddle.nn.functional.softplus, + paddle.nn.functional.softshrink, + paddle.nn.functional.softsign, + paddle.nn.functional.swish, + paddle.nn.functional.tanhshrink, + paddle.nn.functional.thresholded_relu, + paddle.stanh, + paddle.nn.functional.celu, + paddle.nn.functional.mish, + paddle.nn.functional.silu, + paddle.nn.functional.tanh, + paddle.cosh, + paddle.sinh, + paddle.abs, + paddle.acos, + paddle.asin, + paddle.atan, + paddle.ceil, + paddle.cos, + paddle.exp, + paddle.floor, + paddle.log, + paddle.log1p, + paddle.reciprocal, + paddle.round, + paddle.sin, + paddle.sqrt, + paddle.square, + paddle.tanh, + paddle.acosh, + paddle.asinh, + paddle.atanh, + paddle.expm1, + paddle.log10, + paddle.log2, + paddle.tan, +] + + +# Use to test zero-dim in unary API. +class TestUnaryAPI(unittest.TestCase): + def test(self): + paddle.disable_static() + fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True}) + for api in unary_api_list: + x = paddle.rand([]) + x.stop_gradient = False + out = api(x) + out.backward() + + self.assertEqual(x.shape, []) + self.assertEqual(out.shape, []) + self.assertEqual(x.grad.shape, []) + self.assertEqual(out.grad.shape, []) + + paddle.enable_static() + + +reduce_api_list = [ + paddle.sum, + paddle.mean, + paddle.nansum, + paddle.nanmean, + paddle.min, + paddle.max, + paddle.amin, + paddle.amax, + paddle.prod, + paddle.logsumexp, + paddle.all, + paddle.any, +] + + +# Use to test zero-dim of reduce API +class TestReduceAPI(unittest.TestCase): + def test(self): + paddle.disable_static() + fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True}) + for api in reduce_api_list: + if api in [paddle.all, paddle.any]: + x = paddle.randint(0, 2, []).astype('bool') + out = api(x, None) + self.assertEqual(x.shape, []) + self.assertEqual(out.shape, []) + else: + x = paddle.rand([]) + x.stop_gradient = False + out = api(x, None) + out.backward() + + self.assertEqual(x.shape, []) + self.assertEqual(x.grad.shape, []) + self.assertEqual(out.shape, []) + self.assertEqual(out.grad.shape, []) + + paddle.enable_static() + + +binary_api_list = [ + {'func': paddle.add, 'cls_method': '__add__'}, + {'func': paddle.subtract, 'cls_method': '__sub__'}, + {'func': paddle.multiply, 'cls_method': '__mul__'}, + {'func': paddle.divide, 'cls_method': '__div__'}, + {'func': paddle.pow, 'cls_method': '__pow__'}, +] + +binary_api_list_without_grad = [ + {'func': paddle.equal, 'cls_method': '__eq__'}, + {'func': paddle.not_equal, 'cls_method': '__ne__'}, + {'func': paddle.greater_equal, 'cls_method': '__ge__'}, + {'func': paddle.greater_than, 'cls_method': '__gt__'}, + {'func': paddle.less_equal, 'cls_method': '__le__'}, + {'func': paddle.less_than, 'cls_method': '__lt__'}, + {'func': paddle.remainder, 'cls_method': '__mod__'}, + paddle.mod, + paddle.floor_mod, + paddle.logical_and, + paddle.logical_or, + paddle.logical_xor, +] + +binary_int_api_list_without_grad = [ + paddle.bitwise_and, + paddle.bitwise_or, + paddle.bitwise_xor, +] + + +# Use to test zero-dim of binary API +class TestBinaryAPI(unittest.TestCase): + def test(self): + paddle.disable_static() + fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True}) + for api in binary_api_list + binary_api_list_without_grad: + # 1) x/y is 0D + x = paddle.rand([]) + y = paddle.rand([]) + x.stop_gradient = False + y.stop_gradient = False + if isinstance(api, dict): + out = api['func'](x, y) + out_cls = getattr(paddle.Tensor, api['cls_method'])(x, y) + np.testing.assert_array_equal(out_cls.numpy(), out.numpy()) + else: + out = api(x, y) + + self.assertEqual(out.shape, []) + if api not in binary_api_list_without_grad: + out.backward() + self.assertEqual(x.grad.shape, []) + self.assertEqual(y.grad.shape, []) + self.assertEqual(out.grad.shape, []) + + # 2) x is not 0D , y is 0D + x = paddle.rand([2, 3, 4]) + y = paddle.rand([]) + x.stop_gradient = False + y.stop_gradient = False + if isinstance(api, dict): + out = api['func'](x, y) + out_cls = getattr(paddle.Tensor, api['cls_method'])(x, y) + np.testing.assert_array_equal(out_cls.numpy(), out.numpy()) + else: + out = api(x, y) + + self.assertEqual(out.shape, [2, 3, 4]) + if api not in binary_api_list_without_grad: + out.backward() + self.assertEqual(x.grad.shape, [2, 3, 4]) + self.assertEqual(y.grad.shape, []) + self.assertEqual(out.grad.shape, [2, 3, 4]) + + # 3) x is 0D , y is not 0D + x = paddle.rand([]) + y = paddle.rand([2, 3, 4]) + x.stop_gradient = False + y.stop_gradient = False + if isinstance(api, dict): + out = api['func'](x, y) + out_cls = getattr(paddle.Tensor, api['cls_method'])(x, y) + np.testing.assert_array_equal(out_cls.numpy(), out.numpy()) + else: + out = api(x, y) + + self.assertEqual(out.shape, [2, 3, 4]) + if api not in binary_api_list_without_grad: + out.backward() + self.assertEqual(x.grad.shape, []) + self.assertEqual(y.grad.shape, [2, 3, 4]) + self.assertEqual(out.grad.shape, [2, 3, 4]) + + # 4) x is 0D , y is scalar + x = paddle.rand([]) + y = 0.5 + x.stop_gradient = False + if isinstance(api, dict): + out = getattr(paddle.Tensor, api['cls_method'])(x, y) + self.assertEqual(out.shape, []) + + for api in binary_int_api_list_without_grad: + # 1) x/y is 0D + x = paddle.randint(-10, 10, []) + y = paddle.randint(-10, 10, []) + out = api(x, y) + self.assertEqual(out.shape, []) + + # 2) x is not 0D , y is 0D + x = paddle.randint(-10, 10, [3, 5]) + y = paddle.randint(-10, 10, []) + out = api(x, y) + self.assertEqual(out.shape, [3, 5]) + + # 3) x is 0D , y is not 0D + x = paddle.randint(-10, 10, []) + y = paddle.randint(-10, 10, [3, 5]) + out = api(x, y) + self.assertEqual(out.shape, [3, 5]) + + paddle.enable_static() + + +# Use to test zero-dim of Sundry API, which is simple and do +# not have backward, or is not need to test backward in OpTest. +class TestSundryAPI(unittest.TestCase): + def setUp(self): + paddle.disable_static() + self.x = paddle.rand([]) + + def test_linear(self): + x = paddle.randn([3, 2]) + w = paddle.full(shape=[2, 4], fill_value=0.5) + b = paddle.zeros([]) + + np.testing.assert_array_equal( + F.linear(x, w, b).numpy(), F.linear(x, w).numpy() + ) + + def test_is_floating_point(self): + self.assertTrue(paddle.is_floating_point(self.x)) + + def test_is_integer(self): + x = paddle.randint(0, 10, []) + self.assertTrue(paddle.is_integer(x)) + + def test_is_tensor(self): + self.assertTrue(paddle.is_tensor(self.x)) + + def test_is_empty(self): + x = paddle.rand([3, 0, 5]) + self.assertTrue(paddle.is_empty(x)) + + def test_isfinite(self): + out = paddle.isfinite(self.x) + np.testing.assert_array_equal(out.numpy(), np.array(True)) + + def test_isinf(self): + x = paddle.to_tensor(np.array(float('-inf'))) + out = paddle.isinf(x) + np.testing.assert_array_equal(out.numpy(), np.array(True)) + + def test_isnan(self): + x = paddle.to_tensor(np.array(float('nan'))) + out = paddle.isnan(x) + np.testing.assert_array_equal(out.numpy(), np.array(True)) + + def test_isclose(self): + out = paddle.isclose(self.x, self.x) + np.testing.assert_array_equal(out.numpy(), np.array(True)) + + def test_clone(self): + out = paddle.clone(self.x) + np.testing.assert_array_equal(out.numpy(), self.x.numpy()) + + def test_assign(self): + out = paddle.assign(self.x) + np.testing.assert_array_equal(out.numpy(), self.x.numpy()) + + def test_item(self): + x = paddle.full([], 0.5) + self.assertEqual(x.item(), 0.5) + + def test_tolist(self): + x = paddle.full([], 0.5) + self.assertEqual(x.tolist(), 0.5) + + def test_numpy(self): + x = paddle.full([], 0.5) + np.testing.assert_array_equal(x.numpy(), np.array(0.5)) + + def test_numel(self): + out = paddle.numel(self.x) + self.assertEqual(out.shape, []) + np.testing.assert_array_equal(out.numpy(), np.array(1)) + + def test_rank(self): + out = paddle.rank(self.x) + self.assertEqual(out.shape, []) + np.testing.assert_array_equal(out.numpy(), np.array(0)) + + def test_shape(self): + out = paddle.shape(self.x) + self.assertEqual(out.shape, [0]) + np.testing.assert_array_equal(out.numpy(), np.array([])) + + +if __name__ == "__main__": + unittest.main() From a00aebe1ab38117a5cf4c20a4e0e53a5073009e6 Mon Sep 17 00:00:00 2001 From: Wilber Date: Tue, 15 Nov 2022 13:55:17 +0800 Subject: [PATCH 014/210] [convert_to_mixed_precision] fallback to fp32 when encounter circle (#47902) --- .../passes/convert_to_mixed_precision.cc | 343 +++++++----------- 1 file changed, 127 insertions(+), 216 deletions(-) diff --git a/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc b/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc index e9b188d78f16d..a37cfda021d5b 100644 --- a/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc +++ b/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc @@ -40,7 +40,6 @@ #include "paddle/phi/common/float16.h" #include "paddle/phi/common/layout.h" #include "paddle/phi/common/place.h" -#include "paddle/phi/core/tensor_meta.h" namespace paddle { namespace inference { @@ -111,12 +110,10 @@ class ConvertToMixedPrecisionPass { black_list_(black_list), place_(paddle::CPUPlace()), executor_(place_) { - // black_list_.insert("assign"); - black_list_.insert("fill_constant"); - black_list_.insert("assign_value"); - black_list_.insert("eye"); - black_list_.insert("fill_any_like"); - black_list_.insert("fill_constant_batch_size_like"); + VLOG(4) << "black_list has "; + for (auto& name : black_list_) { + VLOG(4) << " - " << name; + } } void Run(); @@ -145,18 +142,11 @@ class ConvertToMixedPrecisionPass { // Just process special cases for weights conversion. bool WeightsShouldNotConvert(framework::ir::Node* var_node); - // To support multi block, we need to consider a lot of special cases. // Return Node* which first appers in block. - framework::ir::Node* GetRealVarNode(BlockID block_idx, - framework::ir::Node* node); - void FindVarsInMultiBlock(); - inline bool VarIsMultiPrecisionOpsOut(BlockID block_idx, - framework::ir::Node* op_node); + framework::ir::Node* GetRealVarNode(framework::ir::Node* node); - private: - // A trick. Patch for strange op, which input name equal to output name, such - // as `fused_multi_transformer` - void PatchForStrangeOp(); + // Fallback to fp32 dtype when encounter circle (Not a DAG graph). + void ProcessCircleCases(); private: std::string model_file_; @@ -171,35 +161,21 @@ class ConvertToMixedPrecisionPass { framework::Executor executor_; framework::Scope scope_; + std::unordered_map name2node_; std::unordered_map cast_map_; - std::unordered_map> - vars_in_multi_block_with_pair_; - std::unordered_map> - vars_in_multi_block_with_ops_; int suffix_{0}; + std::set var_names_in_circles_; + std::unique_ptr program_desc_{nullptr}; std::unique_ptr main_graph_{nullptr}; std::vector graphes_; }; framework::ir::Node* ConvertToMixedPrecisionPass::GetRealVarNode( - BlockID block_idx, framework::ir::Node* var_node) { + framework::ir::Node* var_node) { CHECK_EQ(var_node->IsVar(), true); - - if (vars_in_multi_block_with_pair_.count(var_node->Name())) { - auto origin_blockId = - vars_in_multi_block_with_pair_.at(var_node->Name()).second; - if (block_idx != origin_blockId) { - auto* graph = graphes_[origin_blockId]; - for (auto* node : graph->Nodes()) { - if (node->Name() == var_node->Name()) { - return node; - } - } - } - } - + if (name2node_.count(var_node->Name())) return name2node_[var_node->Name()]; return var_node; } @@ -212,32 +188,6 @@ inline bool ConvertToMixedPrecisionPass::VarNodeHasDtype( (type == VarType::VOCAB); } -// op1(fp32) -> var1, op2(fp16) -> var1 -// if and only if op1 and op2 both support fp16, we convert op1 and op2's -// precision. -inline bool ConvertToMixedPrecisionPass::VarIsMultiPrecisionOpsOut( - BlockID block_idx, framework::ir::Node* op_node) { - CHECK_EQ(op_node->IsOp(), true); - - for (auto* var_node : op_node->outputs) { - if (!var_node->IsVar()) continue; - auto* real_var_node = GetRealVarNode(block_idx, var_node); - if (!real_var_node->Var()->Persistable() && - vars_in_multi_block_with_ops_.count(var_node->Name())) { - for (const auto& op_type : - vars_in_multi_block_with_ops_.at(var_node->Name())) { - if (!OpSupportPrecision( - op_type, backend_, mixed_precision_, black_list_)) { - VLOG(2) << var_node->Name() - << " is multi precision op's out, so we skip convert to fp16"; - return true; - } - } - } - } - return false; -} - void ConvertToMixedPrecisionPass::ProcessInputNode( bool support_precision, framework::ir::Node* in_node, @@ -247,18 +197,13 @@ void ConvertToMixedPrecisionPass::ProcessInputNode( VarType::Type to_type, BlockID block_idx) { if (!in_node->IsVar()) return; - auto* real_node = GetRealVarNode(block_idx, in_node); + auto* real_node = GetRealVarNode(in_node); if (!VarNodeHasDtype(real_node)) return; auto* graph = graphes_[block_idx]; - bool is_main_block = block_idx == 0; auto* in_var = real_node->Var(); auto in_var_type = in_var->GetDataType(); auto prev_type = in_var_type; - bool is_in_multi_block = vars_in_multi_block_with_pair_.count(in_var->Name()); - if (!is_main_block && is_in_multi_block) { - in_var_type = vars_in_multi_block_with_pair_.at(in_var->Name()).first; - } if (support_precision) { if (in_var->Persistable() && in_var_type == VarType::FP32) { if (WeightsShouldNotConvert(in_node)) return; @@ -299,7 +244,7 @@ void ConvertToMixedPrecisionPass::ProcessInputNode( void ConvertToMixedPrecisionPass::ProcessOutputNode( BlockID block_idx, framework::ir::Node* var_node, VarType::Type to_type) { if (!var_node->IsVar()) return; - auto* real_node = GetRealVarNode(block_idx, var_node); + auto* real_node = GetRealVarNode(var_node); if (!VarNodeHasDtype(real_node)) return; auto* out_var = real_node->Var(); auto prev_type = out_var->GetDataType(); @@ -400,9 +345,17 @@ void ConvertToMixedPrecisionPass::LoadAndPrepare() { inference::Load(&executor_, &scope_, model_file_, params_file_); main_graph_ = std::unique_ptr( new framework::ir::Graph(*program_desc_)); + for (size_t i = 0; i < main_graph_->SubGraphsSize(); ++i) { auto* graph = main_graph_->GetSubGraph(i); graphes_.push_back(graph); + + for (auto* node : graph->Nodes()) { + if (!node->IsVar()) continue; + if (!name2node_.count(node->Name())) { + name2node_[node->Name()] = node; + } + } } // Remove all control var @@ -411,46 +364,68 @@ void ConvertToMixedPrecisionPass::LoadAndPrepare() { arg.SetMainGraphNotOwned(main_graph_.get()); pass.Run(&arg); - FindVarsInMultiBlock(); + ProcessCircleCases(); } -void ConvertToMixedPrecisionPass::FindVarsInMultiBlock() { - std::unordered_set all_var_names_set; - std::vector> block_var_names_set(program_desc_->Size()); - for (BlockID idx = 0; idx < program_desc_->Size(); ++idx) { +// Find var names which in circles. +void ConvertToMixedPrecisionPass::ProcessCircleCases() { + std::vector vars_in_circles; + for (size_t idx = 0; idx < program_desc_->Size(); ++idx) { for (auto* op : program_desc_->Block(idx).AllOps()) { + // TODO(inference): batch_norm has circle, but we need to fuse it in conv + // op. + if (op->Type() == "batch_norm") continue; const auto& in_names = op->InputArgumentNames(); - block_var_names_set[idx].insert(in_names.begin(), in_names.end()); const auto& out_names = op->OutputArgumentNames(); - block_var_names_set[idx].insert(out_names.begin(), out_names.end()); - - if (op->HasAttr("sub_block") == false) { - for (const auto& name : out_names) { - if (all_var_names_set.count(name)) { - vars_in_multi_block_with_ops_[name].push_back(op->Type()); - } - } - } - all_var_names_set.insert(block_var_names_set[idx].begin(), - block_var_names_set[idx].end()); + std::set in_names_set(in_names.begin(), in_names.end()); + std::set out_names_set(out_names.begin(), out_names.end()); + std::set_intersection(in_names_set.begin(), + in_names_set.end(), + out_names_set.begin(), + out_names_set.end(), + std::back_inserter(vars_in_circles)); } } - CHECK_GT(program_desc_->Size(), 0U); - for (BlockID idx = 0; idx < program_desc_->Size() - 1; ++idx) { - for (BlockID jdx = idx + 1; jdx < program_desc_->Size(); ++jdx) { - std::vector vars_in_multi_block; - std::set_intersection(block_var_names_set[idx].begin(), - block_var_names_set[idx].end(), - block_var_names_set[jdx].begin(), - block_var_names_set[jdx].end(), - std::back_inserter(vars_in_multi_block)); - - for (const auto& name : vars_in_multi_block) { - vars_in_multi_block_with_pair_.emplace( - name, std::make_pair(VarType::Type(), idx)); - } - } + for (auto& name : vars_in_circles) { + var_names_in_circles_.insert(name); + } + for (auto& name : var_names_in_circles_) { + LOG(INFO) << name + << " in circles, so we will skip process those vars and ops."; + } +} + +inline void ProcessConstantOpAttr(framework::ir::Node* op_node, + VarType::Type from_type, + VarType::Type to_type) { + if (!op_node->IsOp()) return; + auto op_type = op_node->Op()->Type(); + if (op_type == "feed" || op_type == "fetch") return; + + if (op_type == "fill_constant") { + if (PADDLE_GET_CONST(int, op_node->Op()->GetAttr("dtype")) == + static_cast(from_type)) + op_node->Op()->SetAttr("dtype", static_cast(to_type)); + } else if (op_type == "assign_value") { + if (PADDLE_GET_CONST(int, op_node->Op()->GetAttr("dtype")) == + static_cast(from_type)) + op_node->Op()->SetAttr("dtype", static_cast(to_type)); + } else if (op_type == "eye") { + if (PADDLE_GET_CONST(int, op_node->Op()->GetAttr("dtype")) == + static_cast(from_type)) + op_node->Op()->SetAttr("dtype", static_cast(to_type)); + } else if (op_type == "fill_any_like") { + if (PADDLE_GET_CONST(int, op_node->Op()->GetAttr("dtype")) == + static_cast(from_type)) + op_node->Op()->SetAttr("dtype", static_cast(to_type)); + } else if (op_type == "cast") { + if (PADDLE_GET_CONST(int, op_node->Op()->GetAttr("in_dtype")) == + static_cast(from_type)) + op_node->Op()->SetAttr("in_dtype", static_cast(to_type)); + if (PADDLE_GET_CONST(int, op_node->Op()->GetAttr("out_dtype")) == + static_cast(from_type)) + op_node->Op()->SetAttr("out_dtype", static_cast(to_type)); } } @@ -460,33 +435,7 @@ void ConvertToMixedPrecisionPass::ConvertAllFp64ToFp32( for (auto* op_node : op_nodes) { if (!op_node->IsOp()) continue; auto op_type = op_node->Op()->Type(); - if (op_type == "feed" || op_type == "fetch") continue; - - if (op_type == "fill_constant") { - if (PADDLE_GET_CONST(int, op_node->Op()->GetAttr("dtype")) == - static_cast(VarType::FP64)) - op_node->Op()->SetAttr("dtype", static_cast(VarType::FP32)); - } else if (op_type == "assign_value") { - if (PADDLE_GET_CONST(int, op_node->Op()->GetAttr("dtype")) == - static_cast(VarType::FP64)) - op_node->Op()->SetAttr("dtype", static_cast(VarType::FP32)); - } else if (op_type == "eye") { - if (PADDLE_GET_CONST(int, op_node->Op()->GetAttr("dtype")) == - static_cast(VarType::FP64)) - op_node->Op()->SetAttr("dtype", static_cast(VarType::FP32)); - } else if (op_type == "fill_any_like") { - if (PADDLE_GET_CONST(int, op_node->Op()->GetAttr("dtype")) == - static_cast(VarType::FP64)) - op_node->Op()->SetAttr("dtype", static_cast(VarType::FP32)); - } else if (op_type == "cast") { - if (PADDLE_GET_CONST(int, op_node->Op()->GetAttr("in_dtype")) == - static_cast(VarType::FP64)) - op_node->Op()->SetAttr("in_dtype", static_cast(VarType::FP32)); - if (PADDLE_GET_CONST(int, op_node->Op()->GetAttr("out_dtype")) == - static_cast(VarType::FP64)) - op_node->Op()->SetAttr("out_dtype", static_cast(VarType::FP32)); - } - + ProcessConstantOpAttr(op_node, VarType::FP64, VarType::FP32); auto inputs = op_node->inputs; for (auto* in_node : inputs) { auto* in_var = in_node->Var(); @@ -509,9 +458,6 @@ void ConvertToMixedPrecisionPass::Run() { ConvertTensorDtype(i); FixCastAttr(graph); - // A trick - PatchForStrangeOp(); - CHECK_EQ(framework::ir::VarDescIsConsistency(*graph), true); } @@ -556,28 +502,9 @@ void ConvertToMixedPrecisionPass::ConvertTensorDtype(BlockID block_idx) { continue; } + // We can not add cast operator before ops who have sub_block, as in + // sub_block we may get a var which may be transformer by cast op. else if (op_node->Op()->HasAttr("sub_block")) { // NOLINT - // sub_block op's output dtype should be same as input dtype, if have the - // same name. - std::unordered_map in_name_to_node; - for (auto* in : op_node->inputs) { - if (!in->IsVar()) continue; - auto* real_node = GetRealVarNode(block_idx, in); - if (VarNodeHasDtype(real_node)) { - in_name_to_node[in->Name()] = in; - } - } - - for (auto* out : op_node->outputs) { - if (!out->IsVar()) continue; - auto* real_node = GetRealVarNode(block_idx, out); - if (VarNodeHasDtype(real_node)) { - if (in_name_to_node.count(out->Name())) - real_node->Var()->SetDataType( - in_name_to_node[out->Name()]->Var()->GetDataType()); - } - } - continue; } @@ -585,65 +512,75 @@ void ConvertToMixedPrecisionPass::ConvertTensorDtype(BlockID block_idx) { // - cast weight to fp16/bf16. // - add cast op if the input dtype is not fp16/bf16. // - set output dtype. - // - // If a var(op's out var) appears multiple times in graph, we should not - // convert to fp16. - else if (black_list_.count(op_type) == 0 && // NOLINT - !VarIsMultiPrecisionOpsOut(block_idx, op_node)) { + else if (black_list_.count(op_type) == 0) { // NOLINT bool support_precision = OpSupportPrecision(op_type, backend_, mixed_precision_, black_list_); - // If the op has no input of float type, we will not choose the + // If op's output in circle, we should not convert to fp16. + for (auto* out_node : op_node->outputs) { + if (var_names_in_circles_.count(out_node->Name())) { + support_precision = false; + VLOG(2) << " op's output " << out_node->Name() + << " is in circle, we can not support this case, just skip."; + break; + } + } + + // If the op has no input or output of float type, we will not choose the // low precision kernel. - { - bool has_float_input{false}; + if (support_precision) { + bool has_float_in_out{false}; for (auto* in_node : op_node->inputs) { if (!in_node->IsVar()) continue; - auto* real_node = GetRealVarNode(block_idx, in_node); + if (in_node->Var()->GetType() != VarType::LOD_TENSOR) { + support_precision = false; + VLOG(2) << " op has tensor array input[" << in_node->Name() + << "], just skip."; + break; + } + auto* real_node = GetRealVarNode(in_node); + if (real_node->Var()->GetDataType() == VarType::FP16 || + real_node->Var()->GetDataType() == VarType::FP32 || + real_node->Var()->GetDataType() == VarType::FP64 || + real_node->Var()->GetDataType() == VarType::BF16) { + has_float_in_out = true; + break; + } + } + for (auto* out_node : op_node->outputs) { + if (!out_node->IsVar()) continue; + auto* real_node = GetRealVarNode(out_node); if (real_node->Var()->GetDataType() == VarType::FP16 || real_node->Var()->GetDataType() == VarType::FP32 || real_node->Var()->GetDataType() == VarType::FP64 || real_node->Var()->GetDataType() == VarType::BF16) { - has_float_input = true; + has_float_in_out = true; break; } } - if (!has_float_input) { + if (!has_float_in_out) { support_precision = false; - VLOG(2) << " op doesn't has float input, just skip."; + VLOG(2) << " op doesn't has float input and output, just skip."; } } + VLOG(2) << "op type: " << op_type << " support low precision: " << support_precision; if (support_precision) { + ProcessConstantOpAttr(op_node, VarType::FP32, to_type); VLOG(2) << " process input nodes:"; ++num_low_precision; auto inputs = op_node->inputs; - - // Just for paddle's terriable case: op's input and output has the same - // name. - std::unordered_map names_map; - for (auto* out_node : op_node->outputs) { - for (auto* in_node : op_node->inputs) { - if (out_node->Name() == in_node->Name()) { - names_map[out_node->Name()] = in_node->Name(); - } - } - } - - // Process inputs. for (auto* in_node : inputs) { ProcessInputNode( true, in_node, op_node, &suffix_, block_desc, to_type, block_idx); - if (names_map.count(in_node->Name()) && cast_map_.count(in_node)) { - names_map[in_node->Name()] = cast_map_[in_node]->Name(); - } } + VLOG(2) << " process output nodes:"; - // Process outputs. - for (auto* out_node : op_node->outputs) { + auto outputs = op_node->outputs; + for (auto* out_node : outputs) { ProcessOutputNode(block_idx, out_node, to_type); } } else { @@ -663,8 +600,10 @@ void ConvertToMixedPrecisionPass::ConvertTensorDtype(BlockID block_idx) { // 3. check op not support fp16/bf16 or in blacklist. // - add cast op if the input dtype is not fp32. else { // NOLINT - VLOG(3) << "not to run fp16 op_type: " << op_type; - for (auto* in_node : op_node->inputs) { + VLOG(3) << "not to run fp16 op_type: " << op_type << ", node input size " + << op_node->inputs.size(); + auto in_nodes = op_node->inputs; + for (auto* in_node : in_nodes) { auto* in_var = in_node->Var(); if (in_var->GetDataType() == to_type) { AddCastOp(graph, @@ -716,21 +655,6 @@ void ConvertToMixedPrecisionPass::ConvertTensorDtype(BlockID block_idx) { } } - for (auto* node : graph->Nodes()) { - if (!node->IsVar()) continue; - auto* real_node = GetRealVarNode(block_idx, node); - if (!VarNodeHasDtype(real_node)) continue; - - if (vars_in_multi_block_with_pair_.count(real_node->Name()) && - vars_in_multi_block_with_pair_.at(real_node->Name()).second == - block_idx && - vars_in_multi_block_with_pair_.at(real_node->Name()).first == - VarType::Type()) { - vars_in_multi_block_with_pair_.at(real_node->Name()).first = - real_node->Var()->GetDataType(); - } - } - if (num_low_precision) LOG(INFO) << "--- detected " << num_low_precision << " low precision ops in " << block_idx << " subgraph"; @@ -738,6 +662,7 @@ void ConvertToMixedPrecisionPass::ConvertTensorDtype(BlockID block_idx) { // We modify op's input output precision, and we need to fix cast op in_dtype // and out_dtype attribute. +// TODO(inference): we need a cast elimination pass. void ConvertToMixedPrecisionPass::FixCastAttr(framework::ir::Graph* graph) { auto op_nodes = framework::ir::TopologySortOperations(*graph); for (auto* op_node : op_nodes) { @@ -766,7 +691,8 @@ void ConvertToMixedPrecisionPass::SaveMixedModel() { if (VarNodeHasDtype(node)) { if (node->Var()->Persistable() && node->Var()->GetDataType() == VarType::FP32) { - VLOG(2) << "weights keep to fp32: " << node->Name(); + VLOG(2) << "weights keep to fp32: " << node->Name() << ", ptr " + << reinterpret_cast(node->Var()); weights_should_be_fp32.insert(node->Name()); } } @@ -808,7 +734,6 @@ void ConvertToMixedPrecisionPass::SaveMixedModel() { std::ostringstream os; phi::CPUContext ctx; for (const auto& param : parameters) { - VLOG(3) << "Serialize param: " << param; PADDLE_ENFORCE_NOT_NULL( scope_.FindVar(param), platform::errors::NotFound( @@ -829,21 +754,6 @@ void ConvertToMixedPrecisionPass::SaveMixedModel() { mixed_program_desc.Proto()->SerializeAsString()); StrToBinary(mixed_params_file_, SerializeParams()); } - -void ConvertToMixedPrecisionPass::PatchForStrangeOp() { - for (auto* graph : graphes_) { - for (auto op_node : framework::ir::TopologySortOperations(*graph)) { - if (op_node->Name() == "fused_multi_transformer") { - auto cache_kv_inputs = op_node->Op()->Input("CacheKV"); - auto cache_kv_outputs = op_node->Op()->Output("CacheKVOut"); - CHECK_EQ(cache_kv_inputs.size(), cache_kv_outputs.size()); - for (size_t i = 0; i < cache_kv_inputs.size(); ++i) { - op_node->Op()->RenameOutput(cache_kv_outputs[i], cache_kv_inputs[i]); - } - } - } - } -} } // namespace void AddCastOp( @@ -893,6 +803,7 @@ void AddCastOp( } next_op->Op()->Rename(node->Name(), map->at(node)->Name()); IR_NODE_LINK_TO(node, map->at(node)->inputs[0]); + IR_NODE_UNLINK(node, next_op); IR_NODE_LINK_TO(map->at(node), next_op); } From e65bac286a6b44b297d1aba69d9adf703b161c60 Mon Sep 17 00:00:00 2001 From: Yuang Liu Date: Tue, 15 Nov 2022 14:33:23 +0800 Subject: [PATCH 015/210] Update for scatter support fake 2d index (#47946) --- paddle/phi/infermeta/ternary.cc | 23 +++++++++++++------ .../fluid/tests/unittests/test_scatter_op.py | 19 +++++++++++++++ 2 files changed, 35 insertions(+), 7 deletions(-) diff --git a/paddle/phi/infermeta/ternary.cc b/paddle/phi/infermeta/ternary.cc index dc219deac0691..9b3ffbd083762 100644 --- a/paddle/phi/infermeta/ternary.cc +++ b/paddle/phi/infermeta/ternary.cc @@ -987,13 +987,22 @@ void ScatterInferMeta(const MetaTensor& x, const auto& updates_dims = updates.dims(); const auto& ref_dims = x.dims(); const auto& index_dims = index.dims(); - PADDLE_ENFORCE_EQ( - index_dims.size(), - 1, - phi::errors::InvalidArgument( - "The size of Input(Ids)'s shape should be equal to 1, but " - "received the rank of Input(Ids) is %d.", - index_dims.size())); + + if (index_dims.size() == 2) { + PADDLE_ENFORCE_EQ(index_dims[1], + 1, + phi::errors::InvalidArgument( + "The last dim of the index should be 1 when the " + "index is a 2D tensor, but we get %d.", + index_dims[1])); + } else { + PADDLE_ENFORCE_EQ( + index_dims.size(), + 1, + phi::errors::InvalidArgument("The index should be a 1D tensor when the " + "index is not a 2D tensor, but we get %d.", + index_dims.size())); + } PADDLE_ENFORCE_EQ( ref_dims.size(), updates_dims.size(), diff --git a/python/paddle/fluid/tests/unittests/test_scatter_op.py b/python/paddle/fluid/tests/unittests/test_scatter_op.py index 479498bfbb451..a830ed0a9e291 100644 --- a/python/paddle/fluid/tests/unittests/test_scatter_op.py +++ b/python/paddle/fluid/tests/unittests/test_scatter_op.py @@ -191,6 +191,25 @@ def test_check_grad(self): ) +class TestScatterOp6(OpTest): + def setUp(self): + self.op_type = "scatter" + self.python_api = paddle.scatter + ref_np = np.ones((3, 50)).astype("float32") + index_np = np.array([[1], [2]]).astype("int32") + updates_np = np.random.random((2, 50)).astype("float32") + output_np = np.copy(ref_np) + output_np[np.array([1, 2]).astype("int32")] = updates_np + self.inputs = {'X': ref_np, 'Ids': index_np, 'Updates': updates_np} + self.outputs = {'Out': output_np} + + def test_check_output(self): + self.check_output(check_eager=False) + + def test_check_grad(self): + self.check_grad(["X", "Updates"], "Out", check_eager=False) + + class TestScatterAPI(unittest.TestCase): def setUp(self): self.places = [fluid.CPUPlace()] From 8fece428e96d22ec843d776233579b4a3fc254f7 Mon Sep 17 00:00:00 2001 From: zhaoyingli <86812880+zhaoyinglia@users.noreply.github.com> Date: Tue, 15 Nov 2022 14:43:11 +0800 Subject: [PATCH 016/210] fix dist slice op (#47980) --- .../distributed/auto_parallel/operators/dist_slice.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python/paddle/distributed/auto_parallel/operators/dist_slice.py b/python/paddle/distributed/auto_parallel/operators/dist_slice.py index b0d31f12dace9..18c643c1d76cb 100644 --- a/python/paddle/distributed/auto_parallel/operators/dist_slice.py +++ b/python/paddle/distributed/auto_parallel/operators/dist_slice.py @@ -40,8 +40,8 @@ def is_input_compatible(self, dist_op): op_dist_attr = dist_op.dist_attr in_name = op_desc.input('Input')[0] out_name = op_desc.output('Out')[0] - in_var = dist_op.serial_op.block.var(in_name) - out_var = dist_op.serial_op.block.var(out_name) + in_var = dist_op.serial_op.block._var_recursive(in_name) + out_var = dist_op.serial_op.block._var_recursive(out_name) axes = op_desc.attr('axes') in_dims_mapping = op_dist_attr.get_input_dims_mapping(in_name) for axis in axes: @@ -57,8 +57,8 @@ def is_output_compatible(self, dist_op): op_dist_attr = dist_op.dist_attr in_name = op_desc.input('Input')[0] out_name = op_desc.output('Out')[0] - in_var = dist_op.serial_op.block.var(in_name) - out_var = dist_op.serial_op.block.var(out_name) + in_var = dist_op.serial_op.block._var_recursive(in_name) + out_var = dist_op.serial_op.block._var_recursive(out_name) axes = op_desc.attr('axes') decrease_axis = op_desc.attr('decrease_axis') in_dims_mapping = op_dist_attr.get_input_dims_mapping(in_name) From 626d7bcbce10297ee726d44a407135461484635c Mon Sep 17 00:00:00 2001 From: Zhou Wei <1183042833@qq.com> Date: Tue, 15 Nov 2022 17:30:04 +0800 Subject: [PATCH 017/210] [Zero-Dim] Make auto parallel judge dim more strict (#47961) --- paddle/fluid/operators/batch_norm_op.cc | 2 +- python/paddle/distributed/auto_parallel/completion.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc index 6c6591f34abce..878ab18432cdc 100644 --- a/paddle/fluid/operators/batch_norm_op.cc +++ b/paddle/fluid/operators/batch_norm_op.cc @@ -164,7 +164,7 @@ void BatchNormOp::InferShape(framework::InferShapeContext *ctx) const { ctx->SetOutputDim("SavedMean", {C}); ctx->SetOutputDim("SavedVariance", {C}); ctx->ShareLoD("X", "Y"); - if (ctx->HasInput("ReserveSpace")) { + if (ctx->HasOutput("ReserveSpace")) { ctx->SetOutputDim("ReserveSpace", {-1}); } } diff --git a/python/paddle/distributed/auto_parallel/completion.py b/python/paddle/distributed/auto_parallel/completion.py index c0f70f482dd17..7f5e0fee77526 100644 --- a/python/paddle/distributed/auto_parallel/completion.py +++ b/python/paddle/distributed/auto_parallel/completion.py @@ -1239,7 +1239,7 @@ def _get_op_by_id(ops, id): input_var ).dims_mapping else: - if fwd_op_dist_attr.get_input_dims_mapping(input_name): + if input_name in forward_op.input_arg_names: ref_dims_mapping = ( fwd_op_dist_attr.get_input_dims_mapping( input_name @@ -1544,7 +1544,7 @@ def _get_op_by_id(ops, id): input_var ).dims_mapping else: - if fwd_op_dist_attr.get_input_dims_mapping(input_name): + if input_name in forward_op.input_arg_names: ref_dims_mapping = ( fwd_op_dist_attr.get_input_dims_mapping( input_name From 519e7426dd4bbf0b7134b0b59fd9db9cbb0c7102 Mon Sep 17 00:00:00 2001 From: jakpiase Date: Tue, 15 Nov 2022 11:20:16 +0100 Subject: [PATCH 018/210] Added optimization pass for oneDNN layernorm kernel (#47782) * optimization for ln * fix * added output to gpd * added formatting * fix --- paddle/fluid/framework/ir/CMakeLists.txt | 1 + .../framework/ir/graph_pattern_detector.cc | 23 ++++ .../framework/ir/graph_pattern_detector.h | 13 +++ .../layer_norm_onednn_optimization_pass.cc | 110 ++++++++++++++++++ .../layer_norm_onednn_optimization_pass.h | 34 ++++++ .../inference/api/paddle_pass_builder.cc | 2 + .../operators/mkldnn/layer_norm_mkldnn_op.cc | 43 ++++--- 7 files changed, 211 insertions(+), 15 deletions(-) mode change 100644 => 100755 paddle/fluid/framework/ir/graph_pattern_detector.h create mode 100644 paddle/fluid/framework/ir/mkldnn/layer_norm_onednn_optimization_pass.cc create mode 100644 paddle/fluid/framework/ir/mkldnn/layer_norm_onednn_optimization_pass.h diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index f9359d0b58bca..9ea065b567f0b 100644 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -178,6 +178,7 @@ if(WITH_MKLDNN) pass_library(elt_act_mkldnn_fuse_pass inference DIR mkldnn) pass_library(matmul_elementwise_add_mkldnn_fuse_pass inference DIR mkldnn) pass_library(matmul_activation_mkldnn_fuse_pass inference DIR mkldnn) + pass_library(layer_norm_onednn_optimization_pass inference DIR mkldnn) pass_library(operator_scale_onednn_fuse_pass inference DIR mkldnn) pass_library(squeeze2_transpose2_onednn_fuse_pass inference DIR mkldnn) pass_library(operator_unsqueeze2_onednn_fuse_pass inference DIR mkldnn) diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index 45c4c50318f60..746e6077fdf5a 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -940,6 +940,29 @@ PDNode *patterns::ConvBN::operator()(paddle::framework::ir::PDNode *conv_input, return bn_out_var; } +PDNode *patterns::LayerNormShiftScale::operator()() { + auto layer_norm_in = pattern->NewNode(layer_norm_in_repr()) + ->AsInput() + ->assert_is_op_input("layer_norm", "X"); + auto layer_norm_bias = pattern->NewNode(layer_norm_bias_repr()) + ->AsInput() + ->assert_is_op_input("layer_norm", "Bias"); + auto layer_norm_scale = pattern->NewNode(layer_norm_scale_repr()) + ->AsInput() + ->assert_is_op_input("layer_norm", "Scale"); + + auto layer_norm_op = + pattern->NewNode(layer_norm_op_repr())->assert_is_op("layer_norm"); + + auto layer_norm_out = pattern->NewNode(layer_norm_out_repr()) + ->assert_is_op_output("layer_norm", "Y") + ->AsOutput(); + + layer_norm_op->LinksFrom({layer_norm_in, layer_norm_bias, layer_norm_scale}) + .LinksTo({layer_norm_out}); + return layer_norm_out; +} + PDNode *patterns::OperatorActivation::operator()( const std::string &operator_type, const std::string &activation_type) { auto *preceding_op = diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h old mode 100644 new mode 100755 index 0ec4e0c2767c7..fdff82d30caaa --- a/paddle/fluid/framework/ir/graph_pattern_detector.h +++ b/paddle/fluid/framework/ir/graph_pattern_detector.h @@ -526,6 +526,19 @@ struct ConvBN : public PatternBase { PATTERN_DECL_NODE(bn_saved_variance); }; +struct LayerNormShiftScale : public PatternBase { + LayerNormShiftScale(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "layer_norm_shift_scale") {} + + PDNode* operator()(); + + PATTERN_DECL_NODE(layer_norm_in); + PATTERN_DECL_NODE(layer_norm_op); + PATTERN_DECL_NODE(layer_norm_bias); + PATTERN_DECL_NODE(layer_norm_scale); + PATTERN_DECL_NODE(layer_norm_out); +}; + struct OperatorActivation : public PatternBase { OperatorActivation(PDPattern* pattern, const std::string& name_scope) : PatternBase(pattern, name_scope, "operator_activation") {} diff --git a/paddle/fluid/framework/ir/mkldnn/layer_norm_onednn_optimization_pass.cc b/paddle/fluid/framework/ir/mkldnn/layer_norm_onednn_optimization_pass.cc new file mode 100644 index 0000000000000..e5c1a43d92301 --- /dev/null +++ b/paddle/fluid/framework/ir/mkldnn/layer_norm_onednn_optimization_pass.cc @@ -0,0 +1,110 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/mkldnn/layer_norm_onednn_optimization_pass.h" + +#include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/fluid/platform/mkldnn_reuse.h" +#include "paddle/fluid/string/pretty_log.h" + +namespace paddle { +namespace framework { +namespace ir { + +using string::PrettyLogDetail; + +void LayerNormOneDNNOptimizationPass::ApplyImpl(Graph *graph) const { + PADDLE_ENFORCE_NOT_NULL( + graph, platform::errors::InvalidArgument("Graph cannot be nullptr.")); + FusePassBase::Init("layer_norm_onednn_optimization_pass", graph); + + GraphPatternDetector gpd; + patterns::LayerNormShiftScale layer_norm_shift_scale_pattern( + gpd.mutable_pattern(), "layer_norm_onednn_optimization_pass"); + layer_norm_shift_scale_pattern(); + + int found_layer_norm = 0; + + auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph, + Graph *g) { + GET_IR_NODE_FROM_SUBGRAPH( + layer_norm_op, layer_norm_op, layer_norm_shift_scale_pattern); + GET_IR_NODE_FROM_SUBGRAPH( + layer_norm_bias, layer_norm_bias, layer_norm_shift_scale_pattern); + GET_IR_NODE_FROM_SUBGRAPH( + layer_norm_scale, layer_norm_scale, layer_norm_shift_scale_pattern); + + if (layer_norm_op->Op()->HasAttr("use_mkldnn") && + !(PADDLE_GET_CONST(bool, layer_norm_op->Op()->GetAttr("use_mkldnn")))) { + VLOG(4) << "Only oneDNN version of layer_norm can be optimized to " + "include Bias and Shift in a single tensor."; + return; + } + + auto *scope = param_scope(); + + auto ln_bias_name = layer_norm_op->Op()->Input("Bias"); + auto ln_scale_name = layer_norm_op->Op()->Input("Scale"); + + auto *ln_bias_tensor = + scope->FindVar(ln_bias_name[0])->GetMutable(); + auto *ln_scale_tensor = + scope->FindVar(ln_scale_name[0])->GetMutable(); + + const int channels = ln_bias_tensor->dims()[0]; + + VarDesc scale_shift_desc(patterns::PDNodeName( + "layer_norm_onednn_optimization_pass", "ScaleShift")); + scale_shift_desc.SetShape({channels * 2}); + scale_shift_desc.SetDataType( + framework::TransToProtoVarType(ln_bias_tensor->dtype())); + scale_shift_desc.SetPersistable(true); + + auto scale_shift_node = g->CreateVarNode(&scale_shift_desc); + auto *scale_shift_tensor = + scope->Var(scale_shift_node->Name())->GetMutable(); + + scale_shift_tensor->Resize(phi::make_ddim({channels * 2})); + + memcpy(scale_shift_tensor->mutable_data(platform::CPUPlace()), + ln_scale_tensor->data(), + channels * sizeof(float)); + + memcpy(scale_shift_tensor->data() + channels, + ln_bias_tensor->data(), + channels * sizeof(float)); + + layer_norm_op->Op()->SetInput("ScaleShift", {scale_shift_node->Name()}); + + IR_NODE_LINK_TO(scale_shift_node, layer_norm_op); + found_layer_norm++; + }; + + gpd(graph, handler); + AddStatis(found_layer_norm); + if ((!Has("disable_logs") || !Get("disable_logs")) && + found_layer_norm > 0) + PrettyLogDetail("--- optimized %d layer_norms by merging Scale and Bias", + found_layer_norm); +} +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(layer_norm_onednn_optimization_pass, + paddle::framework::ir::LayerNormOneDNNOptimizationPass); +REGISTER_PASS_CAPABILITY(layer_norm_onednn_optimization_pass) + .AddCombination( + paddle::framework::compatible::OpVersionComparatorCombination().GE( + "layer_norm", 0)); diff --git a/paddle/fluid/framework/ir/mkldnn/layer_norm_onednn_optimization_pass.h b/paddle/fluid/framework/ir/mkldnn/layer_norm_onednn_optimization_pass.h new file mode 100644 index 0000000000000..2c3dbb636e40d --- /dev/null +++ b/paddle/fluid/framework/ir/mkldnn/layer_norm_onednn_optimization_pass.h @@ -0,0 +1,34 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph.h" + +namespace paddle { +namespace framework { +namespace ir { + +class LayerNormOneDNNOptimizationPass : public FusePassBase { + public: + virtual ~LayerNormOneDNNOptimizationPass() {} + + protected: + void ApplyImpl(Graph *graph) const override; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc index e6dc79b509dab..aad6f63052040 100755 --- a/paddle/fluid/inference/api/paddle_pass_builder.cc +++ b/paddle/fluid/inference/api/paddle_pass_builder.cc @@ -346,6 +346,7 @@ void CpuPassStrategy::EnableMKLDNN() { "softplus_activation_mkldnn_fuse_pass", // "shuffle_channel_mkldnn_detect_pass", // "elt_act_mkldnn_fuse_pass", // + "layer_norm_onednn_optimization_pass", // "operator_scale_onednn_fuse_pass", // "operator_unsqueeze2_onednn_fuse_pass", // "operator_reshape2_onednn_fuse_pass", // @@ -443,6 +444,7 @@ void CpuPassStrategy::EnableMkldnnInt8() { passes_.push_back("scale_matmul_fuse_pass"); passes_.push_back("reshape_transpose_matmul_mkldnn_fuse_pass"); passes_.push_back("matmul_elementwise_add_mkldnn_fuse_pass"); + passes_.push_back("layer_norm_onednn_optimization_pass"); passes_.push_back("operator_scale_onednn_fuse_pass"); passes_.push_back("operator_unsqueeze2_onednn_fuse_pass"); passes_.push_back("operator_reshape2_onednn_fuse_pass"); diff --git a/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc index 24ae86df61ba9..dee65287724a0 100644 --- a/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc @@ -41,19 +41,32 @@ class LayerNormOneDNNHandler } std::shared_ptr AcquireScaleShiftMemory( - const phi::DenseTensor* scale, const phi::DenseTensor* shift) { - // OneDNN requires a single piece of memory for scale and shift data - const unsigned int C = phi::vectorize(scale->dims())[0]; - - auto scaleshift_memory = - this->AcquireMemoryFromPrimitive(this->fwd_pd_->weights_desc()); - - auto mem_data_handle = - reinterpret_cast(scaleshift_memory->get_data_handle()); - std::copy(scale->data(), scale->data() + C, mem_data_handle); - std::copy( - shift->data(), shift->data() + C, mem_data_handle + C); - return scaleshift_memory; + const phi::DenseTensor* scale, + const phi::DenseTensor* shift, + const framework::ExecutionContext& ctx) { + // OneDNN requires a single piece of memory for scale and shift data. During + // inference both pieces of memory are merged inside + // layer_norm_onednn_optimization_pass, but during training we have to + // manually copy them into new memory buffer + auto* scaleshift = ctx.Input("ScaleShift"); + if (scaleshift) { + return this->AcquireMemoryFromPrimitive( + this->fwd_pd_->weights_desc(), + platform::to_void_cast(scaleshift->data())); + } else { + const unsigned int C = phi::vectorize(scale->dims())[0]; + + auto scaleshift_memory = + this->AcquireMemoryFromPrimitive(this->fwd_pd_->weights_desc()); + + auto mem_data_handle = + reinterpret_cast(scaleshift_memory->get_data_handle()); + std::copy( + scale->data(), scale->data() + C, mem_data_handle); + std::copy( + shift->data(), shift->data() + C, mem_data_handle + C); + return scaleshift_memory; + } } std::shared_ptr AcquireMeanMemory(phi::DenseTensor* mean) { @@ -77,9 +90,9 @@ class LayerNormMKLDNNOpKernel : public paddle::framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { auto* x = ctx.Input("X"); + auto* out = ctx.Output("Y"); auto* scale = ctx.Input("Scale"); auto* bias = ctx.Input("Bias"); - auto* out = ctx.Output("Y"); const float epsilon = ctx.Attr("epsilon"); const auto begin_norm_axis = ctx.Attr("begin_norm_axis"); @@ -129,7 +142,7 @@ class LayerNormMKLDNNOpKernel : public paddle::framework::OpKernel { if (with_scaleshift) { std::shared_ptr scaleshift_memory = - handler.AcquireScaleShiftMemory(scale, bias); + handler.AcquireScaleShiftMemory(scale, bias, ctx); args.insert({DNNL_ARG_SCALE_SHIFT, *scaleshift_memory}); } From 2b81d13c7f2b7f93b378bbef9d877f9481e94a90 Mon Sep 17 00:00:00 2001 From: 1want2sleep <116695878+1want2sleep@users.noreply.github.com> Date: Tue, 15 Nov 2022 18:53:16 +0800 Subject: [PATCH 019/210] =?UTF-8?q?=E6=9B=B4=E6=94=B9=E4=BA=86=E5=BC=95?= =?UTF-8?q?=E7=94=A8=E6=A0=BC=E5=BC=8F=20(#47963)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix some docs bugs; test=document_fix * Update batch_sampler.py * Update dataset.py * Update dataset.py * Update sampler.py * for codestyle; test=document_fix * fix copy-from issue; test=document_fix Co-authored-by: Ligoml <39876205+Ligoml@users.noreply.github.com> Co-authored-by: Ligoml --- .../paddle/fluid/dataloader/batch_sampler.py | 43 ++++++++++--------- python/paddle/fluid/dataloader/dataset.py | 21 +++++---- python/paddle/fluid/dataloader/sampler.py | 17 ++++---- 3 files changed, 42 insertions(+), 39 deletions(-) diff --git a/python/paddle/fluid/dataloader/batch_sampler.py b/python/paddle/fluid/dataloader/batch_sampler.py index 5ac1c79d0cdf7..ff749271e56bb 100644 --- a/python/paddle/fluid/dataloader/batch_sampler.py +++ b/python/paddle/fluid/dataloader/batch_sampler.py @@ -37,20 +37,20 @@ class BatchSampler(Sampler): Args: - dataset(Dataset): this could be a :code:`paddle.io.Dataset` - implement or other python object which implemented + dataset(Dataset, optional): this should be an instance of a subclass of :ref:`api_paddle_io_Dataset` or + :ref:`api_paddle_io_IterableDataset` or other python object which implemented :code:`__len__` for BatchSampler to get indices as the - range of :attr:`dataset` length. Default None. - sampler (Sampler): this could be a :code:`paddle.io.Dataset` - instance which implemented :code:`__iter__` to yield + range of :attr:`dataset` length. Default None, disabled. + sampler (Sampler, optional): this should be a :ref:`api_paddle_io_Sample` + instance which implemented :code:`__iter__` to generate sample indices. :attr:`sampler` and :attr:`dataset` can not be set in the same time. If :attr:`sampler` - is set, :attr:`shuffle` should not be set. Default None. - shuffle(bool): whether to shuffle indices order before genrating - batch indices. Default False. - batch_size(int): sample indice number in a mini-batch indices. - drop_last(bool): whether drop the last incomplete batch dataset size - is not divisible by the batch size. Default False + is set, :attr:`dataset` should not be set. Default None, disabled. + shuffle(bool, optional): whether to shuffle indices order before generating + batch indices. Default False, don't shuffle indices before generating batch indices. + batch_size(int, optional): sample indice number in a mini-batch indices. default 1, each mini-batch includes 1 sample. + drop_last(bool, optional): whether drop the last incomplete (less than 1 mini-batch) batch dataset. Default False, keep it. + see :ref:`api_paddle_io_DataLoader` Returns: BatchSampler: an iterable object for indices iterating @@ -92,7 +92,6 @@ def __len__(self): print(batch_indices) - see `paddle.io.DataLoader` """ @@ -183,22 +182,24 @@ class DistributedBatchSampler(BatchSampler): Dataset is assumed to be of constant size. Args: - dataset(paddle.io.Dataset): this could be a `paddle.io.Dataset` implement + dataset(Dataset): this could be an instance of subclass of :ref:`api_paddle_io_Dataset` or other python object which implemented - `__len__` for BatchSampler to get sample - number of data source. - batch_size(int): sample indice number in a mini-batch indices. + `__len__` for BatchSampler to get indices of samples. + batch_size(int): sample size of each mini-batch. num_replicas(int, optional): porcess number in distributed training. If :attr:`num_replicas` is None, :attr:`num_replicas` will be - retrieved from :code:`paddle.distributed.ParallenEnv`. + retrieved from :ref:`api_paddle_distributed_ParallelEnv` . Default None. rank(int, optional): the rank of the current process among :attr:`num_replicas` processes. If :attr:`rank` is None, :attr:`rank` is retrieved from - :code:`paddle.distributed.ParallenEnv`. Default None. - shuffle(bool): whther to shuffle indices order before genrating + :ref:`api_paddle_distributed_ParallelEnv`. Default None. + shuffle(bool, optional): whther to shuffle indices order before genrating batch indices. Default False. - drop_last(bool): whether drop the last incomplete batch dataset size - is not divisible by the batch size. Default False + drop_last(bool, optional): whether drop the last incomplete(less than a mini-batch) batch dataset size. + Default False. + + Returns: + DistributedBatchSampler, return an iterable object for indices iterating. Examples: .. code-block:: python diff --git a/python/paddle/fluid/dataloader/dataset.py b/python/paddle/fluid/dataloader/dataset.py index 04e03ec844aac..6d62cd9fe0a03 100755 --- a/python/paddle/fluid/dataloader/dataset.py +++ b/python/paddle/fluid/dataloader/dataset.py @@ -89,19 +89,20 @@ class IterableDataset(Dataset): An abstract class to encapsulate methods and behaviors of iterable datasets. All datasets in iterable-style (can only get sample one by one sequentially, like - a Python iterator) should be a subclass of `paddle.io.IterableDataset`. All subclasses should + a Python iterator) should be a subclass of :ref:`api_paddle_io_IterableDataset` . All subclasses should implement following methods: - :code:`__iter__`: yield sample sequentially. This method is required by reading dataset sample in :code:`paddle.io.DataLoader`. + :code:`__iter__`: yield sample sequentially. This method is required by reading dataset sample in :ref:`api_paddle_io_DataLoader` . .. note:: do not implement :code:`__getitem__` and :code:`__len__` in IterableDataset, should not be called either. - see :code:`paddle.io.DataLoader`. + see :ref:`api_paddle_io_DataLoader` . Examples: .. code-block:: python + :name: code-example1 import numpy as np from paddle.io import IterableDataset @@ -128,9 +129,10 @@ def __iter__(self): among workers as follows. In both the methods, worker information that can be getted in a worker process by `paddle.io.get_worker_info` will be needed. - Example 1: splitting data copy in each worker in :code:`__iter__` + splitting data copy in each worker in :code:`__iter__` .. code-block:: python + :name: code-example2 import math import paddle @@ -169,9 +171,10 @@ def __iter__(self): print(data) # outputs: [2, 5, 3, 6, 4, 7] - Example 2: splitting data copy in each worker by :code:`worker_init_fn` + splitting data copy in each worker by :code:`worker_init_fn` .. code-block:: python + :name: code-example3 import math import paddle @@ -370,16 +373,16 @@ def __getitem__(self, idx): class ChainDataset(IterableDataset): """ - A Dataset which chains multiple iterable-tyle datasets. + A Dataset which chains multiple iterable-style datasets. This dataset is used for assembling multiple datasets which should - be :code:`paddle.io.IterableDataset`. + be :ref:`api_paddle_io_IterableDataset`. Args: - datasets(list of Dataset): List of datasets to be chainned. + datasets(list of IterableDatasets): List of datasets to be chainned. Returns: - Dataset: A Dataset which chains fields of multiple datasets. + paddle.io.IterableDataset: A Dataset which chains fields of multiple datasets. Examples: diff --git a/python/paddle/fluid/dataloader/sampler.py b/python/paddle/fluid/dataloader/sampler.py index afd8fa7da0257..a6ec3ffbae9b8 100644 --- a/python/paddle/fluid/dataloader/sampler.py +++ b/python/paddle/fluid/dataloader/sampler.py @@ -151,16 +151,16 @@ class RandomSampler(Sampler): Args: data_source(Dataset): dataset to sample, this could be an - instance of :code:`paddle.io.Dataset` other Python - object which implemented :code:`__len__`. - replacement(bool): If False, sample the whole dataset, If False, - set :attr:`num_samples` for how many sample to draw. Default False. - num_samples(int): set sample number to draw if :attr:`replacement` - is True. Default None. - generator(Generator): specify a generator to sample the data source. Default None + instance of :ref:`api_paddle_io_Dataset` or :ref:`api_paddle_io_IterableDataset` or other Python + object which implemented :code:`__len__` to get indices as the range of :code:`dataset` length. Default None. + replacement(bool, optional): If False, sample the whole dataset, If True, + set :attr:`num_samples` for how many samples to draw. Default False. + num_samples(int, optional): set sample number to draw if :attr:`replacement` + is True, then it will take samples according to the number you set. Default None, disabled. + generator(Generator, optional): specify a generator to sample the :code:`data_source`. Default None, disabled. Returns: - Sampler: a Sampler yield sample index randomly + RandomSampler: a Sampler yield sample index randomly. Examples: @@ -185,7 +185,6 @@ def __len__(self): for index in sampler: print(index) - see `paddle.io.Sampler` """ def __init__( From 21d4fa0280eb4e081de489bbcf40ece7977ae669 Mon Sep 17 00:00:00 2001 From: YuanRisheng Date: Tue, 15 Nov 2022 19:41:19 +0800 Subject: [PATCH 020/210] fix onednn bugs, test=document_fix (#48013) --- paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc index dee65287724a0..1cee039640f8c 100644 --- a/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc @@ -52,7 +52,7 @@ class LayerNormOneDNNHandler if (scaleshift) { return this->AcquireMemoryFromPrimitive( this->fwd_pd_->weights_desc(), - platform::to_void_cast(scaleshift->data())); + phi::funcs::to_void_cast(scaleshift->data())); } else { const unsigned int C = phi::vectorize(scale->dims())[0]; From a8aeb6042b2f57603e4481f2c3871527b6c7633b Mon Sep 17 00:00:00 2001 From: YUNSHEN XIE <1084314248@qq.com> Date: Tue, 15 Nov 2022 20:40:02 +0800 Subject: [PATCH 021/210] fix coverage ci bug, test=document_fix (#48005) --- tools/coverage/paddle_coverage.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/coverage/paddle_coverage.sh b/tools/coverage/paddle_coverage.sh index ed20025377f3e..e28af10862477 100644 --- a/tools/coverage/paddle_coverage.sh +++ b/tools/coverage/paddle_coverage.sh @@ -50,7 +50,7 @@ function gen_full_html_report() { '/paddle/paddle/fluid/string/*' \ '/paddle/paddle/fluid/eager/*' \ '/paddle/paddle/phi/*' \ - '/paddle/paddle/utils/*' \ + '/paddle/paddle/utils/*' \ -o coverage-full.tmp \ --rc lcov_branch_coverage=0 From fd550c1b3bfc68425d06dd172bea27c6b1128115 Mon Sep 17 00:00:00 2001 From: HongyuJia Date: Tue, 15 Nov 2022 20:41:24 +0800 Subject: [PATCH 022/210] [Opt Error Message] Opt error message when selecting kernels under phi (#47970) * opt error message when selecting kernels under phi * fix for loop * polish error message * polish error message, split into 3 error condition * polish error message --- paddle/phi/common/backend.h | 35 ++++++++++ paddle/phi/common/data_type.h | 41 +++++++++++ paddle/phi/core/kernel_factory.cc | 111 +++++++++++++++++++++++++++--- 3 files changed, 179 insertions(+), 8 deletions(-) diff --git a/paddle/phi/common/backend.h b/paddle/phi/common/backend.h index 50fbd8e072461..2ddafdac520eb 100644 --- a/paddle/phi/common/backend.h +++ b/paddle/phi/common/backend.h @@ -182,6 +182,41 @@ inline Backend StringToBackend(const char* backend_cstr) { } } +inline std::string BackendToString(const Backend& backend) { + switch (backend) { + case Backend::UNDEFINED: + return "Undefined(ALL_BACKEND)"; + case Backend::CPU: + return "CPU"; + case Backend::GPU: + return "GPU"; + case Backend::XPU: + return "XPU"; + case Backend::NPU: + return "NPU"; + case Backend::MLU: + return "MLU"; + case Backend::ONEDNN: + return "ONEDNN"; + case Backend::GPUDNN: + return "GPUDNN"; + case Backend::KPS: + return "KPS"; + case Backend::IPU: + return "IPU"; + default: + size_t device_type_id_ = static_cast(backend) - + static_cast(Backend::NUM_BACKENDS); + std::string device_type = phi::GetGlobalDeviceType(device_type_id_); + if (!device_type.empty()) { + return device_type; + } else { + PD_THROW( + "Invalid enum backend type `", static_cast(backend), "`."); + } + } +} + } // namespace experimental } // namespace paddle diff --git a/paddle/phi/common/data_type.h b/paddle/phi/common/data_type.h index d1cd0f69f2bec..339f240dae268 100644 --- a/paddle/phi/common/data_type.h +++ b/paddle/phi/common/data_type.h @@ -212,6 +212,47 @@ inline std::ostream& operator<<(std::ostream& os, DataType dtype) { return os; } +inline std::string DataTypeToString(const DataType& dtype) { + switch (dtype) { + case DataType::UNDEFINED: + return "Undefined(ALL_DTYPE)"; + case DataType::BOOL: + return "bool"; + case DataType::INT8: + return "int8"; + case DataType::UINT8: + return "uint8"; + case DataType::INT16: + return "int16"; + case DataType::UINT16: + return "uint16"; + case DataType::INT32: + return "int32"; + case DataType::UINT32: + return "uint32"; + case DataType::INT64: + return "int64"; + case DataType::UINT64: + return "uint64"; + case DataType::BFLOAT16: + return "bfloat16"; + case DataType::FLOAT16: + return "float16"; + case DataType::FLOAT32: + return "float32"; + case DataType::FLOAT64: + return "float64"; + case DataType::COMPLEX64: + return "complex64"; + case DataType::COMPLEX128: + return "complex128"; + case DataType::PSTRING: + return "pstring"; + default: + PD_THROW("Invalid enum data type `", static_cast(dtype), "`."); + } +} + } // namespace experimental } // namespace paddle diff --git a/paddle/phi/core/kernel_factory.cc b/paddle/phi/core/kernel_factory.cc index bbfe10591f0f9..a2b9f5971756b 100644 --- a/paddle/phi/core/kernel_factory.cc +++ b/paddle/phi/core/kernel_factory.cc @@ -28,6 +28,9 @@ namespace phi { const static Kernel empty_kernel; // NOLINT +std::string kernel_selection_error_message(const std::string& kernel_name, + const KernelKey& target_key); + uint32_t KernelKey::Hash::operator()(const KernelKey& key) const { uint32_t hash_value = 0; // |----31-20------|---19-12---|---11-8----|---7-0---| @@ -141,9 +144,10 @@ KernelResult KernelFactory::SelectKernelOrThrowError( kernel_iter == iter->second.end() && kernel_key.backend() == Backend::CPU, true, phi::errors::NotFound( - "The kernel with key %s of kernel `%s` is not registered.", + "The kernel with key %s of kernel `%s` is not registered. %s", kernel_key, - kernel_name)); + kernel_name, + kernel_selection_error_message(kernel_name, kernel_key))); #if defined(PADDLE_WITH_XPU) && !defined(PADDLE_WITH_XPU_KP) VLOG(6) << "fluid_op_name: " << TransToFluidOpName(kernel_name); @@ -168,10 +172,11 @@ KernelResult KernelFactory::SelectKernelOrThrowError( kernel_iter, iter->second.end(), phi::errors::NotFound( - "The kernel with key %s of kernel `%s` is not registered and" - " fail to fallback to CPU one.", + "The kernel with key %s of kernel `%s` is not registered and " + "fail to fallback to CPU one. %s", kernel_key, - kernel_name)); + kernel_name, + kernel_selection_error_message(kernel_name, kernel_key))); VLOG(3) << "missing " << kernel_key.backend() << " kernel: " << kernel_name << ", expected_kernel_key:" << kernel_key @@ -184,12 +189,13 @@ KernelResult KernelFactory::SelectKernelOrThrowError( kernel_iter, iter->second.end(), phi::errors::NotFound( - "The kernel with key %s of kernel `%s` is not registered and" - " the current value of FLAGS_enable_api_kernel_fallback(bool," + "The kernel with key %s of kernel `%s` is not registered. %s " + "The current value of FLAGS_enable_api_kernel_fallback(bool," " default true) is false. If you want to fallback this kernel" " to CPU one, please set the flag true before run again.", kernel_key, - kernel_name)); + kernel_name, + kernel_selection_error_message(kernel_name, kernel_key))); return {kernel_iter->second, false}; } @@ -344,4 +350,93 @@ std::ostream& operator<<(std::ostream& os, KernelFactory& kernel_factory) { return os; } +// return all kernel selection error message of specific kernel_name: +// 1. If target_key not supports target backend, output "Selected wrong Backend +// ..." +// 2. If target_key not supports target datatype, output "Selected wrong +// DataType ..." +// 3. `target_key` is still not supported, output all kernel keys of +// corresponding kernel_name: +// { +// (CPU, NCHW, [int8, int16, ...]); +// (GPU, Undefined(AnyLayout), [float32, float64, ...]); +// ... +// } +std::string kernel_selection_error_message(const std::string& kernel_name, + const KernelKey& target_key) { + PADDLE_ENFORCE_NE( + KernelFactory::Instance().kernels().find(kernel_name), + KernelFactory::Instance().kernels().end(), + phi::errors::NotFound("The kernel `%s` is not registered.", kernel_name)); + + // Init data structure + bool support_backend = false; + bool support_dtype = false; + std::unordered_map> all_kernel_key; + std::unordered_set backend_set; + std::unordered_set dtype_set; + + // Record all kernel information of kernel_name + for (auto iter : KernelFactory::Instance().kernels()[kernel_name]) { + KernelKey kernel_key = iter.first; + if (kernel_key.backend() == target_key.backend()) { + support_backend = true; + if (kernel_key.dtype() == target_key.dtype()) { + support_dtype = true; + } + dtype_set.insert( + paddle::experimental::DataTypeToString(kernel_key.dtype())); + } + backend_set.insert( + paddle::experimental::BackendToString(kernel_key.backend())); + all_kernel_key[paddle::experimental::BackendToString(kernel_key.backend()) + + ", " + phi::DataLayoutToString(kernel_key.layout())] + .push_back(paddle::experimental::DataTypeToString(kernel_key.dtype())); + } + // 1. If target_key not supports target backend, output "Selected wrong + // Backend ..." + if (!support_backend) { + std::string error_message = ""; + for (auto iter = backend_set.begin(); iter != backend_set.end(); ++iter) { + error_message += *iter; + error_message += ", "; + } + error_message = error_message.substr(0, error_message.length() - 2); + return "Selected wrong Backend `" + + paddle::experimental::BackendToString(target_key.backend()) + + "`. Paddle support following Backends: " + error_message + "."; + } + // 2. If target_key not supports target datatype, output "Selected wrong + // DataType ..." + if (!support_dtype) { + std::string error_message = ""; + for (auto iter = dtype_set.begin(); iter != dtype_set.end(); ++iter) { + error_message += *iter; + error_message += ", "; + } + error_message = error_message.substr(0, error_message.length() - 2); + return "Selected wrong DataType `" + + paddle::experimental::DataTypeToString(target_key.dtype()) + + "`. Paddle support following DataTypes: " + error_message + "."; + } + // 3. `target_key` is still not supported, output all kernel keys of + // corresponding kernel_name + std::string message = "Currently, paddle support following kernel keys of `" + + kernel_name + "`: { "; + for (auto iter = all_kernel_key.begin(); iter != all_kernel_key.end(); + ++iter) { + message += "(" + iter->first + ", ["; + std::vector& dtype_vec = iter->second; + for (std::size_t i = 0; i < dtype_vec.size(); ++i) { + message += dtype_vec[i]; + if (i + 1 != dtype_vec.size()) { + message += ", "; + } + } + message += "]); "; + } + message += "}."; + return message; +} + } // namespace phi From 5859d0a60af80c50c0e36c79f961b98975af3448 Mon Sep 17 00:00:00 2001 From: sneaxiy <32832641+sneaxiy@users.noreply.github.com> Date: Tue, 15 Nov 2022 20:57:55 +0800 Subject: [PATCH 023/210] add gather dtype err msg (#48002) --- paddle/phi/kernels/cpu/gather_grad_kernel.cc | 4 ++++ paddle/phi/kernels/cpu/gather_kernel.cc | 4 ++++ paddle/phi/kernels/gpu/gather_grad_kernel.cu | 4 ++++ paddle/phi/kernels/gpu/gather_kernel.cu | 4 ++++ 4 files changed, 16 insertions(+) diff --git a/paddle/phi/kernels/cpu/gather_grad_kernel.cc b/paddle/phi/kernels/cpu/gather_grad_kernel.cc index f0a6948018afc..f7f0ac6b2e0fe 100644 --- a/paddle/phi/kernels/cpu/gather_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/gather_grad_kernel.cc @@ -65,6 +65,10 @@ void GatherGradKernel(const Context& dev_ctx, phi::funcs::ScatterAssignAdd( dev_ctx, out_grad, index, x_grad); } + } else { + PADDLE_THROW(phi::errors::InvalidArgument( + "The data type of Input(Index) of gather_grad must be int32 or int64 " + "on CPU.")); } } diff --git a/paddle/phi/kernels/cpu/gather_kernel.cc b/paddle/phi/kernels/cpu/gather_kernel.cc index 9207a05b9dcce..9f6e7d2291a1b 100644 --- a/paddle/phi/kernels/cpu/gather_kernel.cc +++ b/paddle/phi/kernels/cpu/gather_kernel.cc @@ -49,6 +49,10 @@ void GatherKernel(const Context& dev_ctx, phi::funcs::CPUGather(dev_ctx, x, index, out); } else if (index_type == phi::DataType::INT64) { phi::funcs::CPUGather(dev_ctx, x, index, out); + } else { + PADDLE_THROW( + phi::errors::InvalidArgument("The data type of Input(Index) of gather " + "must be int32 or int64 on CPU.")); } } diff --git a/paddle/phi/kernels/gpu/gather_grad_kernel.cu b/paddle/phi/kernels/gpu/gather_grad_kernel.cu index 6965c2b0c244e..56b6f136723e6 100644 --- a/paddle/phi/kernels/gpu/gather_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/gather_grad_kernel.cu @@ -55,6 +55,10 @@ void GatherGradKernel(const Context& dev_ctx, } else if (index_type == DataType::INT64) { phi::funcs::GPUScatterAssign( dev_ctx, out_grad, index, x_grad, overwrite); + } else { + PADDLE_THROW(phi::errors::InvalidArgument( + "The data type of Input(Index) of gather_grad must be int32 or int64 " + "on GPU.")); } } diff --git a/paddle/phi/kernels/gpu/gather_kernel.cu b/paddle/phi/kernels/gpu/gather_kernel.cu index 8ba9edb49fac5..931f7b6431d9b 100644 --- a/paddle/phi/kernels/gpu/gather_kernel.cu +++ b/paddle/phi/kernels/gpu/gather_kernel.cu @@ -52,6 +52,10 @@ void GatherKernel(const Context& dev_ctx, phi::funcs::GPUGather(dev_ctx, x, index, out); } else if (index_type == phi::DataType::INT16) { phi::funcs::GPUGather(dev_ctx, x, index, out); + } else { + PADDLE_THROW( + phi::errors::InvalidArgument("The data type of Input(Index) of gather " + "must be int16, int32 or int64 on GPU.")); } } From 39c85064a27a6a6ab0d8eed8d8e996caf5302ff8 Mon Sep 17 00:00:00 2001 From: czr-gc <96037699+czr-gc@users.noreply.github.com> Date: Wed, 16 Nov 2022 09:53:30 +0800 Subject: [PATCH 024/210] feat(ipu): add paddle inference support for model_runtime. (#47364) --- .../ir/ipu/inference_process_pass.cc | 4 ++ paddle/fluid/inference/analysis/argument.h | 3 + .../analysis/passes/ir_graph_build_pass.cc | 3 + paddle/fluid/inference/api/analysis_config.cc | 11 +++- .../fluid/inference/api/analysis_predictor.cc | 2 + .../inference/api/paddle_analysis_config.h | 11 +++- .../inference/tests/api/ipu_resnet50_test.cc | 63 +++++++++++++++++++ paddle/fluid/pybind/inference_api.cc | 3 +- 8 files changed, 95 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/framework/ir/ipu/inference_process_pass.cc b/paddle/fluid/framework/ir/ipu/inference_process_pass.cc index 11679c95b1133..0213d20d30c71 100644 --- a/paddle/fluid/framework/ir/ipu/inference_process_pass.cc +++ b/paddle/fluid/framework/ir/ipu/inference_process_pass.cc @@ -86,6 +86,10 @@ void InferenceProcessPass::ApplyImpl(ir::Graph* graph) const { } } + // Set executor + ipu_strategy_instance_->enable_model_runtime_executor = + graph->Get("enable_model_runtime_executor"); + // Set available memory proportion for matmul/conv ipu_strategy_instance_->available_memory_proportion = graph->Get("available_memory_proportion"); diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h index 76b51f5890ff3..496cd9d1e2d53 100644 --- a/paddle/fluid/inference/analysis/argument.h +++ b/paddle/fluid/inference/analysis/argument.h @@ -353,6 +353,9 @@ struct Argument { DECL_ARGUMENT_FIELD(ipu_custom_patterns, IpuCustomPatterns, std::vector>); + DECL_ARGUMENT_FIELD(ipu_enable_model_runtime_executor, + IpuEnableModelRuntimeExecutor, + bool); // npu related DECL_ARGUMENT_FIELD(use_npu, UseNpu, bool); diff --git a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc index e07eaa64615c8..18f5c9e4a9c6c 100644 --- a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc +++ b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc @@ -97,6 +97,9 @@ void IrGraphBuildPass::RunImpl(Argument *argument) { &argument->ipu_custom_ops_info()); argument->main_graph().SetNotOwned("custom_patterns", &argument->ipu_custom_patterns()); + argument->main_graph().SetNotOwned( + "enable_model_runtime_executor", + &argument->ipu_enable_model_runtime_executor()); } } #endif diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index 08d569635b0c9..7d243c6df2a55 100755 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -205,11 +205,13 @@ void AnalysisConfig::EnableIpu(int ipu_device_num, void AnalysisConfig::SetIpuConfig(bool ipu_enable_fp16, int ipu_replica_num, float ipu_available_memory_proportion, - bool ipu_enable_half_partial) { + bool ipu_enable_half_partial, + bool ipu_enable_model_runtime_executor) { ipu_enable_fp16_ = ipu_enable_fp16; ipu_replica_num_ = ipu_replica_num; ipu_available_memory_proportion_ = ipu_available_memory_proportion; ipu_enable_half_partial_ = ipu_enable_half_partial; + ipu_enable_model_runtime_executor_ = ipu_enable_model_runtime_executor; Update(); } @@ -284,7 +286,7 @@ void AnalysisConfig::LoadIpuConfig(const std::string &config_path) { if (ipu_config_mapper_.find(key) == ipu_config_mapper_.end()) { PADDLE_THROW(platform::errors::InvalidArgument( - "invalid key {} in IPU config", key)); + "invalid key {} in IPU config: ", key)); } switch (ipu_config_mapper_.at(key)) { case ipu_config_code::ipu_device_num: @@ -317,6 +319,9 @@ void AnalysisConfig::LoadIpuConfig(const std::string &config_path) { case ipu_config_code::ipu_custom_patterns: ipu_custom_patterns_ = string2vector(value); break; + case ipu_config_code::ipu_enable_model_runtime_executor: + ipu_enable_model_runtime_executor_ = string2bool(value); + break; default: PADDLE_THROW(platform::errors::InvalidArgument( @@ -482,6 +487,7 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { CP_MEMBER(ipu_replica_num_); CP_MEMBER(ipu_available_memory_proportion_); CP_MEMBER(ipu_enable_half_partial_); + CP_MEMBER(ipu_enable_model_runtime_executor_); CP_MEMBER(ipu_custom_ops_info_); CP_MEMBER(ipu_custom_patterns_); @@ -1061,6 +1067,7 @@ std::string AnalysisConfig::SerializeInfoCache() { ss << ipu_replica_num_; ss << ipu_available_memory_proportion_; ss << ipu_enable_half_partial_; + ss << ipu_enable_model_runtime_executor_; for (auto custom_op : ipu_custom_ops_info_) for (auto attr : custom_op) ss << attr; ss << ";"; diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 48dc6f0afcda7..d2b0ba0a5fcf8 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -1185,6 +1185,8 @@ void AnalysisPredictor::PrepareArgument() { argument_.SetIpuAvailableMemoryProportion( config_.ipu_available_memory_proportion_); argument_.SetIpuEnableHalfPartial(config_.ipu_enable_half_partial_); + argument_.SetIpuEnableModelRuntimeExecutor( + config_.ipu_enable_model_runtime_executor_); argument_.SetIpuCustomOpsInfo(config_.ipu_custom_ops_info_); argument_.SetIpuCustomPatterns(config_.ipu_custom_patterns_); #endif diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h index af34bfc796d21..0fef4f6ced5fd 100644 --- a/paddle/fluid/inference/api/paddle_analysis_config.h +++ b/paddle/fluid/inference/api/paddle_analysis_config.h @@ -297,7 +297,8 @@ struct PD_INFER_DECL AnalysisConfig { ipu_available_memory_proportion, ipu_enable_half_partial, ipu_custom_ops_info, - ipu_custom_patterns + ipu_custom_patterns, + ipu_enable_model_runtime_executor, }; /// @@ -323,11 +324,14 @@ struct PD_INFER_DECL AnalysisConfig { /// matmul/conv. /// \param ipu_enable_half_partial enable fp16 partial for matmul, only work /// with fp16. + /// \param ipu_enable_model_runtime_executor whether to use model_runtime + /// executor. /// void SetIpuConfig(bool ipu_enable_fp16 = false, int ipu_replica_num = 1, float ipu_available_memory_proportion = 1.0, - bool ipu_enable_half_partial = false); + bool ipu_enable_half_partial = false, + bool ipu_enable_model_runtime_executor = false); /// /// \brief Set IPU custom ops and patterns. @@ -1176,6 +1180,7 @@ struct PD_INFER_DECL AnalysisConfig { int ipu_replica_num_{1}; float ipu_available_memory_proportion_{1.0}; bool ipu_enable_half_partial_{false}; + bool ipu_enable_model_runtime_executor_{false}; std::vector> ipu_custom_ops_info_; std::vector> ipu_custom_patterns_; @@ -1190,6 +1195,8 @@ struct PD_INFER_DECL AnalysisConfig { {"ipu_available_memory_proportion", ipu_config_code::ipu_available_memory_proportion}, {"ipu_enable_half_partial", ipu_config_code::ipu_enable_half_partial}, + {"ipu_enable_model_runtime_executor", + ipu_config_code::ipu_enable_model_runtime_executor}, {"ipu_custom_ops_info", ipu_config_code::ipu_custom_ops_info}, {"ipu_custom_patterns", ipu_config_code::ipu_custom_patterns}}; diff --git a/paddle/fluid/inference/tests/api/ipu_resnet50_test.cc b/paddle/fluid/inference/tests/api/ipu_resnet50_test.cc index cc37ae0695d3d..ab7d8bd368e13 100644 --- a/paddle/fluid/inference/tests/api/ipu_resnet50_test.cc +++ b/paddle/fluid/inference/tests/api/ipu_resnet50_test.cc @@ -111,5 +111,68 @@ TEST(Analyzer_Resnet50_ipu, compare_results_2_batch) { } } +// multi threading +TEST(Analyzer_Resnet50_ipu, model_runtime_multi_thread) { + std::string model_dir = FLAGS_infer_model + "/" + "model"; + AnalysisConfig config; + const int thread_num = 10; + // ipu_device_num, ipu_micro_batch_size, ipu_enable_pipelining + config.EnableIpu(1, 1, false); + config.SetIpuConfig(false, 1, 1.0, false, true); + config.SetModel(model_dir + "/model", model_dir + "/params"); + + auto main_predictor = CreatePaddlePredictor(config); + std::vector> inputs; + std::vector> outputs; + std::vector predictors; + std::vector threads; + outputs.resize(thread_num); + inputs.resize(thread_num); + + const int batch = 1; + const int channel = 3; + const int height = 318; + const int width = 318; + const int input_num = batch * channel * height * width; + std::vector input(input_num, 1); + + PaddleTensor in; + in.shape = {batch, channel, height, width}; + in.data = + PaddleBuf(static_cast(input.data()), input_num * sizeof(float)); + in.dtype = PaddleDType::FLOAT32; + + for (int i = 0; i < thread_num; ++i) { + inputs[i].emplace_back(in); + predictors.emplace_back(std::move(main_predictor->Clone())); + } + + auto run = [](PaddlePredictor* predictor, + std::vector& input, + std::vector& output) { + ASSERT_TRUE(predictor->Run(input, &output)); + }; + + for (int i = 0; i < thread_num; ++i) { + threads.emplace_back( + run, predictors[i].get(), std::ref(inputs[i]), std::ref(outputs[i])); + } + + for (int i = 0; i < thread_num; ++i) { + threads[i].join(); + } + + const size_t expected_size = 1; + for (int i = 0; i < thread_num; ++i) { + EXPECT_EQ(outputs[i].size(), expected_size); + float* data_o = static_cast(outputs[i][0].data.data()); + + for (size_t j = 0; j < outputs[i][0].data.length() / sizeof(float); + j += 10) { + EXPECT_NEAR( + (data_o[j] - truth_values[j / 10]) / truth_values[j / 10], 0., 12e-5); + } + } +} } // namespace inference } // namespace paddle diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc index 2bfe221659ade..83db629dc89f2 100644 --- a/paddle/fluid/pybind/inference_api.cc +++ b/paddle/fluid/pybind/inference_api.cc @@ -678,7 +678,8 @@ void BindAnalysisConfig(py::module *m) { py::arg("ipu_enable_fp16") = false, py::arg("ipu_replica_num") = 1, py::arg("ipu_available_memory_proportion") = 1.0, - py::arg("ipu_enable_half_partial") = false) + py::arg("ipu_enable_half_partial") = false, + py::arg("ipu_enable_model_runtime_executor") = false) .def("set_ipu_custom_info", &AnalysisConfig::SetIpuCustomInfo, py::arg("ipu_custom_ops_info") = From e4ebf3834bef5576d63a7087ae824d1501c745f5 Mon Sep 17 00:00:00 2001 From: Wen Sun <35923278+HermitSun@users.noreply.github.com> Date: Wed, 16 Nov 2022 10:19:54 +0800 Subject: [PATCH 025/210] Update `ProcessGroupCustom` for `sync_op` compatibility (#47976) * refactor: update pg custom * fix: use new api in ut * fix: typo * revert: recover legacy apis * fix: add GetDeviceContext --- .../collective/ProcessGroupCustom.cc | 170 +++++++++++------- .../collective/ProcessGroupCustom.h | 25 ++- .../custom_runtime/process_group_xccl.py | 20 +-- 3 files changed, 131 insertions(+), 84 deletions(-) diff --git a/paddle/fluid/distributed/collective/ProcessGroupCustom.cc b/paddle/fluid/distributed/collective/ProcessGroupCustom.cc index 61e68889190f0..2a87c78993719 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupCustom.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupCustom.cc @@ -202,38 +202,6 @@ std::shared_ptr ProcessGroupCustom::Collective( return task; } -std::shared_ptr ProcessGroupCustom::AllGather( - std::vector& in_tensors, - std::vector& out_tensors) { - PADDLE_ENFORCE_EQ( - CheckTensorsInCustomPlace(in_tensors, device_type_), - true, - platform::errors::InvalidArgument( - "All inputs should be in CustomPlace(%s).", device_type_)); - PADDLE_ENFORCE_EQ( - CheckTensorsInCustomPlace(out_tensors, device_type_), - true, - platform::errors::InvalidArgument( - "All outputs should be in CustomPlace(%s).", device_type_)); - return Collective( - in_tensors, - out_tensors, - [&](phi::DenseTensor& input, - phi::DenseTensor& output, - phi::ccl::CCLComm comm, - const phi::stream::Stream& stream) { - return phi::DeviceManager::CCLAllGather( - device_type_, - input.data(), - output.data(), - input.numel(), - phi::ccl::ToCCLDataType(input.dtype()), - comm, - stream); - }, - CommType::ALLGATHER); -} - void* XcclGetPointerByOffset(void* raw_pointer, size_t offset, experimental::DataType type) { @@ -259,13 +227,13 @@ void* XcclGetPointerByOffset(void* raw_pointer, return nullptr; } -// NOTE: this is ONLY for compatibility std::shared_ptr ProcessGroupCustom::AllGather( phi::DenseTensor* out_tensor, const phi::DenseTensor& in_tensor, int64_t offset, int64_t numel, - bool sync_op) { + bool sync_op // for compatibility, no use now +) { std::vector in_wrapper{in_tensor}; std::vector out_wrapper{*out_tensor}; return Collective( @@ -287,6 +255,105 @@ std::shared_ptr ProcessGroupCustom::AllGather( CommType::ALLGATHER); } +std::shared_ptr ProcessGroupCustom::AllReduce( + phi::DenseTensor* out_tensor, + const phi::DenseTensor& in_tensor, + const AllreduceOptions& opts, + bool sync_op // for compatibility, no use now +) { + std::vector in_wrapper{in_tensor}; + std::vector out_wrapper{*out_tensor}; + return AllReduce(in_wrapper, out_wrapper, opts); +} + +std::shared_ptr ProcessGroupCustom::Broadcast( + phi::DenseTensor* out_tensor, + const phi::DenseTensor& in_tensor, + const BroadcastOptions& opts, + bool sync_op // for compatibility, no use now +) { + std::vector in_wrapper{in_tensor}; + std::vector out_wrapper{*out_tensor}; + return Broadcast(in_wrapper, out_wrapper, opts); +} + +std::shared_ptr ProcessGroupCustom::Barrier( + const BarrierOptions& opts) { + // Only support single card single process + PADDLE_ENFORCE_GE(opts.device_id, + 0, + platform::errors::PreconditionNotMet( + "The barrier device id must greater or equal than 0.")); + platform::CustomPlace place(device_type_, opts.device_id); + auto allocator = std::unique_ptr( + new paddle::experimental::DefaultAllocator(place)); + phi::DenseTensorMeta meta(phi::DataType::FLOAT32, phi::DDim{1}); + phi::DenseTensor barrier_tensor{allocator.get(), meta}; + + auto task = ProcessGroupCustom::AllReduce(&barrier_tensor, + barrier_tensor, + {}, + /*sync_op*/ true); + auto xccl_task = dynamic_cast(task.get()); + xccl_task->barrierTensors_ = {barrier_tensor}; + return task; +} + +const phi::DeviceContext& ProcessGroupCustom::GetDeviceContext( + const Place& place) const { + const std::string key = GetKeyFromPlace(place); + const auto& iter = places_to_ctx_.find(key); + PADDLE_ENFORCE_NE( + iter, + places_to_ctx_.end(), + platform::errors::NotFound( + "Cannot find the device context in this process group.")); + return *iter->second[0]; +} + +phi::ccl::CCLComm ProcessGroupCustom::CustomCCLComm(const Place& place) const { + std::vector places = {place}; + const auto& iter = places_to_customcomm_.find(GetKeyFromPlaces(places)); + PADDLE_ENFORCE_NE(iter, + places_to_customcomm_.end(), + platform::errors::InvalidArgument( + "Cannot find nccl comm in process group.")); + return iter->second[0]->GetCustomCCLComm(); +} + +// TODO(sunyilun): methods below will be removed later +std::shared_ptr ProcessGroupCustom::AllGather( + std::vector& in_tensors, + std::vector& out_tensors) { + PADDLE_ENFORCE_EQ( + CheckTensorsInCustomPlace(in_tensors, device_type_), + true, + platform::errors::InvalidArgument( + "All inputs should be in CustomPlace(%s).", device_type_)); + PADDLE_ENFORCE_EQ( + CheckTensorsInCustomPlace(out_tensors, device_type_), + true, + platform::errors::InvalidArgument( + "All outputs should be in CustomPlace(%s).", device_type_)); + return Collective( + in_tensors, + out_tensors, + [&](phi::DenseTensor& input, + phi::DenseTensor& output, + phi::ccl::CCLComm comm, + const phi::stream::Stream& stream) { + return phi::DeviceManager::CCLAllGather( + device_type_, + input.data(), + output.data(), + input.numel(), + phi::ccl::ToCCLDataType(input.dtype()), + comm, + stream); + }, + CommType::ALLGATHER); +} + std::shared_ptr ProcessGroupCustom::AllReduce( std::vector& in_tensors, // NOLINT std::vector& out_tensors, // NOLINT @@ -366,40 +433,5 @@ std::shared_ptr ProcessGroupCustom::Broadcast( CommType::BROADCAST); } -std::shared_ptr ProcessGroupCustom::Barrier( - const BarrierOptions& opts) { - // Only support single card single process - PADDLE_ENFORCE_GE(opts.device_id, - 0, - platform::errors::PreconditionNotMet( - "The barrier device id must greater or equal than 0.")); - platform::CustomPlace place(device_type_, opts.device_id); - std::vector places = {place}; - std::vector barrierTensors; - barrierTensors.reserve(places.size()); - - for (auto& place : places) { - phi::DeviceGuard guard(place); - phi::DenseTensorMeta meta(phi::DataType::FLOAT32, phi::DDim({1})); - auto allocator = std::unique_ptr( - new paddle::experimental::DefaultAllocator(place)); - barrierTensors.emplace_back(allocator.get(), meta); - } - auto task = ProcessGroupCustom::AllReduce(barrierTensors, barrierTensors); - auto xccl_task = dynamic_cast(task.get()); - xccl_task->barrierTensors_ = std::move(barrierTensors); - return task; -} - -phi::ccl::CCLComm ProcessGroupCustom::CustomCCLComm(const Place& place) const { - std::vector places = {place}; - const auto& iter = places_to_customcomm_.find(GetKeyFromPlaces(places)); - PADDLE_ENFORCE_NE(iter, - places_to_customcomm_.end(), - platform::errors::InvalidArgument( - "Cannot find nccl comm in process group.")); - return iter->second[0]->GetCustomCCLComm(); -} - } // namespace distributed } // namespace paddle diff --git a/paddle/fluid/distributed/collective/ProcessGroupCustom.h b/paddle/fluid/distributed/collective/ProcessGroupCustom.h index 3ca2d767c7fc5..050e780ae120d 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupCustom.h +++ b/paddle/fluid/distributed/collective/ProcessGroupCustom.h @@ -78,6 +78,26 @@ class ProcessGroupCustom : public ProcessGroup { int64_t numel, bool sync_op) override; + std::shared_ptr AllReduce( + phi::DenseTensor* out_tensor, + const phi::DenseTensor& in_tensor, + const AllreduceOptions& opts, + bool sync_op) override; + + std::shared_ptr Broadcast( + phi::DenseTensor* out_tensor, + const phi::DenseTensor& in_tensor, + const BroadcastOptions& opts, + bool sync_op) override; + + std::shared_ptr Barrier( + const BarrierOptions& = BarrierOptions()) override; + + const phi::DeviceContext& GetDeviceContext(const Place& place) const override; + + phi::ccl::CCLComm CustomCCLComm(const Place& place) const; + + // TODO(sunyilun): methods below will be removed later std::shared_ptr AllGather( std::vector& in_tensors, std::vector& out_tensors) override; @@ -92,11 +112,6 @@ class ProcessGroupCustom : public ProcessGroup { std::vector& out_tensors, const BroadcastOptions& = BroadcastOptions()) override; - std::shared_ptr Barrier( - const BarrierOptions& = BarrierOptions()) override; - - phi::ccl::CCLComm CustomCCLComm(const Place& place) const; - protected: virtual std::shared_ptr CreateTask( std::vector places, diff --git a/python/paddle/fluid/tests/custom_runtime/process_group_xccl.py b/python/paddle/fluid/tests/custom_runtime/process_group_xccl.py index 1d3dfce9597a1..9734b2e775e19 100644 --- a/python/paddle/fluid/tests/custom_runtime/process_group_xccl.py +++ b/python/paddle/fluid/tests/custom_runtime/process_group_xccl.py @@ -63,11 +63,11 @@ def test_create_process_group_xccl(self): sum_result = tensor_x + tensor_y if pg.rank() == 0: - task = pg.allreduce(tensor_x) + task = pg.all_reduce(tensor_x, core.ReduceOp.SUM, sync_op=True) task.wait() # assert np.array_equal(tensor_x, sum_result) else: - task = pg.allreduce(tensor_y) + task = pg.all_reduce(tensor_y, core.ReduceOp.SUM, sync_op=True) task.wait() # assert np.array_equal(tensor_y, sum_result) @@ -81,11 +81,11 @@ def test_create_process_group_xccl(self): max_result = paddle.maximum(tensor_x, tensor_y) if pg.rank() == 0: - task = pg.allreduce(tensor_x, core.ReduceOp.MAX) + task = pg.all_reduce(tensor_x, core.ReduceOp.MAX, sync_op=True) task.wait() # assert np.array_equal(tensor_x, max_result) else: - task = pg.allreduce(tensor_y, core.ReduceOp.MAX) + task = pg.all_reduce(tensor_y, core.ReduceOp.MAX, sync_op=True) task.wait() # assert np.array_equal(tensor_y, max_result) @@ -101,14 +101,14 @@ def test_create_process_group_xccl(self): broadcast_result = paddle.assign(tensor_x) if pg.rank() == 0: - task = pg.broadcast(tensor_x, 0) - task.synchronize() + task = pg.broadcast(tensor_x, 0, sync_op=True) + task.wait() # paddle.fluid.core._custom_device_synchronize("custom_cpu", -1) assert task.is_completed() # assert np.array_equal(broadcast_result, tensor_x) else: - task = pg.broadcast(tensor_y, 0) - task.synchronize() + task = pg.broadcast(tensor_y, 0, sync_op=True) + task.wait() # paddle.fluid.core._custom_device_synchronize("custom_cpu", -1) assert task.is_completed() # assert np.array_equal(broadcast_result, tensor_y) @@ -139,12 +139,12 @@ def test_create_process_group_xccl(self): out = np.random.random(out_shape).astype(self.dtype) tensor_out = paddle.to_tensor(out) if pg.rank() == 0: - task = pg.all_gather(tensor_x, tensor_out) + task = pg.all_gather(tensor_out, tensor_x, sync_op=True) task.wait() # paddle.fluid.core._custom_device_synchronize("custom_cpu", -1) # rank 1 else: - task = pg.all_gather(tensor_y, tensor_out) + task = pg.all_gather(tensor_out, tensor_y, sync_op=True) task.wait() # paddle.fluid.core._custom_device_synchronize("custom_cpu", -1) out_1 = paddle.slice(tensor_out, [0], [0], [out_shape[0] // 2]) From 9adca1e73e13049d82f667571e3f236372c1ef31 Mon Sep 17 00:00:00 2001 From: Wang Xin Date: Wed, 16 Nov 2022 11:08:10 +0800 Subject: [PATCH 026/210] move "gpu_primitives.h" to phi (#48015) --- paddle/phi/backends/gpu/gpu_primitives.h | 610 ++++++++++++++++++ .../phi/kernels/funcs/detail/gru_gpu_kernel.h | 2 +- .../kernels/funcs/detail/lstm_gpu_kernel.h | 11 +- paddle/phi/kernels/funcs/gather.cu.h | 4 +- paddle/phi/kernels/funcs/pooling.cu | 9 +- paddle/phi/kernels/funcs/scatter.cu.h | 6 +- paddle/phi/kernels/funcs/segment_pooling.cu | 20 +- .../kernels/funcs/selected_rows_functor.cu | 12 +- paddle/phi/kernels/gpu/accuracy_kernel.cu | 4 +- paddle/phi/kernels/gpu/adagrad_kernel.cu | 10 +- .../kernels/gpu/affine_grid_grad_kernel.cu | 51 +- paddle/phi/kernels/gpu/affine_grid_kernel.cu | 2 +- paddle/phi/kernels/gpu/auc_kernel.cu | 8 +- paddle/phi/kernels/gpu/bincount_kernel.cu | 9 +- paddle/phi/kernels/gpu/box_coder.cu | 2 +- .../gpu/deformable_conv_grad_kernel.cu | 6 +- paddle/phi/kernels/gpu/depthwise_conv.h | 6 +- .../phi/kernels/gpu/diagonal_grad_kernel.cu | 4 +- paddle/phi/kernels/gpu/diagonal_kernel.cu | 4 +- .../gpu/distribute_fpn_proposals_kernel.cu | 4 +- .../phi/kernels/gpu/edit_distance_kernel.cu | 4 +- .../phi/kernels/gpu/embedding_grad_kernel.cu | 6 +- paddle/phi/kernels/gpu/graph_reindex_funcs.h | 2 +- .../phi/kernels/gpu/graph_send_recv_funcs.h | 21 +- .../kernels/gpu/graph_send_ue_recv_funcs.h | 53 +- .../kernels/gpu/grid_sample_grad_kernel.cu | 6 +- .../phi/kernels/gpu/group_norm_grad_kernel.cu | 4 +- paddle/phi/kernels/gpu/group_norm_utils.h | 4 +- paddle/phi/kernels/gpu/histogram_kernel.cu | 8 +- .../phi/kernels/gpu/index_add_grad_kernel.cu | 4 +- paddle/phi/kernels/gpu/index_add_kernel.cu | 6 +- .../kernels/gpu/index_sample_grad_kernel.cu | 6 +- .../kernels/gpu/index_select_grad_kernel.cu | 6 +- paddle/phi/kernels/gpu/index_select_impl.h | 4 +- paddle/phi/kernels/gpu/index_select_kernel.cu | 4 +- .../kernels/gpu/interpolate_grad_kernel.cu | 160 ++--- paddle/phi/kernels/gpu/interpolate_kernel.cu | 2 +- paddle/phi/kernels/gpu/linspace_kernel.cu | 2 +- .../phi/kernels/gpu/nanmedian_grad_kernel.cu | 4 +- paddle/phi/kernels/gpu/nanmedian_kernel.cu | 12 +- paddle/phi/kernels/gpu/nll_loss.h | 6 +- paddle/phi/kernels/gpu/nms_kernel.cu | 2 +- paddle/phi/kernels/gpu/one_hot_kernel.cu | 4 +- paddle/phi/kernels/gpu/pad3d_grad_kernel.cu | 16 +- paddle/phi/kernels/gpu/pad3d_kernel.cu | 4 +- .../phi/kernels/gpu/psroi_pool_grad_kernel.cu | 4 +- .../phi/kernels/gpu/roi_align_grad_kernel.cu | 15 +- .../phi/kernels/gpu/roi_pool_grad_kernel.cu | 4 +- paddle/phi/kernels/gpu/roll_grad_kernel.cu | 2 +- paddle/phi/kernels/gpu/roll_kernel.cu | 2 +- paddle/phi/kernels/gpu/roll_kernel_impl.h | 4 +- .../kernels/gpu/send_ue_recv_grad_kernel.cu | 8 +- paddle/phi/kernels/gpu/send_ue_recv_kernel.cu | 2 +- paddle/phi/kernels/gpu/send_uv_grad_kernel.cu | 4 +- paddle/phi/kernels/gpu/send_uv_kernel.cu | 2 +- paddle/phi/kernels/gpu/sgd_kernel.cu | 4 +- paddle/phi/kernels/gpu/shard_index_kernel.cu | 4 +- paddle/phi/kernels/gpu/transpose_kernel.cu | 2 +- paddle/phi/kernels/gpu/trunc_grad_kernel.cu | 4 +- paddle/phi/kernels/gpu/trunc_kernel.cu | 4 +- .../kernels/gpudnn/affine_grid_grad_kernel.cu | 2 +- .../phi/kernels/gpudnn/affine_grid_kernel.cu | 2 +- .../impl/repeat_interleave_grad_kernel_impl.h | 6 +- .../impl/repeat_interleave_kernel_impl.h | 14 +- 64 files changed, 895 insertions(+), 328 deletions(-) create mode 100644 paddle/phi/backends/gpu/gpu_primitives.h diff --git a/paddle/phi/backends/gpu/gpu_primitives.h b/paddle/phi/backends/gpu/gpu_primitives.h new file mode 100644 index 0000000000000..be08f29aa8150 --- /dev/null +++ b/paddle/phi/backends/gpu/gpu_primitives.h @@ -0,0 +1,610 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#ifdef PADDLE_WITH_CUDA +#include +#endif +#ifdef PADDLE_WITH_HIP +#include +#endif +#include + +#include "paddle/phi/common/bfloat16.h" +#include "paddle/phi/common/complex.h" +#include "paddle/phi/common/float16.h" + +template +using complex = phi::dtype::complex; + +using float16 = phi::dtype::float16; +using bfloat16 = phi::dtype::bfloat16; + +namespace phi { + +#define CUDA_ATOMIC_WRAPPER(op, T) \ + __device__ __forceinline__ T CudaAtomic##op(T *address, const T val) + +#define USE_CUDA_ATOMIC(op, T) \ + CUDA_ATOMIC_WRAPPER(op, T) { return atomic##op(address, val); } + +// Default thread count per block(or block size). +// TODO(typhoonzero): need to benchmark against setting this value +// to 1024. +constexpr int PADDLE_CUDA_NUM_THREADS = 512; + +// For atomicAdd. +USE_CUDA_ATOMIC(Add, float); +USE_CUDA_ATOMIC(Add, int); +USE_CUDA_ATOMIC(Add, unsigned int); +// CUDA API uses unsigned long long int, we cannot use uint64_t here. +// It because unsigned long long int is not necessarily uint64_t +USE_CUDA_ATOMIC(Add, unsigned long long int); // NOLINT + +CUDA_ATOMIC_WRAPPER(Add, int64_t) { + // Here, we check long long int must be int64_t. + static_assert(sizeof(int64_t) == sizeof(long long int), // NOLINT + "long long should be int64"); + return CudaAtomicAdd( + reinterpret_cast(address), // NOLINT + static_cast(val)); // NOLINT +} + +#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600) +USE_CUDA_ATOMIC(Add, double); +#else +CUDA_ATOMIC_WRAPPER(Add, double) { + unsigned long long int *address_as_ull = // NOLINT + reinterpret_cast(address); // NOLINT + unsigned long long int old = *address_as_ull, assumed; // NOLINT + + do { + assumed = old; + old = atomicCAS(address_as_ull, + assumed, + __double_as_longlong(val + __longlong_as_double(assumed))); + + // Note: uses integer comparison to avoid hang in case of NaN + } while (assumed != old); + + return __longlong_as_double(old); +} +#endif + +#ifdef PADDLE_CUDA_FP16 +// NOTE(dzhwinter): cuda do not have atomicCAS for half. +// Just use the half address as a unsigned value address and +// do the atomicCAS. According to the value store at high 16 bits +// or low 16 bits, then do a different sum and CAS. +// Given most warp-threads will failed on the atomicCAS, so this +// implemented should be avoided in high concurrency. It's will be +// slower than the way convert value into 32bits and do a full atomicCAS. + +// convert the value into float and do the add arithmetic. +// then store the result into a uint32. +inline static __device__ uint32_t add_to_low_half(uint32_t val, float x) { + float16 low_half; + // the float16 in lower 16bits + low_half.x = static_cast(val & 0xFFFFu); + low_half = static_cast(static_cast(low_half) + x); + return (val & 0xFFFF0000u) | low_half.x; +} + +inline static __device__ uint32_t add_to_high_half(uint32_t val, float x) { + float16 high_half; + // the float16 in higher 16bits + high_half.x = static_cast(val >> 16); + high_half = static_cast(static_cast(high_half) + x); + return (val & 0xFFFFu) | (static_cast(high_half.x) << 16); +} + +#if CUDA_VERSION >= 10000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 +static __device__ __forceinline__ float16 CUDAFP16ToPDFP16(__half x) { + return *reinterpret_cast(&x); +} + +static __device__ __forceinline__ __half PDFP16ToCUDAFP16(float16 x) { + return *reinterpret_cast<__half *>(&x); +} + +CUDA_ATOMIC_WRAPPER(Add, float16) { + return CUDAFP16ToPDFP16( + atomicAdd(reinterpret_cast<__half *>(address), PDFP16ToCUDAFP16(val))); +} +#else +CUDA_ATOMIC_WRAPPER(Add, float16) { + // concrete packed float16 value may exsits in lower or higher 16bits + // of the 32bits address. + uint32_t *address_as_ui = reinterpret_cast( + reinterpret_cast(address) - + (reinterpret_cast(address) & 0x02)); + float val_f = static_cast(val); + uint32_t old = *address_as_ui; + uint32_t sum; + uint32_t newval; + uint32_t assumed; + if (((uintptr_t)address & 0x02) == 0) { + // the float16 value stay at lower 16 bits of the address. + do { + assumed = old; + old = atomicCAS(address_as_ui, assumed, add_to_low_half(assumed, val_f)); + } while (old != assumed); + float16 ret; + ret.x = old & 0xFFFFu; + return ret; + } else { + // the float16 value stay at higher 16 bits of the address. + do { + assumed = old; + old = atomicCAS(address_as_ui, assumed, add_to_high_half(assumed, val_f)); + } while (old != assumed); + float16 ret; + ret.x = old >> 16; + return ret; + } +} +#endif + +// The performance of "atomicAdd(half* )" is bad, but for "atomicAdd(half2* )" +// is good. So for fp16 type, we can use "atomicAdd(half2* )" to speed up. +template < + typename T, + typename std::enable_if::value>::type * = nullptr> +__device__ __forceinline__ void fastAtomicAdd(T *tensor, + size_t index, + const size_t numel, + T value) { +#if ((CUDA_VERSION < 10000) || \ + (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700))) + CudaAtomicAdd(reinterpret_cast(tensor) + index, + static_cast(value)); +#else + // whether the address is 32-byte aligned. + __half *target_addr = reinterpret_cast<__half *>(tensor + index); + bool aligned_half2 = + (reinterpret_cast(target_addr) % sizeof(__half2) == 0); + + if (aligned_half2 && index < (numel - 1)) { + __half2 value2; + value2.x = *reinterpret_cast<__half *>(&value); + value2.y = __int2half_rz(0); + atomicAdd(reinterpret_cast<__half2 *>(target_addr), value2); + + } else if (!aligned_half2 && index > 0) { + __half2 value2; + value2.x = __int2half_rz(0); + value2.y = *reinterpret_cast<__half *>(&value); + atomicAdd(reinterpret_cast<__half2 *>(target_addr - 1), value2); + + } else { + atomicAdd(reinterpret_cast<__half *>(tensor) + index, + *reinterpret_cast<__half *>(&value)); + } +#endif +} + +template < + typename T, + typename std::enable_if::value>::type * = nullptr> +__device__ __forceinline__ void fastAtomicAdd(T *arr, + size_t index, + const size_t numel, + T value) { + CudaAtomicAdd(arr + index, value); +} +#endif + +// NOTE(zhangbo): cuda do not have atomicCAS for __nv_bfloat16. +inline static __device__ uint32_t bf16_add_to_low_half(uint32_t val, float x) { + bfloat16 low_half; + // the bfloat16 in lower 16bits + low_half.x = static_cast(val & 0xFFFFu); + low_half = static_cast(static_cast(low_half) + x); + return (val & 0xFFFF0000u) | low_half.x; +} + +inline static __device__ uint32_t bf16_add_to_high_half(uint32_t val, float x) { + bfloat16 high_half; + // the bfloat16 in higher 16bits + high_half.x = static_cast(val >> 16); + high_half = static_cast(static_cast(high_half) + x); + return (val & 0xFFFFu) | (static_cast(high_half.x) << 16); +} + +#if CUDA_VERSION >= 11000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 +static __device__ __forceinline__ bfloat16 CUDABF16ToPDBF16(__nv_bfloat16 x) { + return *reinterpret_cast(&x); +} + +static __device__ __forceinline__ __nv_bfloat16 PDBF16ToCUDABF16(bfloat16 x) { + return *reinterpret_cast<__nv_bfloat16 *>(&x); +} + +CUDA_ATOMIC_WRAPPER(Add, bfloat16) { + return CUDABF16ToPDBF16(atomicAdd(reinterpret_cast<__nv_bfloat16 *>(address), + PDBF16ToCUDABF16(val))); +} +#else +CUDA_ATOMIC_WRAPPER(Add, bfloat16) { + // concrete packed bfloat16 value may exsits in lower or higher 16bits + // of the 32bits address. + uint32_t *address_as_ui = reinterpret_cast( + reinterpret_cast(address) - + (reinterpret_cast(address) & 0x02)); + float val_f = static_cast(val); + uint32_t old = *address_as_ui; + uint32_t sum; + uint32_t newval; + uint32_t assumed; + if (((uintptr_t)address & 0x02) == 0) { + // the bfloat16 value stay at lower 16 bits of the address. + do { + assumed = old; + old = atomicCAS( + address_as_ui, assumed, bf16_add_to_low_half(assumed, val_f)); + } while (old != assumed); + bfloat16 ret; + ret.x = old & 0xFFFFu; + return ret; + } else { + // the bfloat16 value stay at higher 16 bits of the address. + do { + assumed = old; + old = atomicCAS( + address_as_ui, assumed, bf16_add_to_high_half(assumed, val_f)); + } while (old != assumed); + bfloat16 ret; + ret.x = old >> 16; + return ret; + } +} +#endif + +CUDA_ATOMIC_WRAPPER(Add, complex) { + float *real = reinterpret_cast(address); + float *imag = real + 1; + return complex(CudaAtomicAdd(real, val.real), + CudaAtomicAdd(imag, val.imag)); +} + +CUDA_ATOMIC_WRAPPER(Add, complex) { + double *real = reinterpret_cast(address); + double *imag = real + 1; + return complex(CudaAtomicAdd(real, val.real), + CudaAtomicAdd(imag, val.imag)); +} + +// For atomicMax +USE_CUDA_ATOMIC(Max, int); +USE_CUDA_ATOMIC(Max, unsigned int); +// CUDA API uses unsigned long long int, we cannot use uint64_t here. +// It because unsigned long long int is not necessarily uint64_t +#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350) +USE_CUDA_ATOMIC(Max, unsigned long long int); // NOLINT +#else +CUDA_ATOMIC_WRAPPER(Max, unsigned long long int) { // NOLINT + if (*address >= val) { + return *address; + } + + unsigned long long int old = *address, assumed; // NOLINT + + do { + assumed = old; + if (assumed >= val) { + break; + } + + old = atomicCAS(address, assumed, val); + } while (assumed != old); +} +#endif + +CUDA_ATOMIC_WRAPPER(Max, int64_t) { + // Here, we check long long int must be int64_t. + static_assert(sizeof(int64_t) == sizeof(long long int), // NOLINT + "long long should be int64"); + long long int res = *address; // NOLINT + while (val > res) { + long long int old = res; // NOLINT + res = (long long int)atomicCAS((unsigned long long int *)address, // NOLINT + (unsigned long long int)old, // NOLINT + (unsigned long long int)val); // NOLINT + if (res == old) { + break; + } + } + return res; +} + +CUDA_ATOMIC_WRAPPER(Max, float) { + if (*address >= val) { + return *address; + } + + int *const address_as_i = reinterpret_cast(address); + int old = *address_as_i, assumed; + + do { + assumed = old; + if (__int_as_float(assumed) >= val) { + break; + } + + old = atomicCAS(address_as_i, assumed, __float_as_int(val)); + } while (assumed != old); + + return __int_as_float(old); +} + +CUDA_ATOMIC_WRAPPER(Max, double) { + if (*address >= val) { + return *address; + } + + unsigned long long int *const address_as_ull = // NOLINT + reinterpret_cast(address); // NOLINT + unsigned long long int old = *address_as_ull, assumed; // NOLINT + + do { + assumed = old; + if (__longlong_as_double(assumed) >= val) { + break; + } + + old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val)); + } while (assumed != old); + + return __longlong_as_double(old); +} + +#ifdef PADDLE_CUDA_FP16 +inline static __device__ uint32_t max_to_low_half(uint32_t val, float x) { + float16 low_half; + // The float16 in lower 16bits + low_half.x = static_cast(val & 0xFFFFu); + low_half = static_cast(max(static_cast(low_half), x)); + return (val & 0xFFFF0000u) | low_half.x; +} + +inline static __device__ uint32_t max_to_high_half(uint32_t val, float x) { + float16 high_half; + // The float16 in higher 16bits + high_half.x = static_cast(val >> 16); + high_half = static_cast(max(static_cast(high_half), x)); + return (val & 0xFFFFu) | (static_cast(high_half.x) << 16); +} + +CUDA_ATOMIC_WRAPPER(Max, float16) { + if (*address >= val) { + return *address; + } + uint32_t *address_as_ui = reinterpret_cast( + reinterpret_cast(address) - + (reinterpret_cast(address) & 0x02)); + float val_f = static_cast(val); + uint32_t old = *address_as_ui; + uint32_t assumed; + if (((uintptr_t)address & 0x02) == 0) { + // The float16 value stay at lower 16 bits of the address. + do { + assumed = old; + old = atomicCAS(address_as_ui, assumed, max_to_low_half(assumed, val_f)); + } while (old != assumed); + float16 ret; + ret.x = old & 0xFFFFu; + return ret; + } else { + // The float16 value stay at higher 16 bits of the address. + do { + assumed = old; + old = atomicCAS(address_as_ui, assumed, max_to_high_half(assumed, val_f)); + } while (old != assumed); + float16 ret; + ret.x = old >> 16; + return ret; + } +} +#endif + +// For atomicMin +USE_CUDA_ATOMIC(Min, int); +USE_CUDA_ATOMIC(Min, unsigned int); +// CUDA API uses unsigned long long int, we cannot use uint64_t here. +// It because unsigned long long int is not necessarily uint64_t +#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350) +USE_CUDA_ATOMIC(Min, unsigned long long int); // NOLINT +#else +CUDA_ATOMIC_WRAPPER(Min, unsigned long long int) { // NOLINT + if (*address <= val) { + return *address; + } + + unsigned long long int old = *address, assumed; // NOLINT + + do { + assumed = old; + if (assumed <= val) { + break; + } + + old = atomicCAS(address, assumed, val); + } while (assumed != old); +} +#endif + +CUDA_ATOMIC_WRAPPER(Min, int64_t) { + // Here, we check long long int must be int64_t. + static_assert(sizeof(int64_t) == sizeof(long long int), // NOLINT + "long long should be int64"); + long long int res = *address; // NOLINT + while (val < res) { + long long int old = res; // NOLINT + res = (long long int)atomicCAS((unsigned long long int *)address, // NOLINT + (unsigned long long int)old, // NOLINT + (unsigned long long int)val); // NOLINT + if (res == old) { + break; + } + } + return res; +} + +CUDA_ATOMIC_WRAPPER(Min, float) { + if (*address <= val) { + return *address; + } + + int *const address_as_i = reinterpret_cast(address); + int old = *address_as_i, assumed; + + do { + assumed = old; + if (__int_as_float(assumed) <= val) { + break; + } + + old = atomicCAS(address_as_i, assumed, __float_as_int(val)); + } while (assumed != old); + + return __int_as_float(old); +} + +CUDA_ATOMIC_WRAPPER(Min, double) { + if (*address <= val) { + return *address; + } + + unsigned long long int *const address_as_ull = // NOLINT + reinterpret_cast(address); // NOLINT + unsigned long long int old = *address_as_ull, assumed; // NOLINT + + do { + assumed = old; + if (__longlong_as_double(assumed) <= val) { + break; + } + + old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val)); + } while (assumed != old); + + return __longlong_as_double(old); +} + +#ifdef PADDLE_CUDA_FP16 +inline static __device__ uint32_t min_to_low_half(uint32_t val, float x) { + float16 low_half; + // The float16 in lower 16bits + low_half.x = static_cast(val & 0xFFFFu); + low_half = static_cast(min(static_cast(low_half), x)); + return (val & 0xFFFF0000u) | low_half.x; +} + +inline static __device__ uint32_t min_to_high_half(uint32_t val, float x) { + float16 high_half; + // The float16 in higher 16bits + high_half.x = static_cast(val >> 16); + high_half = static_cast(min(static_cast(high_half), x)); + return (val & 0xFFFFu) | (static_cast(high_half.x) << 16); +} + +CUDA_ATOMIC_WRAPPER(Min, float16) { + if (*address <= val) { + return *address; + } + uint32_t *address_as_ui = reinterpret_cast( + reinterpret_cast(address) - + (reinterpret_cast(address) & 0x02)); + float val_f = static_cast(val); + uint32_t old = *address_as_ui; + uint32_t assumed; + if (((uintptr_t)address & 0x02) == 0) { + // The float16 value stay at lower 16 bits of the address. + do { + assumed = old; + old = atomicCAS(address_as_ui, assumed, min_to_low_half(assumed, val_f)); + } while (old != assumed); + float16 ret; + ret.x = old & 0xFFFFu; + return ret; + } else { + // The float16 value stay at higher 16 bits of the address. + do { + assumed = old; + old = atomicCAS(address_as_ui, assumed, min_to_high_half(assumed, val_f)); + } while (old != assumed); + float16 ret; + ret.x = old >> 16; + return ret; + } +} +#endif + +#ifdef PADDLE_CUDA_FP16 +#ifdef PADDLE_WITH_CUDA +/* + * One thead block deals with elementwise atomicAdd for vector of len. + * @in: [x1, x2, x3, ...] + * @out:[y1+x1, y2+x2, y3+x3, ...] + * */ +template < + typename T, + typename std::enable_if::value>::type * = nullptr> +__device__ __forceinline__ void VectorizedAtomicAddPerBlock( + const int64_t len, int tid, int threads_per_block, const T *in, T *out) { + for (int i = tid; i < len; i += threads_per_block) { + CudaAtomicAdd(&out[i], in[i]); + } +} + +// Note: assume that len is even. If len is odd, call fastAtomicAdd directly. +template < + typename T, + typename std::enable_if::value>::type * = nullptr> +__device__ __forceinline__ void VectorizedAtomicAddPerBlock( + const int64_t len, int tid, int threads_per_block, const T *in, T *out) { +#if ((CUDA_VERSION < 10000) || \ + (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700))) + for (int i = tid; i < len; i += threads_per_block) { + CudaAtomicAdd(&out[i], in[i]); + } +#else + int i = 0; + int loops = len / 2 * 2; + + bool aligned_half2 = + (reinterpret_cast(out) % sizeof(__half2) == 0); + + if (aligned_half2) { + for (i = tid * 2; i < loops; i += threads_per_block * 2) { + __half2 value2; + T value_1 = in[i]; + T value_2 = in[i + 1]; + value2.x = *reinterpret_cast<__half *>(&value_1); + value2.y = *reinterpret_cast<__half *>(&value_2); + atomicAdd(reinterpret_cast<__half2 *>(&out[i]), value2); + } + for (; i < len; i += threads_per_block) { + fastAtomicAdd(out, i, len, in[i]); + } + } else { + for (int i = tid; i < len; i += threads_per_block) { + fastAtomicAdd(out, i, len, in[i]); + } + } +#endif +} +#endif +#endif +} // namespace phi diff --git a/paddle/phi/kernels/funcs/detail/gru_gpu_kernel.h b/paddle/phi/kernels/funcs/detail/gru_gpu_kernel.h index 93232d8f7f434..9eac6e602738e 100644 --- a/paddle/phi/kernels/funcs/detail/gru_gpu_kernel.h +++ b/paddle/phi/kernels/funcs/detail/gru_gpu_kernel.h @@ -15,8 +15,8 @@ limitations under the License. */ #pragma once #include -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/device_context.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/kernels/funcs/detail/activation_functions.h" #include "paddle/phi/kernels/funcs/gru_compute.h" diff --git a/paddle/phi/kernels/funcs/detail/lstm_gpu_kernel.h b/paddle/phi/kernels/funcs/detail/lstm_gpu_kernel.h index 9b5c24abc677d..a8083b1388789 100644 --- a/paddle/phi/kernels/funcs/detail/lstm_gpu_kernel.h +++ b/paddle/phi/kernels/funcs/detail/lstm_gpu_kernel.h @@ -15,8 +15,8 @@ limitations under the License. */ #pragma once #include -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/device_context.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/kernels/funcs/detail/activation_functions.h" #include "paddle/phi/kernels/funcs/lstm_compute.h" @@ -202,15 +202,12 @@ __global__ void KeLstmBackward(Op op, if (is_batch) { if (value.prev_state_value) { if (grad.check_ig_grad) - paddle::platform::CudaAtomicAdd(grad.check_ig_grad + frame_idx, - r_checkIGrad); + phi::CudaAtomicAdd(grad.check_ig_grad + frame_idx, r_checkIGrad); if (grad.check_fg_grad) - paddle::platform::CudaAtomicAdd(grad.check_fg_grad + frame_idx, - r_checkFGrad); + phi::CudaAtomicAdd(grad.check_fg_grad + frame_idx, r_checkFGrad); } if (grad.check_og_grad) - paddle::platform::CudaAtomicAdd(grad.check_og_grad + frame_idx, - r_checkOGrad); + phi::CudaAtomicAdd(grad.check_og_grad + frame_idx, r_checkOGrad); } else { if (value.prev_state_value) { if (grad.check_ig_grad) grad.check_ig_grad[frame_idx] += r_checkIGrad; diff --git a/paddle/phi/kernels/funcs/gather.cu.h b/paddle/phi/kernels/funcs/gather.cu.h index f660560fceb5e..ac8487db8f62e 100644 --- a/paddle/phi/kernels/funcs/gather.cu.h +++ b/paddle/phi/kernels/funcs/gather.cu.h @@ -18,8 +18,8 @@ limitations under the License. */ #include "paddle/fluid/memory/memcpy.h" // TODO(paddle-dev): move gpu_primitives.h to phi -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/common/place.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/kernels/funcs/math_function.h" @@ -217,7 +217,7 @@ __global__ void GatherGradGPUKernel(const T* input, int64_t out_index = inner_dim_index * (outer_dim_size * out_index_dim_size) + index[index_dim_index] * outer_dim_size + out_dim_index; - paddle::platform::CudaAtomicAdd(out + out_index, *(input + idx)); + phi::CudaAtomicAdd(out + out_index, *(input + idx)); } } diff --git a/paddle/phi/kernels/funcs/pooling.cu b/paddle/phi/kernels/funcs/pooling.cu index 59d4d21ec4650..39cd26a455c26 100644 --- a/paddle/phi/kernels/funcs/pooling.cu +++ b/paddle/phi/kernels/funcs/pooling.cu @@ -15,8 +15,8 @@ limitations under the License. */ #include #include -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/kernels/funcs/pooling.h" #include "paddle/phi/kernels/funcs/reduce_function.h" #include "paddle/phi/kernels/primitive/datamover_primitives.h" @@ -428,8 +428,7 @@ __global__ void KernelMaxPool2DGrad(const int nthreads, if (maxIndex != -1) { // atomic add - paddle::platform::CudaAtomicAdd(input_grad + maxIndex, - output_grad[index]); + phi::CudaAtomicAdd(input_grad + maxIndex, output_grad[index]); } } } @@ -1330,7 +1329,7 @@ __global__ void KernelMaxPool3DGrad(const int nthreads, } if (maxIdx != -1) { // atomic add - paddle::platform::CudaAtomicAdd(input_grad + maxIdx, output_grad[index]); + phi::CudaAtomicAdd(input_grad + maxIdx, output_grad[index]); } } } @@ -2359,7 +2358,7 @@ __global__ void KernelMaxPool3DWithIdxGrad( w_offset; int max_index = mask[output_index]; if (max_index != -1) { - paddle::platform::CudaAtomicAdd( + phi::CudaAtomicAdd( &input_grad[nc_offset * input_depth * input_height * input_width + max_index], output_grad[output_index]); diff --git a/paddle/phi/kernels/funcs/scatter.cu.h b/paddle/phi/kernels/funcs/scatter.cu.h index d42538edb7561..6aeb09b232bd5 100644 --- a/paddle/phi/kernels/funcs/scatter.cu.h +++ b/paddle/phi/kernels/funcs/scatter.cu.h @@ -16,8 +16,8 @@ limitations under the License. */ #include #include -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/common/place.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/kernels/funcs/math_function.h" @@ -70,7 +70,7 @@ __global__ void ScatterCUDAKernel(const T* params, if (overwrite) { *(output + out_i) = *(params + i); } else { - paddle::platform::CudaAtomicAdd(output + out_i, *(params + i)); + phi::CudaAtomicAdd(output + out_i, *(params + i)); } } } @@ -104,7 +104,7 @@ __global__ void ScatterNdCUDAKernel(const T* update, temp *= output_dims[j]; } int64_t output_i = gather_i + slice_i; - paddle::platform::CudaAtomicAdd(output + output_i, *(update + i)); + phi::CudaAtomicAdd(output + output_i, *(update + i)); } } diff --git a/paddle/phi/kernels/funcs/segment_pooling.cu b/paddle/phi/kernels/funcs/segment_pooling.cu index 0b608367bbe74..99efa783d8e77 100644 --- a/paddle/phi/kernels/funcs/segment_pooling.cu +++ b/paddle/phi/kernels/funcs/segment_pooling.cu @@ -14,9 +14,9 @@ limitations under the License. */ #include -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/kernels/funcs/gather.cu.h" #include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/segment_pooling.h" @@ -60,7 +60,7 @@ __global__ void SegmentSumIdsKernel(const Index* segment_ids, } if (j > 0) { if (last_segment_id == first_segment_id) { - paddle::platform::CudaAtomicAdd(summed_ids + last_segment_id, sum); + phi::CudaAtomicAdd(summed_ids + last_segment_id, sum); } else { *(summed_ids + last_segment_id) = sum; } @@ -70,7 +70,7 @@ __global__ void SegmentSumIdsKernel(const Index* segment_ids, sum += T(1); last_segment_id = current_segment_id; } - paddle::platform::CudaAtomicAdd(summed_ids + last_segment_id, sum); + phi::CudaAtomicAdd(summed_ids + last_segment_id, sum); } } @@ -111,8 +111,8 @@ __global__ void SegmentMeanKernel(const Index* segment_ids, last_segment_id * inner_dim_size + segment_offset; if (last_segment_id == first_segment_id) { - paddle::platform::CudaAtomicAdd( - output + output_index, sum / *(summed_ids + last_segment_id)); + phi::CudaAtomicAdd(output + output_index, + sum / *(summed_ids + last_segment_id)); } else { *(output + output_index) = sum / *(summed_ids + last_segment_id); } @@ -123,8 +123,8 @@ __global__ void SegmentMeanKernel(const Index* segment_ids, last_segment_id = current_segment_id; } Index output_index = last_segment_id * inner_dim_size + segment_offset; - paddle::platform::CudaAtomicAdd(output + output_index, - sum / *(summed_ids + last_segment_id)); + phi::CudaAtomicAdd(output + output_index, + sum / *(summed_ids + last_segment_id)); } } @@ -215,7 +215,7 @@ class MaxPool { DEVICE inline T initial() { return static_cast(-FLT_MAX); } DEVICE inline void compute(const T& x, T* y) { *y = *y > x ? *y : x; } DEVICE inline T atomic(T* address, const T val) { - return paddle::platform::CudaAtomicMax(address, val); + return phi::CudaAtomicMax(address, val); } }; @@ -225,7 +225,7 @@ class MinPool { DEVICE inline T initial() { return static_cast(FLT_MAX); } DEVICE inline void compute(const T& x, T* y) { *y = *y < x ? *y : x; } DEVICE inline T atomic(T* address, const T val) { - return paddle::platform::CudaAtomicMin(address, val); + return phi::CudaAtomicMin(address, val); } }; @@ -235,7 +235,7 @@ class SumPool { DEVICE inline T initial() { return static_cast(0); } DEVICE inline void compute(const T& x, T* y) { *y = *y + x; } DEVICE inline T atomic(T* address, const T val) { - return paddle::platform::CudaAtomicAdd(address, val); + return phi::CudaAtomicAdd(address, val); } }; diff --git a/paddle/phi/kernels/funcs/selected_rows_functor.cu b/paddle/phi/kernels/funcs/selected_rows_functor.cu index c21402693b9c8..e08fea2b35317 100644 --- a/paddle/phi/kernels/funcs/selected_rows_functor.cu +++ b/paddle/phi/kernels/funcs/selected_rows_functor.cu @@ -15,7 +15,7 @@ limitations under the License. */ #include #include -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/common/bfloat16.h" #include "paddle/phi/common/float16.h" #include "paddle/phi/kernels/funcs/math_function.h" @@ -127,7 +127,7 @@ __global__ void SelectedRowsAddTensorKernel(const T* selected_rows, // Since index in rows of SelectedRows can be duplicate, we can not use // tensor_out[index] += selected_rows[index]; Instead, we have to use // AtomicAdd to avoid concurrent write error. - paddle::platform::CudaAtomicAdd(tensor_out + index, selected_rows[index]); + phi::CudaAtomicAdd(tensor_out + index, selected_rows[index]); } } } // namespace @@ -279,7 +279,7 @@ __global__ void SelectedRowsAddToTensorKernel(const T* selected_rows, for (int index = tid; index < row_numel; index += block_size) { // Since index in rows of SelectedRows can be duplicate, we have to use // Atomic Operation to avoid concurrent write error. - paddle::platform::CudaAtomicAdd(tensor_out + index, selected_rows[index]); + phi::CudaAtomicAdd(tensor_out + index, selected_rows[index]); } } } // namespace @@ -360,7 +360,7 @@ __global__ void MergeAddKernel(const T* input, input += ty * row_numel; out += out_idx * row_numel; for (int index = tid; index < row_numel; index += block_size) { - paddle::platform::CudaAtomicAdd(out + index, input[index]); + phi::CudaAtomicAdd(out + index, input[index]); } } @@ -623,9 +623,9 @@ struct UpdateToTensor { auto* in1_data = in1_value.template data(); auto* in2_data = input2->data(); - dim3 threads(paddle::platform::PADDLE_CUDA_NUM_THREADS, 1); + dim3 threads(phi::PADDLE_CUDA_NUM_THREADS, 1); dim3 grid(in1_rows.size(), 1); - UpdateToTensorKernel + UpdateToTensorKernel <<>>( in1_data, in1_rows.cuda_data(), op, in2_data, in1_row_numel); } diff --git a/paddle/phi/kernels/gpu/accuracy_kernel.cu b/paddle/phi/kernels/gpu/accuracy_kernel.cu index 771b189f04071..ef3e5b9af2408 100644 --- a/paddle/phi/kernels/gpu/accuracy_kernel.cu +++ b/paddle/phi/kernels/gpu/accuracy_kernel.cu @@ -17,14 +17,14 @@ #include #include -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_info.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/common/float16.h" #include "paddle/phi/core/kernel_registry.h" namespace phi { -using paddle::platform::PADDLE_CUDA_NUM_THREADS; +using phi::PADDLE_CUDA_NUM_THREADS; template __global__ void AccuracyCudaKernel(const int N, diff --git a/paddle/phi/kernels/gpu/adagrad_kernel.cu b/paddle/phi/kernels/gpu/adagrad_kernel.cu index 150b18bdbd600..53169c8bcfb60 100644 --- a/paddle/phi/kernels/gpu/adagrad_kernel.cu +++ b/paddle/phi/kernels/gpu/adagrad_kernel.cu @@ -14,8 +14,8 @@ #include "paddle/phi/kernels/adagrad_kernel.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/selected_rows_functor.h" @@ -47,7 +47,7 @@ __global__ void MergeGradKernel(const T* grad, grad += ty * row_numel; grad_merge += grad_merge_idx * row_numel; for (int index = tid; index < row_numel; index += block_size) { - paddle::platform::CudaAtomicAdd(grad_merge + index, grad[index]); + phi::CudaAtomicAdd(grad_merge + index, grad[index]); } } @@ -69,9 +69,9 @@ __global__ void SparseAdagradFunctorKernel(const T* grad, for (int index = tid; index < row_numel; index += block_size) { // Since index in rows of SelectedRows can be duplicate, we have to use // Atomic Operation to avoid concurrent write error. - paddle::platform::CudaAtomicAdd(param + index, - -1.0 * learning_rate[0] * grad[index] / - (sqrt(moment[index]) + epsilon)); + phi::CudaAtomicAdd(param + index, + -1.0 * learning_rate[0] * grad[index] / + (sqrt(moment[index]) + epsilon)); } } diff --git a/paddle/phi/kernels/gpu/affine_grid_grad_kernel.cu b/paddle/phi/kernels/gpu/affine_grid_grad_kernel.cu index 5cfa8cf30676c..a7a82236a40a2 100644 --- a/paddle/phi/kernels/gpu/affine_grid_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/affine_grid_grad_kernel.cu @@ -18,9 +18,9 @@ #include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/common/int_array.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/affine_grid_utils.h" @@ -75,18 +75,14 @@ __global__ void affine_grid_grad_kernel_4d(const int count, int theta_offset = n * 6; // 2 * 3; T out_grad_x = out_grad[index * 2]; - paddle::platform::CudaAtomicAdd(theta_grad + theta_offset, - out_grad_x * w_coor); - paddle::platform::CudaAtomicAdd(theta_grad + theta_offset + 1, - out_grad_x * h_coor); - paddle::platform::CudaAtomicAdd(theta_grad + theta_offset + 2, out_grad_x); + phi::CudaAtomicAdd(theta_grad + theta_offset, out_grad_x * w_coor); + phi::CudaAtomicAdd(theta_grad + theta_offset + 1, out_grad_x * h_coor); + phi::CudaAtomicAdd(theta_grad + theta_offset + 2, out_grad_x); T out_grad_y = out_grad[index * 2 + 1]; - paddle::platform::CudaAtomicAdd(theta_grad + theta_offset + 3, - out_grad_y * w_coor); - paddle::platform::CudaAtomicAdd(theta_grad + theta_offset + 4, - out_grad_y * h_coor); - paddle::platform::CudaAtomicAdd(theta_grad + theta_offset + 5, out_grad_y); + phi::CudaAtomicAdd(theta_grad + theta_offset + 3, out_grad_y * w_coor); + phi::CudaAtomicAdd(theta_grad + theta_offset + 4, out_grad_y * h_coor); + phi::CudaAtomicAdd(theta_grad + theta_offset + 5, out_grad_y); } } @@ -116,31 +112,22 @@ __global__ void affine_grid_grad_kernel_5d(const int count, int theta_offset = n * 12; // 3 * 4; T out_grad_x = out_grad[index * 3]; - paddle::platform::CudaAtomicAdd(theta_grad + theta_offset, - out_grad_x * w_coor); - paddle::platform::CudaAtomicAdd(theta_grad + theta_offset + 1, - out_grad_x * h_coor); - paddle::platform::CudaAtomicAdd(theta_grad + theta_offset + 2, - out_grad_x * d_coor); - paddle::platform::CudaAtomicAdd(theta_grad + theta_offset + 3, out_grad_x); + phi::CudaAtomicAdd(theta_grad + theta_offset, out_grad_x * w_coor); + phi::CudaAtomicAdd(theta_grad + theta_offset + 1, out_grad_x * h_coor); + phi::CudaAtomicAdd(theta_grad + theta_offset + 2, out_grad_x * d_coor); + phi::CudaAtomicAdd(theta_grad + theta_offset + 3, out_grad_x); T out_grad_y = out_grad[index * 3 + 1]; - paddle::platform::CudaAtomicAdd(theta_grad + theta_offset + 4, - out_grad_y * w_coor); - paddle::platform::CudaAtomicAdd(theta_grad + theta_offset + 5, - out_grad_y * h_coor); - paddle::platform::CudaAtomicAdd(theta_grad + theta_offset + 6, - out_grad_y * d_coor); - paddle::platform::CudaAtomicAdd(theta_grad + theta_offset + 7, out_grad_y); + phi::CudaAtomicAdd(theta_grad + theta_offset + 4, out_grad_y * w_coor); + phi::CudaAtomicAdd(theta_grad + theta_offset + 5, out_grad_y * h_coor); + phi::CudaAtomicAdd(theta_grad + theta_offset + 6, out_grad_y * d_coor); + phi::CudaAtomicAdd(theta_grad + theta_offset + 7, out_grad_y); T out_grad_z = out_grad[index * 3 + 2]; - paddle::platform::CudaAtomicAdd(theta_grad + theta_offset + 8, - out_grad_z * w_coor); - paddle::platform::CudaAtomicAdd(theta_grad + theta_offset + 9, - out_grad_z * h_coor); - paddle::platform::CudaAtomicAdd(theta_grad + theta_offset + 10, - out_grad_z * d_coor); - paddle::platform::CudaAtomicAdd(theta_grad + theta_offset + 11, out_grad_z); + phi::CudaAtomicAdd(theta_grad + theta_offset + 8, out_grad_z * w_coor); + phi::CudaAtomicAdd(theta_grad + theta_offset + 9, out_grad_z * h_coor); + phi::CudaAtomicAdd(theta_grad + theta_offset + 10, out_grad_z * d_coor); + phi::CudaAtomicAdd(theta_grad + theta_offset + 11, out_grad_z); } } diff --git a/paddle/phi/kernels/gpu/affine_grid_kernel.cu b/paddle/phi/kernels/gpu/affine_grid_kernel.cu index 0f42960502b54..499ed260eef47 100644 --- a/paddle/phi/kernels/gpu/affine_grid_kernel.cu +++ b/paddle/phi/kernels/gpu/affine_grid_kernel.cu @@ -18,9 +18,9 @@ #include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/common/int_array.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/affine_grid_utils.h" diff --git a/paddle/phi/kernels/gpu/auc_kernel.cu b/paddle/phi/kernels/gpu/auc_kernel.cu index 44c0df3b52213..c815f33a6675f 100644 --- a/paddle/phi/kernels/gpu/auc_kernel.cu +++ b/paddle/phi/kernels/gpu/auc_kernel.cu @@ -14,13 +14,13 @@ #include "paddle/phi/kernels/auc_kernel.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/core/kernel_registry.h" namespace phi { -using paddle::platform::PADDLE_CUDA_NUM_THREADS; +using phi::PADDLE_CUDA_NUM_THREADS; __global__ void ClearObsoleteDataKernel(int64_t *pos, int64_t *neg, @@ -74,9 +74,9 @@ __global__ void AddDataKernel(const int64_t *label_data, "The predict data must gather or equal 0."); uint32_t binIdx = static_cast(predict_data * num_thresholds); if (label_data[i]) { - paddle::platform::CudaAtomicAdd(pos + cur_step_begin + binIdx, 1); + phi::CudaAtomicAdd(pos + cur_step_begin + binIdx, 1); } else { - paddle::platform::CudaAtomicAdd(neg + cur_step_begin + binIdx, 1); + phi::CudaAtomicAdd(neg + cur_step_begin + binIdx, 1); } } } diff --git a/paddle/phi/kernels/gpu/bincount_kernel.cu b/paddle/phi/kernels/gpu/bincount_kernel.cu index 3b1e41d92e6b6..162df476982ee 100644 --- a/paddle/phi/kernels/gpu/bincount_kernel.cu +++ b/paddle/phi/kernels/gpu/bincount_kernel.cu @@ -14,15 +14,15 @@ #include "paddle/phi/kernels/bincount_kernel.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/eigen/common.h" #include "paddle/phi/kernels/funcs/math_function.h" namespace phi { -using paddle::platform::PADDLE_CUDA_NUM_THREADS; +using phi::PADDLE_CUDA_NUM_THREADS; inline int GET_BLOCKS(const int N) { return (N + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS; @@ -36,12 +36,11 @@ __global__ void KernelBincount(const InputT* input, OutT* output) { if (!has_weights) { for (int i = threadIdx.x; i < total_elements; i += blockDim.x) { - paddle::platform::CudaAtomicAdd(&output[input[i]], 1L); + phi::CudaAtomicAdd(&output[input[i]], 1L); } } else { for (int i = threadIdx.x; i < total_elements; i += blockDim.x) { - paddle::platform::CudaAtomicAdd(&output[input[i]], - static_cast(weights[i])); + phi::CudaAtomicAdd(&output[input[i]], static_cast(weights[i])); } } } diff --git a/paddle/phi/kernels/gpu/box_coder.cu b/paddle/phi/kernels/gpu/box_coder.cu index bca18c25c3eee..6dd53c04d29fb 100644 --- a/paddle/phi/kernels/gpu/box_coder.cu +++ b/paddle/phi/kernels/gpu/box_coder.cu @@ -18,8 +18,8 @@ #include #include "paddle/fluid/memory/memory.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/impl/box_coder.h" diff --git a/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu b/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu index b46f1f4a3314d..55c8a9f96fd81 100644 --- a/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu @@ -14,8 +14,8 @@ #include "paddle/phi/kernels/deformable_conv_grad_kernel.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h" @@ -107,8 +107,8 @@ __global__ void ModulatedDeformableCol2imGpuKernel( height, width); - paddle::platform::CudaAtomicAdd(grad_im + cur_bottom_grad_pos, - weight * cur_top_grad); + phi::CudaAtomicAdd(grad_im + cur_bottom_grad_pos, + weight * cur_top_grad); } } } diff --git a/paddle/phi/kernels/gpu/depthwise_conv.h b/paddle/phi/kernels/gpu/depthwise_conv.h index 21d0faf0b0d7b..5da0ae96e6be4 100644 --- a/paddle/phi/kernels/gpu/depthwise_conv.h +++ b/paddle/phi/kernels/gpu/depthwise_conv.h @@ -28,7 +28,7 @@ namespace cub = hipcub; #endif #include "paddle/fluid/platform/device/gpu/gpu_device_function.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/kernels/funcs/math_function.h" namespace paddle { @@ -981,7 +981,7 @@ __device__ __inline__ void KernelDepthwiseConvFilterGradNHWC( } #undef gaid } - platform::CudaAtomicAdd(&filter_grad_data[gbid], s); + phi::CudaAtomicAdd(&filter_grad_data[gbid], s); } } @@ -1057,7 +1057,7 @@ __device__ __inline__ void KernelDepthwiseConvFilterGradCFilterNHWC( } for (int i = 0; i < c_filter * c_filter; ++i) { T* weight = filter_grad_data + i * output_channels + kernel_id; - platform::CudaAtomicAdd(&weight[0], r_weight[i]); + phi::CudaAtomicAdd(&weight[0], r_weight[i]); } } } diff --git a/paddle/phi/kernels/gpu/diagonal_grad_kernel.cu b/paddle/phi/kernels/gpu/diagonal_grad_kernel.cu index 1fd1e446991fa..05a57426fcb21 100644 --- a/paddle/phi/kernels/gpu/diagonal_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/diagonal_grad_kernel.cu @@ -15,13 +15,13 @@ #include "paddle/phi/kernels/diagonal_grad_kernel.h" #include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/diagonal.h" namespace phi { -using paddle::platform::PADDLE_CUDA_NUM_THREADS; +using phi::PADDLE_CUDA_NUM_THREADS; template void DiagonalGradKernel(const Context& dev_ctx, diff --git a/paddle/phi/kernels/gpu/diagonal_kernel.cu b/paddle/phi/kernels/gpu/diagonal_kernel.cu index 169cb3f2c78b9..74bad0ecd9a35 100644 --- a/paddle/phi/kernels/gpu/diagonal_kernel.cu +++ b/paddle/phi/kernels/gpu/diagonal_kernel.cu @@ -15,12 +15,12 @@ #include "paddle/phi/kernels/diagonal_kernel.h" #include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/diagonal.h" namespace phi { -using paddle::platform::PADDLE_CUDA_NUM_THREADS; +using phi::PADDLE_CUDA_NUM_THREADS; template void DiagonalKernel(const Context& dev_ctx, const DenseTensor& x, diff --git a/paddle/phi/kernels/gpu/distribute_fpn_proposals_kernel.cu b/paddle/phi/kernels/gpu/distribute_fpn_proposals_kernel.cu index 130dc99ab1734..bcce09649a8fc 100644 --- a/paddle/phi/kernels/gpu/distribute_fpn_proposals_kernel.cu +++ b/paddle/phi/kernels/gpu/distribute_fpn_proposals_kernel.cu @@ -32,7 +32,7 @@ namespace cub = hipcub; #include "paddle/fluid/memory/allocation/allocator.h" #include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/operators/detection/bbox_util.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" namespace phi { @@ -69,7 +69,7 @@ __global__ void GPUDistFpnProposalsHelper(const int nthreads, tgt_lvl = min(max_level, max(tgt_lvl, min_level)); target_lvls[i] = tgt_lvl; // compute number of rois in the same batch and same target level - paddle::platform::CudaAtomicAdd( + phi::CudaAtomicAdd( sub_lod_list + (tgt_lvl - min_level) * lod_size + roi_batch_ind, 1); } } diff --git a/paddle/phi/kernels/gpu/edit_distance_kernel.cu b/paddle/phi/kernels/gpu/edit_distance_kernel.cu index 993b4771cc958..d63430d527992 100644 --- a/paddle/phi/kernels/gpu/edit_distance_kernel.cu +++ b/paddle/phi/kernels/gpu/edit_distance_kernel.cu @@ -18,14 +18,14 @@ #include #include "paddle/fluid/memory/memcpy.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/math_function.h" namespace phi { -using paddle::platform::PADDLE_CUDA_NUM_THREADS; +using phi::PADDLE_CUDA_NUM_THREADS; template __global__ void FillFirstRow(T* dist, const int N) { diff --git a/paddle/phi/kernels/gpu/embedding_grad_kernel.cu b/paddle/phi/kernels/gpu/embedding_grad_kernel.cu index e10d01ce9e4a5..0cfe2e43d1875 100644 --- a/paddle/phi/kernels/gpu/embedding_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/embedding_grad_kernel.cu @@ -16,8 +16,8 @@ #include "paddle/fluid/framework/mixed_vector.h" #include "paddle/fluid/memory/memcpy.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/common/data_type.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/eigen/common.h" @@ -51,10 +51,10 @@ __global__ void EmbeddingGrad(T* table, const T* out = output + idy * D; T* tab = table + id * D; #ifdef PADDLE_WITH_CUDA - paddle::platform::VectorizedAtomicAddPerBlock(D, idx, blockDim.x, out, tab); + phi::VectorizedAtomicAddPerBlock(D, idx, blockDim.x, out, tab); #else for (int i = idx; i < D; i += blockDim.x) { - paddle::platform::CudaAtomicAdd(&tab[i], out[i]); + phi::CudaAtomicAdd(&tab[i], out[i]); } #endif idy += blockDim.y * gridDim.x; diff --git a/paddle/phi/kernels/gpu/graph_reindex_funcs.h b/paddle/phi/kernels/gpu/graph_reindex_funcs.h index 0a6d6a549a730..aee6e5c4d46ce 100644 --- a/paddle/phi/kernels/gpu/graph_reindex_funcs.h +++ b/paddle/phi/kernels/gpu/graph_reindex_funcs.h @@ -14,8 +14,8 @@ #pragma once -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/core/hostdevice.h" #include "paddle/phi/kernels/graph_reindex_kernel.h" diff --git a/paddle/phi/kernels/gpu/graph_send_recv_funcs.h b/paddle/phi/kernels/gpu/graph_send_recv_funcs.h index 180a9dfac854b..9aacba8a7a3aa 100644 --- a/paddle/phi/kernels/gpu/graph_send_recv_funcs.h +++ b/paddle/phi/kernels/gpu/graph_send_recv_funcs.h @@ -19,8 +19,8 @@ #include #include -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/core/hostdevice.h" #include "paddle/phi/kernels/send_u_recv_kernel.h" @@ -32,7 +32,7 @@ struct GraphSendRecvSumCUDAFunctor { T* output, const IndexT& in_i, const IndexT& out_i) { - paddle::platform::CudaAtomicAdd(output + out_i, *(params + in_i)); + phi::CudaAtomicAdd(output + out_i, *(params + in_i)); } }; @@ -42,7 +42,7 @@ struct GraphSendRecvMaxCUDAFunctor { T* output, const IndexT& in_i, const IndexT& out_i) { - paddle::platform::CudaAtomicMax(output + out_i, *(params + in_i)); + phi::CudaAtomicMax(output + out_i, *(params + in_i)); } }; @@ -52,7 +52,7 @@ struct GraphSendRecvMinCUDAFunctor { T* output, const IndexT& in_i, const IndexT& out_i) { - paddle::platform::CudaAtomicMin(output + out_i, *(params + in_i)); + phi::CudaAtomicMin(output + out_i, *(params + in_i)); } }; @@ -106,7 +106,7 @@ __global__ void ComputeCountCUDAKernel(int32_t* count, size_t index_size) { CUDA_KERNEL_LOOP_TYPE(i, index_size, int64_t) { IndexT dst_i = dst_indices[i]; - paddle::platform::CudaAtomicAdd(count + dst_i, 1); + phi::CudaAtomicAdd(count + dst_i, 1); } } @@ -140,8 +140,8 @@ __global__ void ManipulateMeanGradCUDAKernel(const T* params, IndexT dst_i = dst_indices[indices_i]; int64_t in_i = src_i * slice_size + slice_i; int64_t out_i = dst_i * slice_size + slice_i; - paddle::platform::CudaAtomicAdd( - output + out_i, *(params + in_i) / static_cast(dst_count[src_i])); + phi::CudaAtomicAdd(output + out_i, + *(params + in_i) / static_cast(dst_count[src_i])); } } @@ -162,10 +162,9 @@ __global__ void ManipulateMinMaxGradCUDAKernel(const T* params, IndexT dst_i = dst_indices[indices_i]; int64_t in_i = src_i * slice_size + slice_i; int64_t out_i = dst_i * slice_size + slice_i; - paddle::platform::CudaAtomicAdd( - output + out_i, - *(params + in_i) * - static_cast(*(ptr_input + out_i) == *(ptr_output + in_i))); + phi::CudaAtomicAdd(output + out_i, + *(params + in_i) * static_cast(*(ptr_input + out_i) == + *(ptr_output + in_i))); } } diff --git a/paddle/phi/kernels/gpu/graph_send_ue_recv_funcs.h b/paddle/phi/kernels/gpu/graph_send_ue_recv_funcs.h index 1bc841a6d8ba4..bff91078865d9 100644 --- a/paddle/phi/kernels/gpu/graph_send_ue_recv_funcs.h +++ b/paddle/phi/kernels/gpu/graph_send_ue_recv_funcs.h @@ -17,8 +17,8 @@ #include #include -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/core/hostdevice.h" #include "paddle/phi/kernels/impl/graph_message_passing_impl.h" @@ -29,25 +29,25 @@ namespace phi { #define CUDA_MAX_NUM_BLOCKS_Z 0xFFFF inline void CopyBCastOff(const BroadCastInfo& bcast_info, - thrust::device_vector& l_bcastoff, - thrust::device_vector& r_bcastoff) { - l_bcastoff.resize(bcast_info.out_len); - r_bcastoff.resize(bcast_info.out_len); + thrust::device_vector* l_bcastoff, + thrust::device_vector* r_bcastoff) { + l_bcastoff->resize(bcast_info.out_len); + r_bcastoff->resize(bcast_info.out_len); #ifdef PADDLE_WITH_HIP - hipMemcpy(thrust::raw_pointer_cast(l_bcastoff.data()), + hipMemcpy(thrust::raw_pointer_cast(l_bcastoff->data()), bcast_info.l_offset.data(), sizeof(int64_t) * bcast_info.out_len, hipMemcpyHostToDevice); - hipMemcpy(thrust::raw_pointer_cast(r_bcastoff.data()), + hipMemcpy(thrust::raw_pointer_cast(r_bcastoff->data()), bcast_info.r_offset.data(), sizeof(int64_t) * bcast_info.out_len, hipMemcpyHostToDevice); #else - cudaMemcpy(thrust::raw_pointer_cast(l_bcastoff.data()), + cudaMemcpy(thrust::raw_pointer_cast(l_bcastoff->data()), bcast_info.l_offset.data(), sizeof(int64_t) * bcast_info.out_len, cudaMemcpyHostToDevice); - cudaMemcpy(thrust::raw_pointer_cast(r_bcastoff.data()), + cudaMemcpy(thrust::raw_pointer_cast(r_bcastoff->data()), bcast_info.r_offset.data(), sizeof(int64_t) * bcast_info.out_len, cudaMemcpyHostToDevice); @@ -102,21 +102,21 @@ inline int FindNumBlocks(char axis, int nblocks, int max_num_blocks = -1) { template struct GraphSendUERecvSumCUDAFunctor { DEVICE inline void operator()(T* output, T val) { - paddle::platform::CudaAtomicAdd(output, val); + phi::CudaAtomicAdd(output, val); } }; template struct GraphSendUERecvMaxCUDAFunctor { DEVICE inline void operator()(T* output, T val) { - paddle::platform::CudaAtomicMax(output, val); + phi::CudaAtomicMax(output, val); } }; template struct GraphSendUERecvMinCUDAFunctor { DEVICE inline void operator()(T* output, T val) { - paddle::platform::CudaAtomicMin(output, val); + phi::CudaAtomicMin(output, val); } }; @@ -192,8 +192,7 @@ __global__ void ManipulateMeanGradCUDAKernelForMulX(const T* out_grad_data, int64_t o_add = use_bcast ? l_bcastoff[tx] : tx; int64_t e_add = use_bcast ? r_bcastoff[tx] : tx; T val = out_grad_off[o_add] * e_off[e_add]; - paddle::platform::CudaAtomicAdd(x_grad_off + tx, - val / static_cast(dst_count[src])); + phi::CudaAtomicAdd(x_grad_off + tx, val / static_cast(dst_count[src])); tx += stride_x; } ty += stride_y; @@ -222,7 +221,7 @@ __global__ void ManipulateSumGradCUDAKernelForAddE(const T* out_grad_data, const T* out_grad_off = out_grad_data + dst * out_len; while (tx < out_len) { int64_t e_add = use_bcast ? r_bcastoff[tx] : tx; - paddle::platform::CudaAtomicAdd(e_grad_off + e_add, out_grad_off[tx]); + phi::CudaAtomicAdd(e_grad_off + e_add, out_grad_off[tx]); tx += stride_x; } ty += stride_y; @@ -258,8 +257,7 @@ __global__ void ManipulateSumGradCUDAKernelForMulE(const T* x_data, while (tx < out_len) { int64_t x_add = use_bcast ? l_bcastoff[tx] : tx; int64_t e_add = use_bcast ? r_bcastoff[tx] : tx; - paddle::platform::CudaAtomicAdd(e_grad_off + e_add, - out_grad_off[tx] * x_off[x_add]); + phi::CudaAtomicAdd(e_grad_off + e_add, out_grad_off[tx] * x_off[x_add]); tx += stride_x; } ty += stride_y; @@ -289,9 +287,8 @@ __global__ void ManipulateMeanGradCUDAKernelForAddE(const T* out_grad_data, const T* out_grad_off = out_grad_data + dst * out_len; while (tx < out_len) { int64_t e_add = use_bcast ? r_bcastoff[tx] : tx; - paddle::platform::CudaAtomicAdd( - e_grad_off + e_add, - out_grad_off[tx] / static_cast(dst_count[dst])); + phi::CudaAtomicAdd(e_grad_off + e_add, + out_grad_off[tx] / static_cast(dst_count[dst])); tx += stride_x; } ty += stride_y; @@ -328,7 +325,7 @@ __global__ void ManipulateMeanGradCUDAKernelForMulE(const T* x_data, while (tx < out_len) { int64_t x_add = use_bcast ? l_bcastoff[tx] : tx; int64_t e_add = use_bcast ? r_bcastoff[tx] : tx; - paddle::platform::CudaAtomicAdd( + phi::CudaAtomicAdd( e_grad_off + e_add, out_grad_off[tx] * x_off[x_add] / static_cast(dst_count[dst])); tx += stride_x; @@ -373,12 +370,10 @@ __global__ void ManipulateMinMaxGradCUDAKernelForAdd(const T* x_data, int64_t x_add = use_bcast ? xbcast_off[tx] : tx; int64_t e_add = use_bcast ? ebcast_off[tx] : tx; T val = x_off[x_add] + e_off[e_add]; - paddle::platform::CudaAtomicAdd( - x_grad_off + x_add, - out_grad_off[tx] * static_cast(val == out_off[tx])); - paddle::platform::CudaAtomicAdd( - e_grad_off + e_add, - out_grad_off[tx] * static_cast(val == out_off[tx])); + phi::CudaAtomicAdd(x_grad_off + x_add, + out_grad_off[tx] * static_cast(val == out_off[tx])); + phi::CudaAtomicAdd(e_grad_off + e_add, + out_grad_off[tx] * static_cast(val == out_off[tx])); tx += stride_x; } ty += stride_y; @@ -421,10 +416,10 @@ __global__ void ManipulateMinMaxGradCUDAKernelForMul(const T* x_data, int64_t x_add = use_bcast ? xbcast_off[tx] : tx; int64_t e_add = use_bcast ? ebcast_off[tx] : tx; T val = x_off[x_add] * e_off[e_add]; - paddle::platform::CudaAtomicAdd( + phi::CudaAtomicAdd( x_grad_off + x_add, out_grad_off[tx] * static_cast(val == out_off[tx]) * e_off[e_add]); - paddle::platform::CudaAtomicAdd( + phi::CudaAtomicAdd( e_grad_off + e_add, out_grad_off[tx] * static_cast(val == out_off[tx]) * x_off[x_add]); tx += stride_x; diff --git a/paddle/phi/kernels/gpu/grid_sample_grad_kernel.cu b/paddle/phi/kernels/gpu/grid_sample_grad_kernel.cu index 436faeaabf953..8f4beaa26775f 100644 --- a/paddle/phi/kernels/gpu/grid_sample_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/grid_sample_grad_kernel.cu @@ -15,9 +15,9 @@ #include "paddle/phi/kernels/grid_sample_grad_kernel.h" #include "paddle/fluid/platform/device/gpu/gpu_device_function.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_info.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/gpu/grid_sample_utils.h" @@ -28,7 +28,7 @@ template static __forceinline__ __device__ void AtomicAdd( T* data, int h, int w, int sH, int sW, int H, int W, T delta) { if (InBounds(h, w, H, W)) { - paddle::platform::CudaAtomicAdd(data + h * sH + w * sW, delta); + phi::CudaAtomicAdd(data + h * sH + w * sW, delta); } } @@ -45,7 +45,7 @@ static __forceinline__ __device__ void AtomicAdd3D(T* data, int W, T delta) { if (InBounds3D(d, h, w, D, H, W)) { - paddle::platform::CudaAtomicAdd(data + d * sD + h * sH + w * sW, delta); + phi::CudaAtomicAdd(data + d * sD + h * sH + w * sW, delta); } } diff --git a/paddle/phi/kernels/gpu/group_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/group_norm_grad_kernel.cu index cbe8c18cbffd7..54006cd8d5a18 100644 --- a/paddle/phi/kernels/gpu/group_norm_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/group_norm_grad_kernel.cu @@ -71,14 +71,14 @@ __global__ void GroupNormBackwardGetMeanAndVar(const T* x, if (flags & kHasScale) { #if CUDA_VERSION >= 11070 - paddle::platform::CudaAtomicAdd(&(d_scale[ccid]), d_scale_data); + phi::CudaAtomicAdd(&(d_scale[ccid]), d_scale_data); #else CudaAtomicAddWithWarp(&(d_scale[ccid]), d_scale_data); #endif } if (flags & kHasBias) { #if CUDA_VERSION >= 11070 - paddle::platform::CudaAtomicAdd(&(d_bias[ccid]), d_bias_data); + phi::CudaAtomicAdd(&(d_bias[ccid]), d_bias_data); #else CudaAtomicAddWithWarp(&(d_bias[ccid]), d_bias_data); #endif diff --git a/paddle/phi/kernels/gpu/group_norm_utils.h b/paddle/phi/kernels/gpu/group_norm_utils.h index 6af7b96ca2182..00986817c61a0 100644 --- a/paddle/phi/kernels/gpu/group_norm_utils.h +++ b/paddle/phi/kernels/gpu/group_norm_utils.h @@ -23,7 +23,7 @@ namespace cub = hipcub; #endif #include "paddle/fluid/platform/device/gpu/gpu_device_function.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/kernels/primitive/kernel_primitives.h" namespace phi { @@ -51,7 +51,7 @@ __device__ __inline__ void CudaAtomicAddWithWarp(T* sum, T value) { typedef cub::WarpReduce WarpReduce; typename WarpReduce::TempStorage temp_storage; value = WarpReduce(temp_storage).Sum(value); - if (cub::LaneId() == 0) paddle::platform::CudaAtomicAdd(sum, value); + if (cub::LaneId() == 0) phi::CudaAtomicAdd(sum, value); } template diff --git a/paddle/phi/kernels/gpu/histogram_kernel.cu b/paddle/phi/kernels/gpu/histogram_kernel.cu index 4cc6bc35578d1..02f5bbb530a6c 100644 --- a/paddle/phi/kernels/gpu/histogram_kernel.cu +++ b/paddle/phi/kernels/gpu/histogram_kernel.cu @@ -14,9 +14,9 @@ #include "paddle/phi/kernels/histogram_kernel.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/eigen/common.h" #include "paddle/phi/kernels/funcs/eigen/eigen_function.h" @@ -25,7 +25,7 @@ namespace phi { using IndexType = int64_t; -using paddle::platform::PADDLE_CUDA_NUM_THREADS; +using phi::PADDLE_CUDA_NUM_THREADS; inline int GET_BLOCKS(const int N) { return (N + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS; @@ -61,13 +61,13 @@ __global__ void KernelHistogram(const T* input, if (input_value >= min_value && input_value <= max_value) { const IndexType output_index = GetBin(input_value, min_value, max_value, nbins); - paddle::platform::CudaAtomicAdd(&buf_hist[output_index], 1); + phi::CudaAtomicAdd(&buf_hist[output_index], 1); } } __syncthreads(); for (int i = threadIdx.x; i < nbins; i += blockDim.x) { - paddle::platform::CudaAtomicAdd(&output[i], buf_hist[i]); + phi::CudaAtomicAdd(&output[i], buf_hist[i]); } } diff --git a/paddle/phi/kernels/gpu/index_add_grad_kernel.cu b/paddle/phi/kernels/gpu/index_add_grad_kernel.cu index ddc8a65ad51ed..88e42a16dba36 100644 --- a/paddle/phi/kernels/gpu/index_add_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/index_add_grad_kernel.cu @@ -14,9 +14,9 @@ #include "paddle/phi/kernels/index_add_grad_kernel.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_info.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/utils/data_type.h" #include "paddle/phi/kernels/funcs/math_function.h" @@ -24,7 +24,7 @@ namespace phi { -using paddle::platform::PADDLE_CUDA_NUM_THREADS; +using phi::PADDLE_CUDA_NUM_THREADS; template void IndexAddGradKernel(const Context& ctx, diff --git a/paddle/phi/kernels/gpu/index_add_kernel.cu b/paddle/phi/kernels/gpu/index_add_kernel.cu index 047e54b99aa3b..215b28085a92d 100644 --- a/paddle/phi/kernels/gpu/index_add_kernel.cu +++ b/paddle/phi/kernels/gpu/index_add_kernel.cu @@ -14,9 +14,9 @@ #include "paddle/phi/kernels/index_add_kernel.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_info.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/utils/data_type.h" @@ -24,7 +24,7 @@ DECLARE_bool(cudnn_deterministic); namespace phi { -using paddle::platform::PADDLE_CUDA_NUM_THREADS; +using phi::PADDLE_CUDA_NUM_THREADS; template __global__ void index_add_cuda_kernel(const T* input, @@ -41,7 +41,7 @@ __global__ void index_add_cuda_kernel(const T* input, IndexT src_dim_idx = index[dim_idx]; int64_t input_idx = idx + (delta * pre_idx + src_dim_idx - dim_idx) * stride; - paddle::platform::CudaAtomicAdd(&output[input_idx], add_value[idx]); + phi::CudaAtomicAdd(&output[input_idx], add_value[idx]); } } diff --git a/paddle/phi/kernels/gpu/index_sample_grad_kernel.cu b/paddle/phi/kernels/gpu/index_sample_grad_kernel.cu index 5368d98c56a95..6c94e14492bc0 100644 --- a/paddle/phi/kernels/gpu/index_sample_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/index_sample_grad_kernel.cu @@ -18,9 +18,9 @@ #include #include "paddle/fluid/framework/convert_utils.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/math_function.h" @@ -50,8 +50,8 @@ __global__ void IndexSampleGrad(const IndexT* index, unsigned int in_idx = index_j * input_length + index_i; IndexT sample_idx = index[index_idx]; if (same_data_in_row) { - paddle::platform::CudaAtomicAdd( - &(in_grad[in_idx - index_i + sample_idx]), out_grad[sample_idx]); + phi::CudaAtomicAdd(&(in_grad[in_idx - index_i + sample_idx]), + out_grad[sample_idx]); } else { in_grad[in_idx - index_i + sample_idx] = out_grad[index_idx]; } diff --git a/paddle/phi/kernels/gpu/index_select_grad_kernel.cu b/paddle/phi/kernels/gpu/index_select_grad_kernel.cu index fb9157db557e6..1cd998810ee3c 100644 --- a/paddle/phi/kernels/gpu/index_select_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/index_select_grad_kernel.cu @@ -14,9 +14,9 @@ #include "paddle/phi/kernels/index_select_grad_kernel.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_info.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/utils/data_type.h" #include "paddle/phi/kernels/funcs/math_function.h" @@ -25,7 +25,7 @@ DECLARE_bool(cudnn_deterministic); namespace phi { -using paddle::platform::PADDLE_CUDA_NUM_THREADS; +using phi::PADDLE_CUDA_NUM_THREADS; template __global__ void index_select_grad_cuda_kernel(const T* output_grad, @@ -42,7 +42,7 @@ __global__ void index_select_grad_cuda_kernel(const T* output_grad, IndexT src_dim_idx = index[dim_idx]; int64_t input_idx = idx + (delta * pre_idx + src_dim_idx - dim_idx) * stride; - paddle::platform::CudaAtomicAdd(&input_grad[input_idx], output_grad[idx]); + phi::CudaAtomicAdd(&input_grad[input_idx], output_grad[idx]); } } diff --git a/paddle/phi/kernels/gpu/index_select_impl.h b/paddle/phi/kernels/gpu/index_select_impl.h index da9cdbf52783b..deeb6e5eb20f2 100644 --- a/paddle/phi/kernels/gpu/index_select_impl.h +++ b/paddle/phi/kernels/gpu/index_select_impl.h @@ -14,15 +14,15 @@ #pragma once -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_info.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/utils/data_type.h" namespace phi { -using paddle::platform::PADDLE_CUDA_NUM_THREADS; +using phi::PADDLE_CUDA_NUM_THREADS; template __global__ void index_select_cuda_kernel(const T* input, diff --git a/paddle/phi/kernels/gpu/index_select_kernel.cu b/paddle/phi/kernels/gpu/index_select_kernel.cu index 135ae52651897..925feee4ccf66 100644 --- a/paddle/phi/kernels/gpu/index_select_kernel.cu +++ b/paddle/phi/kernels/gpu/index_select_kernel.cu @@ -14,16 +14,16 @@ #include "paddle/phi/kernels/index_select_kernel.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_info.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/utils/data_type.h" #include "paddle/phi/kernels/gpu/index_select_impl.h" namespace phi { -using paddle::platform::PADDLE_CUDA_NUM_THREADS; +using phi::PADDLE_CUDA_NUM_THREADS; template void IndexSelectKernel(const Context& ctx, diff --git a/paddle/phi/kernels/gpu/interpolate_grad_kernel.cu b/paddle/phi/kernels/gpu/interpolate_grad_kernel.cu index ee24d9f900509..51a5f50560eac 100644 --- a/paddle/phi/kernels/gpu/interpolate_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/interpolate_grad_kernel.cu @@ -14,9 +14,9 @@ #include "paddle/phi/kernels/interpolate_grad_kernel.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/common/amp_type_traits.h" #include "paddle/phi/common/layout.h" #include "paddle/phi/core/kernel_registry.h" @@ -96,12 +96,11 @@ __global__ void KeLinearInterpBw(T* in, const T* out_pos = &out[out_id_w]; if (data_layout == DataLayout::kNCHW) { - paddle::platform::CudaAtomicAdd(&in_pos[0], w2lambda * out_pos[0]); - paddle::platform::CudaAtomicAdd(&in_pos[w_id], w1lambda * out_pos[0]); + phi::CudaAtomicAdd(&in_pos[0], w2lambda * out_pos[0]); + phi::CudaAtomicAdd(&in_pos[w_id], w1lambda * out_pos[0]); } else { - paddle::platform::CudaAtomicAdd(&in_pos[0], w2lambda * out_pos[0]); - paddle::platform::CudaAtomicAdd(&in_pos[w_id * num_channels], - w1lambda * out_pos[0]); + phi::CudaAtomicAdd(&in_pos[0], w2lambda * out_pos[0]); + phi::CudaAtomicAdd(&in_pos[w_id * num_channels], w1lambda * out_pos[0]); } } } @@ -141,7 +140,7 @@ __global__ void KeNearestNeighborInterpNCHWBw(T* in, while (nc_id < nc) { T* in_pos = &in[in_index]; const T out_pos = out[out_index]; - paddle::platform::CudaAtomicAdd(in_pos, out_pos); + phi::CudaAtomicAdd(in_pos, out_pos); in_index += in_index_stride; out_index += out_index_stride; nc_id += nc_stride; @@ -194,7 +193,7 @@ __global__ void KeNearestNeighborInterpBw( in_img_idx * num_channels + channel_id]; const T out_pos = out[tid]; - paddle::platform::CudaAtomicAdd(in_pos, out_pos); + phi::CudaAtomicAdd(in_pos, out_pos); } } @@ -218,7 +217,7 @@ __inline__ __device__ T PartialBlockMin(T val, } } else { shared_last_val = std::numeric_limits::max(); - paddle::platform::CudaAtomicMin(&shared_last_val, val); + phi::CudaAtomicMin(&shared_last_val, val); shared[wid] = shared_last_val; shared_last_idx = wid; } @@ -308,33 +307,27 @@ __global__ void KeBilinearInterpBwShareMemory(T* in, ? (in_top_max_index - in_top_min_index) : (in_bot_max_index - in_bot_min_index); if (h_id != 0) { - paddle::platform::CudaAtomicAdd( - &s_data[0][input_index - in_top_min_index], - h2lambda * w2lambda * value); - paddle::platform::CudaAtomicAdd( - &s_data[0][top_right_index - in_top_min_index], - h2lambda * w1lambda * value); - paddle::platform::CudaAtomicAdd( - &s_data[1][bot_left_index - in_bot_min_index], - h1lambda * w2lambda * value); - paddle::platform::CudaAtomicAdd( - &s_data[1][bot_right_index - in_bot_min_index], - h1lambda * w1lambda * value); + phi::CudaAtomicAdd(&s_data[0][input_index - in_top_min_index], + h2lambda * w2lambda * value); + phi::CudaAtomicAdd(&s_data[0][top_right_index - in_top_min_index], + h2lambda * w1lambda * value); + phi::CudaAtomicAdd(&s_data[1][bot_left_index - in_bot_min_index], + h1lambda * w2lambda * value); + phi::CudaAtomicAdd(&s_data[1][bot_right_index - in_bot_min_index], + h1lambda * w1lambda * value); } else { - paddle::platform::CudaAtomicAdd( - &s_data[0][top_right_index - in_top_min_index], - (h2lambda + h1lambda) * w1lambda * value); - paddle::platform::CudaAtomicAdd( - &s_data[1][bot_left_index - in_bot_min_index], - (h1lambda + h2lambda) * w2lambda * value); + phi::CudaAtomicAdd(&s_data[0][top_right_index - in_top_min_index], + (h2lambda + h1lambda) * w1lambda * value); + phi::CudaAtomicAdd(&s_data[1][bot_left_index - in_bot_min_index], + (h1lambda + h2lambda) * w2lambda * value); } __syncthreads(); if (threadIdx.x <= upper_limit_share_idx) { - paddle::platform::CudaAtomicAdd(&in[in_top_min_index + threadIdx.x], - s_data[0][threadIdx.x]); - paddle::platform::CudaAtomicAdd(&in[in_bot_min_index + threadIdx.x], - s_data[1][threadIdx.x]); + phi::CudaAtomicAdd(&in[in_top_min_index + threadIdx.x], + s_data[0][threadIdx.x]); + phi::CudaAtomicAdd(&in[in_bot_min_index + threadIdx.x], + s_data[1][threadIdx.x]); } } } @@ -387,17 +380,14 @@ __global__ void KeBilinearInterpNCHWBw(T* in, T d2val = out[index]; - paddle::platform::CudaAtomicAdd(in + GetInputIndex(nc, in_h, in_w, h1, w1), - h0lambda * w0lambda * d2val); - paddle::platform::CudaAtomicAdd( - in + GetInputIndex(nc, in_h, in_w, h1, w1 + x_id), - h0lambda * w1lambda * d2val); - paddle::platform::CudaAtomicAdd( - in + GetInputIndex(nc, in_h, in_w, h1 + y_id, w1), - h1lambda * w0lambda * d2val); - paddle::platform::CudaAtomicAdd( - in + GetInputIndex(nc, in_h, in_w, h1 + y_id, w1 + x_id), - h1lambda * w1lambda * d2val); + phi::CudaAtomicAdd(in + GetInputIndex(nc, in_h, in_w, h1, w1), + h0lambda * w0lambda * d2val); + phi::CudaAtomicAdd(in + GetInputIndex(nc, in_h, in_w, h1, w1 + x_id), + h0lambda * w1lambda * d2val); + phi::CudaAtomicAdd(in + GetInputIndex(nc, in_h, in_w, h1 + y_id, w1), + h1lambda * w0lambda * d2val); + phi::CudaAtomicAdd(in + GetInputIndex(nc, in_h, in_w, h1 + y_id, w1 + x_id), + h1lambda * w1lambda * d2val); } } @@ -446,12 +436,12 @@ __global__ void KeBilinearInterpBw(T* in, T value = out[tid]; T* in_pos = &in[out_id_h * in_chw + in_img_idy * in_w * num_channels + in_img_idx * num_channels + channel_id]; - paddle::platform::CudaAtomicAdd(&in_pos[0], h2lambda * w2lambda * value); - paddle::platform::CudaAtomicAdd(&in_pos[w_id * num_channels], - h2lambda * w1lambda * value); - paddle::platform::CudaAtomicAdd(&in_pos[h_id * in_w * num_channels], - h1lambda * w2lambda * value); - paddle::platform::CudaAtomicAdd( + phi::CudaAtomicAdd(&in_pos[0], h2lambda * w2lambda * value); + phi::CudaAtomicAdd(&in_pos[w_id * num_channels], + h2lambda * w1lambda * value); + phi::CudaAtomicAdd(&in_pos[h_id * in_w * num_channels], + h1lambda * w2lambda * value); + phi::CudaAtomicAdd( &in_pos[h_id * in_w * num_channels + w_id * num_channels], h1lambda * w1lambda * value); } @@ -530,8 +520,8 @@ __global__ void KeBicubicInterpBw(T* in, in_pos = &in[out_id_h * input_w + access_y * in_img_w * num_channels + access_x * num_channels + channel_id]; } - paddle::platform::CudaAtomicAdd( - &in_pos[0], (out_pos[0] * y_coeffs[j] * x_coeffs[i])); + phi::CudaAtomicAdd(&in_pos[0], + (out_pos[0] * y_coeffs[j] * x_coeffs[i])); } } } @@ -629,26 +619,22 @@ __global__ void KeTrilinearInterpBw(T* in, const T* out_pos = &out[out_id_h * output_w + out_id_w]; // trilinear interpolation grad - paddle::platform::CudaAtomicAdd( - &in_pos1[0], d2lambda * h2lambda * w2lambda * out_pos[0]); - paddle::platform::CudaAtomicAdd( - &in_pos1[w_id], d2lambda * h2lambda * w1lambda * out_pos[0]); - paddle::platform::CudaAtomicAdd( - &in_pos1[h_id * in_img_w], - d2lambda * h1lambda * w2lambda * out_pos[0]); - paddle::platform::CudaAtomicAdd( - &in_pos1[h_id * in_img_w + w_id], - d2lambda * h1lambda * w1lambda * out_pos[0]); - paddle::platform::CudaAtomicAdd( - &in_pos2[0], d1lambda * h2lambda * w2lambda * out_pos[0]); - paddle::platform::CudaAtomicAdd( - &in_pos2[w_id], d1lambda * h2lambda * w1lambda * out_pos[0]); - paddle::platform::CudaAtomicAdd( - &in_pos2[h_id * in_img_w], - d1lambda * h1lambda * w2lambda * out_pos[0]); - paddle::platform::CudaAtomicAdd( - &in_pos2[h_id * in_img_w + w_id], - d1lambda * h1lambda * w1lambda * out_pos[0]); + phi::CudaAtomicAdd(&in_pos1[0], + d2lambda * h2lambda * w2lambda * out_pos[0]); + phi::CudaAtomicAdd(&in_pos1[w_id], + d2lambda * h2lambda * w1lambda * out_pos[0]); + phi::CudaAtomicAdd(&in_pos1[h_id * in_img_w], + d2lambda * h1lambda * w2lambda * out_pos[0]); + phi::CudaAtomicAdd(&in_pos1[h_id * in_img_w + w_id], + d2lambda * h1lambda * w1lambda * out_pos[0]); + phi::CudaAtomicAdd(&in_pos2[0], + d1lambda * h2lambda * w2lambda * out_pos[0]); + phi::CudaAtomicAdd(&in_pos2[w_id], + d1lambda * h2lambda * w1lambda * out_pos[0]); + phi::CudaAtomicAdd(&in_pos2[h_id * in_img_w], + d1lambda * h1lambda * w2lambda * out_pos[0]); + phi::CudaAtomicAdd(&in_pos2[h_id * in_img_w + w_id], + d1lambda * h1lambda * w1lambda * out_pos[0]); } else { int in_pos1_idx = out_id_h * input_w + in_img_idt * in_img_h * in_img_w * num_channels + @@ -661,26 +647,22 @@ __global__ void KeTrilinearInterpBw(T* in, const T* out_pos = &out[out_id_h * output_w + out_id_w]; // trilinear interpolation grad - paddle::platform::CudaAtomicAdd( - &in_pos1[0], d2lambda * h2lambda * w2lambda * out_pos[0]); - paddle::platform::CudaAtomicAdd( - &in_pos1[w_id * num_channels], - d2lambda * h2lambda * w1lambda * out_pos[0]); - paddle::platform::CudaAtomicAdd( - &in_pos1[h_id * in_img_w * num_channels], - d2lambda * h1lambda * w2lambda * out_pos[0]); - paddle::platform::CudaAtomicAdd( + phi::CudaAtomicAdd(&in_pos1[0], + d2lambda * h2lambda * w2lambda * out_pos[0]); + phi::CudaAtomicAdd(&in_pos1[w_id * num_channels], + d2lambda * h2lambda * w1lambda * out_pos[0]); + phi::CudaAtomicAdd(&in_pos1[h_id * in_img_w * num_channels], + d2lambda * h1lambda * w2lambda * out_pos[0]); + phi::CudaAtomicAdd( &in_pos1[h_id * in_img_w * num_channels + w_id * num_channels], d2lambda * h1lambda * w1lambda * out_pos[0]); - paddle::platform::CudaAtomicAdd( - &in_pos2[0], d1lambda * h2lambda * w2lambda * out_pos[0]); - paddle::platform::CudaAtomicAdd( - &in_pos2[w_id * num_channels], - d1lambda * h2lambda * w1lambda * out_pos[0]); - paddle::platform::CudaAtomicAdd( - &in_pos2[h_id * in_img_w * num_channels], - d1lambda * h1lambda * w2lambda * out_pos[0]); - paddle::platform::CudaAtomicAdd( + phi::CudaAtomicAdd(&in_pos2[0], + d1lambda * h2lambda * w2lambda * out_pos[0]); + phi::CudaAtomicAdd(&in_pos2[w_id * num_channels], + d1lambda * h2lambda * w1lambda * out_pos[0]); + phi::CudaAtomicAdd(&in_pos2[h_id * in_img_w * num_channels], + d1lambda * h1lambda * w2lambda * out_pos[0]); + phi::CudaAtomicAdd( &in_pos2[h_id * in_img_w * num_channels + w_id * num_channels], d1lambda * h1lambda * w1lambda * out_pos[0]); } @@ -751,7 +733,7 @@ __global__ void KeNearestNeighbor3DInterpBw(T* in, in_img_idx * num_channels + channel_id]; } const T out_pos = out[out_id_h * output_w + out_id_w]; - paddle::platform::CudaAtomicAdd(in_pos, out_pos); + phi::CudaAtomicAdd(in_pos, out_pos); } } diff --git a/paddle/phi/kernels/gpu/interpolate_kernel.cu b/paddle/phi/kernels/gpu/interpolate_kernel.cu index c8f6a40104b1b..625718e8f4bc9 100644 --- a/paddle/phi/kernels/gpu/interpolate_kernel.cu +++ b/paddle/phi/kernels/gpu/interpolate_kernel.cu @@ -15,9 +15,9 @@ #include "paddle/phi/kernels/interpolate_kernel.h" #include "paddle/fluid/platform/device/gpu/gpu_device_function.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/common/amp_type_traits.h" #include "paddle/phi/common/float16.h" #include "paddle/phi/common/layout.h" diff --git a/paddle/phi/kernels/gpu/linspace_kernel.cu b/paddle/phi/kernels/gpu/linspace_kernel.cu index 9db11381cbc15..eaf2955010cc1 100644 --- a/paddle/phi/kernels/gpu/linspace_kernel.cu +++ b/paddle/phi/kernels/gpu/linspace_kernel.cu @@ -14,8 +14,8 @@ #include "paddle/phi/kernels/linspace_kernel.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/tensor_utils.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/phi/kernels/gpu/nanmedian_grad_kernel.cu b/paddle/phi/kernels/gpu/nanmedian_grad_kernel.cu index 30fd93533ed54..99b1c1a8c0af8 100644 --- a/paddle/phi/kernels/gpu/nanmedian_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/nanmedian_grad_kernel.cu @@ -14,9 +14,9 @@ #include "paddle/phi/kernels/nanmedian_grad_kernel.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/tensor_meta.h" #include "paddle/phi/kernels/funcs/math_function.h" @@ -24,7 +24,7 @@ namespace phi { -using paddle::platform::PADDLE_CUDA_NUM_THREADS; +using phi::PADDLE_CUDA_NUM_THREADS; inline int GET_BLOCKS(const int N) { return (N + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS; } diff --git a/paddle/phi/kernels/gpu/nanmedian_kernel.cu b/paddle/phi/kernels/gpu/nanmedian_kernel.cu index 132b9fa10b7c6..2765f96321c02 100644 --- a/paddle/phi/kernels/gpu/nanmedian_kernel.cu +++ b/paddle/phi/kernels/gpu/nanmedian_kernel.cu @@ -15,9 +15,9 @@ #include "paddle/phi/kernels/nanmedian_kernel.h" #include "paddle/fluid/memory/memcpy.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/full_kernel.h" #include "paddle/phi/kernels/impl/nanmedian_kernel_impl.h" @@ -25,7 +25,7 @@ namespace phi { -using paddle::platform::PADDLE_CUDA_NUM_THREADS; +using phi::PADDLE_CUDA_NUM_THREADS; inline int GET_BLOCKS(const int N) { return (N + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS; @@ -56,15 +56,15 @@ __global__ void KernelNanCounts(const T* input, const T x = input[index]; if (isnan(static_cast(x))) { auto bin = static_cast(index / stride); - paddle::platform::CudaAtomicAdd(&buf[bin], 1); + phi::CudaAtomicAdd(&buf[bin], 1); } } __syncthreads(); for (int i = threadIdx.x; i < pre_dim; i += blockDim.x) { - paddle::platform::CudaAtomicAdd(&nan_counts[i], buf[i]); - paddle::platform::CudaAtomicAdd(&nan_total[0], buf[i]); - paddle::platform::CudaAtomicMax(&nan_total[1], stride - buf[i]); + phi::CudaAtomicAdd(&nan_counts[i], buf[i]); + phi::CudaAtomicAdd(&nan_total[0], buf[i]); + phi::CudaAtomicMax(&nan_total[1], stride - buf[i]); } } diff --git a/paddle/phi/kernels/gpu/nll_loss.h b/paddle/phi/kernels/gpu/nll_loss.h index bb47a2f06f4c3..37a67b4767a9b 100644 --- a/paddle/phi/kernels/gpu/nll_loss.h +++ b/paddle/phi/kernels/gpu/nll_loss.h @@ -20,7 +20,7 @@ #include #include "paddle/fluid/operators/math.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/hostdevice.h" @@ -270,8 +270,8 @@ __global__ void GPUNLLLossForward2D_with_reduce(T* out_data, partial_sums, blockDim.x, acc_weight, thrust::plus(), (T)0); if (threadIdx.x == 0) { - paddle::platform::CudaAtomicAdd(total_weight_data, acc_weight); - paddle::platform::CudaAtomicAdd(out_data, input_sum); + phi::CudaAtomicAdd(total_weight_data, acc_weight); + phi::CudaAtomicAdd(out_data, input_sum); } } diff --git a/paddle/phi/kernels/gpu/nms_kernel.cu b/paddle/phi/kernels/gpu/nms_kernel.cu index dcc6d6e2b45f0..79b0b8dfb1825 100644 --- a/paddle/phi/kernels/gpu/nms_kernel.cu +++ b/paddle/phi/kernels/gpu/nms_kernel.cu @@ -16,8 +16,8 @@ #include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/memory/memcpy.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/phi/kernels/gpu/one_hot_kernel.cu b/paddle/phi/kernels/gpu/one_hot_kernel.cu index abe7757df7205..d055e5ad73ee9 100644 --- a/paddle/phi/kernels/gpu/one_hot_kernel.cu +++ b/paddle/phi/kernels/gpu/one_hot_kernel.cu @@ -15,14 +15,14 @@ #include "paddle/phi/kernels/one_hot_kernel.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/math_function.h" namespace phi { -using paddle::platform::PADDLE_CUDA_NUM_THREADS; +using phi::PADDLE_CUDA_NUM_THREADS; template __global__ void FillOutputKernel(const InT* p_in_data, diff --git a/paddle/phi/kernels/gpu/pad3d_grad_kernel.cu b/paddle/phi/kernels/gpu/pad3d_grad_kernel.cu index fb7f1a2325790..ca26d9be4f908 100644 --- a/paddle/phi/kernels/gpu/pad3d_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/pad3d_grad_kernel.cu @@ -15,14 +15,14 @@ #include "paddle/phi/kernels/pad3d_grad_kernel.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/math_function.h" namespace phi { -using paddle::platform::PADDLE_CUDA_NUM_THREADS; +using phi::PADDLE_CUDA_NUM_THREADS; template __global__ void Pad3DGradConstNCDHW(const int in_size, @@ -133,7 +133,7 @@ __global__ void Pad3DGradReflectNCDHW(const int out_size, in_h = min(in_h, 2 * in_height - in_h - 2); in_w = min(in_w, 2 * in_width - in_w - 2); - paddle::platform::CudaAtomicAdd( + phi::CudaAtomicAdd( &d_in_data[nc * in_depth * in_height * in_width + in_d * in_height * in_width + in_h * in_width + in_w], d_out_data[out_index]); @@ -176,7 +176,7 @@ __global__ void Pad3DGradReflectNDHWC(const int out_size, in_d = min(in_d, in_depth * 2 - in_d - 2); in_h = min(in_h, in_height * 2 - in_h - 2); in_w = min(in_w, in_width * 2 - in_w - 2); - paddle::platform::CudaAtomicAdd( + phi::CudaAtomicAdd( &d_in_data[n * in_depth * in_height * in_width * channels + in_d * in_height * in_width * channels + in_h * in_width * channels + in_w * channels + c], @@ -211,7 +211,7 @@ __global__ void Pad3DGradReplicateNCDHW(const int out_size, const int in_h = min(in_height - 1, max(out_h - pad_top, 0)); const int in_w = min(in_width - 1, max(out_w - pad_left, 0)); - paddle::platform::CudaAtomicAdd( + phi::CudaAtomicAdd( &d_in_data[nc * in_depth * in_height * in_width + in_d * in_height * in_width + in_h * in_width + in_w], d_out_data[out_index]); @@ -247,7 +247,7 @@ __global__ void Pad3DGradReplicateNDHWC(const int out_size, const int in_h = min(in_height - 1, max(out_h - pad_top, 0)); const int in_w = min(in_width - 1, max(out_w - pad_left, 0)); - paddle::platform::CudaAtomicAdd( + phi::CudaAtomicAdd( &d_in_data[n * in_depth * in_height * in_width * channels + in_d * in_height * in_width * channels + in_h * in_width * channels + in_w * channels + c], @@ -282,7 +282,7 @@ __global__ void Pad3DGradCircularNCDHW(const int out_size, int in_h = ((out_h - pad_top) % in_height + in_height) % in_height; int in_w = ((out_w - pad_left) % in_width + in_width) % in_width; - paddle::platform::CudaAtomicAdd( + phi::CudaAtomicAdd( &d_in_data[nc * in_depth * in_height * in_width + in_d * in_height * in_width + in_h * in_width + in_w], d_out_data[out_index]); @@ -318,7 +318,7 @@ __global__ void Pad3DGradCircularNDHWC(const int out_size, int in_h = ((out_h - pad_top) % in_height + in_height) % in_height; int in_w = ((out_w - pad_left) % in_width + in_width) % in_width; - paddle::platform::CudaAtomicAdd( + phi::CudaAtomicAdd( &d_in_data[n * in_depth * in_height * in_width * channels + in_d * in_height * in_width * channels + in_h * in_width * channels + in_w * channels + c], diff --git a/paddle/phi/kernels/gpu/pad3d_kernel.cu b/paddle/phi/kernels/gpu/pad3d_kernel.cu index fa85c650bc854..241ffefe5d18d 100644 --- a/paddle/phi/kernels/gpu/pad3d_kernel.cu +++ b/paddle/phi/kernels/gpu/pad3d_kernel.cu @@ -17,14 +17,14 @@ #include #include "paddle/fluid/platform/device/gpu/gpu_info.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/common/complex.h" #include "paddle/phi/core/kernel_registry.h" namespace phi { -using paddle::platform::PADDLE_CUDA_NUM_THREADS; +using phi::PADDLE_CUDA_NUM_THREADS; template __global__ void Pad3DConstNCDHW(const int nthreads, diff --git a/paddle/phi/kernels/gpu/psroi_pool_grad_kernel.cu b/paddle/phi/kernels/gpu/psroi_pool_grad_kernel.cu index 6ecaaef1870a1..433bb5e4cf1ad 100644 --- a/paddle/phi/kernels/gpu/psroi_pool_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/psroi_pool_grad_kernel.cu @@ -16,7 +16,7 @@ #include #include "paddle/fluid/memory/memory.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/common/place.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/tensor_utils.h" @@ -97,7 +97,7 @@ __global__ void GPUPSROIPoolBackward(const int nthreads, for (int ih = hstart; ih < hend; ++ih) { for (int iw = wstart; iw < wend; ++iw) { int input_index = ih * width + iw; - paddle::platform::CudaAtomicAdd(offset_dx_data + input_index, diff_val); + phi::CudaAtomicAdd(offset_dx_data + input_index, diff_val); } } } diff --git a/paddle/phi/kernels/gpu/roi_align_grad_kernel.cu b/paddle/phi/kernels/gpu/roi_align_grad_kernel.cu index 0673eda8d5fad..09dce366bea91 100644 --- a/paddle/phi/kernels/gpu/roi_align_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/roi_align_grad_kernel.cu @@ -15,9 +15,9 @@ #include "paddle/phi/kernels/roi_align_grad_kernel.h" #include "paddle/fluid/memory/memory.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/common/place.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/empty_kernel.h" @@ -153,14 +153,11 @@ __global__ void GPURoiAlignBackward(const int nthreads, T diff3 = out_grad_this_bin * w3 / count; T diff4 = out_grad_this_bin * w4 / count; if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) { - paddle::platform::CudaAtomicAdd( - offset_input_grad + y_low * width + x_low, diff1); - paddle::platform::CudaAtomicAdd( - offset_input_grad + y_low * width + x_high, diff2); - paddle::platform::CudaAtomicAdd( - offset_input_grad + y_high * width + x_low, diff3); - paddle::platform::CudaAtomicAdd( - offset_input_grad + y_high * width + x_high, diff4); + phi::CudaAtomicAdd(offset_input_grad + y_low * width + x_low, diff1); + phi::CudaAtomicAdd(offset_input_grad + y_low * width + x_high, diff2); + phi::CudaAtomicAdd(offset_input_grad + y_high * width + x_low, diff3); + phi::CudaAtomicAdd(offset_input_grad + y_high * width + x_high, + diff4); } } } diff --git a/paddle/phi/kernels/gpu/roi_pool_grad_kernel.cu b/paddle/phi/kernels/gpu/roi_pool_grad_kernel.cu index 6b888b200e1eb..527060ae68162 100644 --- a/paddle/phi/kernels/gpu/roi_pool_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/roi_pool_grad_kernel.cu @@ -15,9 +15,9 @@ #include "paddle/phi/kernels/roi_pool_grad_kernel.h" #include "paddle/fluid/memory/memory.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/common/place.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/math_function.h" @@ -63,7 +63,7 @@ __global__ void GPURoiPoolBackward(const int nthreads, int arg_max = offset_arg_max_data[ph * pooled_width + pw]; if (arg_max != -1) { - paddle::platform::CudaAtomicAdd( + phi::CudaAtomicAdd( offset_input_grad + arg_max, static_cast(offset_output_grad[ph * pooled_width + pw])); } diff --git a/paddle/phi/kernels/gpu/roll_grad_kernel.cu b/paddle/phi/kernels/gpu/roll_grad_kernel.cu index 82e0fa72ab076..ff3ac29117e41 100644 --- a/paddle/phi/kernels/gpu/roll_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/roll_grad_kernel.cu @@ -20,7 +20,7 @@ namespace phi { -using paddle::platform::PADDLE_CUDA_NUM_THREADS; +using phi::PADDLE_CUDA_NUM_THREADS; template void RollGradKernel(const Context& dev_ctx, diff --git a/paddle/phi/kernels/gpu/roll_kernel.cu b/paddle/phi/kernels/gpu/roll_kernel.cu index 5d3584e4f44c1..60db02d846317 100644 --- a/paddle/phi/kernels/gpu/roll_kernel.cu +++ b/paddle/phi/kernels/gpu/roll_kernel.cu @@ -21,7 +21,7 @@ namespace phi { -using paddle::platform::PADDLE_CUDA_NUM_THREADS; +using phi::PADDLE_CUDA_NUM_THREADS; template void RollKernel(const Context& dev_ctx, diff --git a/paddle/phi/kernels/gpu/roll_kernel_impl.h b/paddle/phi/kernels/gpu/roll_kernel_impl.h index 823164f3fbc52..99ef7515e2c99 100644 --- a/paddle/phi/kernels/gpu/roll_kernel_impl.h +++ b/paddle/phi/kernels/gpu/roll_kernel_impl.h @@ -14,13 +14,13 @@ #pragma once -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/core/utils/array.h" #include "paddle/phi/kernels/primitive/kernel_primitives.h" namespace phi { -using paddle::platform::PADDLE_CUDA_NUM_THREADS; +using phi::PADDLE_CUDA_NUM_THREADS; template __global__ void RollCudaKernel(const T* input, diff --git a/paddle/phi/kernels/gpu/send_ue_recv_grad_kernel.cu b/paddle/phi/kernels/gpu/send_ue_recv_grad_kernel.cu index 281bf6278afa4..14ec4462fc5a6 100644 --- a/paddle/phi/kernels/gpu/send_ue_recv_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/send_ue_recv_grad_kernel.cu @@ -45,7 +45,7 @@ void CalculateXEGradForMinMax(const Context& ctx, const auto& bcast_info = phi::CalcBCastInfo(x_dims, e_dims); thrust::device_vector l_bcastoff, r_bcastoff; if (bcast_info.use_bcast) { - CopyBCastOff(bcast_info, l_bcastoff, r_bcastoff); + CopyBCastOff(bcast_info, &l_bcastoff, &r_bcastoff); } int64_t out_len = bcast_info.out_len; @@ -177,7 +177,7 @@ void CalculateXGrad(const Context& ctx, const auto& bcast_info = phi::CalcBCastInfo(out_grad_dims, e_dims); thrust::device_vector l_bcastoff, r_bcastoff; if (bcast_info.use_bcast) { - CopyBCastOff(bcast_info, l_bcastoff, r_bcastoff); + CopyBCastOff(bcast_info, &l_bcastoff, &r_bcastoff); } int64_t out_len = bcast_info.out_len; const int ntx = FindNumThreads(out_len, ctx.GetMaxThreadsPerBlock()); @@ -300,7 +300,7 @@ void CalculateXGrad(const Context& ctx, const auto& bcast_info = phi::CalcBCastInfo(out_grad_dims, e_dims); thrust::device_vector l_bcastoff, r_bcastoff; if (bcast_info.use_bcast) { - CopyBCastOff(bcast_info, l_bcastoff, r_bcastoff); + CopyBCastOff(bcast_info, &l_bcastoff, &r_bcastoff); } int64_t out_len = bcast_info.out_len; const int ntx = FindNumThreads(out_len, ctx.GetMaxThreadsPerBlock()); @@ -386,7 +386,7 @@ void CalculateEGrad(const Context& ctx, const auto& bcast_info = phi::CalcBCastInfo(x_dims, e_dims); thrust::device_vector l_bcastoff, r_bcastoff; if (bcast_info.use_bcast) { - CopyBCastOff(bcast_info, l_bcastoff, r_bcastoff); + CopyBCastOff(bcast_info, &l_bcastoff, &r_bcastoff); } int64_t out_len = bcast_info.out_len; const int ntx = FindNumThreads(out_len, ctx.GetMaxThreadsPerBlock()); diff --git a/paddle/phi/kernels/gpu/send_ue_recv_kernel.cu b/paddle/phi/kernels/gpu/send_ue_recv_kernel.cu index 482077b7f93bf..aaae915f9df3e 100644 --- a/paddle/phi/kernels/gpu/send_ue_recv_kernel.cu +++ b/paddle/phi/kernels/gpu/send_ue_recv_kernel.cu @@ -89,7 +89,7 @@ void GraphSendUERecvOpCUDAKernelLaunchHelper(const Context& ctx, thrust::device_vector x_bcastoff, e_bcastoff; if (bcast_info.use_bcast) { - CopyBCastOff(bcast_info, x_bcastoff, e_bcastoff); + CopyBCastOff(bcast_info, &x_bcastoff, &e_bcastoff); } int64_t out_len = bcast_info.out_len; diff --git a/paddle/phi/kernels/gpu/send_uv_grad_kernel.cu b/paddle/phi/kernels/gpu/send_uv_grad_kernel.cu index ad904c8ae2d88..e33d259bd44c8 100644 --- a/paddle/phi/kernels/gpu/send_uv_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/send_uv_grad_kernel.cu @@ -45,7 +45,7 @@ __global__ void GraphSendUVGradCUDAKernel(const T* out_grad, const T* out_grad_off = out_grad + ty * slice_size; T* x_grad_off = x_grad + dst * slice_size; while (tx < slice_size) { - paddle::platform::CudaAtomicAdd(x_grad_off + tx, out_grad_off[tx]); + phi::CudaAtomicAdd(x_grad_off + tx, out_grad_off[tx]); tx += stride_x; } ty += stride_y; @@ -127,7 +127,7 @@ void CalculateGrad(const Context& ctx, const auto& bcast_info = phi::CalcBCastInfo(y.dims(), out_grad_dims); thrust::device_vector l_bcastoff, r_bcastoff; if (bcast_info.use_bcast) { - CopyBCastOff(bcast_info, l_bcastoff, r_bcastoff); + CopyBCastOff(bcast_info, &l_bcastoff, &r_bcastoff); } int64_t out_len = bcast_info.out_len; const int ntx = FindNumThreads(out_len, ctx.GetMaxThreadsPerBlock()); diff --git a/paddle/phi/kernels/gpu/send_uv_kernel.cu b/paddle/phi/kernels/gpu/send_uv_kernel.cu index 69c1515e8124d..860a900dac834 100644 --- a/paddle/phi/kernels/gpu/send_uv_kernel.cu +++ b/paddle/phi/kernels/gpu/send_uv_kernel.cu @@ -94,7 +94,7 @@ void GraphSendUVOpCUDAKernelLaunchHelper(const Context& ctx, thrust::device_vector x_bcastoff, y_bcastoff; if (bcast_info.use_bcast) { - CopyBCastOff(bcast_info, x_bcastoff, y_bcastoff); + CopyBCastOff(bcast_info, &x_bcastoff, &y_bcastoff); } int64_t out_len = bcast_info.out_len; diff --git a/paddle/phi/kernels/gpu/sgd_kernel.cu b/paddle/phi/kernels/gpu/sgd_kernel.cu index b2b73b2812c60..ea257ebd1cc24 100644 --- a/paddle/phi/kernels/gpu/sgd_kernel.cu +++ b/paddle/phi/kernels/gpu/sgd_kernel.cu @@ -16,9 +16,9 @@ #include "paddle/fluid/framework/mixed_vector.h" #include "paddle/fluid/operators/amp/fp16_type_traits.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_helper.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/core/kernel_registry.h" namespace phi { @@ -56,7 +56,7 @@ __global__ void SparseSGDFunctorKernel(const T* selected_rows, for (int64_t index = threadIdx.x; index < row_numel; index += blockDim.x) { // Since index in rows of SelectedRows can be duplicate, we have to use // Atomic Operation to avoid concurrent write error. - paddle::platform::CudaAtomicAdd( + phi::CudaAtomicAdd( tensor_out_ptr + index, -static_cast(1.0) * learning_rate[0] * selected_rows_ptr[index]); } diff --git a/paddle/phi/kernels/gpu/shard_index_kernel.cu b/paddle/phi/kernels/gpu/shard_index_kernel.cu index 96fd3911c0d45..f8e51eb98d799 100644 --- a/paddle/phi/kernels/gpu/shard_index_kernel.cu +++ b/paddle/phi/kernels/gpu/shard_index_kernel.cu @@ -14,13 +14,13 @@ #include "paddle/phi/kernels/shard_index_kernel.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/core/kernel_registry.h" namespace phi { -using paddle::platform::PADDLE_CUDA_NUM_THREADS; +using phi::PADDLE_CUDA_NUM_THREADS; template __global__ void ShardIndexInner(const T* in_data, diff --git a/paddle/phi/kernels/gpu/transpose_kernel.cu b/paddle/phi/kernels/gpu/transpose_kernel.cu index 9b895adb0a3be..a970902d80094 100644 --- a/paddle/phi/kernels/gpu/transpose_kernel.cu +++ b/paddle/phi/kernels/gpu/transpose_kernel.cu @@ -18,9 +18,9 @@ #include "paddle/fluid/framework/gpu_utils.h" #include "paddle/fluid/operators/transpose_op.cu.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/common/bfloat16.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/impl/transpose_grad_kernel_impl.h" diff --git a/paddle/phi/kernels/gpu/trunc_grad_kernel.cu b/paddle/phi/kernels/gpu/trunc_grad_kernel.cu index 927978339181d..8a88383e6e4f0 100644 --- a/paddle/phi/kernels/gpu/trunc_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/trunc_grad_kernel.cu @@ -14,14 +14,14 @@ #include "paddle/phi/kernels/trunc_grad_kernel.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_info.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/core/kernel_registry.h" namespace phi { -using paddle::platform::PADDLE_CUDA_NUM_THREADS; +using phi::PADDLE_CUDA_NUM_THREADS; template __global__ void TruncGrad(T* dx, int64_t N) { diff --git a/paddle/phi/kernels/gpu/trunc_kernel.cu b/paddle/phi/kernels/gpu/trunc_kernel.cu index 1f374714a6ff1..dfc4f6589e9cf 100644 --- a/paddle/phi/kernels/gpu/trunc_kernel.cu +++ b/paddle/phi/kernels/gpu/trunc_kernel.cu @@ -14,14 +14,14 @@ #include "paddle/phi/kernels/trunc_kernel.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_info.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/core/kernel_registry.h" namespace phi { -using paddle::platform::PADDLE_CUDA_NUM_THREADS; +using phi::PADDLE_CUDA_NUM_THREADS; template class TruncFunctor { diff --git a/paddle/phi/kernels/gpudnn/affine_grid_grad_kernel.cu b/paddle/phi/kernels/gpudnn/affine_grid_grad_kernel.cu index 13c93c9d3fef5..4bc8c205025e3 100644 --- a/paddle/phi/kernels/gpudnn/affine_grid_grad_kernel.cu +++ b/paddle/phi/kernels/gpudnn/affine_grid_grad_kernel.cu @@ -18,9 +18,9 @@ #include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/common/int_array.h" #include "paddle/phi/core/kernel_registry.h" diff --git a/paddle/phi/kernels/gpudnn/affine_grid_kernel.cu b/paddle/phi/kernels/gpudnn/affine_grid_kernel.cu index 7a1cb6d53af13..98f200480d44c 100644 --- a/paddle/phi/kernels/gpudnn/affine_grid_kernel.cu +++ b/paddle/phi/kernels/gpudnn/affine_grid_kernel.cu @@ -18,9 +18,9 @@ #include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/common/int_array.h" #include "paddle/phi/core/kernel_registry.h" diff --git a/paddle/phi/kernels/impl/repeat_interleave_grad_kernel_impl.h b/paddle/phi/kernels/impl/repeat_interleave_grad_kernel_impl.h index ddaaebafbc9eb..5e90028527333 100644 --- a/paddle/phi/kernels/impl/repeat_interleave_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/repeat_interleave_grad_kernel_impl.h @@ -18,7 +18,7 @@ #include "paddle/phi/kernels/cpu/index_select_impl.h" #include "paddle/phi/kernels/repeat_interleave_grad_kernel.h" #if defined(__NVCC__) || defined(__HIPCC__) -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/kernels/primitive/functor_primitives.h" #ifdef __NVCC__ #include "cub/cub.cuh" @@ -33,7 +33,7 @@ namespace cub = hipcub; namespace phi { #if defined(__NVCC__) || defined(__HIPCC__) -using paddle::platform::PADDLE_CUDA_NUM_THREADS; +using phi::PADDLE_CUDA_NUM_THREADS; template __global__ void index_select_grad_cuda_kernel(const T* output_grad, @@ -53,7 +53,7 @@ __global__ void index_select_grad_cuda_kernel(const T* output_grad, int64_t dim_idx = idx % (stride * size) / stride; IndexT src_dim_idx = index[dim_idx]; int64_t input_idx = idx + (delta * pre_idx + src_dim_idx - dim_idx) * stride; - paddle::platform::CudaAtomicAdd(&input_grad[input_idx], output_grad[idx]); + phi::CudaAtomicAdd(&input_grad[input_idx], output_grad[idx]); } template diff --git a/paddle/phi/kernels/impl/repeat_interleave_kernel_impl.h b/paddle/phi/kernels/impl/repeat_interleave_kernel_impl.h index dd950a14f67a4..8548785aaca4a 100644 --- a/paddle/phi/kernels/impl/repeat_interleave_kernel_impl.h +++ b/paddle/phi/kernels/impl/repeat_interleave_kernel_impl.h @@ -18,9 +18,9 @@ #include "paddle/phi/kernels/cpu/index_select_impl.h" #include "paddle/phi/kernels/repeat_interleave_kernel.h" #if defined(__NVCC__) || defined(__HIPCC__) -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_decls.h" #include "paddle/phi/backends/gpu/gpu_info.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_resources.h" #include "paddle/phi/kernels/primitive/functor_primitives.h" #endif @@ -30,7 +30,7 @@ namespace phi { #if defined(__NVCC__) || defined(__HIPCC__) -using paddle::platform::PADDLE_CUDA_NUM_THREADS; +using phi::PADDLE_CUDA_NUM_THREADS; template __global__ void index_select_cuda_kernel(const T* input, T* output, @@ -81,9 +81,8 @@ void RepeatInterleaveKernel(const Context& ctx, output_dim[dim] = index_size; out->Resize(phi::make_ddim(output_dim)); phi::IndexSelectInner(ctx, &x_copy, index, out, dim); - } #if defined(__NVCC__) || defined(__HIPCC__) - else { + } else { auto stride_dim = phi::stride(input_dim); int64_t stride = stride_dim[dim]; paddle::framework::TensorFromVector(index_vec, ctx, &index); @@ -105,6 +104,8 @@ void RepeatInterleaveKernel(const Context& ctx, stream>>>( x.data(), out_data, index_data, numel, stride, size, delta); } +#else + } #endif } @@ -163,9 +164,8 @@ void RepeatInterleaveWithTensorIndexKernel(const Context& ctx, out->Resize(phi::make_ddim(output_dim)); IndexSelectInner(ctx, &x_copy, index, out, dim); } - } #if defined(__NVCC__) || defined(__HIPCC__) - else { + } else { auto stride_dim = phi::stride(input_dim); int64_t stride = stride_dim[dim]; auto stream = ctx.stream(); @@ -209,6 +209,8 @@ void RepeatInterleaveWithTensorIndexKernel(const Context& ctx, in_data, out_data, index_data, numel, stride, size, delta); } } +#else + } #endif } From 0d507fc25704b3991d21459553790bba45d80460 Mon Sep 17 00:00:00 2001 From: shentanyue <34421038+shentanyue@users.noreply.github.com> Date: Wed, 16 Nov 2022 11:17:11 +0800 Subject: [PATCH 027/210] fix xccl (#48018) --- python/paddle/distributed/parallel.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/paddle/distributed/parallel.py b/python/paddle/distributed/parallel.py index 2d34ddfef7101..ca557dc7dd372 100644 --- a/python/paddle/distributed/parallel.py +++ b/python/paddle/distributed/parallel.py @@ -176,6 +176,7 @@ def train(): or core.is_compiled_with_xpu() or core.is_compiled_with_npu() or core.is_compiled_with_mlu() + or backend == "xccl" ): raise NotImplementedError( "If you want to use CPU-only version, please use 'gloo' as backend" From 2f8901cb9c0e6453916c66d800acdf81e27d74a4 Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Wed, 16 Nov 2022 11:17:32 +0800 Subject: [PATCH 028/210] increase the level of some log (#47990) --- .../fluid/eager/auto_code_generator/eager_generator.cc | 2 +- .../auto_code_generator/generator/python_c_gen.py | 4 ++-- paddle/fluid/eager/to_static/run_program_op_func.h | 4 ++-- paddle/fluid/framework/attribute_checker.h | 2 +- paddle/fluid/imperative/tracer.cc | 2 +- paddle/fluid/pybind/eager_functions.cc | 2 +- paddle/fluid/pybind/eager_math_op_patch.cc | 10 +++++----- paddle/fluid/pybind/eager_method.cc | 2 +- paddle/fluid/pybind/eager_utils.cc | 6 +++--- paddle/phi/infermeta/binary.cc | 4 ++-- paddle/phi/infermeta/unary.cc | 2 +- 11 files changed, 20 insertions(+), 20 deletions(-) diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc index 2f51294f15b5d..8485183f7aeb6 100644 --- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc +++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc @@ -592,7 +592,7 @@ static bool CheckOpProto(proto::OpProto* op_proto) { } // Only handle matmul_v2 for now - VLOG(1) << "------ Analyzing Op ------: " << op_type; + VLOG(3) << "------ Analyzing Op ------: " << op_type; return true; } diff --git a/paddle/fluid/eager/auto_code_generator/generator/python_c_gen.py b/paddle/fluid/eager/auto_code_generator/generator/python_c_gen.py index 10eed267bc98b..8e3944b79c30f 100644 --- a/paddle/fluid/eager/auto_code_generator/generator/python_c_gen.py +++ b/paddle/fluid/eager/auto_code_generator/generator/python_c_gen.py @@ -115,7 +115,7 @@ def FindParsingFunctionFromAttributeType(atype): FUNCTION_SET_DEVICE_TEMPLATE = """{} if (paddle::platform::is_gpu_place(place)) {{ #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) phi::backends::gpu::SetDeviceId(place.device); - VLOG(1) <<"CurrentDeviceId: " << phi::backends::gpu::GetCurrentDeviceId() << " from " << (int)place.device; + VLOG(4) <<"CurrentDeviceId: " << phi::backends::gpu::GetCurrentDeviceId() << " from " << (int)place.device; #else PADDLE_THROW(paddle::platform::errors::PreconditionNotMet( "PaddlePaddle should compile with GPU if use CUDAPlace.")); @@ -124,7 +124,7 @@ def FindParsingFunctionFromAttributeType(atype): if (paddle::platform::is_custom_place(place)) {{ #if defined(PADDLE_WITH_CUSTOM_DEVICE) phi::DeviceManager::SetDevice(place); - VLOG(1) <<"CurrentDeviceId: " << phi::DeviceManager::GetDevice(place.GetDeviceType()) << " from " << (int)place.device; + VLOG(4) <<"CurrentDeviceId: " << phi::DeviceManager::GetDevice(place.GetDeviceType()) << " from " << (int)place.device; #else PADDLE_THROW(paddle::platform::errors::PreconditionNotMet( "PaddlePaddle should compile with CUSTOM_DEVICE if use CustomPlace.")); diff --git a/paddle/fluid/eager/to_static/run_program_op_func.h b/paddle/fluid/eager/to_static/run_program_op_func.h index 23ba88c8898c1..8a6b59808d702 100644 --- a/paddle/fluid/eager/to_static/run_program_op_func.h +++ b/paddle/fluid/eager/to_static/run_program_op_func.h @@ -32,7 +32,7 @@ static void clear_no_grad_edges( for (size_t i = 0; i < params.size(); ++i) { auto p_grad_name = paddle::framework::GradVarName(params[i].name()); if (!block_desc->HasVar(p_grad_name)) { - VLOG(1) << "clear edge of " << p_grad_name; + VLOG(3) << "clear edge of " << p_grad_name; grad_node->MutableOutputMeta()[slot_id][i].GetMutableEdge().Clear(); } } @@ -48,7 +48,7 @@ static void clear_no_grad_edges_with_partial_block( auto p_grad_name = paddle::framework::GradVarName(params[i].name()); if (!forward_block_desc->HasVar(p_grad_name) && !backward_block_desc->HasVar(p_grad_name)) { - VLOG(1) << "clear edge of " << p_grad_name; + VLOG(3) << "clear edge of " << p_grad_name; grad_node->MutableOutputMeta()[slot_id][i].GetMutableEdge().Clear(); } } diff --git a/paddle/fluid/framework/attribute_checker.h b/paddle/fluid/framework/attribute_checker.h index 6552d167e1d01..67eb69efdf3d0 100644 --- a/paddle/fluid/framework/attribute_checker.h +++ b/paddle/fluid/framework/attribute_checker.h @@ -249,7 +249,7 @@ class TypedAttrChecker { "doesn't support phi::DenseTensor type.", attr_name_)); - VLOG(1) << "Found Attribute " << attr_name_ << " with type(Variable)."; + VLOG(3) << "Found Attribute " << attr_name_ << " with type(Variable)."; var_info_checker_(it->second); return; } diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc index 400c0021d6d7e..08f73c51fe3a9 100644 --- a/paddle/fluid/imperative/tracer.cc +++ b/paddle/fluid/imperative/tracer.cc @@ -223,7 +223,7 @@ void Tracer::TraceOpImpl(const std::string& type, platform::RecordEvent op_type_record_event( type, platform::TracerEventType::Operator, 1); platform::ScopedFlushDenormal flush; - VLOG(1) << "Trace Op: " << type; + VLOG(4) << "Trace Op: " << type; if (FLAGS_use_mkldnn) { // if both lists are empty all ops are enabled (default for // FLAGS_use_mkldnn=1) diff --git a/paddle/fluid/pybind/eager_functions.cc b/paddle/fluid/pybind/eager_functions.cc index 493a8d0b33f1d..cdace567b2e9d 100644 --- a/paddle/fluid/pybind/eager_functions.cc +++ b/paddle/fluid/pybind/eager_functions.cc @@ -161,7 +161,7 @@ static PyObject* eager_api_run_partial_grad(PyObject* self, only_inputs, allow_unused, no_grad_vars); - VLOG(1) << " in eager_api_run_partial_grad, after runing egr::Grad"; + VLOG(4) << " in eager_api_run_partial_grad, after runing egr::Grad"; } return ToPyObject(result, true /* return_py_none_if_not_initialize */); EAGER_CATCH_AND_THROW_RETURN_NULL diff --git a/paddle/fluid/pybind/eager_math_op_patch.cc b/paddle/fluid/pybind/eager_math_op_patch.cc index 6c7d974e70422..24ec364efb3b6 100644 --- a/paddle/fluid/pybind/eager_math_op_patch.cc +++ b/paddle/fluid/pybind/eager_math_op_patch.cc @@ -349,7 +349,7 @@ static PyObject* tensor__rsub__method(TensorObject* self, 1); EAGER_TRY - VLOG(1) << "Running Eager tensor__rsub__method"; + VLOG(4) << "Running Eager tensor__rsub__method"; // Set Device ID auto place = egr::Controller::Instance().GetExpectedPlace(); @@ -771,7 +771,7 @@ static PyObject* tensor__gt__method(TensorObject* self, 1); EAGER_TRY - VLOG(1) << "Running Eager tensor__gt__method"; + VLOG(4) << "Running Eager tensor__gt__method"; // Set Device ID auto place = egr::Controller::Instance().GetExpectedPlace(); @@ -857,7 +857,7 @@ static PyObject* tensor__ge__method(TensorObject* self, 1); EAGER_TRY - VLOG(1) << "Running Eager tensor__ge__method"; + VLOG(4) << "Running Eager tensor__ge__method"; // Set Device ID auto place = egr::Controller::Instance().GetExpectedPlace(); @@ -1134,7 +1134,7 @@ static PyObject* tensor__lt__method(TensorObject* self, 1); EAGER_TRY - VLOG(1) << "Running Eager tensor__lt__method"; + VLOG(4) << "Running Eager tensor__lt__method"; // Set Device ID auto place = egr::Controller::Instance().GetExpectedPlace(); @@ -1220,7 +1220,7 @@ static PyObject* tensor__le__method(TensorObject* self, 1); EAGER_TRY - VLOG(1) << "Running Eager tensor__le__method"; + VLOG(4) << "Running Eager tensor__le__method"; // Set Device ID auto place = egr::Controller::Instance().GetExpectedPlace(); diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc index 07978fc053647..3c52a705fc506 100644 --- a/paddle/fluid/pybind/eager_method.cc +++ b/paddle/fluid/pybind/eager_method.cc @@ -800,7 +800,7 @@ static PyObject* tensor_method__get_tensor_from_selected_rows( auto* dense_tensor = static_cast(selected_rows->mutable_value()); - VLOG(1) << "dense_tensor: " << dense_tensor->IsInitialized(); + VLOG(4) << "dense_tensor: " << dense_tensor->IsInitialized(); auto t = paddle::experimental::Tensor( egr::Controller::Instance().GenerateUniqueName()); diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc index 04f9e20aa2848..4cbac193ad070 100644 --- a/paddle/fluid/pybind/eager_utils.cc +++ b/paddle/fluid/pybind/eager_utils.cc @@ -1243,7 +1243,7 @@ paddle::experimental::Scalar CastNumpy2Scalar(PyObject* obj, ssize_t arg_pos) { PyTypeObject* type = obj->ob_type; auto type_name = std::string(type->tp_name); - VLOG(1) << "type_name: " << type_name; + VLOG(4) << "type_name: " << type_name; if (type_name == "numpy.ndarray" && PySequence_Check(obj)) { PyObject* item = nullptr; item = PySequence_GetItem(obj, 0); @@ -1296,7 +1296,7 @@ paddle::experimental::Scalar CastPyArg2Scalar(PyObject* obj, // obj could be: int, float, bool, paddle.Tensor PyTypeObject* type = obj->ob_type; auto type_name = std::string(type->tp_name); - VLOG(1) << "type_name: " << type_name; + VLOG(4) << "type_name: " << type_name; if (PyBool_Check(obj)) { bool value = CastPyArg2Boolean(obj, op_type, arg_pos); return paddle::experimental::Scalar(value); @@ -1348,7 +1348,7 @@ std::vector CastPyArg2ScalarArray(PyObject* obj, PyTypeObject* type = obj->ob_type; auto type_name = std::string(type->tp_name); - VLOG(1) << "type_name: " << type_name; + VLOG(4) << "type_name: " << type_name; if (PyList_Check(obj)) { Py_ssize_t len = PyList_Size(obj); PyObject* item = nullptr; diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc index 466a60be250a0..c48388a03173d 100644 --- a/paddle/phi/infermeta/binary.cc +++ b/paddle/phi/infermeta/binary.cc @@ -195,10 +195,10 @@ void BincountInferMeta(const MetaTensor& x, "But the dimension of Input(X) is [%d]", input_dim.size())); - VLOG(1) << "####### CHECK weights"; + VLOG(4) << "####### CHECK weights"; if (weights) { auto weights_dim = weights.dims(); - VLOG(1) << "##### weights_dim " << weights_dim; + VLOG(4) << "##### weights_dim " << weights_dim; PADDLE_ENFORCE_EQ(weights_dim.size(), 1, phi::errors::InvalidArgument( diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc index 4746adc892e89..f51a4a2b2b9de 100644 --- a/paddle/phi/infermeta/unary.cc +++ b/paddle/phi/infermeta/unary.cc @@ -3732,7 +3732,7 @@ void StridedSliceRawInferMeta(const MetaTensor& x, } out_dims = phi::make_ddim(new_out_shape); } - VLOG(1) << "out_dims: " << out_dims; + VLOG(4) << "out_dims: " << out_dims; out->set_dims(out_dims); out->share_lod(x); out->set_dtype(x.dtype()); From a762d68eee187bb3515b6fd6956dc6e1da0da20d Mon Sep 17 00:00:00 2001 From: hong <43953930+phlrain@users.noreply.github.com> Date: Wed, 16 Nov 2022 13:38:12 +0800 Subject: [PATCH 029/210] remove avx check (#48003) * remove avx check * fix bug; --- paddle/fluid/platform/init.cc | 46 ----------------------------------- 1 file changed, 46 deletions(-) diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc index 19175914103b1..9045b4b54cc51 100644 --- a/paddle/fluid/platform/init.cc +++ b/paddle/fluid/platform/init.cc @@ -285,52 +285,6 @@ void InitDevices(const std::vector devices) { #ifndef PADDLE_WITH_MKLDNN platform::SetNumThreads(FLAGS_paddle_num_threads); #endif - -#if !defined(_WIN32) && !defined(__APPLE__) && !defined(__OSX__) - if (platform::MayIUse(platform::avx)) { -#ifndef __AVX__ - LOG(WARNING) << "AVX is available, Please re-compile on local machine"; -#endif - } - -// Throw some informations when CPU instructions mismatch. -#define AVX_GUIDE(compiletime, runtime) \ - PADDLE_THROW(platform::errors::Unavailable( \ - "This version is compiled on higher instruction(" #compiletime \ - ") system, you may encounter illegal instruction error running on" \ - " your local CPU machine. Please reinstall the " #runtime \ - " version or compile from source code.")) - -#ifdef __AVX512F__ - if (!platform::MayIUse(platform::avx512f)) { - if (platform::MayIUse(platform::avx2)) { - AVX_GUIDE(AVX512, AVX2); - } else if (platform::MayIUse(platform::avx)) { - AVX_GUIDE(AVX512, AVX); - } else { - AVX_GUIDE(AVX512, NonAVX); - } - } -#endif - -#ifdef __AVX2__ - if (!platform::MayIUse(platform::avx2)) { - if (platform::MayIUse(platform::avx)) { - AVX_GUIDE(AVX2, AVX); - } else { - AVX_GUIDE(AVX2, NonAVX); - } - } -#endif - -#ifdef __AVX__ - if (!platform::MayIUse(platform::avx)) { - AVX_GUIDE(AVX, NonAVX); - } -#endif -#undef AVX_GUIDE - -#endif } #ifndef _WIN32 From e23dfed9ea00f69acd4ba90583d5e355cdaa3b59 Mon Sep 17 00:00:00 2001 From: ykkk2333 <77383312+ykkk2333@users.noreply.github.com> Date: Wed, 16 Nov 2022 14:26:40 +0800 Subject: [PATCH 030/210] Fix paddle rec, kim, dsin models' bugs (#47792) * add stat tool * add roll and roll_grad kernels and strided_slice and strided_slice_grad kernels, test=kunlun * embedding and embedding_grad add int32 input, test=kunlun --- .../kernels/funcs/selected_rows_functor.cc | 64 ++++++++++++++ paddle/phi/kernels/xpu/add_n_kernel.cc | 65 +++++++++++--- .../phi/kernels/xpu/embedding_grad_kernel.cc | 13 ++- paddle/phi/kernels/xpu/embedding_kernel.cc | 12 ++- .../tests/unittests/xpu/test_sum_op_xpu.py | 85 +++++++++++++++++++ 5 files changed, 224 insertions(+), 15 deletions(-) diff --git a/paddle/phi/kernels/funcs/selected_rows_functor.cc b/paddle/phi/kernels/funcs/selected_rows_functor.cc index e1d45eef54981..de362d45a8ba7 100644 --- a/paddle/phi/kernels/funcs/selected_rows_functor.cc +++ b/paddle/phi/kernels/funcs/selected_rows_functor.cc @@ -320,11 +320,75 @@ struct SelectedRowsAddToTensor { } }; +#ifdef PADDLE_WITH_XPU +template +struct SelectedRowsAddToTensor { + void operator()(const phi::XPUContext& context, + const phi::SelectedRows& input1, + phi::DenseTensor* input2) { + if (UNLIKELY(input1.rows().size() == 0)) { + LOG(WARNING) << "input selected rows is empty!"; + return; + } + using XPUType = typename XPUTypeTrait::Type; + auto in1_height = input1.height(); + const auto& in2_dims = input2->dims(); + PADDLE_ENFORCE_EQ( + in1_height, + in2_dims[0], + phi::errors::InvalidArgument("The two inputs height must be equal." + "But received first input height = " + "[%d], second input height = [%d]", + in1_height, + in2_dims[0])); + + auto& in1_value = input1.value(); + auto& in1_rows = input1.rows(); + int64_t* in1_rows_data = nullptr; + xpu::VectorParam in1_rows_vec{ + in1_rows.data(), static_cast(in1_rows.size()), in1_rows_data}; + + int64_t in1_row_numel = in1_value.numel() / in1_rows.size(); + PADDLE_ENFORCE_EQ( + in1_row_numel, + input2->numel() / in1_height, + phi::errors::InvalidArgument( + "The two inputs width must be equal." + "But received first input width = [%d], second input width = [%d]", + in1_row_numel, + input2->numel() / in1_height)); + + auto* in1_data = in1_value.data(); + auto* out_data = input2->data(); + + int h = in1_rows.size(); + int w = in1_row_numel; + const std::vector xshape{h, w}; + + int r = xpu::scatter( + context.x_context(), + nullptr, + reinterpret_cast(in1_data), + reinterpret_cast(out_data), + in1_rows_vec, + xshape, + 0, + false); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "scatter"); + } +}; + +#endif + template struct SelectedRowsAddToTensor; template struct SelectedRowsAddToTensor; template struct SelectedRowsAddToTensor; template struct SelectedRowsAddToTensor; template struct SelectedRowsAddToTensor; + +#ifdef PADDLE_WITH_XPU +template struct SelectedRowsAddToTensor; +#endif // This is a separated namespace for manipulate SelectedRows typed // data. Like merge duplicated rows, adding two SelectedRows etc. // diff --git a/paddle/phi/kernels/xpu/add_n_kernel.cc b/paddle/phi/kernels/xpu/add_n_kernel.cc index 324ced03d98e3..c1411a10d54b5 100644 --- a/paddle/phi/kernels/xpu/add_n_kernel.cc +++ b/paddle/phi/kernels/xpu/add_n_kernel.cc @@ -17,6 +17,8 @@ #include "paddle/phi/backends/xpu/enforce_xpu.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/selected_rows_functor.h" + namespace phi { template @@ -25,6 +27,8 @@ void AddNKernel(const Context& dev_ctx, DenseTensor* out) { using XPUType = typename XPUTypeTrait::Type; size_t in_num = x.size(); + dev_ctx.template Alloc(out); + bool in_place = false; if (x.size() > 0 && x[0]->initialized() && DenseTensor::classof(x[0])) { if ((static_cast(x[0]))->Holder() == out->Holder()) { @@ -33,26 +37,61 @@ void AddNKernel(const Context& dev_ctx, } if (!in_place) { - dev_ctx.template Alloc(out); + int r = xpu::constant(dev_ctx.x_context(), + reinterpret_cast(out->data()), + out->numel(), + XPUType(0)); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "constant"); } + std::vector ptrs; + phi::funcs::SelectedRowsAddToTensor functor; for (size_t i = 0; i < in_num; ++i) { - PADDLE_ENFORCE_EQ(DenseTensor::classof(x[i]), - true, - errors::InvalidArgument("XPU only support DensorTensor")); + if (DenseTensor::classof(x[i])) { + auto& in_t = *(static_cast(x[i])); + if (!in_t.initialized() || in_t.numel() == 0) { + continue; + } + ptrs.push_back(reinterpret_cast(in_t.data())); + } else if (SelectedRows::classof(x[i])) { + PADDLE_ENFORCE_EQ(x[i]->dtype(), + DataType::FLOAT32, + errors::InvalidArgument("SelectedRowsAdd(scatter) only", + "supports float type")); - auto& in_t = *(static_cast(x[i])); - if (in_t.numel() == 0) { - continue; + auto& in_t = *(static_cast(x[i])); + functor(dev_ctx, in_t, out); + } else { + PADDLE_THROW(phi::errors::InvalidArgument( + "Expected type of Input(X) of %d-th must be Tensor, " + "SelectedRows. But got " + "unsupport type: %s.", + x[i]->type_info().name())); } - ptrs.push_back(reinterpret_cast(in_t.data())); } - int r = xpu::sum(dev_ctx.x_context(), - ptrs, - reinterpret_cast(out->data()), - out->numel()); - PADDLE_ENFORCE_XDNN_SUCCESS(r, "sum"); + if (ptrs.empty()) { + return; + } else if (ptrs.size() < x.size()) { + xpu::ctx_guard RAII_GUARD(dev_ctx.x_context()); + XPUType* out_t = RAII_GUARD.alloc_l3_or_gm(out->numel()); + int r = xpu::sum(dev_ctx.x_context(), ptrs, out_t, out->numel()); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "sum"); + + r = xpu::add(dev_ctx.x_context(), + reinterpret_cast(out->data()), + out_t, + reinterpret_cast(out->data()), + out->numel()); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "add"); + } else { + int r = xpu::sum(dev_ctx.x_context(), + ptrs, + reinterpret_cast(out->data()), + out->numel()); + + PADDLE_ENFORCE_XDNN_SUCCESS(r, "sum"); + } } template diff --git a/paddle/phi/kernels/xpu/embedding_grad_kernel.cc b/paddle/phi/kernels/xpu/embedding_grad_kernel.cc index 53b5cdb90169b..cd3b920feffa8 100644 --- a/paddle/phi/kernels/xpu/embedding_grad_kernel.cc +++ b/paddle/phi/kernels/xpu/embedding_grad_kernel.cc @@ -43,7 +43,18 @@ void EmbeddingGradKernel(const Context& ctx, "number of ids in LookupTableV2GradXPUKernel.")); auto& dev_ctx = ctx; - const int64_t* ids_data = ids_t->data(); + xpu::ctx_guard RAII_GUARD(ctx.x_context()); + const int64_t* ids_data; + if (ids_t->dtype() == phi::DataType::INT64) { + ids_data = ids_t->data(); + } else { + int64_t* ids_tt = RAII_GUARD.alloc_l3_or_gm(ids_t->numel()); + int r = xpu::cast( + ctx.x_context(), ids_t->data(), ids_tt, ids_t->numel()); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast"); + ids_data = reinterpret_cast(ids_tt); + } + const T* d_output_data = d_output_t->data(); T* d_table_data = dev_ctx.template Alloc(d_table_t); int xm = d_table_t->dims()[0]; diff --git a/paddle/phi/kernels/xpu/embedding_kernel.cc b/paddle/phi/kernels/xpu/embedding_kernel.cc index d0e531f8c1399..ace2116cdc963 100644 --- a/paddle/phi/kernels/xpu/embedding_kernel.cc +++ b/paddle/phi/kernels/xpu/embedding_kernel.cc @@ -42,7 +42,17 @@ void EmbeddingKernel(const Context &ctx, auto *table = table_t->data(); auto *output = dev_ctx.template Alloc(output_t); - const int64_t *ids = ids_t->data(); + xpu::ctx_guard RAII_GUARD(ctx.x_context()); + const int64_t *ids; + if (ids_t->dtype() == phi::DataType::INT64) { + ids = ids_t->data(); + } else { + int64_t *ids_tt = RAII_GUARD.alloc_l3_or_gm(ids_t->numel()); + int r = xpu::cast( + ctx.x_context(), ids_t->data(), ids_tt, ids_t->numel()); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast"); + ids = reinterpret_cast(ids_tt); + } PADDLE_ENFORCE_EQ( ids_numel <= std::numeric_limits::max(), diff --git a/python/paddle/fluid/tests/unittests/xpu/test_sum_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_sum_op_xpu.py index 778cb66b9a84b..84be81718fbb0 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_sum_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_sum_op_xpu.py @@ -197,6 +197,91 @@ def test_list_of_none_input(): self.assertRaises(Exception, test_list_of_none_input) +class TestLoDTensorAndSelectedRowsOp(unittest.TestCase): + def setUp(self): + self.height = 10 + self.row_numel = 12 + self.rows = [0, 1, 2, 3, 4, 5, 6] + self.dtype = np.float32 + self.init_kernel_type() + + def check_with_place(self, place, inplace): + self.check_input_and_optput(place, inplace, True, True, True) + + def init_kernel_type(self): + pass + + def _get_array(self, rows, row_numel): + array = np.ones((len(rows), row_numel)).astype(self.dtype) + for i in range(len(rows)): + array[i] *= rows[i] + return array + + def check_input_and_optput( + self, + place, + inplace, + w1_has_data=False, + w2_has_data=False, + w3_has_data=False, + ): + paddle.disable_static() + w1 = self.create_lod_tensor(place) + w2 = self.create_selected_rows(place, w2_has_data) + + x = [w1, w2] + out = paddle.add_n(x) + + result = np.ones((1, self.height)).astype(np.int32).tolist()[0] + for ele in self.rows: + result[ele] += 1 + + out_t = np.array(out) + self.assertEqual(out_t.shape[0], self.height) + np.testing.assert_array_equal( + out_t, + self._get_array([i for i in range(self.height)], self.row_numel) + * np.tile(np.array(result).reshape(self.height, 1), self.row_numel), + ) + + paddle.enable_static() + + def create_selected_rows(self, place, has_data): + # create and initialize W Variable + if has_data: + rows = self.rows + else: + rows = [] + + w_array = self._get_array(self.rows, self.row_numel) + var = core.eager.Tensor( + core.VarDesc.VarType.FP32, + w_array.shape, + "selected_rows", + core.VarDesc.VarType.SELECTED_ROWS, + True, + ) + + w_selected_rows = var.value().get_selected_rows() + w_selected_rows.set_height(self.height) + w_selected_rows.set_rows(rows) + w_tensor = w_selected_rows.get_tensor() + w_tensor.set(w_array, place) + + return var + + def create_lod_tensor(self, place): + w_array = self._get_array( + [i for i in range(self.height)], self.row_numel + ) + return paddle.to_tensor(w_array) + + def test_w_is_selected_rows(self): + places = [core.XPUPlace(0)] + for place in places: + self.check_with_place(place, True) + + support_types = get_xpu_op_support_types('sum') for stype in support_types: create_test_class(globals(), XPUTestSumOp, stype) From 8e6315e4a18c5a5cf19d8949301d384647682598 Mon Sep 17 00:00:00 2001 From: Piotr Paturej <48731682+piotrekobi@users.noreply.github.com> Date: Wed, 16 Nov 2022 07:29:58 +0100 Subject: [PATCH 031/210] Add bf16 data type support to oneDNN bilinear_interp kernel (#46770) * Enable bf16 in oneDNN bilinear_interp kernel * Fix bilinear_interp_v2 not enabled in models * Remove unnecessary checks --- .../framework/ir/graph_pattern_detector.cc | 3 +- .../ir/mkldnn/cpu_bfloat16_placement_pass.cc | 4 +- .../inference/api/paddle_pass_builder.cc | 1 + .../phi/kernels/onednn/interpolate_kernel.cc | 8 ++- .../test_bilinear_interp_v2_mkldnn_op.py | 54 ++++++++++++++----- 5 files changed, 50 insertions(+), 20 deletions(-) diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index 746e6077fdf5a..cb131f8ec16ac 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -2789,7 +2789,8 @@ PDNode *patterns::QuantizePlacement::operator()( PDNode *patterns::Bfloat16Placement::operator()( const std::unordered_set &bfloat16_enabled_op_types) { std::unordered_set supported_op_types = - std::unordered_set({"cast", + std::unordered_set({"bilinear_interp_v2", + "cast", "clip", "concat", "conv2d", diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.cc index fc7a53c4e7923..fbdafbfe304ce 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.cc @@ -56,9 +56,7 @@ int CPUBfloat16PlacementPass::SetMkldnnDataType(ir::Graph* graph) const { // Only float input can be converted to bfloat16 if (op_in->Var()->GetDataType() != proto::VarType::FP32) return; - if ((op->Op()->HasAttr("mkldnn_data_type") || - op->Op()->HasProtoAttr("mkldnn_data_type")) && - !platform::HasOpINT8DataType(op->Op())) { + if (platform::HasOpINT8DataType(op->Op()) == false) { VLOG(4) << "--- marked " << op->Op()->Type() << " operator to bfloat16 "; op->Op()->SetAttr("mkldnn_data_type", std::string("bfloat16")); diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc index aad6f63052040..a1980a8ba5005 100755 --- a/paddle/fluid/inference/api/paddle_pass_builder.cc +++ b/paddle/fluid/inference/api/paddle_pass_builder.cc @@ -329,6 +329,7 @@ void CpuPassStrategy::EnableMKLDNN() { "conv_transpose_eltwiseadd_bn_fuse_pass", // "conv_bias_mkldnn_fuse_pass", // "conv_transpose_bias_mkldnn_fuse_pass", + "interpolate_mkldnn_pass", // TODO(baoachun): Need to support 5-dimensional input. // "conv3d_bias_mkldnn_fuse_pass", // "conv_elementwise_add_mkldnn_fuse_pass", diff --git a/paddle/phi/kernels/onednn/interpolate_kernel.cc b/paddle/phi/kernels/onednn/interpolate_kernel.cc index abef0ccf6feb3..7f6ded1958f2d 100644 --- a/paddle/phi/kernels/onednn/interpolate_kernel.cc +++ b/paddle/phi/kernels/onednn/interpolate_kernel.cc @@ -227,8 +227,12 @@ void NearestInterpKernel( } } // namespace phi -PD_REGISTER_KERNEL( - bilinear_interp, OneDNN, ONEDNN, phi::BilinearInterpKernel, float) {} +PD_REGISTER_KERNEL(bilinear_interp, + OneDNN, + ONEDNN, + phi::BilinearInterpKernel, + float, + phi::dtype::bfloat16) {} PD_REGISTER_KERNEL(nearest_interp, OneDNN, diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_bilinear_interp_v2_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_bilinear_interp_v2_mkldnn_op.py index e6cf8381fa28a..9f40c16689d2b 100644 --- a/python/paddle/fluid/tests/unittests/mkldnn/test_bilinear_interp_v2_mkldnn_op.py +++ b/python/paddle/fluid/tests/unittests/mkldnn/test_bilinear_interp_v2_mkldnn_op.py @@ -15,11 +15,11 @@ import unittest import numpy as np import math -from paddle.fluid.tests.unittests.op_test import OpTest +from paddle.fluid.tests.unittests.op_test import OpTest, convert_float_to_uint16 from paddle.fluid.tests.unittests.op_test import skip_check_grad_ci -def bilinear_interp_mkldnn_np( +def bilinear_interp_onednn_np( input, out_h, out_w, out_size=None, actual_shape=None, data_layout='NCHW' ): """bilinear interpolation implement in shape [N, C, H, W]""" @@ -65,17 +65,21 @@ def bilinear_interp_mkldnn_np( @skip_check_grad_ci(reason="Haven not implement interpolate grad kernel.") -class TestBilinearInterpMKLDNNOp(OpTest): +class TestBilinearInterpOneDNNOp(OpTest): def init_test_case(self): pass + def init_data_type(self): + pass + def setUp(self): self.op_type = "bilinear_interp_v2" self.interp_method = 'bilinear' self._cpu_only = True - self.use_mkldnn = True + self.use_onednn = True self.input_shape = [1, 1, 2, 2] self.data_layout = 'NCHW' + self.dtype = np.float32 # priority: actual_shape > out_size > scale > out_h & out_w self.out_h = 1 self.out_w = 1 @@ -84,8 +88,12 @@ def setUp(self): self.actual_shape = None self.init_test_case() + self.init_data_type() + + input_np = np.random.random(self.input_shape).astype(self.dtype) + if self.dtype == np.uint16: + input_np = convert_float_to_uint16(input_np) - input_np = np.random.random(self.input_shape).astype("float32") if self.data_layout == "NCHW": in_h = self.input_shape[2] in_w = self.input_shape[3] @@ -114,7 +122,7 @@ def setUp(self): out_h = self.out_h out_w = self.out_w - output_np = bilinear_interp_mkldnn_np( + output_np = bilinear_interp_onednn_np( input_np, out_h, out_w, @@ -137,7 +145,7 @@ def setUp(self): 'out_w': self.out_w, 'scale': self.scale, 'data_layout': self.data_layout, - 'use_mkldnn': self.use_mkldnn, + 'use_mkldnn': self.use_onednn, } self.outputs = {'Out': output_np} @@ -145,7 +153,7 @@ def test_check_output(self): self.check_output(check_dygraph=False) -class TestBilinearInterpOpMKLDNNNHWC(TestBilinearInterpMKLDNNOp): +class TestBilinearInterpOpOneDNNNHWC(TestBilinearInterpOneDNNOp): def init_test_case(self): self.input_shape = [3, 2, 32, 16] self.out_h = 27 @@ -154,14 +162,14 @@ def init_test_case(self): self.data_layout = 'NHWC' -class TestBilinearNeighborInterpMKLDNNCase2(TestBilinearInterpMKLDNNOp): +class TestBilinearNeighborInterpOneDNNCase2(TestBilinearInterpOneDNNOp): def init_test_case(self): self.input_shape = [3, 3, 9, 6] self.out_h = 12 self.out_w = 12 -class TestBilinearNeighborInterpCase3(TestBilinearInterpMKLDNNOp): +class TestBilinearNeighborInterpOneDNNCase3(TestBilinearInterpOneDNNOp): def init_test_case(self): self.input_shape = [1, 1, 32, 64] self.out_h = 64 @@ -169,7 +177,7 @@ def init_test_case(self): self.scale = [0.1, 0.05] -class TestBilinearNeighborInterpCase4(TestBilinearInterpMKLDNNOp): +class TestBilinearNeighborInterpOneDNNCase4(TestBilinearInterpOneDNNOp): def init_test_case(self): self.input_shape = [1, 1, 32, 64] self.out_h = 64 @@ -178,7 +186,7 @@ def init_test_case(self): self.out_size = np.array([65, 129]).astype("int32") -class TestBilinearNeighborInterpCase5(TestBilinearInterpMKLDNNOp): +class TestBilinearNeighborInterpOneDNNCase5(TestBilinearInterpOneDNNOp): def init_test_case(self): self.input_shape = [1, 1, 9, 6] self.out_h = 12 @@ -186,7 +194,7 @@ def init_test_case(self): self.out_size = np.array([13, 13]).astype("int32") -class TestBilinearNeighborInterpCase6(TestBilinearInterpMKLDNNOp): +class TestBilinearNeighborInterpOneDNNCase6(TestBilinearInterpOneDNNOp): def init_test_case(self): self.input_shape = [1, 1, 32, 64] self.out_h = 64 @@ -195,7 +203,7 @@ def init_test_case(self): self.out_size = np.array([65, 129]).astype("int32") -class TestBilinearNeighborInterpSame(TestBilinearInterpMKLDNNOp): +class TestBilinearNeighborInterpOneDNNSame(TestBilinearInterpOneDNNOp): def init_test_case(self): self.input_shape = [2, 3, 32, 64] self.out_h = 32 @@ -204,6 +212,24 @@ def init_test_case(self): self.out_size = np.array([65, 129]).astype("int32") +def create_test_class(parent): + class TestBf16Case(parent): + def init_data_type(self): + self.dtype = np.uint16 + + TestBf16Case.__name__ = "{0}_{1}".format(parent.__name__, "BF16") + globals()[TestBf16Case.__name__] = TestBf16Case + + +create_test_class(TestBilinearInterpOneDNNOp) +create_test_class(TestBilinearInterpOpOneDNNNHWC) +create_test_class(TestBilinearNeighborInterpOneDNNCase2) +create_test_class(TestBilinearNeighborInterpOneDNNCase3) +create_test_class(TestBilinearNeighborInterpOneDNNCase4) +create_test_class(TestBilinearNeighborInterpOneDNNCase5) +create_test_class(TestBilinearNeighborInterpOneDNNCase6) +create_test_class(TestBilinearNeighborInterpOneDNNSame) + if __name__ == "__main__": from paddle import enable_static From 7c30458006f64b7ed0cd790663e3022fcead49f7 Mon Sep 17 00:00:00 2001 From: HongyuJia Date: Wed, 16 Nov 2022 14:36:17 +0800 Subject: [PATCH 032/210] [Opt depthwise_conv2d] Simplify depthwise_conv2d use_cudnn attribute (#48010) * simplify depthwise_conv2d phi kernel selection * fix depthwise_conv2d --- paddle/phi/api/yaml/legacy_backward.yaml | 8 ++++---- paddle/phi/api/yaml/legacy_ops.yaml | 4 ++-- python/paddle/nn/functional/conv.py | 4 +--- 3 files changed, 7 insertions(+), 9 deletions(-) diff --git a/paddle/phi/api/yaml/legacy_backward.yaml b/paddle/phi/api/yaml/legacy_backward.yaml index 855ea1b48b696..f9ac891526c49 100755 --- a/paddle/phi/api/yaml/legacy_backward.yaml +++ b/paddle/phi/api/yaml/legacy_backward.yaml @@ -391,7 +391,7 @@ optional : mask - backward_op : depthwise_conv2d_double_grad - forward : depthwise_conv2d_grad (Tensor input, Tensor filter, Tensor grad_out, int[] strides, int[] paddings, str padding_algorithm, int groups, int[] dilations, str data_format, bool use_gpudnn) -> Tensor(grad_input), Tensor(grad_filter) + forward : depthwise_conv2d_grad (Tensor input, Tensor filter, Tensor grad_out, int[] strides, int[] paddings, str padding_algorithm, int groups, int[] dilations, str data_format) -> Tensor(grad_input), Tensor(grad_filter) args : (Tensor input, Tensor filter, Tensor grad_out, Tensor grad_input_grad, Tensor grad_filter_grad, int[] strides, int[] paddings, str padding_algorithm, int groups, int[] dilations, str data_format) output : Tensor(input_grad), Tensor(filter_grad), Tensor(grad_out_grad) infer_meta : @@ -402,8 +402,8 @@ optional : grad_input_grad, grad_filter_grad - backward_op : depthwise_conv2d_grad - forward : depthwise_conv2d (Tensor input, Tensor filter, int[] strides, int[] paddings, str padding_algorithm, int groups, int[] dilations, str data_format, bool use_gpudnn) -> Tensor(out) - args : (Tensor input, Tensor filter, Tensor out_grad, int[] strides, int[] paddings, str padding_algorithm, int groups, int[] dilations, str data_format, bool use_gpudnn) + forward : depthwise_conv2d (Tensor input, Tensor filter, int[] strides, int[] paddings, str padding_algorithm, int groups, int[] dilations, str data_format) -> Tensor(out) + args : (Tensor input, Tensor filter, Tensor out_grad, int[] strides, int[] paddings, str padding_algorithm, int groups, int[] dilations, str data_format) output : Tensor(input_grad), Tensor(filter_grad) infer_meta : func : GeneralBinaryGradInferMeta @@ -411,7 +411,7 @@ kernel : func : depthwise_conv2d_grad param : [input, filter, out_grad, strides, paddings, padding_algorithm, groups, dilations, data_format] - use_gpudnn : use_gpudnn + use_gpudnn : True backward : depthwise_conv2d_double_grad - backward_op : depthwise_conv2d_transpose_grad diff --git a/paddle/phi/api/yaml/legacy_ops.yaml b/paddle/phi/api/yaml/legacy_ops.yaml index 03f80b7934627..dca7d885be7fe 100755 --- a/paddle/phi/api/yaml/legacy_ops.yaml +++ b/paddle/phi/api/yaml/legacy_ops.yaml @@ -541,7 +541,7 @@ backward : deformable_conv_grad - op : depthwise_conv2d - args : (Tensor x, Tensor filter, int[] strides, int[] paddings, str padding_algorithm, int groups, int[] dilations, str data_format, bool use_gpudnn) + args : (Tensor x, Tensor filter, int[] strides, int[] paddings, str padding_algorithm, int groups, int[] dilations, str data_format) output : Tensor(out) infer_meta : func : DepthwiseConvInferMeta @@ -549,7 +549,7 @@ kernel : func : depthwise_conv2d param : [x, filter, strides, paddings, padding_algorithm, groups, dilations, data_format] - use_gpudnn : use_gpudnn + use_gpudnn : true backward : depthwise_conv2d_grad - op : depthwise_conv2d_transpose diff --git a/python/paddle/nn/functional/conv.py b/python/paddle/nn/functional/conv.py index d3de9e9ac0db4..1d5d5df4588f0 100644 --- a/python/paddle/nn/functional/conv.py +++ b/python/paddle/nn/functional/conv.py @@ -172,7 +172,6 @@ def _conv_nd( groups, dilation, data_format, - use_cudnn, ) if bias is not None: channel_dim = ( @@ -484,7 +483,7 @@ def conv1d( conv2d_data_format, ) else: - out = getattr(_C_ops, l_type)( + out = _C_ops.depthwise_conv2d( x, weight, stride, @@ -497,7 +496,6 @@ def conv1d( -1, False, False, - use_cudnn, ) if bias is not None: out = nn.elementwise_add(out, bias, axis=channel_dim) From 6c54e0e86ec93367b2e2fedd0ad58177f16a17f4 Mon Sep 17 00:00:00 2001 From: Zhang Jun Date: Wed, 16 Nov 2022 14:45:04 +0800 Subject: [PATCH 033/210] [inference][trt] update trt hardswish plugin to layer (#47745) --- .../tensorrt/convert/hard_swish_op.cc | 43 +++++++++++-------- 1 file changed, 26 insertions(+), 17 deletions(-) diff --git a/paddle/fluid/inference/tensorrt/convert/hard_swish_op.cc b/paddle/fluid/inference/tensorrt/convert/hard_swish_op.cc index add9e5638f6b2..5ce386c8a71cd 100644 --- a/paddle/fluid/inference/tensorrt/convert/hard_swish_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/hard_swish_op.cc @@ -13,7 +13,6 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" -#include "paddle/fluid/inference/tensorrt/plugin/hard_swish_op_plugin.h" namespace paddle { namespace framework { @@ -41,7 +40,6 @@ class HardSwishOpConverter : public OpConverter { framework::OpDesc op_desc(op, nullptr); // Declare inputs - int input_num = op_desc.Input("X").size(); auto* input = engine_->GetITensor(op_desc.Input("X")[0]); const float threshold = @@ -69,21 +67,32 @@ class HardSwishOpConverter : public OpConverter { nvinfer1::ElementWiseOperation::kPROD); layer = eltwise_layer; } else { - if (engine_->with_dynamic_shape()) { -#if IS_TRT_VERSION_GE(6000) - plugin::HardSwishPluginDynamic* plugin = - new plugin::HardSwishPluginDynamic(threshold, scale, offset); - layer = engine_->AddDynamicPlugin(&input, input_num, plugin); -#else - PADDLE_THROW(platform::errors::Fatal( - "You are running the TRT Dynamic Shape mode, need to confirm that " - "your TRT version is no less than 6.0")); -#endif - } else { - plugin::HardSwishPlugin* plugin = - new plugin::HardSwishPlugin(threshold, scale, offset); - layer = engine_->AddPlugin(&input, input_num, plugin); - } + int32_t rank = input->getDimensions().nbDims; + nvinfer1::Dims constant_shape; + constant_shape.nbDims = rank; + std::fill(constant_shape.d, constant_shape.d + rank, 1); + std::vector weight_threshold_data{threshold}; + std::vector weight_scale_data{scale}; + std::vector weight_offset_data{offset}; + std::vector weight_zero_data{0.f}; + auto* threshold_data = + AddConstantLayer(weight_threshold_data.data(), constant_shape); + auto* scale_data = + AddConstantLayer(weight_scale_data.data(), constant_shape); + auto* offset_data = + AddConstantLayer(weight_offset_data.data(), constant_shape); + auto* zero_data = + AddConstantLayer(weight_zero_data.data(), constant_shape); + + auto* input_sum_with_offset = Sum(input, offset_data); + auto* pre_max_with_zero = Max(input_sum_with_offset, zero_data); + auto* pre_min_with_threshold = Min(pre_max_with_zero, threshold_data); + auto* pre_prod_with_input = Prod(pre_min_with_threshold, input); + layer = TRT_ENGINE_ADD_LAYER(engine_, + ElementWise, + *pre_prod_with_input, + *scale_data, + nvinfer1::ElementWiseOperation::kDIV); } auto output_name = op_desc.Output("Out")[0]; RreplenishLayerAndOutput(layer, "hard_swish", {output_name}, test_mode); From 9cf3aa6159dd73ecccd51f51d57be7850987f2a8 Mon Sep 17 00:00:00 2001 From: Zhang Jun Date: Wed, 16 Nov 2022 14:46:15 +0800 Subject: [PATCH 034/210] trt memory set change from setMaxWorkspaceSize to setMemoryPoolLimit since trt 8.3+ (#47795) --- paddle/fluid/inference/tensorrt/engine.cc | 6 +++++- paddle/fluid/inference/tensorrt/test_tensorrt.cc | 4 ++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc index c6c0d0479c9e5..0b3c099934c57 100644 --- a/paddle/fluid/inference/tensorrt/engine.cc +++ b/paddle/fluid/inference/tensorrt/engine.cc @@ -150,8 +150,12 @@ void TensorRTEngine::FreezeNetwork() { "Call InitNetwork first to initialize network.")); // build engine. infer_builder_->setMaxBatchSize(max_batch_); +#if IS_TRT_VERSION_GE(8300) + infer_builder_config_->setMemoryPoolLimit( + nvinfer1::MemoryPoolType::kWORKSPACE, max_workspace_); +#else infer_builder_config_->setMaxWorkspaceSize(max_workspace_); - +#endif bool enable_fp16 = (precision_ == AnalysisConfig::Precision::kHalf); if (enable_fp16) { bool support_fp16 = infer_builder_->platformHasFastFp16(); diff --git a/paddle/fluid/inference/tensorrt/test_tensorrt.cc b/paddle/fluid/inference/tensorrt/test_tensorrt.cc index 4a8bcb2565615..571a2b56c56a3 100644 --- a/paddle/fluid/inference/tensorrt/test_tensorrt.cc +++ b/paddle/fluid/inference/tensorrt/test_tensorrt.cc @@ -95,7 +95,11 @@ nvinfer1::IHostMemory* CreateNetwork() { network->markOutput(*output); // Build the engine. builder->setMaxBatchSize(1); +#if IS_TRT_VERSION_GE(8300) + config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, 1 << 10); +#else config->setMaxWorkspaceSize(1 << 10); +#endif auto engine = builder->buildEngineWithConfig(*network, *config); EXPECT_NE(engine, nullptr); // Serialize the engine to create a model, then close. From fd15390a21ae1db41ce6a4a15b06e127ccc9cbbc Mon Sep 17 00:00:00 2001 From: ccrrong <101700995+ccrrong@users.noreply.github.com> Date: Wed, 16 Nov 2022 15:12:09 +0800 Subject: [PATCH 035/210] remove chunk_eval in nn.py under fluid (#47948) * remove chunk_eval --- python/paddle/fluid/evaluator.py | 118 ------------- python/paddle/fluid/layers/nn.py | 163 ------------------ .../tests/unittests/test_chunk_eval_op.py | 47 ----- .../fluid/tests/unittests/test_layers.py | 49 ------ 4 files changed, 377 deletions(-) diff --git a/python/paddle/fluid/evaluator.py b/python/paddle/fluid/evaluator.py index 007337b9d0404..b8ccfd083cafd 100644 --- a/python/paddle/fluid/evaluator.py +++ b/python/paddle/fluid/evaluator.py @@ -23,7 +23,6 @@ from .layers import detection __all__ = [ - 'ChunkEvaluator', 'EditDistance', 'DetectionMAP', ] @@ -127,123 +126,6 @@ def _create_state(self, suffix, dtype, shape): return state -class ChunkEvaluator(Evaluator): - """ - Warning: This would be deprecated in the future. Please use fluid.metrics.ChunkEvaluator - instead. - - Accumulate counter numbers output by chunk_eval from mini-batches and - compute the precision recall and F1-score using the accumulated counter - numbers. - For some basics of chunking, please refer to - 'Chunking with Support Vector Machines '. - - Args: - input (Variable): prediction output of the network. - label (Variable): label of the test data set. - chunk_scheme (str): can be IOB/IOE/IOBES and IO. See the chunk_eval op for details. - num_chunk_types (int): the number of chunk type. - excluded_chunk_types (list): A list including chunk type ids, indicating chunk types that are not counted. - - Returns: - tuple: tuple containing: precision, recall, f1_score - - Examples: - .. code-block:: python - - exe = fluid.executor(place) - evaluator = fluid.Evaluator.ChunkEvaluator(input, label) - for epoch in PASS_NUM: - evaluator.reset(exe) - for data in batches: - loss = exe.run(fetch_list=[cost]) - distance, instance_error = distance_evaluator.eval(exe) - """ - - def __init__( - self, - input, - label, - chunk_scheme, - num_chunk_types, - excluded_chunk_types=None, - ): - super().__init__("chunk_eval") - main_program = self.helper.main_program - if main_program.current_block().idx != 0: - raise ValueError("You can only invoke Evaluator in root block") - - self.num_infer_chunks = self._create_state( - dtype='int64', shape=[1], suffix='num_infer_chunks' - ) - self.num_label_chunks = self._create_state( - dtype='int64', shape=[1], suffix='num_label_chunks' - ) - self.num_correct_chunks = self._create_state( - dtype='int64', shape=[1], suffix='num_correct_chunks' - ) - ( - precision, - recall, - f1_score, - num_infer_chunks, - num_label_chunks, - num_correct_chunks, - ) = layers.chunk_eval( - input=input, - label=label, - chunk_scheme=chunk_scheme, - num_chunk_types=num_chunk_types, - excluded_chunk_types=excluded_chunk_types, - ) - layers.sums( - input=[self.num_infer_chunks, num_infer_chunks], - out=self.num_infer_chunks, - ) - layers.sums( - input=[self.num_label_chunks, num_label_chunks], - out=self.num_label_chunks, - ) - layers.sums( - input=[self.num_correct_chunks, num_correct_chunks], - out=self.num_correct_chunks, - ) - - self.metrics.extend([precision, recall, f1_score]) - - def eval(self, executor, eval_program=None): - if eval_program is None: - eval_program = Program() - block = eval_program.current_block() - num_infer_chunks, num_label_chunks, num_correct_chunks = executor.run( - eval_program, - fetch_list=[_clone_var_(block, state) for state in self.states], - ) - num_infer_chunks = num_infer_chunks[0] - num_label_chunks = num_label_chunks[0] - num_correct_chunks = num_correct_chunks[0] - precision = ( - float(num_correct_chunks) / num_infer_chunks - if num_infer_chunks - else 0 - ) - recall = ( - float(num_correct_chunks) / num_label_chunks - if num_label_chunks - else 0 - ) - f1_score = ( - float(2 * precision * recall) / (precision + recall) - if num_correct_chunks - else 0 - ) - return ( - np.array([precision], dtype='float32'), - np.array([recall], dtype='float32'), - np.array([f1_score], dtype='float32'), - ) - - class EditDistance(Evaluator): """ Warning: This would be deprecated in the future. Please use fluid.metrics.EditDistance diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 57e8a24e0dc75..d105ea892ccf2 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -68,7 +68,6 @@ 'linear_chain_crf', 'crf_decoding', 'cos_sim', - 'chunk_eval', 'conv2d', 'conv3d', 'softmax', @@ -1254,168 +1253,6 @@ def get_attrs(prog, dropout_prob, is_test, seed): return out -@templatedoc() -def chunk_eval( - input, - label, - chunk_scheme, - num_chunk_types, - excluded_chunk_types=None, - seq_length=None, -): - r""" - This operator computes the precision, recall and F1-score for chunk detection. - It is often used in sequence tagging tasks, such as Named Entity Recognition(NER). - - For some basics of chunking, please refer to - `Chunking with Support Vector Machines `_ . - - This operator supports IOB, IOE, IOBES and IO (also known as plain) tagging schemes. - Here is a NER example for the usage of these tagging schemes: - - .. code-block:: python - - ====== ====== ====== ===== == ============ ===== ===== ===== == ========= - Li Ming works at Agricultural Bank of China in Beijing. - ====== ====== ====== ===== == ============ ===== ===== ===== == ========= - IO I-PER I-PER O O I-ORG I-ORG I-ORG I-ORG O I-LOC - IOB B-PER I-PER O O B-ORG I-ORG I-ORG I-ORG O B-LOC - IOE I-PER E-PER O O I-ORG I-ORG I-ORG E-ORG O E-LOC - IOBES B-PER E-PER O O I-ORG I-ORG I-ORG E-ORG O S-LOC - ====== ====== ====== ===== == ============ ===== ===== ===== == ========= - - There are three chunk types(named entity types) including PER(person), ORG(organization) - and LOC(location), and we can see that the labels have the form `-` . - - Since the implementation of this operator actually uses label ids rather than - label strings, to make it work, there should be a way to map label ids to - tag types and chunk types. This operator uses the following way to do mapping: - - .. code-block:: python - - tag_type = label % num_tag_type - chunk_type = label / num_tag_type - - where `num_tag_type` is the num of tag types in the tagging scheme, `num_chunk_type` - is the num of chunk types, and `tag_type` get its value from the following table. - - .. code-block:: python - - Scheme Begin Inside End Single - plain 0 - - - - IOB 0 1 - - - IOE - 0 1 - - IOBES 0 1 2 3 - - Accordingly, in the above NER example, if the tagging scheme is IOB and chunk - types are ORG, PER and LOC, then the label ids would be as follows: - - .. code-block:: python - - B-ORG 0 - I-ORG 1 - B-PER 2 - I-PER 3 - B-LOC 4 - I-LOC 5 - O 6 - - With which we can map each label id to the corresponding tag type and chunk - type correctly. - - Args: - input (Tensor): A Tensor representing the predicted labels - from the network. Its shape would be `[N, M, 1]`, - where `N` stands for batch size, `M` for sequence length. - The data type should be int64. - label (Tensor): A Tensor representing the ground-truth labels. - It should have the same shape, lod and data type as ``input`` . - chunk_scheme (str): Indicate the tagging schemes used here. The value must - be IOB, IOE, IOBES or plain. - num_chunk_types (int): The number of chunk types. - excluded_chunk_types (list, optional): Indicate the chunk types shouldn't - be taken into account. It should be a list of chunk type ids(integer). - Default None. - seq_length(Tensor, optional): A 1D Tensor containing the length of each - sequence when ``input`` and ``label`` are Tensor. Default None. - - Returns: - tuple: A tuple including precision, recall, F1-score, chunk number detected, \ - chunk number in ground-truth, chunk number correctly detected. Each \ - is a Tensor with shape `[1]`. The data type of precision, recall and \ - F1-score all is float32, and the others' data type all is int64. - - Examples: - .. code-block:: python - - import paddle.fluid as fluid - - dict_size = 10000 - label_dict_len = 7 - sequence = fluid.data( - name='id', shape=[None, 1], lod_level=1, dtype='int64') - embedding = fluid.embedding( - input=sequence, size=[dict_size, 512]) - hidden = fluid.layers.fc(input=embedding, size=512) - label = fluid.data( - name='label', shape=[None, 1], lod_level=1, dtype='int64') - crf = fluid.layers.linear_chain_crf( - input=hidden, label=label, param_attr=fluid.ParamAttr(name="crfw")) - crf_decode = fluid.layers.crf_decoding( - input=hidden, param_attr=fluid.ParamAttr(name="crfw")) - fluid.layers.chunk_eval( - input=crf_decode, - label=label, - chunk_scheme="IOB", - num_chunk_types=int((label_dict_len - 1) / 2)) - """ - helper = LayerHelper("chunk_eval", **locals()) - - check_variable_and_dtype(input, 'input', ['int64'], 'chunk_eval') - check_variable_and_dtype(label, 'label', ['int64'], 'chunk_eval') - - # prepare output - precision = helper.create_variable_for_type_inference(dtype="float32") - recall = helper.create_variable_for_type_inference(dtype="float32") - f1_score = helper.create_variable_for_type_inference(dtype="float32") - num_infer_chunks = helper.create_variable_for_type_inference(dtype="int64") - num_label_chunks = helper.create_variable_for_type_inference(dtype="int64") - num_correct_chunks = helper.create_variable_for_type_inference( - dtype="int64" - ) - - this_input = {"Inference": [input], "Label": [label]} - - if seq_length is not None: - this_input["SeqLength"] = [seq_length] - - helper.append_op( - type="chunk_eval", - inputs=this_input, - outputs={ - "Precision": [precision], - "Recall": [recall], - "F1-Score": [f1_score], - "NumInferChunks": [num_infer_chunks], - "NumLabelChunks": [num_label_chunks], - "NumCorrectChunks": [num_correct_chunks], - }, - attrs={ - "num_chunk_types": num_chunk_types, - "chunk_scheme": chunk_scheme, - "excluded_chunk_types": excluded_chunk_types or [], - }, - ) - return ( - precision, - recall, - f1_score, - num_infer_chunks, - num_label_chunks, - num_correct_chunks, - ) - - @deprecated(since="2.0.0", update_to="paddle.nn.functional.softmax") def softmax(input, use_cudnn=True, name=None, axis=-1): r""" diff --git a/python/paddle/fluid/tests/unittests/test_chunk_eval_op.py b/python/paddle/fluid/tests/unittests/test_chunk_eval_op.py index d6b2bcc28c0d3..1cbf8ebabb804 100644 --- a/python/paddle/fluid/tests/unittests/test_chunk_eval_op.py +++ b/python/paddle/fluid/tests/unittests/test_chunk_eval_op.py @@ -16,8 +16,6 @@ import numpy as np from op_test import OpTest import numpy as np -from paddle.fluid import Program, program_guard -from paddle import fluid class Segment: @@ -283,50 +281,5 @@ def set_input(self, infer, label, lod): } -class TestChunkEvalOpError(unittest.TestCase): - def test_errors(self): - with program_guard(Program(), Program()): - - def test_input(): - input_data = np.random.random(1, 1).astype("int64") - label_data = np.random.random(1).astype("int64") - fluid.layers.chunk_eval( - input=input_data, - label=label_data, - chunk_scheme="IOB", - num_chunk_types=3, - ) - - self.assertRaises(TypeError, test_input) - - def test_label(): - input_ = fluid.data( - name="input", shape=[None, 1], dtype="int64" - ) - label_data = np.random.random(1).astype("int64") - fluid.layers.chunk_eval( - input=input_, - label=label_data, - chunk_scheme="IOB", - num_chunk_types=3, - ) - - self.assertRaises(TypeError, test_label) - - def test_type(): - in_data = fluid.data( - name="input_", shape=[None, 1], dtype="int32" - ) - label = fluid.data(name="label_", shape=[1], dtype="int64") - fluid.layers.chunk_eval( - input=in_data, - label=label, - chunk_scheme="IOB", - num_chunk_types=3, - ) - - self.assertRaises(TypeError, test_type) - - if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index 9f9e98bfca1c7..328f719d40537 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -4169,55 +4169,6 @@ def test_dynamic_lstmp(self): ) ) - def test_linear_chain_crf(self): - with self.static_graph(): - label_dict_len = 10 - feature = layers.data(name='feature', shape=[784], dtype='float32') - label = layers.data(name='label', shape=[1], dtype='int64') - emission = layers.fc(input=feature, size=10) - crf = layers.linear_chain_crf( - input=emission, label=label, param_attr=ParamAttr(name="crfw") - ) - crf_decode = layers.crf_decoding( - input=emission, param_attr=ParamAttr(name="crfw") - ) - self.assertIsNotNone(crf) - self.assertIsNotNone(crf_decode) - return layers.chunk_eval( - input=crf_decode, - label=label, - chunk_scheme="IOB", - num_chunk_types=(label_dict_len - 1) // 2, - ) - - def test_linear_chain_crf_padding(self): - with self.static_graph(): - label_dict_len, max_len = 10, 20 - feature = layers.data( - name='feature', shape=[max_len, 784], dtype='float32' - ) - label = layers.data(name='label', shape=[max_len], dtype='int64') - length = layers.data(name='length', shape=[1], dtype='int64') - emission = layers.fc(input=feature, size=10, num_flatten_dims=2) - crf = layers.linear_chain_crf( - input=emission, - label=label, - length=length, - param_attr=ParamAttr(name="crfw"), - ) - crf_decode = layers.crf_decoding( - input=emission, length=length, param_attr=ParamAttr(name="crfw") - ) - self.assertIsNotNone(crf) - self.assertIsNotNone(crf_decode) - return layers.chunk_eval( - input=crf_decode, - label=label, - seq_length=length, - chunk_scheme="IOB", - num_chunk_types=(label_dict_len - 1) // 2, - ) - def test_im2sequence(self): # TODO(minqiyang): dygraph do not support lod now with self.static_graph(): From 9fba1e72e918cfc14dfc1eafa1160e9bbc534f75 Mon Sep 17 00:00:00 2001 From: ccrrong <101700995+ccrrong@users.noreply.github.com> Date: Wed, 16 Nov 2022 15:12:27 +0800 Subject: [PATCH 036/210] remove adaptive_pool2d and adaptive_pool3d (#48004) --- python/paddle/fluid/layers/nn.py | 316 ------------------ .../unittests/ipu/test_pool_avg_op_ipu.py | 17 - .../unittests/ipu/test_pool_max_op_ipu.py | 17 - .../fluid/tests/unittests/test_layers.py | 32 -- 4 files changed, 382 deletions(-) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index d105ea892ccf2..06e49b8a25f11 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -73,8 +73,6 @@ 'softmax', 'pool2d', 'pool3d', - 'adaptive_pool2d', - 'adaptive_pool3d', 'batch_norm', 'inplace_abn', 'instance_norm', @@ -2518,320 +2516,6 @@ def is_list_or_tuple(ele): return pool_out -@deprecated(since="2.0.0") -@templatedoc(op_type="pool2d") -def adaptive_pool2d( - input, pool_size, pool_type="max", require_index=False, name=None -): - r""" - - This operation calculates the output based on the input, pool_size, - pool_type parameters. Input(X) and output(Out) are in NCHW format, where N is batch - size, C is the number of channels, H is the height of the feature, and W is - the width of the feature. Parameters(pool_size) should contain two elements which - represent height and width, respectively. Also the H and W dimensions of output(Out) - is same as Parameter(pool_size). The output tensor shape will be [N, C, pool_size[0], pool_size[1]] - - For average adaptive pool2d: - - .. math:: - - hstart &= floor(i * H_{in} / H_{out}) - - hend &= ceil((i + 1) * H_{in} / H_{out}) - - wstart &= floor(j * W_{in} / W_{out}) - - wend &= ceil((j + 1) * W_{in} / W_{out}) - - Output(i ,j) &= \\frac{sum(Input[hstart:hend, wstart:wend])}{(hend - hstart) * (wend - wstart)} - - Args: - input (Tensor): The input tensor of pooling operator, which is a 4-D tensor - with shape [N, C, H, W]. The format of input tensor is NCHW, - where N is batch size, C is the number of channels, H is the - height of the feature, and W is the width of the feature. - The data type is float32 or float64. - pool_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list, - it must contain two integers, (pool_size_Height, pool_size_Width). - pool_type: ${pooling_type_comment} - require_index (bool): If true, the index of max pooling point will be returned along - with outputs. It cannot be set in average pooling type. Default False. - name(str, optional): For detailed information, please refer - to :ref:`api_guide_Name`. Usually name is no need to set and - None by default. - - Returns: - Tensor: The output tensor of adaptive pooling result. The data type is same - as input tensor. - - Raises: - ValueError: 'pool_type' is not 'max' nor 'avg'. - ValueError: invalid setting 'require_index' true when 'pool_type' is 'avg'. - ValueError: 'pool_size' should be a list or tuple with length as 2. - - Examples: - .. code-block:: python - - # average adaptive pool2d - # suppose input data in shape of [N, C, H, W], `pool_size` is [m, n], - # output shape is [N, C, m, n], adaptive pool divide H and W dimensions - # of input data into m * n grids averagely and performs poolings in each - # grid to get output. - # adaptive average pool performs calculations as follow: - # - # for i in range(m): - # for j in range(n): - # hstart = floor(i * H / m) - # hend = ceil((i + 1) * H / m) - # wstart = floor(i * W / n) - # wend = ceil((i + 1) * W / n) - # output[:, :, i, j] = avg(input[:, :, hstart: hend, wstart: wend]) - # - import paddle - paddle.enable_static() - data = paddle.rand(shape=[1,3,32,32]) - pool_out = paddle.fluid.layers.adaptive_pool2d( - input=data, - pool_size=[3, 3], - pool_type='avg') - - # max adaptive pool2d - # suppose input data in shape of [N, C, H, W], `pool_size` is [m, n], - # output shape is [N, C, m, n], adaptive pool divide H and W dimensions - # of input data into m * n grids averagely and performs poolings in each - # grid to get output. - # adaptive average pool performs calculations as follow: - # - # for i in range(m): - # for j in range(n): - # hstart = floor(i * H / m) - # hend = ceil((i + 1) * H / m) - # wstart = floor(i * W / n) - # wend = ceil((i + 1) * W / n) - # output[:, :, i, j] = max(input[:, :, hstart: hend, wstart: wend]) - # - import paddle - data = paddle.rand(shape=[1,3,32,32]) - pool_out = paddle.fluid.layers.adaptive_pool2d( - input=data, - pool_size=[3, 3], - pool_type='max') - """ - check_variable_and_dtype( - input, - 'input', - ['float16', 'float32', 'float64', 'int32', 'int64'], - 'adaptive_pool2d', - ) - check_type(pool_type, 'pool_type', str, 'adaptive_pool2d') - check_type(pool_size, 'pool_size', (int, list, tuple), 'adaptive_pool2d') - check_type(require_index, 'require_index', bool, 'adaptive_pool2d') - if pool_type not in ["max", "avg"]: - raise ValueError( - "Unknown pool_type: '%s'. It can only be 'max' or 'avg'.", - str(pool_type), - ) - - if pool_type == "avg" and require_index: - raise ValueError( - "invalid setting 'require_index' true when 'pool_type' is 'avg'." - ) - - pool_size = utils.convert_to_list(pool_size, 2, 'pool_size') - - if pool_type == "max": - l_type = 'max_pool2d_with_index' - else: - l_type = "pool2d" - - helper = LayerHelper(l_type, **locals()) - dtype = helper.input_dtype() - pool_out = helper.create_variable_for_type_inference(dtype) - - outputs = {"Out": pool_out} - if pool_type == "max": - mask = helper.create_variable_for_type_inference(dtype) - outputs["Mask"] = mask - - helper.append_op( - type=l_type, - inputs={"X": input}, - outputs=outputs, - attrs={ - "pooling_type": pool_type, - "ksize": pool_size, - "adaptive": True, - }, - ) - - return (pool_out, mask) if require_index else pool_out - - -@deprecated(since="2.0.0") -@templatedoc(op_type="pool3d") -def adaptive_pool3d( - input, pool_size, pool_type="max", require_index=False, name=None -): - r""" - - This operation calculates the output based on the input, pool_size, - pool_type parameters. Input(X) and output(Out) are in NCDHW format, where N is batch - size, C is the number of channels, D is the depth of the feature, H is the height of - the feature, and W is the width of the feature. Parameters(pool_size) should contain - three elements which represent height and width, respectively. Also the D, H and W - dimensions of output(Out) is same as Parameter(pool_size). The output tensor shape - will be [N, C, pool_size[0], pool_size[1], pool_size[2]] - - For average adaptive pool3d: - - .. math:: - - dstart &= floor(i * D_{in} / D_{out}) - - dend &= ceil((i + 1) * D_{in} / D_{out}) - - hstart &= floor(j * H_{in} / H_{out}) - - hend &= ceil((j + 1) * H_{in} / H_{out}) - - wstart &= floor(k * W_{in} / W_{out}) - - wend &= ceil((k + 1) * W_{in} / W_{out}) - - Output(i ,j, k) &= \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{(dend - dstart) * (hend - hstart) * (wend - wstart)} - - Args: - input (Tensor): The input tensor of pooling operator, which is a 5-D tensor with - shape [N, C, D, H, W]. The format of input tensor is NCDHW, where - N is batch size, C is the number of channels, D is the depth of the feature, - H is the height of the feature, and W is the width of the feature. - The data type is float32 or float64. - pool_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list, - it must contain three integers, (Depth, Height, Width). - pool_type: ${pooling_type_comment} - require_index (bool): If true, the index of max pooling point will be returned along - with outputs. It cannot be set in average pooling type. Default False. - name(str, optional): For detailed information, please refer - to :ref:`api_guide_Name`. Usually name is no need to set and - None by default. - - Returns: - Tensor: The output tensor of adaptive pooling result. The data type is same as input tensor. - - Raises: - ValueError: 'pool_type' is not 'max' nor 'avg'. - ValueError: invalid setting 'require_index' true when 'pool_type' is 'avg'. - ValueError: 'pool_size' should be a list or tuple with length as 2. - - Examples: - .. code-block:: python - - # average adaptive pool3d - # suppose input data in shape of [N, C, D, H, W], `pool_size` is [l, m, n], - # output shape is [N, C, l, m, n], adaptive pool divide D, H and W dimensions - # of input data into l * m * n grids averagely and performs poolings in each - # grid to get output. - # adaptive average pool performs calculations as follow: - # - # for i in range(l): - # for j in range(m): - # for k in range(n): - # dstart = floor(i * D / l) - # dend = ceil((i + 1) * D / l) - # hstart = floor(j * H / m) - # hend = ceil((j + 1) * H / m) - # wstart = floor(k * W / n) - # wend = ceil((k + 1) * W / n) - # output[:, :, i, j, k] = - # avg(input[:, :, dstart:dend, hstart: hend, wstart: wend]) - # - - import paddle - paddle.enable_static() - data = paddle.rand(shape=[1,3,32,32,32]) - pool_out = paddle.fluid.layers.adaptive_pool3d( - input=data, - pool_size=[3, 3, 3], - pool_type='avg') - - # max adaptive pool3d - # suppose input data in shape of [N, C, D, H, W], `pool_size` is [l, m, n], - # output shape is [N, C, l, m, n], adaptive pool divide D, H and W dimensions - # of input data into l * m * n grids averagely and performs poolings in each - # grid to get output. - # adaptive average pool performs calculations as follow: - # - # for i in range(l): - # for j in range(m): - # for k in range(n): - # dstart = floor(i * D / l) - # dend = ceil((i + 1) * D / l) - # hstart = floor(j * H / m) - # hend = ceil((j + 1) * H / m) - # wstart = floor(k * W / n) - # wend = ceil((k + 1) * W / n) - # output[:, :, i, j, k] = - # avg(input[:, :, dstart:dend, hstart: hend, wstart: wend]) - # - - import paddle - data = paddle.rand(shape=[1,3,32,32,32]) - pool_out = paddle.fluid.layers.adaptive_pool3d( - input=data, - pool_size=[3, 3, 3], - pool_type='max') - """ - check_variable_and_dtype( - input, - 'input', - ['float16', 'float32', 'float64', 'int32', 'int64'], - 'adaptive_pool3d', - ) - check_type(pool_type, 'pool_type', str, 'adaptive_pool3d') - check_type(pool_size, 'pool_size', (int, list, tuple), 'adaptive_pool3d') - check_type(require_index, 'require_index', bool, 'adaptive_pool3d') - if pool_type not in ["max", "avg"]: - raise ValueError( - "Unknown pool_type: '%s'. It can only be 'max' or 'avg'.", - str(pool_type), - ) - - if pool_type == "avg" and require_index: - raise ValueError( - "invalid setting 'require_index' true when 'pool_type' is 'avg'." - ) - - pool_size = utils.convert_to_list(pool_size, 3, 'pool_size') - - if pool_type == "max": - l_type = 'max_pool3d_with_index' - else: - l_type = "pool3d" - - helper = LayerHelper(l_type, **locals()) - dtype = helper.input_dtype() - pool_out = helper.create_variable_for_type_inference(dtype) - - outputs = {"Out": pool_out} - if pool_type == "max": - mask = helper.create_variable_for_type_inference(dtype) - outputs["Mask"] = mask - - helper.append_op( - type=l_type, - inputs={"X": input}, - outputs=outputs, - attrs={ - "pooling_type": pool_type, - "ksize": pool_size, - "adaptive": True, - }, - ) - - return (pool_out, mask) if require_index else pool_out - - def batch_norm( input, act=None, diff --git a/python/paddle/fluid/tests/unittests/ipu/test_pool_avg_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_pool_avg_op_ipu.py index 058e3b30a5315..88104fa253af1 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_pool_avg_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_pool_avg_op_ipu.py @@ -138,22 +138,5 @@ def set_attrs(self): self.attrs['exclusive'] = False -class TestAdaptive(TestBase): - def set_op_attrs(self): - self.attrs = { - "pool_size": 1, - "pool_type": 'avg', - "require_index": False, - } - - @IPUOpTest.static_graph - def build_model(self): - x = paddle.static.data( - name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32' - ) - out = paddle.fluid.layers.adaptive_pool2d(x, **self.attrs) - self.fetch_list = [out.name] - - if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ipu/test_pool_max_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_pool_max_op_ipu.py index aff790a775a9f..3fa93cc89dfab 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_pool_max_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_pool_max_op_ipu.py @@ -137,22 +137,5 @@ def set_op_attrs(self): self.attrs['exclusive'] = False -class TestAdaptive(TestBase): - def set_op_attrs(self): - self.attrs = { - "pool_size": 1, - "pool_type": 'max', - "require_index": False, - } - - @IPUOpTest.static_graph - def build_model(self): - x = paddle.static.data( - name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32' - ) - out = paddle.fluid.layers.adaptive_pool2d(x, **self.attrs) - self.fetch_list = [out.name] - - if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index 328f719d40537..78d01722b8121 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -3299,38 +3299,6 @@ def make_pool3d(self): pool_padding=(2, 1, 1), ) - def make_adaptive_pool2d(self): - with program_guard( - fluid.default_main_program(), fluid.default_startup_program() - ): - x = self._get_data(name='x', shape=[3, 224, 224], dtype='float32') - return layers.adaptive_pool2d(x, [3, 3], pool_type='avg') - pool, mask = layers.adaptive_pool2d(x, [3, 3], require_index=True) - return pool - return mask - return layers.adaptive_pool2d(x, 3, pool_type='avg') - pool, mask = layers.adaptive_pool2d(x, 3, require_index=True) - return pool - return mask - - def make_adaptive_pool3d(self): - with program_guard( - fluid.default_main_program(), fluid.default_startup_program() - ): - x = self._get_data( - name='x', shape=[3, 244, 224, 224], dtype='float32' - ) - return layers.adaptive_pool3d(x, [3, 3, 3], pool_type='avg') - pool, mask = layers.adaptive_pool3d( - x, [3, 3, 3], require_index=True - ) - return pool - return mask - return layers.adaptive_pool3d(x, 3, pool_type='avg') - pool, mask = layers.adaptive_pool3d(x, 3, require_index=True) - return pool - return mask - def make_lstm_unit(self): with program_guard( fluid.default_main_program(), fluid.default_startup_program() From a2a97cbbac10a050e6ad13999926867e1a4aaafe Mon Sep 17 00:00:00 2001 From: wangzhen38 <41941775+wangzhen38@users.noreply.github.com> Date: Wed, 16 Nov 2022 15:48:33 +0800 Subject: [PATCH 037/210] [remove fluid] under fleet meta_optimizers (#47864) * [remove fluid] under fleet meta_optimizers * [remove fluid] under fleet meta_optimizers * [remove fluid] under fleet meta_optimizers * [remove fluid] under fleet meta_optimizers * [remove fluid] under fleet meta_optimizers * [remove fluid] under fleet meta_optimizers * [remove fluid] under fleet meta_optimizers * [remove fluid] under fleet meta_optimizers * [remove fluid] under fleet meta_optimizers * [remove fluid] under fleet meta_optimizers * [remove fluid] under fleet meta_optimizers * [remove fluid] under fleet meta_optimizers --- python/paddle/distributed/__init__.py | 3 +- .../ascend/ascend_optimizer.py | 4 +- .../meta_optimizers/ascend/ascend_parser.py | 2 +- .../heter_parallel_optimizer.py | 8 +- .../hybrid_parallel_gradscaler.py | 4 +- .../hybrid_parallel_optimizer.py | 44 +-- .../sharding_optimizer_stage2.py | 2 +- .../meta_optimizers/sharding/fp16_helper.py | 2 +- .../sharding/offload_helper.py | 3 +- .../fleet/meta_optimizers/sharding/utils.py | 9 +- python/paddle/distributed/io.py | 288 ++++++++++++++++++ .../incubate/fleet/collective/__init__.py | 4 +- .../fluid/tests/unittests/dist_save_load.py | 6 +- .../fluid/tests/unittests/test_dist_base.py | 2 +- .../unittests/test_dist_sparse_load_ps0.py | 4 +- .../fluid/tests/unittests/test_load_op.py | 3 +- .../fluid/tests/unittests/test_load_op_xpu.py | 2 +- .../tests/unittests/test_static_save_load.py | 10 +- python/paddle/framework/__init__.py | 1 + 19 files changed, 350 insertions(+), 51 deletions(-) create mode 100644 python/paddle/distributed/io.py diff --git a/python/paddle/distributed/__init__.py b/python/paddle/distributed/__init__.py index 3612d009045d5..4db153c53b414 100644 --- a/python/paddle/distributed/__init__.py +++ b/python/paddle/distributed/__init__.py @@ -12,9 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. +from . import io from .spawn import spawn # noqa: F401 from .launch.main import launch # noqa: F401 - from .parallel import init_parallel_env # noqa: F401 from .parallel import get_rank # noqa: F401 from .parallel import get_world_size # noqa: F401 @@ -74,6 +74,7 @@ from . import rpc __all__ = [ # noqa + "io", "spawn", "launch", "scatter", diff --git a/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_optimizer.py index b0495e13b21c8..64c1881223ed5 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_optimizer.py @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -from paddle.fluid.optimizer import Optimizer -import paddle.fluid.core as core +from paddle.optimizer import Optimizer +import paddle.framework.core as core from . import ascend_parser from paddle.distributed import fleet import hccl.manage.api as hccl diff --git a/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_parser.py b/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_parser.py index 3be5636b256ec..79f79a8dea462 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_parser.py +++ b/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_parser.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import paddle.fluid.core as core +import paddle.framework.core as core import numpy as np from functools import reduce diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/heter_parallel_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/heter_parallel_optimizer.py index a2a65d995ad7c..44655876e1587 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/heter_parallel_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/heter_parallel_optimizer.py @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -from paddle.fluid.dygraph import base as imperative_base -from paddle.fluid import framework +import paddle.autograd as imperative_base +from paddle import framework __all__ = [] @@ -41,13 +41,13 @@ def __init__(self, optimizer, strategy): # NOTE(liubo48): In pure DataParallel mode, # the gradient synchronization is achieved through reducer. - @imperative_base.no_grad + @imperative_base.no_grad() @framework.dygraph_only def step(self): parameters_list = _obtain_optimizer_parameters_list(self._inner_opt) self._inner_opt.step() - @imperative_base.no_grad + @imperative_base.no_grad() def minimize( self, loss, startup_program=None, parameters=None, no_grad_set=None ): diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_gradscaler.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_gradscaler.py index d768411dea5fe..6d723a3af77f7 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_gradscaler.py +++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_gradscaler.py @@ -13,7 +13,7 @@ # limitations under the License. from ...base.topology import ParallelMode -from paddle.fluid.dygraph import base as imperative_base +import paddle.autograd as imperative_base import paddle from paddle import _legacy_C_ops @@ -51,7 +51,7 @@ def minimize(self, optimizer, *args, **kwargs): return optimize_ops, params_grads - @imperative_base.no_grad + @imperative_base.no_grad() def _unscale(self, optimizer): if not self._enable: return diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py index bd05cbe879718..38c9b7b2bfc20 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py @@ -19,10 +19,10 @@ sharding_reduce_gradients, ) from ...base.topology import ParallelMode -from paddle.fluid.dygraph import base as imperative_base -from paddle.fluid import framework +from paddle.autograd import no_grad +from paddle import framework from ...utils.log_util import logger -from paddle.fluid import core +from paddle.framework import core from paddle.fluid import layers __all__ = [] @@ -47,7 +47,7 @@ def __init__(self, clip, hcg): self._clip = clip self._hcg = hcg - @imperative_base.no_grad + @no_grad() def _dygraph_clip(self, params_grads): sum_square_dist_fp16 = [] sum_square_dist_fp32 = [] @@ -63,8 +63,8 @@ def _dygraph_clip(self, params_grads): if g.type == core.VarDesc.VarType.SELECTED_ROWS: merge_grad = layers.merge_selected_rows(g) merge_grad = layers.get_tensor_from_selected_rows(merge_grad) - square = layers.square(merge_grad) - sum_square = layers.reduce_sum(square) + square = paddle.square(merge_grad) + sum_square = paddle.sum(square) not_shared_enable = (not hasattr(p, 'is_firstly_shared')) or ( hasattr(p, 'is_firstly_shared') @@ -89,8 +89,8 @@ def _dygraph_clip(self, params_grads): [0.0], dtype=paddle.float32 ) else: - global_norm_dist_fp16 = layers.concat(sum_square_dist_fp16) - global_norm_dist_fp16 = layers.reduce_sum(global_norm_dist_fp16) + global_norm_dist_fp16 = paddle.concat(sum_square_dist_fp16) + global_norm_dist_fp16 = paddle.sum(global_norm_dist_fp16) global_norm_dist_fp16 = paddle.cast( global_norm_dist_fp16, dtype=paddle.float32 ) @@ -101,29 +101,27 @@ def _dygraph_clip(self, params_grads): [0.0], dtype=paddle.float32 ) else: - global_norm_not_dist_fp16 = layers.concat(sum_square_not_dist_fp16) - global_norm_not_dist_fp16 = layers.reduce_sum( - global_norm_not_dist_fp16 - ) + global_norm_not_dist_fp16 = paddle.concat(sum_square_not_dist_fp16) + global_norm_not_dist_fp16 = paddle.sum(global_norm_not_dist_fp16) global_norm_not_dist_fp16 = paddle.cast( global_norm_not_dist_fp16, dtype=paddle.float32 ) # global norm of distributed FP32 params_and_grads global_norm_dist_fp32 = ( - layers.concat(sum_square_dist_fp32) + paddle.concat(sum_square_dist_fp32) if len(sum_square_dist_fp32) != 0 else paddle.to_tensor([0.0], dtype=paddle.float32) ) - global_norm_dist_fp32 = layers.reduce_sum(global_norm_dist_fp32) + global_norm_dist_fp32 = paddle.sum(global_norm_dist_fp32) # global norm of non-distributed FP32 params_and_grads global_norm_not_dist_fp32 = ( - layers.concat(sum_square_not_dist_fp32) + paddle.concat(sum_square_not_dist_fp32) if len(sum_square_not_dist_fp32) != 0 else paddle.to_tensor([0.0], dtype=paddle.float32) ) - global_norm_not_dist_fp32 = layers.reduce_sum(global_norm_not_dist_fp32) + global_norm_not_dist_fp32 = paddle.sum(global_norm_not_dist_fp32) global_norm_var_dist = global_norm_dist_fp16 + global_norm_dist_fp32 global_norm_var_not_dist = ( @@ -151,14 +149,16 @@ def _dygraph_clip(self, params_grads): group=self._hcg.get_sharding_parallel_group(), ) - global_norm_var_fp32 = layers.sqrt( + global_norm_var_fp32 = paddle.sqrt( global_norm_var_dist + global_norm_var_not_dist ) - max_global_norm = layers.fill_constant( - shape=[1], dtype=global_norm_var_fp32.dtype, value=self.clip_norm + max_global_norm = paddle.full( + shape=[1], + dtype=global_norm_var_fp32.dtype, + fill_value=self.clip_norm, ) - clip_var = layers.elementwise_div( + clip_var = paddle.divide( x=max_global_norm, y=paddle.maximum(x=global_norm_var_fp32, y=max_global_norm), ) @@ -229,7 +229,7 @@ def __init__(self, optimizer, hcg, strategy): self._inner_opt._grad_clip, hcg ) - @imperative_base.no_grad + @no_grad() @framework.dygraph_only def step(self): parameters_list = _obtain_optimizer_parameters_list(self._inner_opt) @@ -241,7 +241,7 @@ def step(self): self._inner_opt.step() - @imperative_base.no_grad + @no_grad() def minimize( self, loss, startup_program=None, parameters=None, no_grad_set=None ): diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py index c2bacc6a668b9..615980ab5230f 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py +++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py @@ -28,7 +28,7 @@ import paddle import paddle.distributed as dist -from paddle.fluid import core +from paddle.framework import core from paddle.optimizer import Optimizer from paddle.fluid.clip import ClipGradByGlobalNorm from paddle.distributed.collective import ( diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/fp16_helper.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/fp16_helper.py index 1c500ea56b5cb..f1244c30df089 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/sharding/fp16_helper.py +++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/fp16_helper.py @@ -18,7 +18,7 @@ OpRole, ) -from paddle.fluid import core +from paddle.framework import core __all__ = [] diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/offload_helper.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/offload_helper.py index c1951299c2cdc..058b2adc8e185 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/sharding/offload_helper.py +++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/offload_helper.py @@ -13,7 +13,8 @@ # limitations under the License. from ..common import is_optimizer_op, OP_ROLE_KEY, OpRole, is_update_op -from paddle.fluid import core, unique_name +from paddle.framework import core +from paddle.utils import unique_name __all__ = [] diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py index 9feed7b1e5aad..e5f794e51a536 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py +++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py @@ -12,7 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. import paddle -from paddle.fluid import core, unique_name +from paddle.framework import core +from paddle.utils import unique_name from functools import reduce from paddle.distributed.fleet.meta_optimizers.common import ( is_loss_grad_op, @@ -1046,11 +1047,11 @@ def sharding_predicate(var): ) if int(os.environ.get('PADDLE_TRAINER_ID', 0)) == 0: - paddle.fluid.io.save_persistables( - exe, dirname, main_program=main_program, filename=None + paddle.distributed.io.save_persistables( + exe, dirname, main_program=main_program, filename=filename ) else: - paddle.fluid.io.save_vars( + paddle.static.save_vars( exe, dirname, main_program=main_program, diff --git a/python/paddle/distributed/io.py b/python/paddle/distributed/io.py new file mode 100644 index 0000000000000..47c3368c5f591 --- /dev/null +++ b/python/paddle/distributed/io.py @@ -0,0 +1,288 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import paddle +from paddle.framework import dygraph_not_support, core +from paddle.fluid.framework import Program + + +def _save_distributed_persistables(executor, dirname, main_program): + """ + save_persistables for distributed training. + the method will do things listed below: + 1.save part of persistable variables on trainer. + 2.receive "remote prefetch variables" from parameter servers and merge them. + 3.save "distributed lookup table" on parameter servers. + 4.receive "optimizer variables" from parameter servers and merge them. + + Args: + executor(Executor): The executor to run for saving parameters. + dirname(str): The saving directory path. + main_program(Program): The program whose parameters will be + saved. the main_program must be the trainer_program + get after transpiler. + + Returns: + None + + Examples: + .. code-block:: python + + import paddle + import paddle + + paddle.enable_static() + exe = paddle.static.Executor(paddle.CPUPlace()) + param_path = "./my_paddle_model" + t = distribute_transpiler.DistributeTranspiler() + t.transpile(...) + train_program = t.get_trainer_program() + _save_distributed_persistables(executor=exe, dirname=param_path, main_program=train_program) + """ + + def __save_remote_params(executor, dirname, remote_params_map): + """ + receive params on pserver through rpc. + if the params are be sliced, will concat them to one, then save it. + """ + if not remote_params_map: + return + + prog = paddle.static.Program() + block = prog.global_block() + + # recv optimize vars from pserver + for name, remote_params in remote_params_map.items(): + origin = remote_params[0].origin + is_slice = remote_params[0].is_slice + + slices = [None] * len(remote_params) + slice_varnames = [None] * len(remote_params) + remote_varnames = [None] * len(remote_params) + endpoints = [None] * len(remote_params) + + for idx, optimizer in enumerate(remote_params): + block_id = optimizer.block_id + slice = optimizer.slice + endpoint = optimizer.endpoint + + index = block_id if is_slice else idx + slices[index] = slice + slice_varnames[index] = "{}.slice.{}".format(slice.name, idx) + remote_varnames[index] = slice.name + endpoints[index] = endpoint + + slice_shapes = [] + for slice in slices: + tmp = [str(dim) for dim in slice.shape] + slice_shapes.append(",".join(tmp)) + + block.append_op( + type='recv_save', + attrs={ + "trainer_id": 0, + "shape": origin.shape, + "slice_shapes": slice_shapes, + "slice_varnames": slice_varnames, + "remote_varnames": remote_varnames, + "endpoints": endpoints, + "file_path": os.path.join(dirname, origin.name), + }, + ) + + executor.run(prog) + + def __save_distributed_lookup_tables( + executor, dirname, distributed_lookup_table, endpoints + ): + """ + because the distributed lookup table may too huge to merge and save at one place, + it will be saved at parameter server independent respectively. + + the save directory is dirname/"__lookup_table__". + + """ + prog = paddle.static.Program() + block = prog.global_block() + + # if there is lookup table, the trainer 0 will notify all pserver to save. + lookup_table_filename = os.path.join(dirname, "__lookup_table__") + attrs = {} + attrs['epmap'] = endpoints + attrs['dir'] = lookup_table_filename + attrs['lookup_table'] = distributed_lookup_table + block.append_op( + type='checkpoint_notify', inputs={}, outputs={}, attrs=attrs + ) + executor.run(prog) + + def __exclude_vars(exclude_var_names=[]): + def is_valid(var): + if var.name in exclude_var_names: + return False + if ( + var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH + or var.desc.type() == core.VarDesc.VarType.FETCH_LIST + or var.desc.type() == core.VarDesc.VarType.READER + ): + return False + return var.persistable + + return is_valid + + if not isinstance(main_program, Program): + raise TypeError("'main_program' should be an instance of Program.") + + if not main_program._is_distributed: + raise ValueError( + "'_save_distributed_persistables' just be designed for distributed training." + ) + + remote_params_map = ( + main_program._parameters_on_pservers.get_distributed_vars_by_vtypes( + ["Optimizer", "RemotePrefetch"], groupby=True + ) + ) + + exclude_var_names = [] + if remote_params_map: + exclude_var_names.extend(remote_params_map.keys()) + + if main_program._distributed_lookup_table: + if isinstance(main_program._distributed_lookup_table, list): + exclude_var_names.extend(main_program._distributed_lookup_table) + else: + exclude_var_names.append(main_program._distributed_lookup_table) + + local_vars = list( + filter(__exclude_vars(exclude_var_names), main_program.list_vars()) + ) + paddle.static.save_vars( + executor, main_program=main_program, dirname=dirname, vars=local_vars + ) + + if main_program._is_chief: + if remote_params_map: + __save_remote_params(executor, dirname, remote_params_map) + if main_program._distributed_lookup_table: + __save_distributed_lookup_tables( + executor, + dirname, + main_program._distributed_lookup_table, + main_program._endpoints, + ) + + +def is_persistable(var): + """ + Check whether the given variable is persistable. + + Args: + var(Variable): The variable to be checked. + + Returns: + bool: True if the given `var` is persistable + False if not. + + Examples: + .. code-block:: python + + import paddle + import paddle.fluid as fluid + + paddle.enable_static() + param = fluid.default_main_program().global_block().var('fc.b') + res = fluid.io.is_persistable(param) + """ + if ( + var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH + or var.desc.type() == core.VarDesc.VarType.FETCH_LIST + or var.desc.type() == core.VarDesc.VarType.READER + ): + return False + return var.persistable + + +@dygraph_not_support +def save_persistables(executor, dirname, main_program=None, filename=None): + """ + Save all persistable variables from :code:`main_program` to + the folder :code:`dirname` or file :code:`filename`. You can refer to + :ref:`api_guide_model_save_reader_en` for more details. And then + saves these persistables variables to the folder :code:`dirname` or file + :code:`filename`. + + The :code:`dirname` is used to specify the folder where persistable variables + are going to be saved. If you would like to save variables in separate + files, set :code:`filename` None; if you would like to save all variables in a + single file, use :code:`filename` to specify the file name. + + Args: + executor(Executor): The executor to run for saving persistable variables. + You can refer to :ref:`api_guide_executor_en` for + more details. + + dirname(str, optional): The saving directory path. + When you need to save the parameter to the memory, set it to None. + main_program(Program, optional): The program whose persistbale variables will + be saved. You can refer to + :ref:`api_guide_Program_en` for more details. + If it is None, the default main program will + be used. + Default: None. + filename(str, optional): The file to save all variables. If you prefer to + save variables in different files, set it to None. + Default: None. + + Returns: + str: When saving parameters to a file, returns None. + When saving parameters to memory, returns a binary string containing parameters. + + Examples: + .. code-block:: python + + import paddle + + paddle.enable_static() + dir_path = "./my_paddle_model" + file_name = "persistables" + image = paddle.static..data(name='img', shape=[None, 28, 28], dtype='float32') + label = paddle.static.data(name='label', shape=[None, 1], dtype='int64') + feeder = paddle.static.DataFeeder(feed_list=[image, label], place=paddle.CPUPlace()) + + predict = paddle.static.nn.fc(x=image, size=10, activation='softmax') + loss = paddle.nn.functional.cross_entropy(input=predict, label=label) + avg_loss = paddle.mean(loss) + exe = paddle.static.Executor(paddle.CPUPlace()) + exe.run(paddle.static.default_startup_program()) + paddle.distributed.io.save_persistables(executor=exe, dirname=dir_path, filename=file_name) + # The persistables variables weights and bias in the fc layer of the network + # are going to be saved in the same file named "persistables" in the path + # "./my_paddle_model" + """ + if main_program and main_program._is_distributed: + return _save_distributed_persistables( + executor, dirname=dirname, main_program=main_program + ) + else: + return paddle.static.save_vars( + executor, + dirname=dirname, + main_program=main_program, + vars=None, + predicate=is_persistable, + filename=filename, + ) diff --git a/python/paddle/fluid/incubate/fleet/collective/__init__.py b/python/paddle/fluid/incubate/fleet/collective/__init__.py index 949ef93a472a3..c18e77b29c2b2 100644 --- a/python/paddle/fluid/incubate/fleet/collective/__init__.py +++ b/python/paddle/fluid/incubate/fleet/collective/__init__.py @@ -154,7 +154,9 @@ def save_persistables( "must be as Program type." ) - io.save_persistables(executor, dirname, main_program, filename=filename) + paddle.distributed.io.save_persistables( + executor, dirname, main_program, filename=filename + ) def save_checkpoint( self, diff --git a/python/paddle/fluid/tests/unittests/dist_save_load.py b/python/paddle/fluid/tests/unittests/dist_save_load.py index eb36010ea6fac..7f2d864a0f585 100644 --- a/python/paddle/fluid/tests/unittests/dist_save_load.py +++ b/python/paddle/fluid/tests/unittests/dist_save_load.py @@ -178,7 +178,9 @@ def get_data(): fetch_list=[avg_cost.name], feed=feeder.feed(get_data()) ) if need_save and model_dir: - io.save_persistables(startup_exe, model_dir, trainer_prog) + paddle.distributed.io.save_persistables( + startup_exe, model_dir, trainer_prog + ) var = np.array( fluid.global_scope().find_var('__fc_b__').get_tensor() @@ -199,7 +201,7 @@ def get_data(): and idx == skip_steps and args.trainer_id == 0 ): - io.save_persistables( + paddle.distributed.io.save_persistables( startup_exe, model_dir, trainer_prog ) else: diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py index 5b874f1a17b92..c0f992c010bf3 100755 --- a/python/paddle/fluid/tests/unittests/test_dist_base.py +++ b/python/paddle/fluid/tests/unittests/test_dist_base.py @@ -381,7 +381,7 @@ def get_data(): infer_save_dir_fleet = os.path.join( model_save_dir, "fleet_infer_2" ) - fluid.io.save_persistables( + paddle.distributed.io.save_persistables( exe, model_save_dir_fluid, fleet._origin_program ) fleet.save_persistables(executor=exe, dirname=model_save_dir_fleet) diff --git a/python/paddle/fluid/tests/unittests/test_dist_sparse_load_ps0.py b/python/paddle/fluid/tests/unittests/test_dist_sparse_load_ps0.py index bff5754df1fe8..f79afcca3de88 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_sparse_load_ps0.py +++ b/python/paddle/fluid/tests/unittests/test_dist_sparse_load_ps0.py @@ -68,7 +68,9 @@ def save_origin_model(self, emb_array, fc_array): exe = fluid.Executor(fluid.CPUPlace()) exe.run(startup_program) model_path = tempfile.mkdtemp() - fluid.io.save_persistables(executor=exe, dirname=model_path) + paddle.distributed.io.save_persistables( + executor=exe, dirname=model_path + ) return model_path diff --git a/python/paddle/fluid/tests/unittests/test_load_op.py b/python/paddle/fluid/tests/unittests/test_load_op.py index ed123e06a03c5..a299e6aaffe00 100644 --- a/python/paddle/fluid/tests/unittests/test_load_op.py +++ b/python/paddle/fluid/tests/unittests/test_load_op.py @@ -18,6 +18,7 @@ import paddle.fluid.layers as layers import os import tempfile +import paddle class TestLoadOp(unittest.TestCase): @@ -42,7 +43,7 @@ def setUp(self): ) exe = fluid.Executor(fluid.CPUPlace()) exe.run(start_prog) - fluid.io.save_persistables( + paddle.distributed.io.save_persistables( exe, dirname=os.path.join(self.temp_dir.name, "./model"), main_program=main_prog, diff --git a/python/paddle/fluid/tests/unittests/test_load_op_xpu.py b/python/paddle/fluid/tests/unittests/test_load_op_xpu.py index 05ad3dc77626e..21e4636ce5b69 100644 --- a/python/paddle/fluid/tests/unittests/test_load_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/test_load_op_xpu.py @@ -47,7 +47,7 @@ def setUp(self): ) exe = fluid.Executor(fluid.XPUPlace(0)) exe.run(start_prog) - fluid.io.save_persistables( + paddle.distributed.io.save_persistables( exe, dirname=self.model_path, main_program=main_prog ) diff --git a/python/paddle/fluid/tests/unittests/test_static_save_load.py b/python/paddle/fluid/tests/unittests/test_static_save_load.py index cfdd62c92d350..e177e351a6de4 100644 --- a/python/paddle/fluid/tests/unittests/test_static_save_load.py +++ b/python/paddle/fluid/tests/unittests/test_static_save_load.py @@ -1061,7 +1061,7 @@ def test_load_from_old_interface(self): base_map[var.name] = t # fluid.save(main_program, "./test_1") - fluid.io.save_persistables( + paddle.distributed.io.save_persistables( exe, os.path.join(self.temp_dir.name, "test_path"), main_program ) @@ -1200,7 +1200,7 @@ def test_load_from_old_interface_var_list(self): base_map[var.name] = t # fluid.save(main_program, "./test_1") - fluid.io.save_persistables( + paddle.distributed.io.save_persistables( exe, os.path.join(self.temp_dir.name, "test_static_load_var_list"), main_program, @@ -1338,7 +1338,7 @@ def test_load_from_old_interface(self): base_map[var.name] = t save_dir = os.path.join(temp_dir.name, "test_path") # fluid.save(main_program, "./test_1") - fluid.io.save_persistables( + paddle.distributed.io.save_persistables( exe, save_dir, main_program, filename="model_single" ) @@ -1538,7 +1538,7 @@ def test_ptb_rnn_cpu_float32(self): self.assertTrue(np.sum(np.abs(t)) != 0) base_map[var.name] = t save_dir = os.path.join(self.temp_dir.name, "test_program_1") - fluid.io.save_persistables(exe, save_dir, main_program) + paddle.distributed.io.save_persistables(exe, save_dir, main_program) # set var to zero for var in main_program.list_vars(): @@ -1711,7 +1711,7 @@ def test_ptb_rnn_cpu_float32(self): base_map[var.name] = t save_dir = os.path.join(temp_dir.name, "test_program_2") - fluid.io.save_persistables( + paddle.distributed.io.save_persistables( exe, save_dir, main_program, filename="model_1" ) diff --git a/python/paddle/framework/__init__.py b/python/paddle/framework/__init__.py index 07e9d34e9b88c..6725ed1443591 100644 --- a/python/paddle/framework/__init__.py +++ b/python/paddle/framework/__init__.py @@ -55,6 +55,7 @@ _get_paddle_place, ) # noqa: F401 from ..fluid.framework import dygraph_only # noqa: F401 +from ..fluid.framework import dygraph_not_support # noqa: F401 from ..fluid.framework import ( convert_np_dtype_to_dtype_, _varbase_creator, From b4b780602271fa6dd33fc82f6b7ccac3c637b3dc Mon Sep 17 00:00:00 2001 From: wenbin Date: Wed, 16 Nov 2022 16:21:19 +0800 Subject: [PATCH 038/210] elementwise_floordiv (#47944) * elementwise_op * add teller * modify ut * comments * modify ut * return * modify --- .../fluid/inference/api/analysis_predictor.cc | 2 + .../tensorrt/convert/elementwise_op.cc | 11 ++ paddle/fluid/inference/tensorrt/op_teller.cc | 4 +- .../inference/test_trt_convert_elementwise.py | 140 +++++++++++++----- 4 files changed, 121 insertions(+), 36 deletions(-) diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index d2b0ba0a5fcf8..5e19ae32bd813 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -2227,6 +2227,7 @@ USE_TRT_CONVERTER(elementwise_div_weight); USE_TRT_CONVERTER(elementwise_min_weight); USE_TRT_CONVERTER(elementwise_max_weight); USE_TRT_CONVERTER(elementwise_pow_weight); +USE_TRT_CONVERTER(elementwise_floordiv_weight); USE_TRT_CONVERTER(elementwise_add_tensor); USE_TRT_CONVERTER(elementwise_sub_tensor); USE_TRT_CONVERTER(elementwise_div_tensor); @@ -2234,6 +2235,7 @@ USE_TRT_CONVERTER(elementwise_mul_tensor); USE_TRT_CONVERTER(elementwise_max_tensor); USE_TRT_CONVERTER(elementwise_min_tensor); USE_TRT_CONVERTER(elementwise_pow_tensor); +USE_TRT_CONVERTER(elementwise_floordiv_tensor); USE_TRT_CONVERTER(transpose); USE_TRT_CONVERTER(transpose2); USE_TRT_CONVERTER(flatten); diff --git a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc index 82fd1e016119f..53cb2da285afa 100644 --- a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc @@ -167,6 +167,7 @@ const std::unordered_map {"min", nvinfer1::ElementWiseOperation::kMIN}, {"pow", nvinfer1::ElementWiseOperation::kPOW}, {"max", nvinfer1::ElementWiseOperation::kMAX}, + {"floordiv", nvinfer1::ElementWiseOperation::kFLOOR_DIV}, }; class ElementwiseTensorAddOpConverter : public ElementwiseTensorOpConverter { @@ -204,6 +205,12 @@ class ElementwiseTensorPowOpConverter : public ElementwiseTensorOpConverter { ElementwiseTensorPowOpConverter() { op_type_ = "pow"; } }; +class ElementwiseTensorFloorDivOpConverter + : public ElementwiseTensorOpConverter { + public: + ElementwiseTensorFloorDivOpConverter() { op_type_ = "floordiv"; } +}; + } // namespace tensorrt } // namespace inference } // namespace paddle @@ -222,6 +229,8 @@ REGISTER_TRT_OP_CONVERTER(elementwise_min_weight, ElementwiseTensorMinOpConverter); REGISTER_TRT_OP_CONVERTER(elementwise_pow_weight, ElementwiseTensorPowOpConverter); +REGISTER_TRT_OP_CONVERTER(elementwise_floordiv_weight, + ElementwiseTensorFloorDivOpConverter); REGISTER_TRT_OP_CONVERTER(elementwise_add_tensor, ElementwiseTensorAddOpConverter); @@ -237,3 +246,5 @@ REGISTER_TRT_OP_CONVERTER(elementwise_min_tensor, ElementwiseTensorMinOpConverter); REGISTER_TRT_OP_CONVERTER(elementwise_pow_tensor, ElementwiseTensorPowOpConverter); +REGISTER_TRT_OP_CONVERTER(elementwise_floordiv_tensor, + ElementwiseTensorFloorDivOpConverter); diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc index fd21e70780bd0..d9d9be1241bde 100644 --- a/paddle/fluid/inference/tensorrt/op_teller.cc +++ b/paddle/fluid/inference/tensorrt/op_teller.cc @@ -1244,7 +1244,7 @@ struct SimpleOpTypeSetTeller : public Teller { if (op_type == "elementwise_add" || op_type == "elementwise_mul" || op_type == "elementwise_sub" || op_type == "elementwise_div" || op_type == "elementwise_pow" || op_type == "elementwise_min" || - op_type == "elementwise_max") { + op_type == "elementwise_max" || op_type == "elementwise_floordiv") { if (desc.Input("X").size() != 1) { VLOG(3) << "The input op's Input(\"X\").size() " "should equal to 1, but received Input(\"X\").size() = " @@ -2288,6 +2288,7 @@ struct SimpleOpTypeSetTeller : public Teller { "elementwise_pow", "elementwise_min", "elementwise_max", + "elementwise_floordiv", "equal", "dropout", "prelu", @@ -2413,6 +2414,7 @@ struct SimpleOpTypeSetTeller : public Teller { "elementwise_pow", "elementwise_min", "elementwise_max", + "elementwise_floordiv", "equal", "dropout", "prelu", diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_elementwise.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_elementwise.py index 8420c9cdaae46..3c0230e84b52e 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_elementwise.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_elementwise.py @@ -28,11 +28,22 @@ def is_program_valid(self, program_config: ProgramConfig) -> bool: return True def sample_program_configs(self): - def generate_input(shape): - return np.random.random(shape).astype(np.float32) - - def generate_weight(): - return np.random.randn(1, 32, 1, 1).astype(np.float32) + def generate_input(shape, op_type): + # elementwise_floordiv is integer only + if op_type == "elementwise_floordiv": + return np.random.randint( + low=1, high=10000, size=shape, dtype=np.int32 + ) + else: + return np.random.random(shape).astype(np.float32) + + def generate_weight(op_type): + if op_type == "elementwise_floordiv": + return np.random.randint( + low=1, high=10000, size=[1, 32, 1, 1], dtype=np.int32 + ) + else: + return np.random.randn(1, 32, 1, 1).astype(np.float32) for batch in [1, 4]: for shape in [[batch, 32, 16, 32]]: @@ -44,6 +55,7 @@ def generate_weight(): "elementwise_pow", "elementwise_min", "elementwise_max", + "elementwise_floordiv", ]: for axis in [-1]: self.dims = len(shape) @@ -65,12 +77,14 @@ def generate_weight(): ops=ops, weights={ "weight": TensorConfig( - data_gen=partial(generate_weight) + data_gen=partial(generate_weight, op_type) ) }, inputs={ "input_data": TensorConfig( - data_gen=partial(generate_input, shape) + data_gen=partial( + generate_input, shape, op_type + ) ), }, outputs=["output_data"], @@ -142,11 +156,23 @@ def is_program_valid(self, program_config: ProgramConfig) -> bool: return True def sample_program_configs(self): - def generate_input(shape): - return np.random.random(shape).astype(np.float32) - - def generate_weight(): - return np.random.randn(1).astype(np.float32) + def generate_input(shape, op_type): + # elementwise_floordiv is integer only + if op_type == "elementwise_floordiv": + return np.random.randint( + low=1, high=10000, size=shape, dtype=np.int32 + ) + else: + return np.random.random(shape).astype(np.float32) + + def generate_weight(op_type): + # elementwise_floordiv is integer only + if op_type == "elementwise_floordiv": + return np.random.randint( + low=1, high=10000, size=[1], dtype=np.int32 + ) + else: + return np.random.randn(1).astype(np.float32) for shape in [[32]]: for op_type in [ @@ -157,6 +183,7 @@ def generate_weight(): "elementwise_pow", "elementwise_min", "elementwise_max", + "elementwise_floordiv", ]: for axis in [-1]: self.dims = len(shape) @@ -175,12 +202,12 @@ def generate_weight(): ops=ops, weights={ "weight": TensorConfig( - data_gen=partial(generate_weight) + data_gen=partial(generate_weight, op_type) ) }, inputs={ "input_data": TensorConfig( - data_gen=partial(generate_input, shape) + data_gen=partial(generate_input, shape, op_type) ), }, outputs=["output_data"], @@ -245,11 +272,23 @@ def is_program_valid(self, program_config: ProgramConfig) -> bool: return True def sample_program_configs(self): - def generate_input(shape): - return np.random.random(shape).astype(np.float32) - - def generate_weight(): - return np.random.randn(32).astype(np.float32) + def generate_input(shape, op_type): + # elementwise_floordiv is integer only + if op_type == "elementwise_floordiv": + return np.random.randint( + low=1, high=10000, size=shape, dtype=np.int32 + ) + else: + return np.random.random(shape).astype(np.float32) + + def generate_weight(op_type): + # elementwise_floordiv is integer only + if op_type == "elementwise_floordiv": + return np.random.randint( + low=1, high=10000, size=[32], dtype=np.int32 + ) + else: + return np.random.randn(32).astype(np.float32) for batch in [1, 4]: for shape in [ @@ -266,6 +305,7 @@ def generate_weight(): "elementwise_pow", "elementwise_min", "elementwise_max", + "elementwise_floordiv", ]: for axis in [-1 if len(shape) == 1 else 1]: self.dims = len(shape) @@ -287,12 +327,14 @@ def generate_weight(): ops=ops, weights={ "weight": TensorConfig( - data_gen=partial(generate_weight) + data_gen=partial(generate_weight, op_type) ) }, inputs={ "input_data": TensorConfig( - data_gen=partial(generate_input, shape) + data_gen=partial( + generate_input, shape, op_type + ) ), }, outputs=["output_data"], @@ -379,8 +421,14 @@ def is_program_valid(self, program_config: ProgramConfig) -> bool: return True def sample_program_configs(self): - def generate_input(shape): - return np.random.random(shape).astype(np.float32) + def generate_input(shape, op_type): + # elementwise_floordiv is integer only + if op_type == "elementwise_floordiv": + return np.random.randint( + low=1, high=10000, size=shape, dtype=np.int32 + ) + else: + return np.random.random(shape).astype(np.float32) for shape in [[4], [4, 32], [2, 32, 16], [1, 8, 16, 32]]: for op_type in [ @@ -391,6 +439,7 @@ def generate_input(shape): "elementwise_pow", "elementwise_min", "elementwise_max", + "elementwise_floordiv", ]: for axis in [0, -1]: self.dims = len(shape) @@ -413,10 +462,10 @@ def generate_input(shape): weights={}, inputs={ "input_data1": TensorConfig( - data_gen=partial(generate_input, shape) + data_gen=partial(generate_input, shape, op_type) ), "input_data2": TensorConfig( - data_gen=partial(generate_input, shape) + data_gen=partial(generate_input, shape, op_type) ), }, outputs=["output_data"], @@ -530,8 +579,14 @@ def is_program_valid(self, program_config: ProgramConfig) -> bool: return True def sample_program_configs(self): - def generate_input(shape): - return np.random.random(shape).astype(np.float32) + def generate_input(shape, op_type): + # elementwise_floordiv is integer only + if op_type == "elementwise_floordiv": + return np.random.randint( + low=1, high=10000, size=shape, dtype=np.int32 + ) + else: + return np.random.random(shape).astype(np.float32) input1_shape_list = [[4, 32], [2, 4, 32], [4, 2, 4, 32]] input2_shape1_list = [[32], [4, 32], [2, 4, 32]] @@ -575,6 +630,7 @@ def generate_input(shape): "elementwise_pow", "elementwise_min", "elementwise_max", + "elementwise_floordiv", ]: for axis in axis_list[j][i]: self.shape1 = input1_shape @@ -599,12 +655,12 @@ def generate_input(shape): inputs={ "input_data1": TensorConfig( data_gen=partial( - generate_input, input1_shape + generate_input, input1_shape, op_type ) ), "input_data2": TensorConfig( data_gen=partial( - generate_input, input2_shape + generate_input, input2_shape, op_type ) ), }, @@ -676,12 +732,23 @@ def is_program_valid(self, program_config: ProgramConfig) -> bool: return True def sample_program_configs(self): - def generate_input(shape): - return np.random.random(shape).astype(np.float32) + def generate_input(shape, op_type): + # elementwise_floordiv is integer only + if op_type == "elementwise_floordiv": + return np.random.randint( + low=1, high=10000, size=shape, dtype=np.int32 + ) + else: + return np.random.random(shape).astype(np.float32) # use rand not randn to avoiding pow producing `NAN` - def generate_weight(): - return np.random.rand(32).astype(np.float32) + def generate_weight(op_type): + if op_type == "elementwise_floordiv": + return np.random.randint( + low=1, high=10000, size=[32], dtype=np.int32 + ) + else: + return np.random.rand(32).astype(np.float32) for batch in [1, 2, 4]: for shape in [ @@ -698,6 +765,7 @@ def generate_weight(): "elementwise_pow", "elementwise_min", "elementwise_max", + "elementwise_floordiv", ]: self.op_type = op_type for axis in [-1 if len(shape) == 1 else 1]: @@ -720,12 +788,14 @@ def generate_weight(): ops=ops, weights={ "weight": TensorConfig( - data_gen=partial(generate_weight) + data_gen=partial(generate_weight, op_type) ) }, inputs={ "input_data": TensorConfig( - data_gen=partial(generate_input, shape) + data_gen=partial( + generate_input, shape, op_type + ) ), }, outputs=["output_data"], From d6be900033a7ac93dd86b45d1be45c876629dcf6 Mon Sep 17 00:00:00 2001 From: xiaoxiaohehe001 <49090790+xiaoxiaohehe001@users.noreply.github.com> Date: Wed, 16 Nov 2022 16:25:47 +0800 Subject: [PATCH 039/210] [Paddle Inference] Add fill_any_like trt converter. (#47974) * add_fill_any_like * add_fill_any_like --- .../fluid/inference/api/analysis_predictor.cc | 1 + .../inference/tensorrt/convert/CMakeLists.txt | 1 + .../tensorrt/convert/fill_any_like_op.cc | 93 +++++++++ paddle/fluid/inference/tensorrt/op_teller.cc | 24 +++ .../test_trt_convert_fill_any_like.py | 190 ++++++++++++++++++ 5 files changed, 309 insertions(+) create mode 100644 paddle/fluid/inference/tensorrt/convert/fill_any_like_op.cc create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_fill_any_like.py diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 5e19ae32bd813..2f2e0ff9f7259 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -2259,6 +2259,7 @@ USE_TRT_CONVERTER(pad); USE_TRT_CONVERTER(hard_sigmoid); USE_TRT_CONVERTER(hard_swish); USE_TRT_CONVERTER(split); +USE_TRT_CONVERTER(fill_any_like); USE_TRT_CONVERTER(prelu); USE_TRT_CONVERTER(conv2d_transpose); USE_TRT_CONVERTER(leaky_relu); diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt index 7ede7cd2a2b81..070e7c2c0fd8e 100644 --- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt @@ -25,6 +25,7 @@ list( multihead_matmul_op.cc multihead_matmul_roformer_op.cc shuffle_channel_op.cc + fill_any_like_op.cc where_op.cc swish_op.cc silu_op.cc diff --git a/paddle/fluid/inference/tensorrt/convert/fill_any_like_op.cc b/paddle/fluid/inference/tensorrt/convert/fill_any_like_op.cc new file mode 100644 index 0000000000000..ff4b5e389f187 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/fill_any_like_op.cc @@ -0,0 +1,93 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" + +namespace paddle { +namespace framework { +class Scope; + +namespace proto { +class OpDesc; +} // namespace proto +} // namespace framework +} // namespace paddle + +namespace paddle { +namespace inference { +namespace tensorrt { + +class FillAnyLikeOpConverter : public OpConverter { + public: + void operator()(const framework::proto::OpDesc& op, + const framework::Scope& scope, + bool test_mode) override { + VLOG(3) << "convert fill_any_like op to tensorrt layer "; + framework::OpDesc op_desc(op, nullptr); + auto* input = engine_->GetITensor(op_desc.Input("X").front()); + auto output_name = op_desc.Output("Out").front(); + auto input_dims = input->getDimensions(); + auto nbDims_num = input_dims.nbDims; + nvinfer1::ITensor* value_tensor; + + const int dtype = PADDLE_GET_CONST(int, op_desc.GetAttr("dtype")); + float value = PADDLE_GET_CONST(float, op_desc.GetAttr("value")); + if ((dtype == 2) || + (dtype == -1 && input->getType() == nvinfer1::DataType::kINT32)) { + value_tensor = Add1DConstantLayer(static_cast(value), + output_name + "_value_tensor_"); + } else { + value_tensor = Add1DConstantLayer(value, output_name + "_value_tensor_"); + } + auto shape_tensor = Shape(input); + auto* one_rank_tensor = Add1DConstantLayer( + std::vector(nbDims_num, 1), output_name + "_one_rank_tensor_"); + auto input_shape_tensor = one_rank_tensor; + auto* shuffle = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *value_tensor); + shuffle->setInput(1, *input_shape_tensor); + + std::vector start_vec(nbDims_num, 0); + nvinfer1::Dims start; + start.nbDims = nbDims_num; + for (int32_t i = 0; i < nbDims_num; ++i) { + start.d[i] = start_vec[i]; + } + nvinfer1::Dims size; + size.nbDims = nbDims_num; + nvinfer1::Dims stride; + stride.nbDims = nbDims_num; + + auto starts_tensor = + Add1DConstantLayer(start_vec, output_name + "_start_tensor_"); + auto one_tensor = Add1DConstantLayer(1, output_name + "_one_tensor_"); + + auto sizes_tensor = Max(input_shape_tensor, shape_tensor); + auto input_sub_tensor = Sub(input_shape_tensor, one_tensor); + auto strides_tensor = Min(one_tensor, input_sub_tensor); + + auto layer = TRT_ENGINE_ADD_LAYER( + engine_, Slice, *shuffle->getOutput(0), start, size, stride); + layer->setInput(1, *starts_tensor); + layer->setInput(2, *sizes_tensor); + layer->setInput(3, *strides_tensor); + + RreplenishLayerAndOutput(layer, "fill_any_like", {output_name}, test_mode); + } +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +REGISTER_TRT_OP_CONVERTER(fill_any_like, FillAnyLikeOpConverter); diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc index d9d9be1241bde..3e6fd52fab7f4 100644 --- a/paddle/fluid/inference/tensorrt/op_teller.cc +++ b/paddle/fluid/inference/tensorrt/op_teller.cc @@ -1161,6 +1161,28 @@ struct SimpleOpTypeSetTeller : public Teller { } } + if (op_type == "fill_any_like") { + if (!with_dynamic_shape) { + VLOG(3) << "the fill_any_like does not support static shape yet"; + return false; + } + int dtype = PADDLE_GET_CONST(int, desc.GetAttr("dtype")); + if (dtype != -1 && dtype != 2 && dtype != 5) { + VLOG(3) << "the fill_any_like only supports int32 and float32"; + return false; + } + if (dtype == -1) { + auto* block = desc.Block(); + auto* x_var_desc = block->FindVar(desc.Input("X")[0]); + auto input_type = x_var_desc->GetDataType(); + if (input_type != framework::proto::VarType::INT32 && + input_type != framework::proto::VarType::FP32) { + VLOG(3) << "the fill_any_like only supports int32 and float32"; + return false; + } + } + } + if (op_type == "slice") { if (desc.HasAttr("decrease_axis")) { std::vector decrease_axis = @@ -2291,6 +2313,7 @@ struct SimpleOpTypeSetTeller : public Teller { "elementwise_floordiv", "equal", "dropout", + "fill_any_like", "prelu", "conv2d_transpose", "depthwise_conv2d_transpose", @@ -2417,6 +2440,7 @@ struct SimpleOpTypeSetTeller : public Teller { "elementwise_floordiv", "equal", "dropout", + "fill_any_like", "prelu", "conv2d_transpose", "depthwise_conv2d_transpose", diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_fill_any_like.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_fill_any_like.py new file mode 100644 index 0000000000000..2ca057ed2701e --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_fill_any_like.py @@ -0,0 +1,190 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from trt_layer_auto_scan_test import TrtLayerAutoScanTest +from program_config import TensorConfig, ProgramConfig +import numpy as np +import paddle.inference as paddle_infer +from functools import partial +from typing import List, Dict, Any +import unittest + + +class TrtConvertExpandV2Test(TrtLayerAutoScanTest): + def is_program_valid(self, program_config: ProgramConfig) -> bool: + if self.dtype in [0, 3, 4]: + return False + if self.dims != 4 and self.dtype != 2: + return False + return True + + def sample_program_configs(self): + def generate_input1(attrs: List[Dict[str, Any]]): + if self.dims == 4: + self.input_shape = [1, 1, 4, 6] + if self.dtype == 0: + return np.random.random([1, 1, 4, 6]).astype(np.bool) + elif self.dtype == 2 or self.dtype == -1: + return np.random.random([1, 1, 4, 6]).astype(np.int32) + elif self.dtype == 3: + return np.random.random([1, 1, 4, 6]).astype(np.int64) + elif self.dtype == 4: + return np.random.random([1, 1, 4, 6]).astype(np.float16) + else: + return np.random.random([1, 1, 4, 6]).astype(np.float32) + elif self.dims == 3: + self.input_shape = [1, 8, 6] + return np.random.random([1, 8, 6]).astype(np.int32) + elif self.dims == 2: + self.input_shape = [1, 48] + return np.random.random([1, 48]).astype(np.int32) + elif self.dims == 1: + self.input_shape = [48] + return np.random.random([48]).astype(np.int32) + + def generate_weight1(attrs: List[Dict[str, Any]]): + return np.array([1, 48]).astype(np.int32) + + def generate_shapeT1_data(attrs: List[Dict[str, Any]]): + return np.array([2]).astype(np.int32) + + def generate_shapeT2_data(attrs: List[Dict[str, Any]]): + return np.array([24]).astype(np.int32) + + for dims in [1, 2, 3, 4]: + for value in [2]: + for dtype in [-1, 0, 2, 3, 4, 5]: + dics = [ + { + "value": value, + "dtype": dtype, + }, + ] + self.dims = dims + self.dtype = dtype + dics_intput = [{"X": ["fill_any_like_input"]}] + + ops_config = [ + { + "op_type": "fill_any_like", + "op_inputs": dics_intput[0], + "op_outputs": {"Out": ["fill_any_like_out"]}, + "op_attrs": dics[0], + } + ] + ops = self.generate_op_config(ops_config) + program_config = ProgramConfig( + ops=ops, + weights={}, + inputs={ + "fill_any_like_input": TensorConfig( + data_gen=partial(generate_input1, dics) + ) + }, + outputs=["fill_any_like_out"], + ) + + yield program_config + + def sample_predictor_configs( + self, program_config + ) -> (paddle_infer.Config, List[int], int): + def generate_dynamic_shape(attrs): + if self.dims == 4: + self.dynamic_shape.min_input_shape = { + "fill_any_like_input": [1, 1, 4, 6] + } + self.dynamic_shape.max_input_shape = { + "fill_any_like_input": [10, 1, 4, 6] + } + self.dynamic_shape.opt_input_shape = { + "fill_any_like_input": [1, 1, 4, 6] + } + elif self.dims == 3: + self.dynamic_shape.min_input_shape = { + "fill_any_like_input": [1, 8, 6] + } + self.dynamic_shape.max_input_shape = { + "fill_any_like_input": [4, 8, 6] + } + self.dynamic_shape.opt_input_shape = { + "fill_any_like_input": [1, 8, 6] + } + elif self.dims == 2: + self.dynamic_shape.min_input_shape = { + "fill_any_like_input": [1, 48] + } + self.dynamic_shape.max_input_shape = { + "fill_any_like_input": [4, 48] + } + self.dynamic_shape.opt_input_shape = { + "fill_any_like_input": [1, 48] + } + elif self.dims == 1: + self.dynamic_shape.min_input_shape = { + "fill_any_like_input": [48] + } + self.dynamic_shape.max_input_shape = { + "fill_any_like_input": [48] + } + self.dynamic_shape.opt_input_shape = { + "fill_any_like_input": [48] + } + + def clear_dynamic_shape(): + self.dynamic_shape.min_input_shape = {} + self.dynamic_shape.max_input_shape = {} + self.dynamic_shape.opt_input_shape = {} + + def generate_trt_nodes_num(attrs, dynamic_shape): + if not dynamic_shape: + return 0, 3 + else: + return 1, 2 + + attrs = [ + program_config.ops[i].attrs for i in range(len(program_config.ops)) + ] + + clear_dynamic_shape() + self.trt_param.precision = paddle_infer.PrecisionType.Float32 + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, False + ), 1e-5 + self.trt_param.precision = paddle_infer.PrecisionType.Half + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, False + ), 1e-5 + + # for dynamic_shape + generate_dynamic_shape(attrs) + self.trt_param.precision = paddle_infer.PrecisionType.Float32 + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, True + ), 1e-5 + self.trt_param.precision = paddle_infer.PrecisionType.Half + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, True + ), 1e-5 + + def add_skip_trt_case(self): + pass + + def test(self): + self.add_skip_trt_case() + self.run_test() + + +if __name__ == "__main__": + unittest.main() From b68e0c47eaa0458a16250f4d3d5580dd5c30abc0 Mon Sep 17 00:00:00 2001 From: HongyuJia Date: Wed, 16 Nov 2022 17:34:53 +0800 Subject: [PATCH 040/210] clean fluid elementwise_max (part2): remove API (#48034) --- .../distributed/fleet/metrics/metric.py | 2 +- python/paddle/fluid/layers/nn.py | 67 ------------------- .../unittests/ipu/test_elemetwise_x_op_ipu.py | 2 +- 3 files changed, 2 insertions(+), 69 deletions(-) diff --git a/python/paddle/distributed/fleet/metrics/metric.py b/python/paddle/distributed/fleet/metrics/metric.py index 34c1c2968c820..ccba440bd69bc 100644 --- a/python/paddle/distributed/fleet/metrics/metric.py +++ b/python/paddle/distributed/fleet/metrics/metric.py @@ -79,7 +79,7 @@ def max(input, scope=None, util=None): input = fluid.layers.cast(some_input, dtype='float32') cnt = fluid.layers.reduce_sum(input) global_cnt = fluid.layers.create_global_var(persistable=True, dtype='float32', shape=[1], value=0) - tmp = fluid.layers.elementwise_max(cnt, global_cnt) + tmp = paddle.maximum(cnt, global_cnt) fluid.layers.assign(tmp, global_cnt) # in train.py, after train or infer diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 06e49b8a25f11..0e6abd666797b 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -154,7 +154,6 @@ 'elementwise_div', 'elementwise_sub', 'elementwise_mul', - 'elementwise_max', 'elementwise_min', 'elementwise_pow', 'elementwise_mod', @@ -12632,71 +12631,6 @@ def gen_data(): return _elementwise_op(LayerHelper('elementwise_mul', **locals())) -def elementwise_max(x, y, axis=-1, act=None, name=None): - """ - :alias_main: paddle.elementwise_max - :alias: paddle.elementwise_max,paddle.tensor.elementwise_max,paddle.tensor.math.elementwise_max - :old_api: paddle.fluid.layers.elementwise_max - - Examples: - - .. code-block:: python - - import paddle.fluid as fluid - import numpy as np - import paddle - - def gen_data(): - return { - "x": np.array([2, 3, 4]).astype('float32'), - "y": np.array([1, 5, 2]).astype('float32') - } - paddle.enable_static() - x = fluid.data(name="x", shape=[3], dtype='float32') - y = fluid.data(name="y", shape=[3], dtype='float32') - z = fluid.layers.elementwise_max(x, y) - - place = fluid.CPUPlace() - exe = fluid.Executor(place) - z_value = exe.run(feed=gen_data(), - fetch_list=[z.name]) - - print(z_value) #[2, 5, 4] - - - .. code-block:: python - - import paddle.fluid as fluid - import numpy as np - import paddle - - def gen_data(): - return { - "x": np.ones((2, 3, 4, 5)).astype('float32'), - "y": np.zeros((3, 4)).astype('float32') - } - paddle.enable_static() - x = fluid.data(name="x", shape=[2,3,4,5], dtype='float32') - y = fluid.data(name="y", shape=[3,4], dtype='float32') - z = fluid.layers.elementwise_max(x, y, axis=1) - - place = fluid.CPUPlace() - exe = fluid.Executor(place) - - z_value = exe.run(feed=gen_data(), - fetch_list=[z.name]) - - print(z_value)#[[[[1., 1., 1., 1., 1.] .... [1., 1., 1., 1., 1.]]]] - - """ - if _non_static_mode(): - return _elementwise_op_in_dygraph( - x, y, axis=axis, act=act, op_name='elementwise_max' - ) - - return _elementwise_op(LayerHelper('elementwise_max', **locals())) - - def elementwise_min(x, y, axis=-1, act=None, name=None): """ :alias_main: paddle.elementwise_min @@ -12874,7 +12808,6 @@ def gen_data(): elementwise_div, elementwise_sub, elementwise_mul, - elementwise_max, elementwise_pow, elementwise_min, elementwise_mod, diff --git a/python/paddle/fluid/tests/unittests/ipu/test_elemetwise_x_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_elemetwise_x_op_ipu.py index 8dcb3097c2f52..3ce09c9c479b8 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_elemetwise_x_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_elemetwise_x_op_ipu.py @@ -145,7 +145,7 @@ def set_test_op(self): class TestMax(TestMul): def set_test_op(self): - self.op = paddle.fluid.layers.elementwise_max + self.op = paddle.maximum class TestPow(TestMul): From ad8847aaab7a44736d010a867737557ddb493c73 Mon Sep 17 00:00:00 2001 From: Qi Li Date: Wed, 16 Nov 2022 17:52:08 +0800 Subject: [PATCH 041/210] [NPU] update npu prop, test=develop (#47859) * [NPU] update npu prop, test=develop * remove ddim.h * remove diff * update storage prop, test=develop --- paddle/phi/core/dense_tensor.cc | 4 ++++ paddle/phi/core/dense_tensor.h | 4 ++++ paddle/phi/core/storage_properties.h | 9 +++++---- paddle/phi/tests/core/test_dense_tensor.cc | 14 ++++++++------ 4 files changed, 21 insertions(+), 10 deletions(-) diff --git a/paddle/phi/core/dense_tensor.cc b/paddle/phi/core/dense_tensor.cc index 844182ec3fc82..3fbf3560aff95 100644 --- a/paddle/phi/core/dense_tensor.cc +++ b/paddle/phi/core/dense_tensor.cc @@ -263,6 +263,10 @@ template const NPUStorageProperties& DenseTensor::storage_properties() const; template const OneDNNStorageProperties& DenseTensor::storage_properties() const; #endif +bool DenseTensor::storage_properties_initialized() const { + return storage_properties_ != nullptr; +} + void DenseTensor::set_storage_properties( std::unique_ptr&& storage_properties) { storage_properties_ = std::move(storage_properties); diff --git a/paddle/phi/core/dense_tensor.h b/paddle/phi/core/dense_tensor.h index 0785af7b7037c..c5f38b762167f 100644 --- a/paddle/phi/core/dense_tensor.h +++ b/paddle/phi/core/dense_tensor.h @@ -164,6 +164,10 @@ class DenseTensor : public TensorBase, void* data(); + /// \brief Get whether the storage_properties is inited. + /// \return The init status of storage_properties. + bool storage_properties_initialized() const; + /// \brief Returns the storage_properties of the tensor. /// \return The storage_properties of the tensor. template diff --git a/paddle/phi/core/storage_properties.h b/paddle/phi/core/storage_properties.h index 908abd8d9d35d..ff41938778630 100644 --- a/paddle/phi/core/storage_properties.h +++ b/paddle/phi/core/storage_properties.h @@ -16,6 +16,7 @@ limitations under the License. */ #include +#include "paddle/phi/core/ddim.h" #include "paddle/phi/core/utils/type_registry.h" #ifdef PADDLE_WITH_MKLDNN @@ -42,8 +43,8 @@ struct NPUStorageProperties virtual ~NPUStorageProperties() = default; static const char* name() { return "NPUStorageProperties"; } - int64_t storage_format; - int64_t storage_layout; + int64_t storage_format{-1}; + DDim storage_dims; }; // Add OneDNNStorageProperties firstly for unittest covergae @@ -76,8 +77,8 @@ static std::unique_ptr CopyStorageProperties( auto result = std::make_unique(); result->storage_format = static_cast(sp.get())->storage_format; - result->storage_layout = - static_cast(sp.get())->storage_layout; + result->storage_dims = + static_cast(sp.get())->storage_dims; return result; #ifdef PADDLE_WITH_MKLDNN } else if (OneDNNStorageProperties::classof(sp.get())) { diff --git a/paddle/phi/tests/core/test_dense_tensor.cc b/paddle/phi/tests/core/test_dense_tensor.cc index b997d8f1e76ae..6f08eeaefffb1 100644 --- a/paddle/phi/tests/core/test_dense_tensor.cc +++ b/paddle/phi/tests/core/test_dense_tensor.cc @@ -154,13 +154,15 @@ TEST(dense_tensor, storage_properties) { EXPECT_TRUE(caught_exception); // test custom device storage properties + EXPECT_FALSE(tensor.storage_properties_initialized()); auto npu_properties = std::make_unique(); - npu_properties->storage_format = 1; - npu_properties->storage_layout = 2; + npu_properties->storage_format = 3; + npu_properties->storage_dims = {1, 1, 1, 1, 16}; tensor.set_storage_properties(std::move(npu_properties)); + EXPECT_TRUE(tensor.storage_properties_initialized()); auto get_npu_properties = tensor.storage_properties(); - CHECK_EQ(get_npu_properties.storage_format, 1); - CHECK_EQ(get_npu_properties.storage_layout, 2); + CHECK_EQ(get_npu_properties.storage_format, 3); + CHECK_EQ(get_npu_properties.storage_dims.size(), 5); // test error type storage properties #ifdef PADDLE_WITH_MKLDNN @@ -177,8 +179,8 @@ TEST(dense_tensor, storage_properties) { auto cp_tensor = tensor; auto get_cp_npu_properties = cp_tensor.storage_properties(); - CHECK_EQ(get_cp_npu_properties.storage_format, 1); - CHECK_EQ(get_cp_npu_properties.storage_layout, 2); + CHECK_EQ(get_cp_npu_properties.storage_format, 3); + CHECK_EQ(get_cp_npu_properties.storage_dims.size(), 5); } } // namespace tests From 29a0987ad19326c8318c2a07e673aa0c466442a3 Mon Sep 17 00:00:00 2001 From: huangjiyi <43315610+huangjiyi@users.noreply.github.com> Date: Wed, 16 Nov 2022 19:13:19 +0800 Subject: [PATCH 042/210] rm "paddle/fluid/framework/gpu_utils.h" in phi (#48020) --- paddle/phi/kernels/gpu/transpose_kernel.cu | 1 - 1 file changed, 1 deletion(-) diff --git a/paddle/phi/kernels/gpu/transpose_kernel.cu b/paddle/phi/kernels/gpu/transpose_kernel.cu index a970902d80094..36cf3fb8e397f 100644 --- a/paddle/phi/kernels/gpu/transpose_kernel.cu +++ b/paddle/phi/kernels/gpu/transpose_kernel.cu @@ -16,7 +16,6 @@ #include -#include "paddle/fluid/framework/gpu_utils.h" #include "paddle/fluid/operators/transpose_op.cu.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h" From 99ec2c1671a3c3e70cff7335c1d1e77d304ae7a9 Mon Sep 17 00:00:00 2001 From: HongyuJia Date: Wed, 16 Nov 2022 19:49:20 +0800 Subject: [PATCH 043/210] [Clean fluid] Clean fluid elementwise_min (part1) (#48033) * clean fluid elementwise_min * fix elementwise_min op testcase --- .../fleet/meta_optimizers/localsgd_optimizer.py | 2 +- python/paddle/fluid/dygraph/learning_rate_scheduler.py | 4 +--- python/paddle/fluid/layers/learning_rate_scheduler.py | 9 +++------ .../tests/unittests/ipu/test_elemetwise_x_op_ipu.py | 2 +- .../fluid/tests/unittests/test_elementwise_min_op.py | 3 ++- python/paddle/fluid/tests/unittests/test_layers.py | 6 ++---- 6 files changed, 10 insertions(+), 16 deletions(-) diff --git a/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py index 1cd0b23488ed7..67cd428f3b969 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py @@ -465,7 +465,7 @@ def communicate_avg_loss(): min_local_steps = layers.fill_constant( shape=[1], dtype='int64', value=1 ) - next_local_steps = layers.elementwise_min( + next_local_steps = paddle.minimum( next_local_steps, max_local_steps ) next_local_steps = paddle.maximum( diff --git a/python/paddle/fluid/dygraph/learning_rate_scheduler.py b/python/paddle/fluid/dygraph/learning_rate_scheduler.py index 0204542d6ec2b..21e0dc7c20c62 100644 --- a/python/paddle/fluid/dygraph/learning_rate_scheduler.py +++ b/python/paddle/fluid/dygraph/learning_rate_scheduler.py @@ -680,9 +680,7 @@ def step(self): a = self.create_lr_var(self.step_num**-0.5) b = self.create_lr_var((self.warmup_steps**-1.5) * self.step_num) lr_value = ( - self.learning_rate - * (self.d_model**-0.5) - * layers.elementwise_min(a, b) + self.learning_rate * (self.d_model**-0.5) * paddle.minimum(a, b) ) return lr_value diff --git a/python/paddle/fluid/layers/learning_rate_scheduler.py b/python/paddle/fluid/layers/learning_rate_scheduler.py index 737d9e75e11cb..cf56c793511b2 100644 --- a/python/paddle/fluid/layers/learning_rate_scheduler.py +++ b/python/paddle/fluid/layers/learning_rate_scheduler.py @@ -23,6 +23,7 @@ import math import numbers +import paddle from . import control_flow from . import nn from . import ops @@ -109,9 +110,7 @@ def noam_decay(d_model, warmup_steps, learning_rate=1.0): a = global_step**-0.5 b = (warmup_steps**-1.5) * global_step - lr_value = ( - learning_rate * (d_model**-0.5) * nn.elementwise_min(a, b) - ) + lr_value = learning_rate * (d_model**-0.5) * paddle.minimum(a, b) return lr_value @@ -364,9 +363,7 @@ def polynomial_decay( decay_steps_var = tensor.fill_constant( shape=[1], dtype='float32', value=float(decay_steps) ) - global_step = nn.elementwise_min( - x=global_step, y=decay_steps_var - ) + global_step = paddle.minimum(x=global_step, y=decay_steps_var) decayed_lr = (learning_rate - end_learning_rate) * ( (1 - global_step / decay_steps) ** power diff --git a/python/paddle/fluid/tests/unittests/ipu/test_elemetwise_x_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_elemetwise_x_op_ipu.py index 3ce09c9c479b8..0dc66ba82cde8 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_elemetwise_x_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_elemetwise_x_op_ipu.py @@ -140,7 +140,7 @@ def set_test_op(self): class TestMin(TestMul): def set_test_op(self): - self.op = paddle.fluid.layers.elementwise_min + self.op = paddle.minimum class TestMax(TestMul): diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_min_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_min_op.py index 1fe78b79fb059..8df9f9842b885 100644 --- a/python/paddle/fluid/tests/unittests/test_elementwise_min_op.py +++ b/python/paddle/fluid/tests/unittests/test_elementwise_min_op.py @@ -18,6 +18,7 @@ import paddle import paddle.fluid as fluid import paddle.fluid.core as core +from paddle import _legacy_C_ops paddle.enable_static() @@ -213,7 +214,7 @@ def get_out_and_grad(self, x_np, y_np, axis, place, use_fp32=False): y = paddle.to_tensor(y_np) x.stop_gradient = False y.stop_gradient = False - z = fluid.layers.elementwise_min(x, y, axis) + z = _legacy_C_ops.elementwise_min(x, y, 'axis', axis) x_g, y_g = paddle.grad([z], [x, y]) return ( z.numpy().astype(dtype), diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index 78d01722b8121..7b7dfd399120f 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -649,14 +649,12 @@ def test_elementwise_minmax(self): with self.dynamic_graph(): with _test_eager_guard(): - min_eager_ret = layers.elementwise_min( - to_variable(n), to_variable(n2) - ) + min_eager_ret = paddle.minimum(to_variable(n), to_variable(n2)) max_eager_ret = paddle.maximum(to_variable(n), to_variable(n2)) min_eager_ret_value = min_eager_ret.numpy() max_eager_ret_value = max_eager_ret.numpy() - min_ret = layers.elementwise_min(to_variable(n), to_variable(n2)) + min_ret = paddle.minimum(to_variable(n), to_variable(n2)) max_ret = paddle.maximum(to_variable(n), to_variable(n2)) min_ret_value = min_ret.numpy() max_ret_value = max_ret.numpy() From 992b30bad03448c3b2d1128fccc399f4116ac385 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=82=85=E5=89=91=E5=AF=92?= Date: Wed, 16 Nov 2022 21:46:36 +0800 Subject: [PATCH 044/210] [fluid clear] Remove elu in nn.py (#47855) --- python/paddle/fluid/layers/nn.py | 44 ------------------- .../unittests/test_activation_nn_grad.py | 2 +- .../tests/unittests/test_inplace_abn_op.py | 2 +- 3 files changed, 2 insertions(+), 46 deletions(-) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 0e6abd666797b..41766c6651aad 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -131,7 +131,6 @@ 'log', 'crop', 'crop_tensor', - 'elu', 'relu6', 'pow', 'stanh', @@ -9923,49 +9922,6 @@ def pad2d( return out -@deprecated(since="2.0.0", update_to="paddle.nn.functional.elu") -def elu(x, alpha=1.0, name=None): - """ - :alias_main: paddle.nn.functional.elu - :alias: paddle.nn.functional.elu,paddle.nn.functional.activation.elu - :old_api: paddle.fluid.layers.elu - - ${comment} - Args: - x(${x_type}): ${x_comment} - alpha(${alpha_type}|1.0): ${alpha_comment} - name(str|None): The default value is None. Normally there is no need for user to set this property. - For more information, please refer to :ref:`api_guide_Name`. - Returns: - ${out_type}: ${out_comment} - - Examples: - - .. code-block:: python - - import paddle.fluid as fluid - import numpy as np - - input_elu = np.array([[-1,6],[1,15.6]]) - with fluid.dygraph.guard(): - x = fluid.dygraph.to_variable(input_elu) - y = fluid.layers.elu(x, alpha=0.2) - print(y.numpy()) - # [[-0.12642411 6. ] - # [ 1. 15.6 ]] - """ - helper = LayerHelper('elu', **locals()) - check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'elu') - out = helper.create_variable_for_type_inference(dtype=x.dtype) - helper.append_op( - type='elu', - inputs={'X': x}, - outputs={'Out': out}, - attrs={'alpha': alpha}, - ) - return out - - @deprecated(since="2.0.0", update_to="paddle.nn.functional.relu6") def relu6(x, threshold=6.0, name=None): """ diff --git a/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py b/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py index 38a894755f464..cee448b0648f5 100644 --- a/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py +++ b/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py @@ -252,7 +252,7 @@ def func(self, place): x = layers.data('x', shape, False, dtype) x.persistable = True - y = layers.elu(x, alpha=alpha) + y = paddle.nn.functional.elu(x, alpha=alpha) np.random.RandomState(SEED) x_arr = np.random.uniform(-1, 1, shape).astype(dtype) gradient_checker.double_grad_check( diff --git a/python/paddle/fluid/tests/unittests/test_inplace_abn_op.py b/python/paddle/fluid/tests/unittests/test_inplace_abn_op.py index 1048c6710d27c..ff033d51efdbd 100644 --- a/python/paddle/fluid/tests/unittests/test_inplace_abn_op.py +++ b/python/paddle/fluid/tests/unittests/test_inplace_abn_op.py @@ -80,7 +80,7 @@ def build_program( if activation == 'leaky_relu': bn = fluid.layers.leaky_relu(bn, alpha) if activation == 'elu': - bn = fluid.layers.elu(bn, alpha) + bn = paddle.nn.functional.elu(bn, alpha) # NOTE: in inplace mode input and output of bn # may have same name, multiply 1. to generate From 7cc0d17105ac093c5bc1081205fb910fa6331e59 Mon Sep 17 00:00:00 2001 From: zyfncg Date: Thu, 17 Nov 2022 09:02:39 +0800 Subject: [PATCH 045/210] generate static graph code for some op (#48036) --- paddle/fluid/operators/conj_op.cc | 73 ------- paddle/fluid/operators/grid_sampler_op.cc | 190 ------------------ paddle/fluid/operators/histogram_op.cc | 75 ------- paddle/fluid/operators/index_sample_op.cc | 117 ----------- paddle/fluid/operators/index_select_op.cc | 122 ----------- paddle/fluid/operators/inverse_op.cc | 103 ---------- paddle/phi/api/yaml/backward.yaml | 50 +++++ paddle/phi/api/yaml/legacy_backward.yaml | 60 +----- paddle/phi/api/yaml/legacy_ops.yaml | 59 +----- paddle/phi/api/yaml/op_compat.yaml | 40 +++- paddle/phi/api/yaml/op_version.yaml | 8 + paddle/phi/api/yaml/ops.yaml | 57 ++++++ paddle/phi/ops/compat/grid_sampler_sig.cc | 43 ---- paddle/phi/ops/compat/histogram_sig.cc | 25 --- paddle/phi/ops/compat/index_sample_sig.cc | 28 --- paddle/phi/ops/compat/index_select_sig.cc | 28 --- paddle/phi/ops/compat/inverse_sig.cc | 26 --- .../tests/unittests/test_histogram_op.py | 2 +- 18 files changed, 161 insertions(+), 945 deletions(-) delete mode 100644 paddle/fluid/operators/conj_op.cc delete mode 100644 paddle/fluid/operators/grid_sampler_op.cc delete mode 100644 paddle/fluid/operators/histogram_op.cc delete mode 100644 paddle/fluid/operators/index_sample_op.cc delete mode 100644 paddle/fluid/operators/index_select_op.cc delete mode 100644 paddle/fluid/operators/inverse_op.cc delete mode 100644 paddle/phi/ops/compat/grid_sampler_sig.cc delete mode 100644 paddle/phi/ops/compat/histogram_sig.cc delete mode 100644 paddle/phi/ops/compat/index_sample_sig.cc delete mode 100644 paddle/phi/ops/compat/index_select_sig.cc delete mode 100644 paddle/phi/ops/compat/inverse_sig.cc diff --git a/paddle/fluid/operators/conj_op.cc b/paddle/fluid/operators/conj_op.cc deleted file mode 100644 index fbf053c66582c..0000000000000 --- a/paddle/fluid/operators/conj_op.cc +++ /dev/null @@ -1,73 +0,0 @@ -// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include -#include - -#include "paddle/fluid/framework/infershape_utils.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/core/infermeta_utils.h" -#include "paddle/phi/infermeta/unary.h" - -namespace paddle { -namespace operators { - -class ConjOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; -}; - -class ConjOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput("X", "(Tensor), The input tensor of conj op."); - AddOutput("Out", "(Tensor), The output tensor of conj op."); - AddComment(R"DOC( -Conj Operator. - -This operator is used to perform elementwise conjugate for input $X$. - -)DOC"); - } -}; - -template -class ConjGradMaker : public framework::SingleGradOpMaker { - public: - using framework::SingleGradOpMaker::SingleGradOpMaker; - - void Apply(GradOpPtr retv) const override { - retv->SetType("conj"); - retv->SetInput("X", this->OutputGrad("Out")); - retv->SetAttrMap(this->Attrs()); - retv->SetOutput("Out", this->InputGrad("X")); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -DECLARE_INFER_SHAPE_FUNCTOR(conj, - ConjInferShapeFunctor, - PD_INFER_META(phi::UnchangedInferMeta)); -REGISTER_OPERATOR(conj, - ops::ConjOp, - ops::ConjOpMaker, - ops::ConjGradMaker, - ops::ConjGradMaker, - ConjInferShapeFunctor); diff --git a/paddle/fluid/operators/grid_sampler_op.cc b/paddle/fluid/operators/grid_sampler_op.cc deleted file mode 100644 index 7f57d6e288f87..0000000000000 --- a/paddle/fluid/operators/grid_sampler_op.cc +++ /dev/null @@ -1,190 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include - -#include "paddle/fluid/framework/infershape_utils.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/op_version_registry.h" -#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" -#include "paddle/phi/core/infermeta_utils.h" -#include "paddle/phi/infermeta/backward.h" -#include "paddle/phi/infermeta/binary.h" - -namespace paddle { -namespace operators { - -using Tensor = phi::DenseTensor; - -class GridSampleOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - protected: - framework::OpKernelType GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { - auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X"); - return framework::OpKernelType(data_type, ctx.GetPlace()); - } -}; - -class GridSampleOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput("X", - "(Tensor) The input data of GridSampleOp, " - "This is a 4-D tensor with shape of [N, C, H, W] or" - " a 5-D tensot with shape of [N, C, D, H, W]"); - AddInput( - "Grid", - "(Tensor) The input grid of GridSampleOp generated by AffineGridOp, " - "This is a 4-D tensor with shape of [N, H, W, 2] is the concatenation " - "of x and y coordinates with shape [N, H, W] in last dimension or " - "a 5-D tensor with shape of [N, D, H, W, 3] is the concatenation " - "of depth, x and y coordinates with shape [N, D, H, W] in last " - "dimension "); - AddOutput("Output", - "(Tensor) Output tensor with shape [N, C, H, W] or shape [N,C, " - "D, H ,W]"); - AddAttr( - "align_corners", - "(bool, default true) If align_corners is true, it will project" - "-1 and 1 to the centers of the corner pixels. Otherwise, it will " - "project" - "-1 and 1 to the image edges.") - .SetDefault(true); - - AddAttr( - "mode", - "(bool, default true) The interpolation method which can be 'bilinear'" - " or 'nearest'.") - .SetDefault("bilinear"); - - AddAttr( - "padding_mode", - "(bool, default true) The padding method used when source" - "index is out of input images. It can be 'zeros', 'reflection' and " - "'border'.") - .SetDefault("zeros"); - - AddComment(R"DOC( - This operation samples input X by using bilinear or nearest interpolation based on - flow field grid, which is usually generated by affine_grid. The grid of - shape [N, H, W, 2] is the concatenation of (grid_x, grid_y) coordinates - with shape [N, H, W] each, where grid_x is indexing the 4th dimension - (in width dimension) of input data x and grid_y is indexing the 3rd - dimension (in height dimension), finally results is the bilinear - interpolation value or nearest value of 4 nearest corner points. - - For bilinear interpolation mode: - Step 1: - Get (x, y) grid coordinates and scale to [0, H-1/W-1]. - - grid_x = 0.5 * (grid[:, :, :, 0] + 1) * (W - 1) - grid_y = 0.5 * (grid[:, :, :, 1] + 1) * (H - 1) - - Step 2: - Indices input data X with grid (x, y) in each [H, W] area, and bilinear - interpolate point value by 4 nearest points. - - wn ------- y_n ------- en - | | | - | d_n | - | | | - x_w --d_w-- grid--d_e-- x_e - | | | - | d_s | - | | | - ws ------- y_s ------- wn - - x_w = floor(x) // west side x coord - x_e = x_w + 1 // east side x coord - y_n = floor(y) // north side y coord - y_s = y_s + 1 // south side y coord - - d_w = grid_x - x_w // distance to west side - d_e = x_e - grid_x // distance to east side - d_n = grid_y - y_n // distance to north side - d_s = y_s - grid_y // distance to south side - - wn = X[:, :, y_n, x_w] // north-west point value - en = X[:, :, y_n, x_e] // north-east point value - ws = X[:, :, y_s, x_w] // south-east point value - es = X[:, :, y_s, x_w] // north-east point value - - output = wn * d_e * d_s + en * d_w * d_s - + ws * d_e * d_n + es * d_w * d_n - )DOC"); - } -}; - -class GridSampleOpGrad : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - protected: - framework::OpKernelType GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { - auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X"); - return framework::OpKernelType(data_type, ctx.GetPlace()); - } -}; - -template -class GridSampleGradMaker : public framework::SingleGradOpMaker { - public: - using framework::SingleGradOpMaker::SingleGradOpMaker; - - protected: - void Apply(GradOpPtr op) const override { - op->SetType("grid_sampler_grad"); - op->SetInput("X", this->Input("X")); - op->SetInput("Grid", this->Input("Grid")); - op->SetInput(framework::GradVarName("Output"), this->OutputGrad("Output")); - - op->SetAttrMap(this->Attrs()); - - op->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); - op->SetOutput(framework::GradVarName("Grid"), this->InputGrad("Grid")); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -DECLARE_INFER_SHAPE_FUNCTOR(grid_sampler, - GridSamplerInferShapeFunctor, - PD_INFER_META(phi::GridSampleBaseInferMeta)); -REGISTER_OPERATOR(grid_sampler, - ops::GridSampleOp, - ops::GridSampleOpMaker, - ops::GridSampleGradMaker, - ops::GridSampleGradMaker, - GridSamplerInferShapeFunctor); -DECLARE_INFER_SHAPE_FUNCTOR(grid_sampler_grad, - GridSamplerGradInferShapeFunctor, - PD_INFER_META(phi::GeneralBinaryGradInferMeta)); -REGISTER_OPERATOR(grid_sampler_grad, - ops::GridSampleOpGrad, - GridSamplerGradInferShapeFunctor); - -REGISTER_OP_VERSION(grid_sampler) - .AddCheckpoint( - R"ROC( - Upgrade grid_sampler add a new attribute [mode]. - )ROC", - paddle::framework::compatible::OpVersionDesc().NewAttr( - "mode", "In order to specify interpolation mode", "bilinear")); diff --git a/paddle/fluid/operators/histogram_op.cc b/paddle/fluid/operators/histogram_op.cc deleted file mode 100644 index 9d58d65c83135..0000000000000 --- a/paddle/fluid/operators/histogram_op.cc +++ /dev/null @@ -1,75 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include - -#include "paddle/fluid/framework/infershape_utils.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/infermeta/unary.h" - -namespace paddle { -namespace operators { - -using framework::OpKernelType; - -class HistogramOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - framework::OpKernelType GetExpectedKernelType( - const framework::ExecutionContext &ctx) const { - auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X"); - return framework::OpKernelType(data_type, ctx.device_context()); - } -}; - -class HistogramOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput("X", "(Tensor) The input tensor of Histogram op,"); - AddOutput("Out", "(Tensor) The output tensor of Histogram op,"); - AddAttr("bins", "(int) number of histogram bins") - .SetDefault(100) - .EqualGreaterThan(1); - AddAttr("min", "(int) lower end of the range (inclusive)") - .SetDefault(0); - AddAttr("max", "(int) upper end of the range (inclusive)") - .SetDefault(0); - AddComment(R"DOC( - Histogram Operator. - Computes the histogram of a tensor. The elements are sorted - into equal width bins between min and max. If min and max are - both zero, the minimum and maximum values of the data are used. - )DOC"); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -DECLARE_INFER_SHAPE_FUNCTOR(histogram, - HistogramInferShapeFunctor, - PD_INFER_META(phi::HistogramInferMeta)); - -REGISTER_OPERATOR( - histogram, - ops::HistogramOp, - ops::HistogramOpMaker, - paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker, - HistogramInferShapeFunctor); diff --git a/paddle/fluid/operators/index_sample_op.cc b/paddle/fluid/operators/index_sample_op.cc deleted file mode 100644 index 0c5306e1d4f4a..0000000000000 --- a/paddle/fluid/operators/index_sample_op.cc +++ /dev/null @@ -1,117 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include - -#include "paddle/fluid/framework/infershape_utils.h" -#include "paddle/fluid/framework/no_need_buffer_vars_inference.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/enforce.h" -#include "paddle/phi/core/infermeta_utils.h" -#include "paddle/phi/infermeta/binary.h" -namespace paddle { -namespace operators { -class IndexSampleOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput("X", "Input(Tensor), dtype support int32/int64/float/double"); - AddInput("Index", "Index(Tensor), dtype support int32/int64"); - AddOutput("Out", "Return the element of input at index"); - - AddComment(R"DOC( - IndexSample OP returns the element of the specified location of X, - and the location is specified by Index. - - X tensor and Index tensor's shape must be 2-D, - dimension at 0 which usually is batch size must be equal. - - The returned tensor has the same shape and dimensions as the Index tensor. - )DOC"); - } -}; - -class IndexSampleOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - protected: - framework::OpKernelType GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { - auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X"); - return framework::OpKernelType(data_type, ctx.device_context()); - } -}; - -class IndexSampleGradOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE_EQ( - ctx->HasInput("Index"), - true, - platform::errors::InvalidArgument("Input(Index) should be not null.")); - PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Out")), - true, - platform::errors::InvalidArgument( - "Input(Out@GRAD) should be not null.")); - PADDLE_ENFORCE_EQ(ctx->HasOutput(framework::GradVarName("X")), - true, - platform::errors::InvalidArgument( - "Output(X@GRAD) should be not null.")); - - ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); - } - - protected: - framework::OpKernelType GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { - auto data_type = OperatorWithKernel::IndicateVarDataType( - ctx, framework::GradVarName("Out")); - return framework::OpKernelType(data_type, ctx.device_context()); - } -}; - -template -class IndexSampleGradMaker : public framework::SingleGradOpMaker { - public: - using framework::SingleGradOpMaker::SingleGradOpMaker; - - protected: - void Apply(GradOpPtr op) const override { - op->SetType("index_sample_grad"); - op->SetInput("X", this->Input("X")); - op->SetInput("Index", this->Input("Index")); - op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); - op->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); - } -}; - -DECLARE_NO_NEED_BUFFER_VARS_INFERER(IndexSampleGradNoNeedBufferVarInferer, "X"); -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -DECLARE_INFER_SHAPE_FUNCTOR(index_sample, - IndexSampleInferShapeFunctor, - PD_INFER_META(phi::IndexSampleInferMeta)); -REGISTER_OPERATOR(index_sample, - ops::IndexSampleOp, - ops::IndexSampleOpMaker, - ops::IndexSampleGradMaker, - ops::IndexSampleGradMaker, - IndexSampleInferShapeFunctor); -REGISTER_OPERATOR(index_sample_grad, - ops::IndexSampleGradOp, - ops::IndexSampleGradNoNeedBufferVarInferer); diff --git a/paddle/fluid/operators/index_select_op.cc b/paddle/fluid/operators/index_select_op.cc deleted file mode 100644 index 83b0eefecf77f..0000000000000 --- a/paddle/fluid/operators/index_select_op.cc +++ /dev/null @@ -1,122 +0,0 @@ -// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/index_select_op.h" - -#include - -#include "paddle/fluid/framework/infershape_utils.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/infermeta/binary.h" - -namespace paddle { -namespace operators { - -class IndexSelectOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - protected: - framework::OpKernelType GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { - auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X"); - return framework::OpKernelType(data_type, ctx.device_context()); - } -}; - -class IndexSelectGradOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE_EQ( - ctx->HasInput("Index"), - true, - platform::errors::InvalidArgument("Input(Index) should be not null.")); - PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Out")), - true, - platform::errors::InvalidArgument( - "Input(Out@GRAD) should be not null.")); - PADDLE_ENFORCE_EQ(ctx->HasOutput(framework::GradVarName("X")), - true, - platform::errors::InvalidArgument( - "Output(X@GRAD) should be not null.")); - - ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); - } - - protected: - framework::OpKernelType GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType( - ctx, framework::GradVarName("Out")), - ctx.device_context()); - } -}; - -class IndexSelectOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput("X", "(Tensor) the input tensor."); - AddInput("Index", "the 1-D tensor containing the indices to index."); - AddOutput("Out", "the output tensor."); - AddAttr("dim", "the dimension in which we index.").SetDefault(0); - AddComment(R"DOC( - Returns a new tensor which indexes the input tensor - along dimension dim using the entries in index which - is a Tensor. - - The returned tensor has the same number of dimensions - as the original tensor (input). The dim-th dimension - has the same size as the length of index; other dimensions - have the same size as in the original tensor. - )DOC"); - } -}; - -template -class IndexSelectGradMaker : public framework::SingleGradOpMaker { - public: - using framework::SingleGradOpMaker::SingleGradOpMaker; - - protected: - void Apply(GradOpPtr op) const override { - op->SetType("index_select_grad"); - - op->SetInput("X", this->Input("X")); - op->SetInput("Index", this->Input("Index")); - op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); - op->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); - op->SetAttrMap(this->Attrs()); - } -}; - -DECLARE_NO_NEED_BUFFER_VARS_INFERER(IndexSelectGradNoNeedBufferVarsInferer, - "X"); -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -DECLARE_INFER_SHAPE_FUNCTOR(index_select, - IndexSelectInferShapeFunctor, - PD_INFER_META(phi::IndexSelectInferMeta)); -REGISTER_OPERATOR(index_select, - ops::IndexSelectOp, - ops::IndexSelectOpMaker, - ops::IndexSelectGradMaker, - ops::IndexSelectGradMaker, - IndexSelectInferShapeFunctor); -REGISTER_OPERATOR(index_select_grad, - ops::IndexSelectGradOp, - ops::IndexSelectGradNoNeedBufferVarsInferer); diff --git a/paddle/fluid/operators/inverse_op.cc b/paddle/fluid/operators/inverse_op.cc deleted file mode 100644 index 5d0c7c754b26c..0000000000000 --- a/paddle/fluid/operators/inverse_op.cc +++ /dev/null @@ -1,103 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include - -#include "paddle/fluid/framework/infershape_utils.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/core/infermeta_utils.h" -#include "paddle/phi/infermeta/backward.h" -#include "paddle/phi/infermeta/unary.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" -#include "paddle/phi/kernels/funcs/matrix_inverse.h" - -namespace paddle { -namespace operators { - -class InverseOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; -}; - -class InverseOpInferVarType : public framework::PassInDtypeAndVarTypeToOutput { - protected: - std::unordered_map& GetInputOutputWithSameType() - const override { - static std::unordered_map m{ - {"Input", /*->*/ "Output"}}; - return m; - } -}; - -class InverseGradOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; -}; - -class InverseOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput( - "Input", - "(Tensor) A square matrix (2-D Tensor) or batches of square matrices" - " to inverse."); - AddOutput("Output", "(Tensor) The inverse of input matrix."); - AddComment(R"DOC( -Inverse Operator - -Takes the inverse of the square matrix. -)DOC"); - } -}; - -template -class InverseGradOpMaker : public framework::SingleGradOpMaker { - public: - using framework::SingleGradOpMaker::SingleGradOpMaker; - - protected: - void Apply(GradOpPtr grad) const override { - grad->SetType(this->ForwardOpType() + "_grad"); - grad->SetInput("Output", this->Output("Output")); - grad->SetInput(framework::GradVarName("Output"), - this->OutputGrad("Output")); - grad->SetOutput(framework::GradVarName("Input"), this->InputGrad("Input")); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -DECLARE_INFER_SHAPE_FUNCTOR(inverse, - InverseInferShapeFunctor, - PD_INFER_META(phi::InverseInferMeta)); - -DECLARE_INFER_SHAPE_FUNCTOR(inverse_grad, - InverseGradInferShapeFunctor, - PD_INFER_META(phi::InverseGradInferMeta)); - -REGISTER_OPERATOR(inverse, - ops::InverseOp, - ops::InverseOpMaker, - ops::InverseOpInferVarType, - ops::InverseGradOpMaker, - ops::InverseGradOpMaker, - InverseInferShapeFunctor); - -REGISTER_OPERATOR(inverse_grad, - ops::InverseGradOp, - InverseGradInferShapeFunctor); diff --git a/paddle/phi/api/yaml/backward.yaml b/paddle/phi/api/yaml/backward.yaml index 919f69525bbc2..8ab3589a3e970 100644 --- a/paddle/phi/api/yaml/backward.yaml +++ b/paddle/phi/api/yaml/backward.yaml @@ -172,6 +172,12 @@ kernel : func : cholesky_solve_grad +- backward_op : conj_grad + forward : conj (Tensor x) -> Tensor(out) + args : (Tensor out_grad) + output : Tensor(x_grad) + invoke : conj(out_grad) + - backward_op : cos_double_grad forward : cos_grad (Tensor x, Tensor grad_out) -> Tensor(grad_x) args : (Tensor x, Tensor grad_out, Tensor grad_x_grad) @@ -451,6 +457,17 @@ kernel : func : gelu_grad +- backward_op : grid_sample_grad + forward : grid_sample (Tensor x, Tensor grid, str mode, str padding_mode, bool align_corners) -> Tensor(out) + args : (Tensor x, Tensor grid, Tensor out_grad, str mode, str padding_mode, bool align_corners) + output : Tensor(x_grad), Tensor(grid_grad) + infer_meta : + func : GeneralBinaryGradInferMeta + param : [x, grid] + kernel : + func : grid_sample_grad + data_type : x + - backward_op : gumbel_softmax_grad forward : gumbel_softmax (Tensor x, float temperature, bool hard, int axis) -> Tensor(out) args : (Tensor out, Tensor out_grad, int axis) @@ -482,6 +499,39 @@ func : hard_sigmoid_grad inplace : (out_grad -> x_grad) +- backward_op : index_sample_grad + forward : index_sample (Tensor x, Tensor index) -> Tensor(out) + args : (Tensor x, Tensor index, Tensor out_grad) + output : Tensor(x_grad) + infer_meta : + func : UnchangedInferMeta + param : [x] + kernel : + func : index_sample_grad + data_type : out_grad + no_need_buffer : x + +- backward_op : index_select_grad + forward : index_select(Tensor x, Tensor index, int axis) -> Tensor(out) + args : (Tensor x, Tensor index, Tensor out_grad, int axis) + output : Tensor(x_grad) + infer_meta : + func : UnchangedInferMeta + param : [x] + kernel : + func : index_select_grad + data_type : out_grad + no_need_buffer : x + +- backward_op : inverse_grad + forward : inverse(Tensor x) -> Tensor(out) + args : (Tensor out, Tensor out_grad) + output : Tensor(x_grad) + infer_meta: + func : InverseGradInferMeta + kernel : + func : inverse_grad + - backward_op : leaky_relu_double_grad forward : leaky_relu_grad (Tensor x, Tensor grad_out, float negative_slope) -> Tensor(grad_x) args : (Tensor x, Tensor grad_x_grad, float negative_slope) diff --git a/paddle/phi/api/yaml/legacy_backward.yaml b/paddle/phi/api/yaml/legacy_backward.yaml index f9ac891526c49..f920bbb8b23a7 100755 --- a/paddle/phi/api/yaml/legacy_backward.yaml +++ b/paddle/phi/api/yaml/legacy_backward.yaml @@ -253,16 +253,6 @@ no_need_buffer : x backward : concat_double_grad -- backward_op : conj_grad - forward : conj (Tensor x) -> Tensor(out) - args : (Tensor out_grad) - output : Tensor(x_grad) - infer_meta : - func : UnchangedInferMeta - param: [out_grad] - kernel : - func : conj - - backward_op : conv2d_grad forward : conv2d (Tensor input, Tensor filter, int[] strides, int[] paddings, str padding_algorithm, int[] dilations, int groups, str data_format) -> Tensor(out) args : (Tensor input, Tensor filter, Tensor out_grad, int[] strides, int[] paddings, str padding_algorithm, int[] dilations, int groups, str data_format) @@ -629,17 +619,6 @@ func : gather_nd_grad no_need_buffer : x -- backward_op : grid_sample_grad - forward : grid_sample (Tensor x, Tensor grid, str mode, str padding_mode, bool align_corners) -> Tensor(out) - args : (Tensor x, Tensor grid, Tensor out_grad, str mode, str padding_mode, bool align_corners) - output : Tensor(x_grad), Tensor(grid_grad) - infer_meta : - func : GeneralBinaryGradInferMeta - param : [x, grid] - kernel : - func : grid_sample_grad - data_type : x - - backward_op : group_norm_grad forward : group_norm (Tensor x, Tensor scale, Tensor bias, float epsilon, int groups, str data_layout) -> Tensor(y), Tensor(mean), Tensor(variance) args : (Tensor x, Tensor scale, Tensor bias, Tensor y, Tensor mean, Tensor variance, Tensor y_grad, float epsilon, int groups, str data_layout) @@ -702,6 +681,12 @@ output : Tensor(x_grad) invoke : imag_grad_impl(out_grad, x_grad) +- backward_op : increment_grad + forward : increment (Tensor x, float value) -> Tensor(out) + args : (Tensor out, float value) + output : Tensor(x_grad) + invoke : increment (out, -value) + - backward_op : index_add_grad forward : index_add(Tensor x, Tensor index, Tensor add_value, int axis) -> Tensor(out) args : (Tensor index, Tensor add_value, Tensor out_grad, int axis) @@ -713,30 +698,6 @@ data_type : out_grad inplace : (out_grad -> x_grad) -- backward_op : index_sample_grad - forward : index_sample (Tensor x, Tensor index) -> Tensor(out) - args : (Tensor x, Tensor index, Tensor out_grad) - output : Tensor(x_grad) - infer_meta : - func : UnchangedInferMeta - param : [x] - kernel : - func : index_sample_grad - data_type : out_grad - no_need_buffer : x - -- backward_op : index_select_grad - forward : index_select(Tensor x, Tensor index, int axis) -> Tensor(out) - args : (Tensor x, Tensor index, Tensor out_grad, int axis) - output : Tensor(x_grad) - infer_meta : - func : UnchangedInferMeta - param : [x] - kernel : - func : index_select_grad - data_type : x - no_need_buffer : x - - backward_op : instance_norm_double_grad forward : instance_norm_grad(Tensor x, Tensor fwd_scale, Tensor saved_mean, Tensor saved_variance, Tensor grad_y, float epsilon) -> Tensor(grad_x), Tensor(grad_scale), Tensor(grad_bias) args : (Tensor x, Tensor fwd_scale, Tensor saved_mean, Tensor saved_variance, Tensor grad_y, Tensor grad_x_grad, Tensor grad_scale_grad, Tensor grad_bias_grad, float epsilon) @@ -760,15 +721,6 @@ optional : scale backward : instance_norm_double_grad -- backward_op : inverse_grad - forward : inverse(Tensor x) -> Tensor(out) - args : (Tensor out, Tensor out_grad) - output : Tensor(x_grad) - infer_meta: - func : InverseGradInferMeta - kernel : - func : inverse_grad - - backward_op : kldiv_loss_grad forward : kldiv_loss(Tensor x, Tensor label, str reduction) -> Tensor(out) args : (Tensor x, Tensor label, Tensor out_grad, str reduction) diff --git a/paddle/phi/api/yaml/legacy_ops.yaml b/paddle/phi/api/yaml/legacy_ops.yaml index dca7d885be7fe..a1bc49a477ac3 100755 --- a/paddle/phi/api/yaml/legacy_ops.yaml +++ b/paddle/phi/api/yaml/legacy_ops.yaml @@ -424,15 +424,6 @@ func : concat backward : concat_grad -- op : conj - args : (Tensor x) - output : Tensor - infer_meta : - func : UnchangedInferMeta - kernel : - func : conj - backward : conj_grad - - op : conv2d args : (Tensor input, Tensor filter, int[] strides, int[] paddings, str padding_algorithm, int[] dilations, int groups, str data_format) output : Tensor @@ -911,17 +902,6 @@ kernel : func : greater_than -- op : grid_sample - args : (Tensor x, Tensor grid, str mode, str padding_mode, bool align_corners) - output : Tensor(out) - infer_meta : - func : GridSampleBaseInferMeta - param : [x, grid] - kernel: - func : grid_sample - data_type : x - backward : grid_sample_grad - - op : group_norm args : (Tensor x, Tensor scale, Tensor bias, float epsilon, int groups, str data_layout) output : Tensor(y), Tensor(mean), Tensor(variance) @@ -953,14 +933,6 @@ func : hard_tanh backward : hardtanh_grad -- op : histogram - args : (Tensor input, int64_t bins, int min, int max) - output : Tensor(out) - infer_meta : - func : HistogramInferMeta - kernel : - func : histogram - - op : hsigmoid_loss args : (Tensor x, Tensor w, Tensor label, Tensor path, Tensor code, Tensor bias, int num_classes, bool remote_prefetch, int trainer_id, int64_t[] height_sections, str[] epmap, str[] table_names, bool is_sparse) output : Tensor(out), Tensor(pre_out), Tensor(w_out) @@ -991,7 +963,7 @@ backward : imag_grad - op : increment - args : (Tensor x, float value) + args : (Tensor x, float value = 1.0) output : Tensor(out) infer_meta : func : IncrementInferMeta @@ -1010,26 +982,6 @@ inplace : (x -> out) backward : index_add_grad -- op : index_sample - args : (Tensor x, Tensor index) - output : Tensor - infer_meta : - func : IndexSampleInferMeta - kernel : - func : index_sample - data_type : x - backward : index_sample_grad - -- op : index_select - args : (Tensor x, Tensor index, int axis) - output : Tensor(out) - infer_meta : - func : IndexSelectInferMeta - kernel : - func : index_select - data_type : x - backward : index_select_grad - - op : instance_norm args : (Tensor x, Tensor scale, Tensor bias, float epsilon) output : Tensor(y), Tensor(saved_mean), Tensor(saved_variance) @@ -1042,15 +994,6 @@ intermediate : saved_mean, saved_variance backward : instance_norm_grad -- op : inverse - args : (Tensor x) - output : Tensor(out) - infer_meta : - func : InverseInferMeta - kernel : - func : inverse - backward : inverse_grad - - op : is_empty args : (Tensor x) output : Tensor(out) diff --git a/paddle/phi/api/yaml/op_compat.yaml b/paddle/phi/api/yaml/op_compat.yaml index a0894a9aca8f6..8c1d7ac308576 100644 --- a/paddle/phi/api/yaml/op_compat.yaml +++ b/paddle/phi/api/yaml/op_compat.yaml @@ -188,6 +188,12 @@ extra : attrs : ['str[] skip_eager_deletion_vars = {}'] +- op : conj + inputs : + x : X + outputs : + out : Out + - op : conv2d backward : conv2d_grad extra : @@ -546,8 +552,12 @@ attrs : [bool use_mkldnn = false, str x_data_format = "", str y_data_format = "", str mkldnn_data_type = "float32", bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f] -- op : grid_sampler - backward : grid_sampler_grad +- op : grid_sample(grid_sampler) + backward : grid_sample_grad (grid_sampler_grad) + inputs : + {x : X, grid : Grid} + outputs : + out : Output extra : attrs : [bool use_cudnn = true] @@ -587,11 +597,37 @@ attrs : [bool use_mkldnn = false, str x_data_format = "", str y_data_format = "", str mkldnn_data_type = "float32", bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f] +- op : histogram + inputs : + input : X + outputs : + out : Out + +- op : index_sample + inputs : + {x : X, index : Index} + outputs : + out : Out + +- op : index_select + inputs : + {x : X, index : Index} + outputs : + out : Out + attrs : + axis : dim + - op : inplace_abn backward : inplace_abn_grad extra : attrs : [bool use_mkldnn = false, bool fuse_with_relu = false] +- op : inverse + inputs : + x : Input + outputs : + out : Output + - op : layer_norm backward : layer_norm_grad extra : diff --git a/paddle/phi/api/yaml/op_version.yaml b/paddle/phi/api/yaml/op_version.yaml index 3028b927966a2..ba669f4ba66a9 100644 --- a/paddle/phi/api/yaml/op_version.yaml +++ b/paddle/phi/api/yaml/op_version.yaml @@ -8,6 +8,14 @@ - delete_attr : dims comment : The attr 'dims' is deleted. +- op : grid_sample + version : + - checkpoint : Upgrade grid_sampler add a new attribute [mode] + action : + - add_attr : mode + comment : In order to specify interpolation mode + default : std::string("bilinear") + - op : trace version : - checkpoint : Upgrade trace add a new attribute [axis2] diff --git a/paddle/phi/api/yaml/ops.yaml b/paddle/phi/api/yaml/ops.yaml index 4645be6b4c339..88ab2ee099ca6 100644 --- a/paddle/phi/api/yaml/ops.yaml +++ b/paddle/phi/api/yaml/ops.yaml @@ -152,6 +152,15 @@ func : cholesky_solve backward : cholesky_solve_grad +- op : conj + args : (Tensor x) + output : Tensor (out) + infer_meta : + func : UnchangedInferMeta + kernel : + func : conj + backward : conj_grad + - op : cos args : (Tensor x) output : Tensor @@ -411,6 +420,17 @@ func : gelu backward : gelu_grad +- op : grid_sample + args : (Tensor x, Tensor grid, str mode = "bilinear", str padding_mode = "zeros", bool align_corners = true) + output : Tensor(out) + infer_meta : + func : GridSampleBaseInferMeta + param : [x, grid] + kernel: + func : grid_sample + data_type : x + backward : grid_sample_grad + - op : gumbel_softmax args : (Tensor x, float temperature = 1.0, bool hard = false, int axis = -1) output : Tensor @@ -440,6 +460,43 @@ func : hard_sigmoid backward : hardsigmoid_grad +- op : histogram + args : (Tensor input, int64_t bins = 100, int min = 0, int max = 0) + output : Tensor(out) + infer_meta : + func : HistogramInferMeta + kernel : + func : histogram + +- op : index_sample + args : (Tensor x, Tensor index) + output : Tensor + infer_meta : + func : IndexSampleInferMeta + kernel : + func : index_sample + data_type : x + backward : index_sample_grad + +- op : index_select + args : (Tensor x, Tensor index, int axis = 0) + output : Tensor(out) + infer_meta : + func : IndexSelectInferMeta + kernel : + func : index_select + data_type : x + backward : index_select_grad + +- op : inverse + args : (Tensor x) + output : Tensor(out) + infer_meta : + func : InverseInferMeta + kernel : + func : inverse + backward : inverse_grad + - op : leaky_relu args : (Tensor x, float negative_slope = 0.02f) output : Tensor diff --git a/paddle/phi/ops/compat/grid_sampler_sig.cc b/paddle/phi/ops/compat/grid_sampler_sig.cc deleted file mode 100644 index 486d5230ee7a6..0000000000000 --- a/paddle/phi/ops/compat/grid_sampler_sig.cc +++ /dev/null @@ -1,43 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/phi/core/compat/op_utils.h" - -namespace phi { - -KernelSignature GridSamplerOpArgumentMapping( - const ArgumentMappingContext& ctx) { - return KernelSignature("grid_sample", - {"X", "Grid"}, - {"mode", "padding_mode", "align_corners"}, - {"Output"}); -} - -KernelSignature GridSamplerGradOpArgumentMapping( - const ArgumentMappingContext& ctx) { - return KernelSignature("grid_sample_grad", - {"X", "Grid", "Output@GRAD"}, - {"mode", "padding_mode", "align_corners"}, - {"X@GRAD", "Grid@GRAD"}); -} - -} // namespace phi - -// use Python API name as kernel name -PD_REGISTER_BASE_KERNEL_NAME(grid_sampler, grid_sample); -PD_REGISTER_BASE_KERNEL_NAME(grid_sampler_grad, grid_sample_grad); - -PD_REGISTER_ARG_MAPPING_FN(grid_sampler, phi::GridSamplerOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(grid_sampler_grad, - phi::GridSamplerGradOpArgumentMapping); diff --git a/paddle/phi/ops/compat/histogram_sig.cc b/paddle/phi/ops/compat/histogram_sig.cc deleted file mode 100644 index 0cea146ea4e7f..0000000000000 --- a/paddle/phi/ops/compat/histogram_sig.cc +++ /dev/null @@ -1,25 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/phi/core/compat/op_utils.h" - -namespace phi { - -KernelSignature HistogramOpArgumentMapping(const ArgumentMappingContext& ctx) { - return KernelSignature("histogram", {"X"}, {"bins", "min", "max"}, {"Out"}); -} - -} // namespace phi - -PD_REGISTER_ARG_MAPPING_FN(histogram, phi::HistogramOpArgumentMapping); diff --git a/paddle/phi/ops/compat/index_sample_sig.cc b/paddle/phi/ops/compat/index_sample_sig.cc deleted file mode 100644 index 9c1b7e27f04ec..0000000000000 --- a/paddle/phi/ops/compat/index_sample_sig.cc +++ /dev/null @@ -1,28 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/phi/core/compat/op_utils.h" - -namespace phi { - -KernelSignature IndexSampleGradOpArgumentMapping( - const ArgumentMappingContext& ctx) { - return KernelSignature( - "index_sample_grad", {"X", "Index", "Out@GRAD"}, {}, {"X@GRAD"}); -} - -} // namespace phi - -PD_REGISTER_ARG_MAPPING_FN(index_sample_grad, - phi::IndexSampleGradOpArgumentMapping); diff --git a/paddle/phi/ops/compat/index_select_sig.cc b/paddle/phi/ops/compat/index_select_sig.cc deleted file mode 100644 index 096ad2332c9ab..0000000000000 --- a/paddle/phi/ops/compat/index_select_sig.cc +++ /dev/null @@ -1,28 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/phi/core/compat/op_utils.h" - -namespace phi { - -KernelSignature IndexSelectGradOpArgumentMapping( - const ArgumentMappingContext& ctx) { - return KernelSignature( - "index_select_grad", {"X", "Index", "Out@GRAD"}, {"dim"}, {"X@GRAD"}); -} - -} // namespace phi - -PD_REGISTER_ARG_MAPPING_FN(index_select_grad, - phi::IndexSelectGradOpArgumentMapping); diff --git a/paddle/phi/ops/compat/inverse_sig.cc b/paddle/phi/ops/compat/inverse_sig.cc deleted file mode 100644 index 9ec56d5759a96..0000000000000 --- a/paddle/phi/ops/compat/inverse_sig.cc +++ /dev/null @@ -1,26 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/phi/core/compat/op_utils.h" - -namespace phi { -KernelSignature InverseGradOpArgumentMapping( - const ArgumentMappingContext& ctx) { - return KernelSignature( - "inverse_grad", {"Output", "Output@GRAD"}, {}, {"Input@GRAD"}); -} - -} // namespace phi - -PD_REGISTER_ARG_MAPPING_FN(inverse_grad, phi::InverseGradOpArgumentMapping); diff --git a/python/paddle/fluid/tests/unittests/test_histogram_op.py b/python/paddle/fluid/tests/unittests/test_histogram_op.py index 516bc53a34aca..669431f8f6ff4 100644 --- a/python/paddle/fluid/tests/unittests/test_histogram_op.py +++ b/python/paddle/fluid/tests/unittests/test_histogram_op.py @@ -88,7 +88,7 @@ def net_func(): ) paddle.histogram(input=input_value, bins=-1, min=1, max=5) - with self.assertRaises(IndexError): + with self.assertRaises(ValueError): self.run_network(net_func) def test_min_max_error(self): From efdf75e3a1ab17df3fa57bffb15ce59f97d91db9 Mon Sep 17 00:00:00 2001 From: houj04 <35131887+houj04@users.noreply.github.com> Date: Thu, 17 Nov 2022 10:32:54 +0800 Subject: [PATCH 046/210] xpu.cmake: use baidu-kunlun-product. update to 1116. (#48031) --- cmake/external/xpu.cmake | 38 +++++--------------------------------- 1 file changed, 5 insertions(+), 33 deletions(-) diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake index 5e560a27018b2..a616387c0905e 100644 --- a/cmake/external/xpu.cmake +++ b/cmake/external/xpu.cmake @@ -10,20 +10,11 @@ set(XPU_RT_LIB_NAME "libxpurt.so") if(NOT DEFINED XPU_BASE_URL) set(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.su.bcebos.com/KL-SDK/klsdk-dev") - set(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20221110") + set(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20221116") else() set(XPU_BASE_URL "${XPU_BASE_URL}") endif() -# ubuntu and centos: use output by XDNN API team -if(NOT DEFINED XPU_XDNN_BASE_URL) - set(XPU_XDNN_BASE_URL_WITHOUT_DATE - "https://klx-sdk-release-public.su.bcebos.com/xdnn/dev") - set(XPU_XDNN_BASE_URL "${XPU_XDNN_BASE_URL_WITHOUT_DATE}/20221109") -else() - set(XPU_XDNN_BASE_URL "${XPU_XDNN_BASE_URL}") -endif() - set(XPU_XCCL_BASE_URL "https://klx-sdk-release-public.su.bcebos.com/xccl/release/1.0.0") @@ -31,53 +22,34 @@ if(WITH_AARCH64) set(XPU_XRE_DIR_NAME "xre-kylin_aarch64") set(XPU_XDNN_DIR_NAME "xdnn-kylin_aarch64") set(XPU_XCCL_DIR_NAME "xccl-kylin_aarch64") - set(XPU_XDNN_URL - "${XPU_XDNN_BASE_URL}/${XPU_XDNN_DIR_NAME}.tar.gz" - CACHE STRING "" FORCE) elseif(WITH_SUNWAY) set(XPU_XRE_DIR_NAME "xre-deepin_sw6_64") set(XPU_XDNN_DIR_NAME "xdnn-deepin_sw6_64") set(XPU_XCCL_DIR_NAME "xccl-deepin_sw6_64") - set(XPU_XDNN_URL - "${XPU_BASE_URL}/${XPU_XDNN_DIR_NAME}.tar.gz" - CACHE STRING "" FORCE) elseif(WITH_BDCENTOS) set(XPU_XRE_DIR_NAME "xre-bdcentos_x86_64") set(XPU_XDNN_DIR_NAME "xdnn-bdcentos_x86_64") set(XPU_XCCL_DIR_NAME "xccl-bdcentos_x86_64") - # ubuntu and centos: use output by XDNN API team - set(XPU_XDNN_URL - "${XPU_XDNN_BASE_URL}/${XPU_XDNN_DIR_NAME}.tar.gz" - CACHE STRING "" FORCE) elseif(WITH_UBUNTU) set(XPU_XRE_DIR_NAME "xre-ubuntu_x86_64") set(XPU_XDNN_DIR_NAME "xdnn-ubuntu_x86_64") set(XPU_XCCL_DIR_NAME "xccl-ubuntu_x86_64") - # ubuntu and centos: use output by XDNN API team - set(XPU_XDNN_URL - "${XPU_XDNN_BASE_URL}/${XPU_XDNN_DIR_NAME}.tar.gz" - CACHE STRING "" FORCE) elseif(WITH_CENTOS) set(XPU_XRE_DIR_NAME "xre-centos7_x86_64") - set(XPU_XDNN_DIR_NAME "xdnn-bdcentos_x86_64") + set(XPU_XDNN_DIR_NAME "xdnn-centos7_x86_64") set(XPU_XCCL_DIR_NAME "xccl-bdcentos_x86_64") - # ubuntu and centos: use output by XDNN API team - set(XPU_XDNN_URL - "${XPU_XDNN_BASE_URL}/${XPU_XDNN_DIR_NAME}.tar.gz" - CACHE STRING "" FORCE) else() set(XPU_XRE_DIR_NAME "xre-ubuntu_x86_64") set(XPU_XDNN_DIR_NAME "xdnn-ubuntu_x86_64") set(XPU_XCCL_DIR_NAME "xccl-ubuntu_x86_64") - # default: use output by XDNN API team - set(XPU_XDNN_URL - "${XPU_XDNN_BASE_URL}/${XPU_XDNN_DIR_NAME}.tar.gz" - CACHE STRING "" FORCE) endif() set(XPU_XRE_URL "${XPU_BASE_URL}/${XPU_XRE_DIR_NAME}.tar.gz" CACHE STRING "" FORCE) +set(XPU_XDNN_URL + "${XPU_BASE_URL}/${XPU_XDNN_DIR_NAME}.tar.gz" + CACHE STRING "" FORCE) set(XPU_XCCL_URL "${XPU_XCCL_BASE_URL}/${XPU_XCCL_DIR_NAME}.tar.gz" CACHE STRING "" FORCE) From 33d81aa4077a656949822a8fe45ef3dd8ebbfe2c Mon Sep 17 00:00:00 2001 From: Nyakku Shigure Date: Thu, 17 Nov 2022 10:50:25 +0800 Subject: [PATCH 047/210] [CodeStyle][F821] add a missing import (#48006) --- python/paddle/fluid/tests/unittests/xpu/process_group_bkcl.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/paddle/fluid/tests/unittests/xpu/process_group_bkcl.py b/python/paddle/fluid/tests/unittests/xpu/process_group_bkcl.py index 5ea3845c0bd18..2317e38cb28d0 100644 --- a/python/paddle/fluid/tests/unittests/xpu/process_group_bkcl.py +++ b/python/paddle/fluid/tests/unittests/xpu/process_group_bkcl.py @@ -18,6 +18,7 @@ import sys import paddle +import paddle.distributed as dist from paddle.fluid.framework import _test_eager_guard from paddle.fluid.dygraph.parallel import ParallelEnv From ccbd03d5fb0473f4a35955ebf5ea4d8656551e12 Mon Sep 17 00:00:00 2001 From: sneaxiy <32832641+sneaxiy@users.noreply.github.com> Date: Thu, 17 Nov 2022 11:06:26 +0800 Subject: [PATCH 048/210] Add vectorized bfloat16 atomicAdd (#48056) * add vectorized bfloat16 atomicAdd * fix compile error * fix compile error again * fix V100 compile error * fix V100 compile again --- .../platform/device/gpu/gpu_primitives.h | 96 ++++++++++------- paddle/phi/backends/gpu/gpu_primitives.h | 101 ++++++++++-------- .../unittests/test_bfloat16_embedding.py | 79 ++++++++++++++ 3 files changed, 192 insertions(+), 84 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/test_bfloat16_embedding.py diff --git a/paddle/fluid/platform/device/gpu/gpu_primitives.h b/paddle/fluid/platform/device/gpu/gpu_primitives.h index 96eddf09237d9..4df203b48bb9a 100644 --- a/paddle/fluid/platform/device/gpu/gpu_primitives.h +++ b/paddle/fluid/platform/device/gpu/gpu_primitives.h @@ -151,47 +151,68 @@ CUDA_ATOMIC_WRAPPER(Add, float16) { } #endif +template +struct VecAtomicAddHelperBase { + static constexpr auto kIsAvailable = IsAvailable; + using NVT = NVType; + using NVVec2T = NVVec2Type; +}; + +template +struct VecAtomicAddHelper : VecAtomicAddHelperBase {}; + +#if CUDA_VERSION >= 10000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 +template <> +struct VecAtomicAddHelper + : VecAtomicAddHelperBase {}; +#endif + +#if CUDA_VERSION >= 11000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 +template <> +struct VecAtomicAddHelper + : VecAtomicAddHelperBase {}; +#endif + // The performance of "atomicAdd(half* )" is bad, but for "atomicAdd(half2* )" // is good. So for fp16 type, we can use "atomicAdd(half2* )" to speed up. template ::value>::type * = nullptr> + typename std::enable_if::kIsAvailable>::type * = + nullptr> __device__ __forceinline__ void fastAtomicAdd(T *tensor, size_t index, const size_t numel, T value) { -#if ((CUDA_VERSION < 10000) || \ - (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700))) - CudaAtomicAdd(reinterpret_cast(tensor) + index, - static_cast(value)); -#else // whether the address is 32-byte aligned. - __half *target_addr = reinterpret_cast<__half *>(tensor + index); + using NVT = typename VecAtomicAddHelper::NVT; + using NVVec2T = typename VecAtomicAddHelper::NVVec2T; + NVT *target_addr = reinterpret_cast(tensor + index); bool aligned_half2 = - (reinterpret_cast(target_addr) % sizeof(__half2) == 0); + (reinterpret_cast(target_addr) % sizeof(NVVec2T) == 0); if (aligned_half2 && index < (numel - 1)) { - __half2 value2; - value2.x = *reinterpret_cast<__half *>(&value); - value2.y = __int2half_rz(0); - atomicAdd(reinterpret_cast<__half2 *>(target_addr), value2); + NVVec2T value2; + value2.x = *reinterpret_cast(&value); + value2.y = 0.0; + atomicAdd(reinterpret_cast(target_addr), value2); } else if (!aligned_half2 && index > 0) { - __half2 value2; - value2.x = __int2half_rz(0); - value2.y = *reinterpret_cast<__half *>(&value); - atomicAdd(reinterpret_cast<__half2 *>(target_addr - 1), value2); + NVVec2T value2; + value2.x = 0.0; + value2.y = *reinterpret_cast(&value); + atomicAdd(reinterpret_cast(target_addr - 1), value2); } else { - atomicAdd(reinterpret_cast<__half *>(tensor) + index, - *reinterpret_cast<__half *>(&value)); + atomicAdd(reinterpret_cast(tensor) + index, + *reinterpret_cast(&value)); } -#endif } template ::value>::type * = nullptr> + typename std::enable_if::kIsAvailable>::type + * = nullptr> __device__ __forceinline__ void fastAtomicAdd(T *arr, size_t index, const size_t numel, @@ -546,16 +567,16 @@ CUDA_ATOMIC_WRAPPER(Min, float16) { } #endif -#ifdef PADDLE_CUDA_FP16 #ifdef PADDLE_WITH_CUDA /* * One thead block deals with elementwise atomicAdd for vector of len. * @in: [x1, x2, x3, ...] * @out:[y1+x1, y2+x2, y3+x3, ...] * */ + template ::value>::type * = nullptr> + typename std::enable_if::kIsAvailable>::type + * = nullptr> __device__ __forceinline__ void VectorizedAtomicAddPerBlock( const int64_t len, int tid, int threads_per_block, const T *in, T *out) { for (int i = tid; i < len; i += threads_per_block) { @@ -565,30 +586,26 @@ __device__ __forceinline__ void VectorizedAtomicAddPerBlock( // Note: assume that len is even. If len is odd, call fastAtomicAdd directly. template ::value>::type * = nullptr> + typename std::enable_if::kIsAvailable>::type * = + nullptr> __device__ __forceinline__ void VectorizedAtomicAddPerBlock( const int64_t len, int tid, int threads_per_block, const T *in, T *out) { -#if ((CUDA_VERSION < 10000) || \ - (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700))) - for (int i = tid; i < len; i += threads_per_block) { - CudaAtomicAdd(&out[i], in[i]); - } -#else int i = 0; int loops = len / 2 * 2; + using NVT = typename VecAtomicAddHelper::NVT; + using NVVec2T = typename VecAtomicAddHelper::NVVec2T; bool aligned_half2 = - (reinterpret_cast(out) % sizeof(__half2) == 0); + (reinterpret_cast(out) % sizeof(NVT) == 0); if (aligned_half2) { for (i = tid * 2; i < loops; i += threads_per_block * 2) { - __half2 value2; + NVVec2T value2; T value_1 = in[i]; T value_2 = in[i + 1]; - value2.x = *reinterpret_cast<__half *>(&value_1); - value2.y = *reinterpret_cast<__half *>(&value_2); - atomicAdd(reinterpret_cast<__half2 *>(&out[i]), value2); + value2.x = *reinterpret_cast(&value_1); + value2.y = *reinterpret_cast(&value_2); + atomicAdd(reinterpret_cast(&out[i]), value2); } for (; i < len; i += threads_per_block) { fastAtomicAdd(out, i, len, in[i]); @@ -598,9 +615,8 @@ __device__ __forceinline__ void VectorizedAtomicAddPerBlock( fastAtomicAdd(out, i, len, in[i]); } } -#endif } -#endif + #endif } // namespace platform } // namespace paddle diff --git a/paddle/phi/backends/gpu/gpu_primitives.h b/paddle/phi/backends/gpu/gpu_primitives.h index be08f29aa8150..12f58257cf044 100644 --- a/paddle/phi/backends/gpu/gpu_primitives.h +++ b/paddle/phi/backends/gpu/gpu_primitives.h @@ -156,47 +156,65 @@ CUDA_ATOMIC_WRAPPER(Add, float16) { } #endif +template +struct VecAtomicAddHelperBase { + static constexpr auto kIsAvailable = IsAvailable; + using NVT = NVType; + using NVVec2T = NVVec2Type; +}; + +template +struct VecAtomicAddHelper : VecAtomicAddHelperBase {}; + +#if CUDA_VERSION >= 10000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 +template <> +struct VecAtomicAddHelper + : VecAtomicAddHelperBase {}; +#endif + +#if CUDA_VERSION >= 11000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 +template <> +struct VecAtomicAddHelper + : VecAtomicAddHelperBase {}; +#endif + // The performance of "atomicAdd(half* )" is bad, but for "atomicAdd(half2* )" // is good. So for fp16 type, we can use "atomicAdd(half2* )" to speed up. -template < - typename T, - typename std::enable_if::value>::type * = nullptr> +template ::kIsAvailable>::type * = + nullptr> __device__ __forceinline__ void fastAtomicAdd(T *tensor, size_t index, const size_t numel, T value) { -#if ((CUDA_VERSION < 10000) || \ - (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700))) - CudaAtomicAdd(reinterpret_cast(tensor) + index, - static_cast(value)); -#else // whether the address is 32-byte aligned. - __half *target_addr = reinterpret_cast<__half *>(tensor + index); + using NVT = typename VecAtomicAddHelper::NVT; + using NVVec2T = typename VecAtomicAddHelper::NVVec2T; + NVT *target_addr = reinterpret_cast(tensor + index); bool aligned_half2 = - (reinterpret_cast(target_addr) % sizeof(__half2) == 0); + (reinterpret_cast(target_addr) % sizeof(NVVec2T) == 0); if (aligned_half2 && index < (numel - 1)) { - __half2 value2; - value2.x = *reinterpret_cast<__half *>(&value); - value2.y = __int2half_rz(0); - atomicAdd(reinterpret_cast<__half2 *>(target_addr), value2); + NVVec2T value2; + value2.x = *reinterpret_cast(&value); + value2.y = 0.0; + atomicAdd(reinterpret_cast(target_addr), value2); } else if (!aligned_half2 && index > 0) { - __half2 value2; - value2.x = __int2half_rz(0); - value2.y = *reinterpret_cast<__half *>(&value); - atomicAdd(reinterpret_cast<__half2 *>(target_addr - 1), value2); + NVVec2T value2; + value2.x = 0.0; + value2.y = *reinterpret_cast(&value); + atomicAdd(reinterpret_cast(target_addr - 1), value2); } else { - atomicAdd(reinterpret_cast<__half *>(tensor) + index, - *reinterpret_cast<__half *>(&value)); + atomicAdd(reinterpret_cast(tensor) + index, + *reinterpret_cast(&value)); } -#endif } -template < - typename T, - typename std::enable_if::value>::type * = nullptr> +template ::kIsAvailable>::type + * = nullptr> __device__ __forceinline__ void fastAtomicAdd(T *arr, size_t index, const size_t numel, @@ -551,16 +569,16 @@ CUDA_ATOMIC_WRAPPER(Min, float16) { } #endif -#ifdef PADDLE_CUDA_FP16 #ifdef PADDLE_WITH_CUDA /* * One thead block deals with elementwise atomicAdd for vector of len. * @in: [x1, x2, x3, ...] * @out:[y1+x1, y2+x2, y3+x3, ...] * */ -template < - typename T, - typename std::enable_if::value>::type * = nullptr> + +template ::kIsAvailable>::type + * = nullptr> __device__ __forceinline__ void VectorizedAtomicAddPerBlock( const int64_t len, int tid, int threads_per_block, const T *in, T *out) { for (int i = tid; i < len; i += threads_per_block) { @@ -569,31 +587,27 @@ __device__ __forceinline__ void VectorizedAtomicAddPerBlock( } // Note: assume that len is even. If len is odd, call fastAtomicAdd directly. -template < - typename T, - typename std::enable_if::value>::type * = nullptr> +template ::kIsAvailable>::type * = + nullptr> __device__ __forceinline__ void VectorizedAtomicAddPerBlock( const int64_t len, int tid, int threads_per_block, const T *in, T *out) { -#if ((CUDA_VERSION < 10000) || \ - (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700))) - for (int i = tid; i < len; i += threads_per_block) { - CudaAtomicAdd(&out[i], in[i]); - } -#else int i = 0; int loops = len / 2 * 2; + using NVT = typename VecAtomicAddHelper::NVT; + using NVVec2T = typename VecAtomicAddHelper::NVVec2T; bool aligned_half2 = - (reinterpret_cast(out) % sizeof(__half2) == 0); + (reinterpret_cast(out) % sizeof(NVT) == 0); if (aligned_half2) { for (i = tid * 2; i < loops; i += threads_per_block * 2) { - __half2 value2; + NVVec2T value2; T value_1 = in[i]; T value_2 = in[i + 1]; - value2.x = *reinterpret_cast<__half *>(&value_1); - value2.y = *reinterpret_cast<__half *>(&value_2); - atomicAdd(reinterpret_cast<__half2 *>(&out[i]), value2); + value2.x = *reinterpret_cast(&value_1); + value2.y = *reinterpret_cast(&value_2); + atomicAdd(reinterpret_cast(&out[i]), value2); } for (; i < len; i += threads_per_block) { fastAtomicAdd(out, i, len, in[i]); @@ -603,8 +617,7 @@ __device__ __forceinline__ void VectorizedAtomicAddPerBlock( fastAtomicAdd(out, i, len, in[i]); } } -#endif } -#endif + #endif } // namespace phi diff --git a/python/paddle/fluid/tests/unittests/test_bfloat16_embedding.py b/python/paddle/fluid/tests/unittests/test_bfloat16_embedding.py new file mode 100644 index 0000000000000..e86c45cf5412b --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_bfloat16_embedding.py @@ -0,0 +1,79 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import numpy as np +import unittest +import paddle.nn.functional as F +from test_sparse_attention_op import get_cuda_version + + +class BF16EmbeddingTest(unittest.TestCase): + def setUp(self): + self.batch_size = 30 + self.vocab_size = 1024 + self.hidden_size = 512 + self.seed = 10 + + def run_main(self, dtype): + ids, weight, dout = self.gen_random() + origin_dtype = weight.dtype + weight_cast = weight.astype(dtype) + out = F.embedding(ids, weight_cast) + dout = dout.astype(out.dtype) + dweight = paddle.autograd.grad(out, weight, dout) + return ( + out.astype(origin_dtype).numpy(), + dweight[0].astype(origin_dtype).numpy(), + ) + + def gen_random(self): + np.random.seed(self.seed) + weight = np.random.random([self.vocab_size, self.hidden_size]).astype( + 'float32' + ) + ids = np.random.randint( + low=0, high=self.vocab_size, size=[self.batch_size] + ) + dout = np.random.random([self.batch_size, self.hidden_size]).astype( + 'float32' + ) + + weight = paddle.to_tensor(weight) + weight.stop_gradient = False + ids = paddle.to_tensor(ids) + dout = paddle.to_tensor(dout) + return ids, weight, dout + + def test_main(self): + if not paddle.is_compiled_with_cuda() or get_cuda_version() < 11000: + return + + ret1 = self.run_main('float32') + ret2 = self.run_main('bfloat16') + self.assertEqual(len(ret1), len(ret2)) + for i, (r1, r2) in enumerate(zip(ret1, ret2)): + np.testing.assert_allclose(r1, r2, atol=1e-3, rtol=1e-2) + + +class BF16EmbeddingTestOddHiddenSize(BF16EmbeddingTest): + def setUp(self): + self.batch_size = 30 + self.vocab_size = 511 + self.hidden_size = 512 + self.seed = 20 + + +if __name__ == "__main__": + unittest.main() From 4f57da5fa6866a81f47ba90a8c9573648bdff11d Mon Sep 17 00:00:00 2001 From: Zhou Wei <1183042833@qq.com> Date: Thu, 17 Nov 2022 11:19:58 +0800 Subject: [PATCH 049/210] [Zero-Dim] temporarily revert create_scalar due to input 0D is not fully supported (#48058) --- python/paddle/fluid/layers/math_op_patch.py | 3 ++- python/paddle/fluid/tests/unittests/test_zero_dim_tensor.py | 6 +++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/python/paddle/fluid/layers/math_op_patch.py b/python/paddle/fluid/layers/math_op_patch.py index fb3979434347f..f9ba649867161 100644 --- a/python/paddle/fluid/layers/math_op_patch.py +++ b/python/paddle/fluid/layers/math_op_patch.py @@ -99,7 +99,8 @@ def create_tensor(block, value, dtype, shape): return var def create_scalar(block, value, dtype): - return create_tensor(block, value, dtype, shape=[]) + # TODO(zhouwei): will change to [] which is 0-D Tensor + return create_tensor(block, value, dtype, shape=[1]) def create_tensor_with_batchsize(ref_var, value, dtype): assert isinstance(ref_var, Variable) diff --git a/python/paddle/fluid/tests/unittests/test_zero_dim_tensor.py b/python/paddle/fluid/tests/unittests/test_zero_dim_tensor.py index c85f5aec42e9f..174172b026f21 100644 --- a/python/paddle/fluid/tests/unittests/test_zero_dim_tensor.py +++ b/python/paddle/fluid/tests/unittests/test_zero_dim_tensor.py @@ -350,7 +350,7 @@ def test_dygraph_binary(self): paddle.enable_static() - def test_static_unary(self): + def test_static_binary(self): paddle.enable_static() for api in binary_api_list + binary_api_list_without_grad: main_prog = fluid.Program() @@ -377,15 +377,19 @@ def test_static_unary(self): # Test runtime shape self.assertEqual(out_np.shape, ()) + # TODO(zhouwei): will open when create_scalar is [] # 2) x is 0D , y is scalar + ''' x = paddle.rand([]) y = 0.5 x.stop_gradient = False + print(api) if isinstance(api, dict): out = getattr(paddle.static.Variable, api['cls_method'])( x, y ) self.assertEqual(out.shape, ()) + ''' for api in binary_int_api_list_without_grad: main_prog = fluid.Program() From e5ed5257083b92b018330812c33c746bae26fb41 Mon Sep 17 00:00:00 2001 From: Yuang Liu Date: Thu, 17 Nov 2022 11:22:47 +0800 Subject: [PATCH 050/210] Support bfloat16 for adamw and adam optimizer. Fit the lr for pure bf16 training with tensor fusion. (#48041) * add bfloat16 for adamw * set lr not to bfloat16 for pure bf16 training * update the logic * update the adamw optimizer * support bfloat for adam --- paddle/fluid/pybind/eager_functions.cc | 3 ++- paddle/phi/kernels/gpu/adamw_kernel.cu | 4 ++- python/paddle/optimizer/adam.py | 34 ++++++++++++-------------- python/paddle/optimizer/adamw.py | 17 ++++++------- python/paddle/optimizer/optimizer.py | 26 +++++++++++++++++--- 5 files changed, 51 insertions(+), 33 deletions(-) diff --git a/paddle/fluid/pybind/eager_functions.cc b/paddle/fluid/pybind/eager_functions.cc index cdace567b2e9d..3389daf330c7c 100644 --- a/paddle/fluid/pybind/eager_functions.cc +++ b/paddle/fluid/pybind/eager_functions.cc @@ -268,7 +268,8 @@ PyObject* eager_api_get_grads_types(PyObject* self, if (meta && grad.initialized()) { if (grad.is_dense_tensor() && (tensor.dtype() == paddle::experimental::DataType::FLOAT32 || - tensor.dtype() == paddle::experimental::DataType::FLOAT16)) { + tensor.dtype() == paddle::experimental::DataType::FLOAT16 || + tensor.dtype() == paddle::experimental::DataType::BFLOAT16)) { ret.emplace_back( paddle::framework::TransToProtoVarType(tensor.dtype())); } diff --git a/paddle/phi/kernels/gpu/adamw_kernel.cu b/paddle/phi/kernels/gpu/adamw_kernel.cu index 9ddaacdd5cc6b..6994c83f53624 100644 --- a/paddle/phi/kernels/gpu/adamw_kernel.cu +++ b/paddle/phi/kernels/gpu/adamw_kernel.cu @@ -21,6 +21,7 @@ #include "paddle/fluid/framework/tensor_util.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/common/amp_type_traits.h" +#include "paddle/phi/common/bfloat16.h" #include "paddle/phi/common/float16.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/tensor_utils.h" @@ -300,7 +301,8 @@ PD_REGISTER_KERNEL(adamw, phi::AdamwDenseKernel, float, double, - phi::dtype::float16) { + phi::dtype::float16, + phi::dtype::bfloat16) { // Skip beta1_pow, beta2_pow, skip_update data transform kernel->InputAt(5).SetBackend(phi::Backend::ALL_BACKEND); kernel->InputAt(6).SetBackend(phi::Backend::ALL_BACKEND); diff --git a/python/paddle/optimizer/adam.py b/python/paddle/optimizer/adam.py index 74499b05f24ae..aa76fb82759f1 100644 --- a/python/paddle/optimizer/adam.py +++ b/python/paddle/optimizer/adam.py @@ -28,7 +28,7 @@ __all__ = [] -GRAD_TYPES = [int(paddle.float32), int(paddle.float16)] +GRAD_TYPES = [int(paddle.float32), int(paddle.float16), int(paddle.bfloat16)] class Adam(Optimizer): @@ -265,8 +265,8 @@ def _get_accumulator(self, name, param): """ if self._name is not None: name = self._name + "_" + name - find_master = ( - self._multi_precision and param.dtype == core.VarDesc.VarType.FP16 + find_master = self._multi_precision and self._is_dtype_fp16_or_bf16( + param.dtype ) target_param = ( self._master_weights[param.name] if find_master else param @@ -285,10 +285,7 @@ def _get_accumulator(self, name, param): def _add_moments_pows(self, p): acc_dtype = p.dtype - if ( - acc_dtype == core.VarDesc.VarType.FP16 - or acc_dtype == core.VarDesc.VarType.BF16 - ): + if self._is_dtype_fp16_or_bf16(acc_dtype): acc_dtype = core.VarDesc.VarType.FP32 self._add_accumulator(self._moment1_acc_str, p, dtype=acc_dtype) self._add_accumulator(self._moment2_acc_str, p, dtype=acc_dtype) @@ -322,16 +319,16 @@ def _create_accumulators(self, block, parameters): # Create accumulator tensors for first and second moments for p in parameters: - if self._multi_precision and p.dtype == core.VarDesc.VarType.FP16: + if self._multi_precision and self._is_dtype_fp16_or_bf16(p.dtype): master_p = self._create_master_weight(p) self._add_moments_pows(master_p) continue if ( - p.dtype == core.VarDesc.VarType.FP16 + self._is_dtype_fp16_or_bf16(p.dtype) and not self._multi_precision ): warnings.warn( - "Accumulating with FP16 in optimizer can lead to poor accuracy or slow convergence." + "Accumulating with FP16 or BF16 in optimizer can lead to poor accuracy or slow convergence." "Consider using multi_precision=True option of the Adam optimizer." ) self._add_moments_pows(p) @@ -353,9 +350,8 @@ def _append_optimize_op(self, block, param_and_grad): beta2_pow_acc = self._get_accumulator( self._beta2_pow_acc_str, param_and_grad[0] ) - find_master = ( - self._multi_precision - and param_and_grad[0].dtype == core.VarDesc.VarType.FP16 + find_master = self._multi_precision and self._is_dtype_fp16_or_bf16( + param_and_grad[0].dtype ) master_weight = ( self._master_weights[param_and_grad[0].name] @@ -571,7 +567,7 @@ def step(self): def _multi_tensor_init(self, target_block, parameters, param_group_idx): """ - All parameters used for optimizer (such as: parameters, master_weight, velocity_acc for momentum) calculations are grouped into a python list by data type (float16, float32). + All parameters used for optimizer (such as: parameters, master_weight, velocity_acc for momentum) calculations are grouped into a python list by data type (bfloat16, float16, float32). This function will be overridden in the corresponding optimizer file. Args: target_block: the block in which the loss tensor is present @@ -604,7 +600,7 @@ def _multi_tensor_init(self, target_block, parameters, param_group_idx): self._beta2_pow_acc_dict['FP32_LODTensor'][ param_group_idx ].append(beta2_pow_acc) - elif param.dtype == paddle.float16: + elif self._is_dtype_fp16_or_bf16(param.dtype): self._param_dict['FP16_LODTensor'][param_group_idx].append( param ) @@ -628,7 +624,7 @@ def _multi_tensor_init(self, target_block, parameters, param_group_idx): self._master_weight_dict['FP16_LODTensor'] = None else: raise ValueError( - "Now multi_tensor_momentum only support fp32 and fp16 parameters and grad is LOD_TENSOR." + "Now multi_tensor_momentum only support fp32, fp16 or bf16 parameters and grad is LOD_TENSOR." ) def _append_optimize_multi_tensor_op( @@ -656,7 +652,7 @@ def _append_optimize_multi_tensor_op( ) lr = self._create_param_lr(parameters_and_grads[index]) lr_dict['FP32_LODTensor'].append(lr) - elif tp == GRAD_TYPES[1]: + elif tp == GRAD_TYPES[1] or tp == GRAD_TYPES[2]: grad_dict['FP16_LODTensor'].append( parameters_and_grads[index][1] ) @@ -678,7 +674,7 @@ def _append_optimize_multi_tensor_op( lr = self._create_param_lr(param_and_grad) lr_dict['FP32_LODTensor'].append(lr) elif ( - param_and_grad[0].dtype == paddle.float16 + self._is_dtype_fp16_or_bf16(param_and_grad[0].dtype) and param_and_grad[1].type == core.VarDesc.VarType.LOD_TENSOR ): @@ -711,7 +707,7 @@ def _append_optimize_multi_tensor_op( lr = self._create_param_lr(param_and_grad) lr_dict['FP32_LODTensor'].append(lr) elif ( - param_and_grad[0].dtype == paddle.float16 + self._is_dtype_fp16_or_bf16(param_and_grad[0].dtype) and param_and_grad[1].type == core.VarDesc.VarType.LOD_TENSOR ): diff --git a/python/paddle/optimizer/adamw.py b/python/paddle/optimizer/adamw.py index dca844b668275..5424331a71fa9 100644 --- a/python/paddle/optimizer/adamw.py +++ b/python/paddle/optimizer/adamw.py @@ -369,8 +369,8 @@ def _get_accumulator(self, name, param): """ if self._name is not None: name = self._name + "_" + name - find_master = ( - self._multi_precision and param.dtype == core.VarDesc.VarType.FP16 + find_master = self._multi_precision and self._is_dtype_fp16_or_bf16( + param.dtype ) target_param = ( self._master_weights[param.name] if find_master else param @@ -389,7 +389,7 @@ def _get_accumulator(self, name, param): def _add_moments_pows(self, p): acc_dtype = p.dtype - if acc_dtype == core.VarDesc.VarType.FP16: + if self._is_dtype_fp16_or_bf16(acc_dtype): acc_dtype = core.VarDesc.VarType.FP32 self._add_accumulator(self._moment1_acc_str, p, dtype=acc_dtype) self._add_accumulator(self._moment2_acc_str, p, dtype=acc_dtype) @@ -423,16 +423,16 @@ def _create_accumulators(self, block, parameters): # Create accumulator tensors for first and second moments for p in parameters: - if self._multi_precision and p.dtype == core.VarDesc.VarType.FP16: + if self._multi_precision and self._is_dtype_fp16_or_bf16(p.dtype): master_p = self._create_master_weight(p) self._add_moments_pows(master_p) continue if ( - p.dtype == core.VarDesc.VarType.FP16 + self._is_dtype_fp16_or_bf16(p.dtype) and not self._multi_precision ): warnings.warn( - "Accumulating with FP16 in optimizer can lead to poor accuracy or slow convergence." + "Accumulating with FP16 or BF16 in optimizer can lead to poor accuracy or slow convergence." "Consider using multi_precision=True option of the Adam optimizer." ) self._add_moments_pows(p) @@ -463,9 +463,8 @@ def _append_optimize_op(self, block, param_and_grad): beta2_pow_acc = self._get_accumulator( self._beta2_pow_acc_str, param_and_grad[0] ) - find_master = ( - self._multi_precision - and param_and_grad[0].dtype == core.VarDesc.VarType.FP16 + find_master = self._multi_precision and self._is_dtype_fp16_or_bf16( + param_and_grad[0].dtype ) master_weight = ( self._master_weights[param_and_grad[0].name] diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py index 26ae5b50269b2..59663bb819088 100644 --- a/python/paddle/optimizer/optimizer.py +++ b/python/paddle/optimizer/optimizer.py @@ -421,15 +421,21 @@ def get_opti_var_name_list(self): return self._opti_name_list def _create_global_learning_rate(self): - # lr var can't be float16, for pure fp16 training, should extra handle the dtype for lr + # lr var can't be float16 or bfloat16, for pure fp16 or bf16 training, should extra handle the dtype for lr _lr_dtype = ( paddle.get_default_dtype() if self._dtype is None else self._dtype ) _lr_dtype = ( paddle.float32 if ( - paddle.get_default_dtype() != "float16" - and _lr_dtype == paddle.float16 + ( + paddle.get_default_dtype() != "float16" + and _lr_dtype == paddle.float16 + ) + or ( + paddle.get_default_dtype() != "bfloat16" + and _lr_dtype == paddle.bfloat16 + ) ) else _lr_dtype ) @@ -1526,3 +1532,17 @@ def _append_optimize_multi_tensor_op( For Multi Tensor, append optimize merged_operator to block. """ pass + + def _is_dtype_fp16_or_bf16(self, dtype): + """ + check the dtype is fp16 or the dtype is bf16 + :param dtype: instance of core.VarDesc.VarType + :return: True if dtype is one of fp16 or bf16, False otherwise + """ + assert isinstance( + dtype, core.VarDesc.VarType + ), "The dtype should be an instance of core.VarDesc.VarType." + return ( + dtype == core.VarDesc.VarType.FP16 + or dtype == core.VarDesc.VarType.BF16 + ) From f62bd3b490b151fce074d1cd11389161b1b0acbd Mon Sep 17 00:00:00 2001 From: huangjiyi <43315610+huangjiyi@users.noreply.github.com> Date: Thu, 17 Nov 2022 11:29:36 +0800 Subject: [PATCH 051/210] [PHI decoupling] move "paddle/fluid/operators/math.h" to phi (#48062) * rm "paddle/fluid/operators/math.h" in phi * rm "paddle/fluid/operators/math.h" in fluit --- paddle/fluid/operators/cross_entropy_op.h | 4 +-- paddle/fluid/operators/dequantize_log_op.cu | 1 - .../detection/sigmoid_focal_loss_op.cu | 32 +++++++++++-------- paddle/fluid/operators/math/cross_entropy.cu | 13 ++++---- .../sequence_ops/sequence_softmax_op.cu | 6 ++-- paddle/phi/kernels/cpu/bce_loss_kernel.cc | 7 ++-- .../phi/kernels/cpu/nll_loss_grad_kernel.cc | 2 +- paddle/phi/kernels/funcs/functors.h | 8 ++--- .../operators => phi/kernels/funcs}/math.h | 20 ++++++------ paddle/phi/kernels/gpu/nll_loss.h | 2 +- .../gpu/sigmoid_cross_entropy_with_logits.h | 2 +- ...d_cross_entropy_with_logits_grad_kernel.cu | 4 +-- ...igmoid_cross_entropy_with_logits_kernel.cu | 5 ++- paddle/phi/kernels/impl/selu_kernel_impl.h | 4 +-- 14 files changed, 55 insertions(+), 55 deletions(-) rename paddle/{fluid/operators => phi/kernels/funcs}/math.h (69%) diff --git a/paddle/fluid/operators/cross_entropy_op.h b/paddle/fluid/operators/cross_entropy_op.h index 2949dc8d1fb2a..4dcaf7b99f091 100644 --- a/paddle/fluid/operators/cross_entropy_op.h +++ b/paddle/fluid/operators/cross_entropy_op.h @@ -15,9 +15,9 @@ limitations under the License. */ #pragma once #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math.h" #include "paddle/fluid/operators/math/cross_entropy.h" #include "paddle/fluid/platform/for_range.h" +#include "paddle/phi/kernels/funcs/math.h" #include "paddle/phi/kernels/funcs/math_function.h" namespace paddle { @@ -190,7 +190,7 @@ struct HardLabelCrossEntropyForwardFunctor { label); auto match_x = x_[idx * feature_size_ + label]; - y_[idx] = -math::TolerableValue()(real_log(match_x)); + y_[idx] = -math::TolerableValue()(phi::funcs::real_log(match_x)); match_x_[idx] = match_x; } else { y_[idx] = 0; diff --git a/paddle/fluid/operators/dequantize_log_op.cu b/paddle/fluid/operators/dequantize_log_op.cu index 360871f9e7251..4a1976f6fdd68 100644 --- a/paddle/fluid/operators/dequantize_log_op.cu +++ b/paddle/fluid/operators/dequantize_log_op.cu @@ -13,7 +13,6 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/dequantize_log_op.h" -#include "paddle/fluid/operators/math.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/core/hostdevice.h" diff --git a/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu b/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu index bad93fd22b2e9..76a47581e9f72 100644 --- a/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu +++ b/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu @@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/detection/sigmoid_focal_loss_op.h" -#include "paddle/fluid/operators/math.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/core/hostdevice.h" +#include "paddle/phi/kernels/funcs/math.h" namespace paddle { namespace operators { @@ -55,15 +55,16 @@ __global__ void GPUSigmoidFocalLossForward(const T *x_data, T s_pos = alpha / fg_num; // p = 1. / 1. + expf(-x) - T p = 1. / (1. + real_exp(-x)); + T p = 1. / (1. + phi::funcs::real_exp(-x)); // (1 - p)**gamma * log(p) T term_pos = std::pow(static_cast(1. - p), gamma) * - real_log(p > FLT_MIN ? p : FLT_MIN); + phi::funcs::real_log(p > FLT_MIN ? p : FLT_MIN); // p**gamma * log(1 - p) - T term_neg = - std::pow(p, gamma) * - (-1. * x * (x >= 0) - real_log(1. + real_exp(x - 2. * x * (x >= 0)))); + T term_neg = std::pow(p, gamma) * + (-1. * x * (x >= 0) - + phi::funcs::real_log( + 1. + phi::funcs::real_exp(x - 2. * x * (x >= 0)))); out_data[i] = 0.0; out_data[i] += -c_pos * term_pos * s_pos; @@ -96,17 +97,20 @@ __global__ void GPUSigmoidFocalLossBackward(const T *x_data, T c_pos = static_cast(g == (d + 1)); T c_neg = static_cast((g != -1) & (g != (d + 1))); - T p = 1. / (1. + real_exp(-x)); + T p = 1. / (1. + phi::funcs::real_exp(-x)); // (1-p)**g * (1 - p - g*p*log(p)) - T term_pos = std::pow(static_cast(1. - p), gamma) * - (1. - p - (p * gamma * real_log(p > FLT_MIN ? p : FLT_MIN))); + T term_pos = + std::pow(static_cast(1. - p), gamma) * + (1. - p - + (p * gamma * phi::funcs::real_log(p > FLT_MIN ? p : FLT_MIN))); // (p**g) * (g*(1-p)*log(1-p) - p) - T term_neg = - std::pow(p, gamma) * - ((-1. * x * (x >= 0) - real_log(1. + real_exp(x - 2. * x * (x >= 0)))) * - (1. - p) * gamma - - p); + T term_neg = std::pow(p, gamma) * + ((-1. * x * (x >= 0) - + phi::funcs::real_log( + 1. + phi::funcs::real_exp(x - 2. * x * (x >= 0)))) * + (1. - p) * gamma - + p); dx_data[i] = 0.0; dx_data[i] += -c_pos * s_pos * term_pos; diff --git a/paddle/fluid/operators/math/cross_entropy.cu b/paddle/fluid/operators/math/cross_entropy.cu index 0e5b95542455e..478c4e0cd6611 100644 --- a/paddle/fluid/operators/math/cross_entropy.cu +++ b/paddle/fluid/operators/math/cross_entropy.cu @@ -14,10 +14,10 @@ limitations under the License. */ #include "paddle/fluid/operators/math/cross_entropy.h" #include "paddle/fluid/framework/convert_utils.h" -#include "paddle/fluid/operators/math.h" #include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/kernels/funcs/math.h" namespace paddle { namespace operators { @@ -39,9 +39,10 @@ __global__ void CrossEntropyKernel(T* Y, D, ignore_index, lbl); - Y[i] = ignore_index == lbl - ? static_cast(0) - : -math::TolerableValue()(real_log(X[i * D + lbl])); + Y[i] = + ignore_index == lbl + ? static_cast(0) + : -math::TolerableValue()(phi::funcs::real_log(X[i * D + lbl])); } } @@ -56,7 +57,7 @@ __global__ void SoftCrossEntropyKernel(T* Y, int idx = blockIdx.x * class_num + tid; int end = blockIdx.x * class_num + class_num; for (; idx < end; idx += blockDim.x) { - val += math::TolerableValue()(real_log(X[idx])) * label[idx]; + val += math::TolerableValue()(phi::funcs::real_log(X[idx])) * label[idx]; } val = paddle::platform::reduceSum(val, tid, blockDim.x); @@ -152,7 +153,7 @@ void CrossEntropyFunctor::operator()( template class CrossEntropyFunctor; template class CrossEntropyFunctor; -template class CrossEntropyFunctor; +template class CrossEntropyFunctor; } // namespace math } // namespace operators diff --git a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu index 29f562ec5eca2..e58cff60aea48 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu +++ b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu @@ -23,8 +23,8 @@ limitations under the License. */ namespace cub = hipcub; #endif -#include "paddle/fluid/operators/math.h" #include "paddle/fluid/operators/sequence_ops/sequence_softmax_op.h" +#include "paddle/phi/kernels/funcs/math.h" namespace paddle { namespace operators { @@ -67,7 +67,7 @@ __global__ void sequence_softmax_kernel(const T *in_data, T sum_data = 0; for (int tid = threadIdx.x; tid < span; tid += blockDim.x) { T ele = in_data[start + tid]; - sum_data += real_exp(ele - shared_max_data); + sum_data += phi::funcs::real_exp(ele - shared_max_data); } sum_data = BlockReduce(temp_storage).Reduce(sum_data, cub::Sum()); @@ -79,7 +79,7 @@ __global__ void sequence_softmax_kernel(const T *in_data, // get final resit for (int tid = threadIdx.x; tid < span; tid += blockDim.x) { T ele = in_data[start + tid]; - ele = real_exp(ele - shared_max_data) / shared_sum_data; + ele = phi::funcs::real_exp(ele - shared_max_data) / shared_sum_data; out_data[start + tid] = ele; } } diff --git a/paddle/phi/kernels/cpu/bce_loss_kernel.cc b/paddle/phi/kernels/cpu/bce_loss_kernel.cc index 9d62fabcbe736..7b98016201666 100644 --- a/paddle/phi/kernels/cpu/bce_loss_kernel.cc +++ b/paddle/phi/kernels/cpu/bce_loss_kernel.cc @@ -16,9 +16,9 @@ #include // for max -#include "paddle/fluid/operators/math.h" #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/math.h" namespace phi { @@ -47,10 +47,9 @@ void BCELossKernel(const Context& dev_ctx, "Illegal input, input must be less than or equal to 1")); out_data[i] = (label_data[i] - static_cast(1)) * - std::max(paddle::operators::real_log(static_cast(1) - x_data[i]), + std::max(phi::funcs::real_log(static_cast(1) - x_data[i]), (T)(-100)) - - label_data[i] * - std::max(paddle::operators::real_log(x_data[i]), (T)(-100)); + label_data[i] * std::max(phi::funcs::real_log(x_data[i]), (T)(-100)); } } diff --git a/paddle/phi/kernels/cpu/nll_loss_grad_kernel.cc b/paddle/phi/kernels/cpu/nll_loss_grad_kernel.cc index 9048e87d04989..c84b3d4efbb88 100644 --- a/paddle/phi/kernels/cpu/nll_loss_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/nll_loss_grad_kernel.cc @@ -17,9 +17,9 @@ #include #include -#include "paddle/fluid/operators/math.h" #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/math.h" namespace phi { template diff --git a/paddle/phi/kernels/funcs/functors.h b/paddle/phi/kernels/funcs/functors.h index d518a877b26f2..2e6fe8b2d738b 100644 --- a/paddle/phi/kernels/funcs/functors.h +++ b/paddle/phi/kernels/funcs/functors.h @@ -15,7 +15,7 @@ limitations under the License. */ #pragma once #include "paddle/fluid/operators/amp/fp16_type_traits.h" -#include "paddle/fluid/operators/math.h" +#include "paddle/phi/kernels/funcs/math.h" namespace phi { namespace funcs { @@ -89,8 +89,7 @@ struct TanhFunctor { // y = 2 / (1 + e^-2x) - 1 T t0 = static_cast(2) * x; T t1 = (t0 < kMin) ? kMin : ((t0 > kMax) ? kMax : t0); - return static_cast(2) / - (static_cast(1) + paddle::operators::real_exp(-t1)) - + return static_cast(2) / (static_cast(1) + phi::funcs::real_exp(-t1)) - static_cast(1); } }; @@ -111,8 +110,7 @@ struct SigmoidFunctor { inline HOSTDEVICE T operator()(T x) { // y = 1 / (1 + e^-x) T tmp = (x < kMin) ? kMin : ((x > kMax) ? kMax : x); - return static_cast(1) / - (static_cast(1) + paddle::operators::real_exp(-tmp)); + return static_cast(1) / (static_cast(1) + phi::funcs::real_exp(-tmp)); } }; diff --git a/paddle/fluid/operators/math.h b/paddle/phi/kernels/funcs/math.h similarity index 69% rename from paddle/fluid/operators/math.h rename to paddle/phi/kernels/funcs/math.h index 47281fb0280f0..f8c373badf187 100644 --- a/paddle/fluid/operators/math.h +++ b/paddle/phi/kernels/funcs/math.h @@ -1,4 +1,4 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -15,22 +15,22 @@ #pragma once #include "math.h" // NOLINT -#include "paddle/fluid/platform/float16.h" +#include "paddle/phi/common/float16.h" #include "paddle/phi/core/hostdevice.h" -namespace paddle { -namespace operators { +namespace phi { +namespace funcs { -inline HOSTDEVICE platform::float16 real_exp(platform::float16 x) { - return static_cast(::expf(static_cast(x))); +inline HOSTDEVICE phi::dtype::float16 real_exp(phi::dtype::float16 x) { + return static_cast(::expf(static_cast(x))); } inline HOSTDEVICE float real_exp(float x) { return ::expf(x); } inline HOSTDEVICE double real_exp(double x) { return ::exp(x); } -inline HOSTDEVICE platform::float16 real_log(platform::float16 x) { - return static_cast(::logf(static_cast(x))); +inline HOSTDEVICE phi::dtype::float16 real_log(phi::dtype::float16 x) { + return static_cast(::logf(static_cast(x))); } inline HOSTDEVICE float real_log(float x) { return ::logf(x); } @@ -41,5 +41,5 @@ inline HOSTDEVICE float real_min(float x, float y) { return ::fminf(x, y); } inline HOSTDEVICE double real_min(double x, double y) { return ::fmin(x, y); } -} // namespace operators -} // namespace paddle +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/gpu/nll_loss.h b/paddle/phi/kernels/gpu/nll_loss.h index 37a67b4767a9b..9d063d0ef44a0 100644 --- a/paddle/phi/kernels/gpu/nll_loss.h +++ b/paddle/phi/kernels/gpu/nll_loss.h @@ -19,10 +19,10 @@ #include #include -#include "paddle/fluid/operators/math.h" #include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/hostdevice.h" +#include "paddle/phi/kernels/funcs/math.h" namespace phi { static constexpr int kNumCUDAThreads = 512; diff --git a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits.h b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits.h index 84a24449b3a1c..1cc025bac480f 100644 --- a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits.h +++ b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits.h @@ -17,13 +17,13 @@ #include #include "paddle/fluid/memory/malloc.h" -#include "paddle/fluid/operators/math.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_helper.h" #include "paddle/phi/core/hostdevice.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/tensor_utils.h" #include "paddle/phi/kernels/funcs/elementwise_base.h" +#include "paddle/phi/kernels/funcs/math.h" #include "paddle/phi/kernels/gpu/reduce.h" #ifdef __NVCC__ diff --git a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu index f61cd2c39674e..736c5608a6ac7 100644 --- a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu @@ -37,8 +37,8 @@ struct SigmoidBwdFunctor { dx_data = static_cast(0.); counts = 0; } else { - T simoid_x = static_cast(1) / - (static_cast(1) + paddle::operators::real_exp(-x)); + T simoid_x = + static_cast(1) / (static_cast(1) + phi::funcs::real_exp(-x)); T diff = simoid_x - label; dx_data = dout * diff; counts = 1; diff --git a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu index b0e9efe5bbafe..fb0183ce1efd6 100644 --- a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu +++ b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu @@ -37,9 +37,8 @@ struct SigmoidFwdFunctor { } else { T term1 = (x > 0) ? x : 0; T term2 = x * label; - T term3 = paddle::operators::real_log( - static_cast(1) + - paddle::operators::real_exp(static_cast(-abs(x)))); + T term3 = phi::funcs::real_log( + static_cast(1) + phi::funcs::real_exp(static_cast(-abs(x)))); out_data = term1 - term2 + term3; counts = 1; diff --git a/paddle/phi/kernels/impl/selu_kernel_impl.h b/paddle/phi/kernels/impl/selu_kernel_impl.h index c5d756e6eb4fa..14789a7d61ac8 100644 --- a/paddle/phi/kernels/impl/selu_kernel_impl.h +++ b/paddle/phi/kernels/impl/selu_kernel_impl.h @@ -15,9 +15,9 @@ #pragma once #include -#include "paddle/fluid/operators/math.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/kernels/funcs/for_range.h" +#include "paddle/phi/kernels/funcs/math.h" namespace phi { @@ -32,7 +32,7 @@ struct SeluFunctor { HOSTDEVICE void operator()(size_t idx) const { T x_ele = x_data_ptr_[idx]; if (x_ele <= 0) { - x_ele = alpha_ * paddle::operators::real_exp(x_ele) - alpha_; + x_ele = alpha_ * phi::funcs::real_exp(x_ele) - alpha_; } y_data_ptr_[idx] = scale_ * x_ele; } From b7e120d264a33d97bb7d946d6197edc488a0976c Mon Sep 17 00:00:00 2001 From: huangjiyi <43315610+huangjiyi@users.noreply.github.com> Date: Thu, 17 Nov 2022 11:38:50 +0800 Subject: [PATCH 052/210] rm "paddle/phi/kernels/gpu/batch_norm_utils.h" in phi (#48057) --- .../phi/kernels/cpu/batch_norm_grad_kernel.cc | 2 +- .../phi/kernels/gpu/batch_norm_grad_kernel.cu | 2 +- paddle/phi/kernels/gpu/batch_norm_kernel.cu | 2 +- paddle/phi/kernels/gpu/batch_norm_utils.h | 142 ------------------ 4 files changed, 3 insertions(+), 145 deletions(-) delete mode 100644 paddle/phi/kernels/gpu/batch_norm_utils.h diff --git a/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc b/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc index f2054d4d396c6..efd55dee88cd0 100644 --- a/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc @@ -16,9 +16,9 @@ #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/batch_norm_kernel.h" +#include "paddle/phi/kernels/funcs/batch_norm_utils.h" #include "paddle/phi/kernels/funcs/eigen/common.h" #include "paddle/phi/kernels/funcs/math_function.h" -#include "paddle/phi/kernels/gpu/batch_norm_utils.h" namespace phi { diff --git a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu index 8d072368633ef..e6c681588e4ed 100644 --- a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu @@ -22,10 +22,10 @@ #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/batch_norm_kernel.h" #include "paddle/phi/kernels/empty_kernel.h" +#include "paddle/phi/kernels/funcs/batch_norm_utils.h" #include "paddle/phi/kernels/funcs/eigen/common.h" #include "paddle/phi/kernels/funcs/norm_utils.h" #include "paddle/phi/kernels/funcs/reduce_function.h" -#include "paddle/phi/kernels/gpu/batch_norm_utils.h" #ifdef __HIPCC__ #define LAUNCH_BOUNDS(BlockDim) __launch_bounds__(BlockDim) diff --git a/paddle/phi/kernels/gpu/batch_norm_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_kernel.cu index 7b553db274d1f..44fe99046e158 100644 --- a/paddle/phi/kernels/gpu/batch_norm_kernel.cu +++ b/paddle/phi/kernels/gpu/batch_norm_kernel.cu @@ -29,10 +29,10 @@ namespace cub = hipcub; #include "paddle/phi/common/layout.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/batch_norm_kernel.h" +#include "paddle/phi/kernels/funcs/batch_norm_utils.h" #include "paddle/phi/kernels/funcs/eigen/common.h" #include "paddle/phi/kernels/funcs/norm_utils.h" #include "paddle/phi/kernels/funcs/reduce_function.h" -#include "paddle/phi/kernels/gpu/batch_norm_utils.h" #ifdef __HIPCC__ #define LAUNCH_BOUNDS(BlockDim) __launch_bounds__(BlockDim) diff --git a/paddle/phi/kernels/gpu/batch_norm_utils.h b/paddle/phi/kernels/gpu/batch_norm_utils.h deleted file mode 100644 index c9c62026edfa7..0000000000000 --- a/paddle/phi/kernels/gpu/batch_norm_utils.h +++ /dev/null @@ -1,142 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace phi { - -using Tensor = DenseTensor; - -template -inline void ResizeToChannelFirst(const DeviceContext& context, - const Tensor* input, - Tensor* transformed_input) { - int dim = input->dims().size() - 2; - if (dim == 3) { - // input - transformed_input->Resize(input->dims()); - - auto in_dims_vec = phi::vectorize(input->dims()); - in_dims_vec[1] = input->dims()[4]; - in_dims_vec[2] = input->dims()[1]; - in_dims_vec[3] = input->dims()[2]; - in_dims_vec[4] = input->dims()[3]; - transformed_input->Resize(phi::make_ddim(in_dims_vec)); - context.template Alloc(transformed_input); - - } else if (dim == 2) { - // input - transformed_input->Resize(input->dims()); - - auto in_dims_vec = phi::vectorize(input->dims()); - in_dims_vec[1] = input->dims()[3]; - in_dims_vec[2] = input->dims()[1]; - in_dims_vec[3] = input->dims()[2]; - transformed_input->Resize(phi::make_ddim(in_dims_vec)); - context.template Alloc(transformed_input); - } else if (dim == 1) { - transformed_input->Resize(input->dims()); - - auto in_dims_vec = phi::vectorize(input->dims()); - in_dims_vec[1] = input->dims()[2]; - in_dims_vec[2] = input->dims()[1]; - transformed_input->Resize(phi::make_ddim(in_dims_vec)); - context.template Alloc(transformed_input); - } -} - -template -inline void ResizeToChannelLast(const DeviceContext& context, - const Tensor* input, - Tensor* transformed_input) { - int dim = input->dims().size() - 2; - if (dim == 3) { - // input - transformed_input->Resize(input->dims()); - - auto in_dims_vec = phi::vectorize(input->dims()); - in_dims_vec[1] = input->dims()[2]; - in_dims_vec[2] = input->dims()[3]; - in_dims_vec[3] = input->dims()[4]; - in_dims_vec[4] = input->dims()[1]; - transformed_input->Resize(phi::make_ddim(in_dims_vec)); - context.template Alloc(transformed_input); - - } else if (dim == 2) { - // input - transformed_input->Resize(input->dims()); - - auto in_dims_vec = phi::vectorize(input->dims()); - in_dims_vec[1] = input->dims()[2]; - in_dims_vec[2] = input->dims()[3]; - in_dims_vec[3] = input->dims()[1]; - transformed_input->Resize(phi::make_ddim(in_dims_vec)); - context.template Alloc(transformed_input); - } else if (dim == 1) { - transformed_input->Resize(input->dims()); - - auto in_dims_vec = phi::vectorize(input->dims()); - in_dims_vec[1] = input->dims()[2]; - in_dims_vec[2] = input->dims()[1]; - transformed_input->Resize(phi::make_ddim(in_dims_vec)); - context.template Alloc(transformed_input); - } -} - -template -inline void TransToChannelFirst(const DeviceContext& context, - const Tensor* input, - Tensor* transformed_input) { - VLOG(5) << "Why am I called?"; - int dim = input->dims().size() - 2; - if (dim == 3) { - std::vector axis{0, 4, 1, 2, 3}; - funcs::Transpose trans5; - trans5(context, *input, transformed_input, axis); - - } else if (dim == 2) { - std::vector axis{0, 3, 1, 2}; - funcs::Transpose trans4; - trans4(context, *input, transformed_input, axis); - } else if (dim == 1) { - std::vector axis{0, 2, 1}; - funcs::Transpose trans3; - trans3(context, *input, transformed_input, axis); - } -} - -template -inline void TransToChannelLast(const DeviceContext& context, - const Tensor* input, - Tensor* transformed_input) { - int dim = input->dims().size() - 2; - if (dim == 3) { - std::vector axis{0, 2, 3, 4, 1}; - funcs::Transpose trans5; - trans5(context, *input, transformed_input, axis); - - } else if (dim == 2) { - std::vector axis{0, 2, 3, 1}; - funcs::Transpose trans4; - trans4(context, *input, transformed_input, axis); - } else if (dim == 1) { - std::vector axis{0, 2, 1}; - funcs::Transpose trans3; - trans3(context, *input, transformed_input, axis); - } -} - -} // namespace phi From 460d5040d2e8fd58ab470ba376438b56a0cb8dd1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kevin=E5=90=B4=E5=98=89=E6=96=87?= <417333277@qq.com> Date: Thu, 17 Nov 2022 11:44:13 +0800 Subject: [PATCH 053/210] Remove reduntant numpy input in Example code, test=document_fix (#47916) --- python/paddle/distributed/utils/moe_utils.py | 33 ++--- python/paddle/fft.py | 124 ++++++++++-------- .../paddle/sparse/nn/functional/activation.py | 39 +++--- python/paddle/sparse/nn/layer/activation.py | 43 +++--- 4 files changed, 124 insertions(+), 115 deletions(-) diff --git a/python/paddle/distributed/utils/moe_utils.py b/python/paddle/distributed/utils/moe_utils.py index cd7c0e758d4e0..eb7e73c363bf2 100644 --- a/python/paddle/distributed/utils/moe_utils.py +++ b/python/paddle/distributed/utils/moe_utils.py @@ -71,7 +71,6 @@ def global_scatter( .. code-block:: python # required: distributed - import numpy as np import paddle from paddle.distributed import init_parallel_env init_parallel_env() @@ -79,17 +78,14 @@ def global_scatter( world_size = 2 d_model = 2 in_feat = d_model - local_input_buf = np.array([[1, 2],[3, 4],[5, 6],[7, 8],[9, 10]], \ - dtype=np.float32) + local_input_buf = paddle.to_tensor([[1, 2],[3, 4],[5, 6],[7, 8],[9, 10]], \ + dtype='float32', stop_gradient=False) if paddle.distributed.ParallelEnv().local_rank == 0: - local_count = np.array([2, 1, 1, 1]) - global_count = np.array([2, 1, 1, 1]) + local_count = paddle.to_tensor([2, 1, 1, 1], dtype="int64") + global_count = paddle.to_tensor([2, 1, 1, 1], dtype="int64") else: - local_count = np.array([1, 1, 2, 1]) - global_count = np.array([1, 1, 2, 1]) - local_input_buf = paddle.to_tensor(local_input_buf, dtype="float32", stop_gradient=False) - local_count = paddle.to_tensor(local_count, dtype="int64") - global_count = paddle.to_tensor(global_count, dtype="int64") + local_count = paddle.to_tensor([1, 1, 2, 1], dtype="int64") + global_count = paddle.to_tensor([1, 1, 2, 1], dtype="int64") a = paddle.distributed.utils.global_scatter(local_input_buf, \ local_count, global_count) a.stop_gradient = False @@ -193,7 +189,6 @@ def global_gather( .. code-block:: python # required: distributed - import numpy as np import paddle from paddle.distributed import init_parallel_env init_parallel_env() @@ -201,17 +196,15 @@ def global_gather( world_size = 2 d_model = 2 in_feat = d_model - local_input_buf = np.array([[1, 2],[3, 4],[5, 6],[7, 8],[9, 10]],\ - dtype=np.float32) + local_input_buf = paddle._to_tensor([[1, 2],[3, 4],[5, 6],[7, 8],[9, 10]],\ + dtype='float32', stop_gradient=False) if paddle.distributed.ParallelEnv().local_rank == 0: - local_count = np.array([2, 1, 1, 1]) - global_count = np.array([2, 1, 1, 1]) + local_count = paddle.to_tensor([2, 1, 1, 1], dtype="int64") + global_count = paddle.to_tensor([2, 1, 1, 1], dtype="int64") else: - local_count = np.array([1, 1, 2, 1]) - global_count = np.array([1, 1, 2, 1]) - local_input_buf = paddle.to_tensor(local_input_buf, dtype="float32", stop_gradient=False) - local_count = paddle.to_tensor(local_count, dtype="int64") - global_count = paddle.to_tensor(global_count, dtype="int64") + local_count = paddle.to_tensor([1, 1, 2, 1], dtype="int64") + global_count = paddle.to_tensor([1, 1, 2, 1], dtype="int64") + a = paddle.distributed.utils.global_gather(local_input_buf, local_count, global_count) print(a) # out for rank 0: [[1, 2], [3, 4], [7, 8], [1, 2], [7, 8]] diff --git a/python/paddle/fft.py b/python/paddle/fft.py index 8bc95cd37e9f2..1e4ca9237469b 100644 --- a/python/paddle/fft.py +++ b/python/paddle/fft.py @@ -521,26 +521,29 @@ def fftn(x, s=None, axes=None, norm="backward", name=None): .. code-block:: python - import numpy as np import paddle - x = np.mgrid[:4, :4, :4][1] - xp = paddle.to_tensor(x) - fftn_xp = paddle.fft.fftn(xp, axes=(1, 2)).numpy() - print(fftn_xp) - # [[[24.+0.j 0.+0.j 0.+0.j 0.-0.j] + arr = paddle.arange(4, dtype="float64") + x = paddle.meshgrid(arr, arr, arr)[1] + + fftn_xp = paddle.fft.fftn(x, axes=(1, 2)) + print(fftn_xp.numpy()) + # [[[24.+0.j 0.+0.j 0.+0.j 0.-0.j] # [-8.+8.j 0.+0.j 0.+0.j 0.-0.j] # [-8.+0.j 0.+0.j 0.+0.j 0.-0.j] # [-8.-8.j 0.+0.j 0.+0.j 0.-0.j]] - # [[24.+0.j 0.+0.j 0.+0.j 0.-0.j] + + # [[24.+0.j 0.+0.j 0.+0.j 0.-0.j] # [-8.+8.j 0.+0.j 0.+0.j 0.-0.j] # [-8.+0.j 0.+0.j 0.+0.j 0.-0.j] # [-8.-8.j 0.+0.j 0.+0.j 0.-0.j]] - # [[24.+0.j 0.+0.j 0.+0.j 0.-0.j] + + # [[24.+0.j 0.+0.j 0.+0.j 0.-0.j] # [-8.+8.j 0.+0.j 0.+0.j 0.-0.j] # [-8.+0.j 0.+0.j 0.+0.j 0.-0.j] # [-8.-8.j 0.+0.j 0.+0.j 0.-0.j]] - # [[24.+0.j 0.+0.j 0.+0.j 0.-0.j] + + # [[24.+0.j 0.+0.j 0.+0.j 0.-0.j] # [-8.+8.j 0.+0.j 0.+0.j 0.-0.j] # [-8.+0.j 0.+0.j 0.+0.j 0.-0.j] # [-8.-8.j 0.+0.j 0.+0.j 0.-0.j]]] @@ -901,15 +904,16 @@ def fft2(x, s=None, axes=(-2, -1), norm="backward", name=None): .. code-block:: python - import numpy as np import paddle - x = np.mgrid[:2, :2][1] - xp = paddle.to_tensor(x) - fft2_xp = paddle.fft.fft2(xp).numpy() + arr = paddle.arange(2, dtype="float64") + x = paddle.meshgrid(arr, arr)[0] + + fft2_xp = paddle.fft.fft2(x) print(fft2_xp) - # [[ 2.+0.j -2.+0.j] - # [ 0.+0.j 0.+0.j]] + # Tensor(shape=[2, 2], dtype=complex128, place=Place(gpu:0), stop_gradient=True, + # [[ (2+0j), 0j ], + # [(-2+0j), 0j ]]) """ _check_at_least_ndim(x, 2) @@ -971,15 +975,16 @@ def ifft2(x, s=None, axes=(-2, -1), norm="backward", name=None): .. code-block:: python - import numpy as np import paddle - x = np.mgrid[:2, :2][1] - xp = paddle.to_tensor(x) - ifft2_xp = paddle.fft.ifft2(xp).numpy() + arr = paddle.arange(2, dtype="float64") + x = paddle.meshgrid(arr, arr)[0] + + ifft2_xp = paddle.fft.ifft2(x) print(ifft2_xp) - # [[ 0.5+0.j -0.5+0.j] - # [ 0. +0.j 0. +0.j]] + # Tensor(shape=[2, 2], dtype=complex128, place=Place(gpu:0), stop_gradient=True, + # [[ (0.5+0j), 0j ], + # [(-0.5+0j), 0j ]]) """ _check_at_least_ndim(x, 2) if s is not None: @@ -1033,16 +1038,17 @@ def rfft2(x, s=None, axes=(-2, -1), norm="backward", name=None): .. code-block:: python import paddle - import numpy as np - - x = paddle.to_tensor(np.mgrid[:5, :5][0].astype(np.float32)) - print(paddle.fft.rfft2(x)) - # Tensor(shape=[5, 3], dtype=complex64, place=CUDAPlace(0), stop_gradient=True, - # [[ (50+0j) , (1.1920928955078125e-07+0j) , 0j ], - # [(-12.5+17.204774856567383j) , (-9.644234211236835e-08+7.006946134424652e-08j) , 0j ], - # [(-12.500000953674316+4.061495304107666j) , (3.6837697336977726e-08-1.1337477445749755e-07j), 0j ], - # [(-12.500000953674316-4.061495304107666j) , (3.6837697336977726e-08+1.1337477445749755e-07j), 0j ], - # [(-12.5-17.204774856567383j) , (-9.644234211236835e-08-7.006946134424652e-08j) , 0j ]]) + + arr = paddle.arange(5, dtype="float64") + x = paddle.meshgrid(arr, arr)[0] + + result = paddle.fft.rfft2(x) + print(result.numpy()) + # [[ 50. +0.j 0. +0.j 0. +0.j ] + # [-12.5+17.20477401j 0. +0.j 0. +0.j ] + # [-12.5 +4.0614962j 0. +0.j 0. +0.j ] + # [-12.5 -4.0614962j 0. +0.j 0. +0.j ] + # [-12.5-17.20477401j 0. +0.j 0. +0.j ]] """ _check_at_least_ndim(x, 2) if s is not None: @@ -1192,13 +1198,20 @@ def ihfft2(x, s=None, axes=(-2, -1), norm="backward", name=None): .. code-block:: python - import numpy as np import paddle - x = np.mgrid[:5, :5][0].astype(np.float64) - xp = paddle.to_tensor(x) - ihfft2_xp = paddle.fft.ihfft2(xp).numpy() - print(ihfft2_xp) + arr = paddle.arange(5, dtype="float64") + x = paddle.meshgrid(arr, arr)[0] + print(x) + # Tensor(shape=[5, 5], dtype=float64, place=Place(gpu:0), stop_gradient=True, + # [[0., 0., 0., 0., 0.], + # [1., 1., 1., 1., 1.], + # [2., 2., 2., 2., 2.], + # [3., 3., 3., 3., 3.], + # [4., 4., 4., 4., 4.]]) + + ihfft2_xp = paddle.fft.ihfft2(x) + print(ihfft2_xp.numpy()) # [[ 2. +0.j 0. +0.j 0. +0.j ] # [-0.5-0.68819096j 0. +0.j 0. +0.j ] # [-0.5-0.16245985j 0. +0.j 0. +0.j ] @@ -1250,15 +1263,11 @@ def fftfreq(n, d=1.0, dtype=None, name=None): .. code-block:: python - import numpy as np import paddle - x = np.array([3, 1, 2, 2, 3], dtype=float) scalar_temp = 0.5 - n = x.size - fftfreq_xp = paddle.fft.fftfreq(n, d=scalar_temp) + fftfreq_xp = paddle.fft.fftfreq(5, d=scalar_temp) print(fftfreq_xp) - # Tensor(shape=[5], dtype=float32, place=CUDAPlace(0), stop_gradient=True, # [ 0. , 0.40000001, 0.80000001, -0.80000001, -0.40000001]) """ @@ -1301,13 +1310,10 @@ def rfftfreq(n, d=1.0, dtype=None, name=None): .. code-block:: python - import numpy as np import paddle - x = np.array([3, 1, 2, 2, 3], dtype=float) scalar_temp = 0.3 - n = x.size - rfftfreq_xp = paddle.fft.rfftfreq(n, d=scalar_temp) + rfftfreq_xp = paddle.fft.rfftfreq(5, d=scalar_temp) print(rfftfreq_xp) # Tensor(shape=[3], dtype=float32, place=CUDAPlace(0), stop_gradient=True, @@ -1343,15 +1349,17 @@ def fftshift(x, axes=None, name=None): .. code-block:: python - import numpy as np import paddle - x = np.array([3, 1, 2, 2, 3], dtype=float) - n = x.size - fftfreq_xp = paddle.fft.fftfreq(n, d=0.3) - res = paddle.fft.fftshift(fftfreq_xp).numpy() + fftfreq_xp = paddle.fft.fftfreq(5, d=0.3) + print(fftfreq_xp) + # Tensor(shape=[5], dtype=float32, place=Place(gpu:0), stop_gradient=True, + # [ 0. , 0.66666669, 1.33333337, -1.33333337, -0.66666669]) + + res = paddle.fft.fftshift(fftfreq_xp) print(res) - # [-1.3333334 -0.6666667 0. 0.6666667 1.3333334] + # Tensor(shape=[5], dtype=float32, place=Place(gpu:0), stop_gradient=True, + # [-1.33333337, -0.66666669, 0. , 0.66666669, 1.33333337]) """ shape = paddle.shape(x) @@ -1386,15 +1394,17 @@ def ifftshift(x, axes=None, name=None): .. code-block:: python - import numpy as np import paddle - x = np.array([3, 1, 2, 2, 3], dtype=float) - n = x.size - fftfreq_xp = paddle.fft.fftfreq(n, d=0.3) - res = paddle.fft.ifftshift(fftfreq_xp).numpy() + fftfreq_xp = paddle.fft.fftfreq(5, d=0.3) + print(fftfreq_xp) + # Tensor(shape=[5], dtype=float32, place=Place(gpu:0), stop_gradient=True, + # [ 0. , 0.66666669, 1.33333337, -1.33333337, -0.66666669]) + + res = paddle.fft.ifftshift(fftfreq_xp) print(res) - # [ 1.3333334 -1.3333334 -0.6666667 0. 0.6666667] + # Tensor(shape=[5], dtype=float32, place=Place(gpu:0), stop_gradient=True, + # [ 1.33333337, -1.33333337, -0.66666669, 0. , 0.66666669]) """ shape = paddle.shape(x) diff --git a/python/paddle/sparse/nn/functional/activation.py b/python/paddle/sparse/nn/functional/activation.py index cbe2ddd0d79db..93c5e74014f3e 100644 --- a/python/paddle/sparse/nn/functional/activation.py +++ b/python/paddle/sparse/nn/functional/activation.py @@ -87,28 +87,31 @@ def softmax(x, axis=-1, name=None): .. code-block:: python import paddle - import numpy as np paddle.seed(100) - mask = np.random.rand(3, 4) < 0.5 - np_x = np.random.rand(3, 4) * mask - # [[0. 0. 0.96823406 0.19722934] - # [0.94373937 0. 0.02060066 0.71456372] - # [0. 0. 0. 0.98275049]] - - csr = paddle.to_tensor(np_x).to_sparse_csr() - # Tensor(shape=[3, 4], dtype=paddle.float64, place=Place(gpu:0), stop_gradient=True, - # crows=[0, 2, 5, 6], - # cols=[2, 3, 0, 2, 3, 3], - # values=[0.96823406, 0.19722934, 0.94373937, 0.02060066, 0.71456372, - # 0.98275049]) + mask = paddle.rand((3, 4)) < 0.5 + x = paddle.rand((3, 4)) * mask + print(x) + # Tensor(shape=[3, 4], dtype=float32, place=Place(gpu:0), stop_gradient=True, + # [[0.83438963, 0.70008713, 0. , 0.88831252], + # [0.02200012, 0. , 0.75432241, 0.65136462], + # [0.96088767, 0.82938021, 0.35367414, 0.86653489]]) + + csr = x.to_sparse_csr() + print(csr) + # Tensor(shape=[3, 4], dtype=paddle.float32, place=Place(gpu:0), stop_gradient=True, + # crows=[0 , 3 , 6 , 10], + # cols=[0, 1, 3, 0, 2, 3, 0, 1, 2, 3], + # values=[0.83438963, 0.70008713, 0.88831252, 0.02200012, 0.75432241, + # 0.65136462, 0.96088767, 0.82938021, 0.35367414, 0.86653489]) out = paddle.sparse.nn.functional.softmax(csr) - # Tensor(shape=[3, 4], dtype=paddle.float64, place=Place(gpu:0), stop_gradient=True, - # crows=[0, 2, 5, 6], - # cols=[2, 3, 0, 2, 3, 3], - # values=[0.68373820, 0.31626180, 0.45610887, 0.18119845, 0.36269269, - # 1. ]) + print(out) + # Tensor(shape=[3, 4], dtype=paddle.float32, place=Place(gpu:0), stop_gradient=True, + # crows=[0 , 3 , 6 , 10], + # cols=[0, 1, 3, 0, 2, 3, 0, 1, 2, 3], + # values=[0.34132850, 0.29843223, 0.36023921, 0.20176248, 0.41964680, + # 0.37859070, 0.30015594, 0.26316854, 0.16354506, 0.27313042]) """ return _C_ops.sparse_softmax(x, axis) diff --git a/python/paddle/sparse/nn/layer/activation.py b/python/paddle/sparse/nn/layer/activation.py index 3ad856f69fbec..91d5c198189dd 100644 --- a/python/paddle/sparse/nn/layer/activation.py +++ b/python/paddle/sparse/nn/layer/activation.py @@ -86,29 +86,32 @@ class Softmax(Layer): .. code-block:: python import paddle - import numpy as np - paddle.seed(100) - - mask = np.random.rand(3, 4) < 0.5 - np_x = np.random.rand(3, 4) * mask - # [[0. 0. 0.96823406 0.19722934] - # [0.94373937 0. 0.02060066 0.71456372] - # [0. 0. 0. 0.98275049]] - - csr = paddle.to_tensor(np_x).to_sparse_csr() - # Tensor(shape=[3, 4], dtype=paddle.float64, place=Place(gpu:0), stop_gradient=True, - # crows=[0, 2, 5, 6], - # cols=[2, 3, 0, 2, 3, 3], - # values=[0.96823406, 0.19722934, 0.94373937, 0.02060066, 0.71456372, - # 0.98275049]) + paddle.seed(2022) + + mask = paddle.rand((3, 4)) < 0.7 + x = paddle.rand((3, 4)) * mask + print(x) + # Tensor(shape=[3, 4], dtype=float32, place=Place(gpu:0), stop_gradient=True, + # [[0.08325022, 0.27030438, 0. , 0.83883715], + # [0. , 0.95856029, 0.24004589, 0. ], + # [0.14500992, 0.17088132, 0. , 0. ]]) + + csr = x.to_sparse_csr() + print(csr) + # Tensor(shape=[3, 4], dtype=paddle.float32, place=Place(gpu:0), stop_gradient=True, + # crows=[0, 3, 5, 7], + # cols=[0, 1, 3, 1, 2, 0, 1], + # values=[0.08325022, 0.27030438, 0.83883715, 0.95856029, 0.24004589, + # 0.14500992, 0.17088132]) softmax = paddle.sparse.nn.Softmax() out = softmax(csr) - # Tensor(shape=[3, 4], dtype=paddle.float64, place=Place(gpu:0), stop_gradient=True, - # crows=[0, 2, 5, 6], - # cols=[2, 3, 0, 2, 3, 3], - # values=[0.68373820, 0.31626180, 0.45610887, 0.18119845, 0.36269269, - # 1. ]) + print(out) + # Tensor(shape=[3, 4], dtype=paddle.float32, place=Place(gpu:0), stop_gradient=True, + # crows=[0, 3, 5, 7], + # cols=[0, 1, 3, 1, 2, 0, 1], + # values=[0.23070428, 0.27815846, 0.49113727, 0.67227983, 0.32772022, + # 0.49353254, 0.50646752]) """ def __init__(self, axis=-1, name=None): From c20eb7a6aca801f773f5a54c32ee65e2134939eb Mon Sep 17 00:00:00 2001 From: wuhuachaocoding <77733235+wuhuachaocoding@users.noreply.github.com> Date: Thu, 17 Nov 2022 11:50:48 +0800 Subject: [PATCH 054/210] support stage2 for gradient merge. (#47711) --- .../sharding/group_sharded_stage2.py | 50 +++++++++++-------- .../sharding/group_sharded_utils.py | 13 +---- .../fleet/dygraph_group_sharded_api_eager.py | 5 -- 3 files changed, 29 insertions(+), 39 deletions(-) diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage2.py b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage2.py index 05a25223e65ba..f756162727edd 100644 --- a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage2.py +++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage2.py @@ -418,17 +418,6 @@ def cleanup(): ) ) - if self._dp_group and self._dp_group.nranks > 1: - assert ( - not self._reduce_overlap - ), 'dp + stage2 hybrid parallel only Synchronize due to the new communication lib.' - # TODO(wuhuachao):after the new communication lib upgrading, overlapping the comm of dp + stage2. - dist.all_reduce( - tensor=param.grad, - group=self._dp_group, - sync_op=True, - ) - # Clear the task flow and trigger callback to clear the redundant gradient # self._clear_task_flow() @@ -485,17 +474,6 @@ def cleanup(): ) ) - if self._dp_group and self._dp_group.nranks > 1: - assert ( - not self._reduce_overlap - ), 'dp + stage2 hybrid parallel only Synchronize due to the new communication lib.' - # TODO(wuhuachao):after the new communication lib upgrading, overlapping the comm of dp + stage2. - dist.all_reduce( - tensor=grad_storage.buffer, - group=self._dp_group, - sync_op=True, - ) - cleanup() # Clear the task flow and trigger callback to clear the redundant gradient @@ -648,8 +626,34 @@ def _rank_buffer_size(self, buffer_max_size, model_size): ) return rank_buffer_size + def _dp_allreduce(self): + # do dp allreduce here for gradient merge. + if self._dp_group and self._dp_group.nranks > 1: + for dtype in self._grad_storages.keys(): + for rank, g in sorted( + self._grad_storages[dtype].items(), key=lambda x: x[0] + ): + if g.destination == self._rank: + assert g.buffer._is_initialized() + dist.all_reduce( + tensor=g.buffer, + group=self._dp_group, + sync_op=True, + ) + for param in self._trainable_params: + if param.name in self._param_grads and param.grad is not None: + dst_rank = self._trainable_param2rank[param.name] + if dst_rank == self._rank: + dist.all_reduce( + tensor=param.grad, + group=self._dp_group, + sync_op=True, + ) + def _redefine_opt_step(self): grad_func = self._grad_scale + dp_allreduce_func = self._dp_allreduce + for opt in self._sharding_optimizers: opt_step = opt.step @@ -658,7 +662,9 @@ def _opt_step(self): # Wait for the last reduce task. This wait must before grad scale function. assert self._comm_task is not None self._comm_task.wait() + grad_func() + dp_allreduce_func() opt_step() opt.step = MethodType(_opt_step, opt) diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py index 2976ef88e5983..5ab3ffb4e4eec 100644 --- a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py +++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py @@ -23,7 +23,6 @@ from paddle.fluid import layers from paddle.fluid.dygraph import to_variable from paddle.fluid.framework import dygraph_only -from paddle.distributed import fleet, ParallelMode class Taskflow: @@ -245,18 +244,8 @@ def unscale_method(self, optimizer): self._found_inf = 1 if temp_found_inf_fp16 or temp_found_inf_fp32 else 0 is_found_inf = paddle.to_tensor([self._found_inf], dtype="int32") - hcg = fleet.fleet._hcg if hasattr(fleet.fleet, "_hcg") else None - hybrid_parallel = ( - hcg is not None - and hcg.get_parallel_mode() is not ParallelMode.DATA_PARALLEL - ) - paddle.distributed.all_reduce( - is_found_inf, - op=paddle.distributed.ReduceOp.MAX, - group=hcg.get_check_parallel_group() - if hybrid_parallel - else optimizer._group, + is_found_inf, op=paddle.distributed.ReduceOp.MAX, group=None ) self._found_inf = is_found_inf.numpy()[0] diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/dygraph_group_sharded_api_eager.py b/python/paddle/fluid/tests/unittests/collective/fleet/dygraph_group_sharded_api_eager.py index 04c13f358bcc6..2a93a8d38f42f 100644 --- a/python/paddle/fluid/tests/unittests/collective/fleet/dygraph_group_sharded_api_eager.py +++ b/python/paddle/fluid/tests/unittests/collective/fleet/dygraph_group_sharded_api_eager.py @@ -148,11 +148,6 @@ def test_sharding_api(): output_dir = tempfile.mkdtemp() - # test sharding + dp, just for test - dp_group = paddle.distributed.new_group( - list(range(paddle.distributed.get_world_size())) - ) - # fp16 stage2_params = train_mlp( mlp1, From cb087bebbe5cf20af7e37129773b88b8af033c66 Mon Sep 17 00:00:00 2001 From: ShenLiang <1422485404@qq.com> Date: Thu, 17 Nov 2022 13:01:07 +0800 Subject: [PATCH 055/210] fix bug of p2p (#48045) --- .../fleet/meta_parallel/pp_utils/p2p_communication.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py b/python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py index e97cf3c02302f..7f2e2b8d7a48b 100644 --- a/python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py +++ b/python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py @@ -164,7 +164,11 @@ def set_send_message(self, tensor): [d.shape for d in tensor if not d.stop_gradient] ) self.send_dtype_message = tuple( - [paddle_2_number(d.dtype) for d in tensor] + [ + paddle_2_number(d.dtype) + for d in tensor + if not d.stop_gradient + ] ) From bf6af816f9dc00fb93d598420f98c9199974f985 Mon Sep 17 00:00:00 2001 From: Yiqun Liu Date: Thu, 17 Nov 2022 13:05:22 +0800 Subject: [PATCH 056/210] Implement a common dimension simplifier. (#47981) * Implement a common dims simplifier. * Fix the include position error. * Reduce the cpu overhead of broadcast computing. --- paddle/phi/kernels/funcs/broadcast_function.h | 235 ++--------------- paddle/phi/kernels/funcs/dims_simplifier.h | 247 ++++++++++++++++++ .../kernels/primitive/datamover_primitives.h | 45 ++-- 3 files changed, 293 insertions(+), 234 deletions(-) create mode 100644 paddle/phi/kernels/funcs/dims_simplifier.h diff --git a/paddle/phi/kernels/funcs/broadcast_function.h b/paddle/phi/kernels/funcs/broadcast_function.h index 22ed5b29d77bc..67d3a309b1f33 100644 --- a/paddle/phi/kernels/funcs/broadcast_function.h +++ b/paddle/phi/kernels/funcs/broadcast_function.h @@ -17,6 +17,7 @@ limitations under the License. */ #include "paddle/phi/kernels/funcs/elementwise_base.h" #if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__) +#include "paddle/phi/kernels/funcs/dims_simplifier.h" namespace kps = phi::kps; @@ -27,203 +28,6 @@ namespace funcs { #if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__) -struct DimensionsTransform { - using DimVector = std::vector; - typedef void (*MergeFunctor)( - bool &, std::vector &, DimVector &, int, int); - int64_t N; - int64_t dim_size; - DimVector out_dims; - std::vector in_dims; - - private: - // To compensate the lackage of input_tensors` dimension with input - // variable 'axis'. - void InputDimensionsExtend(int N, int axis) { - for (auto &in_dim : in_dims) { - int64_t in_idx = 0; - if (in_dim.size() < dim_size) { - DimVector tmp_dim(dim_size, 1); - for (; in_idx < in_dim.size();) { - if (in_dim[in_idx] == out_dims[axis] || in_dim[in_idx] == 1) { - tmp_dim[axis] = in_dim[in_idx]; - in_idx++; - axis++; - } else { - PADDLE_THROW(phi::errors::InvalidArgument( - "The %d-th dimension of input tensor is expected to be equal " - "with the %d-th dimension of output tensor %d or 1, but " - "received %d.", - in_idx + 1, - axis + 1, - out_dims[axis], - in_dim[in_idx])); - } - } - in_dim.resize(dim_size); - std::copy(tmp_dim.begin(), tmp_dim.end(), in_dim.begin()); - } else { - for (; in_idx < dim_size;) { - if (in_dim[in_idx] == out_dims[in_idx] || in_dim[in_idx] == 1) { - in_idx++; - } else { - PADDLE_THROW(phi::errors::InvalidArgument( - "The %d-th dimension of input tensor is expected to be equal " - "with the %d-th dimension of output tensor %d or 1, but " - "received %d.", - in_idx + 1, - in_idx + 1, - out_dims[in_idx], - in_dim[in_idx])); - } - } - } - std::reverse(in_dim.begin(), in_dim.end()); - } - std::reverse(out_dims.begin(), out_dims.end()); - } - - // Merge sequential dimension to shrink calculation cost for - // offset computation in CUDA Kernel. - template - __inline__ void MergeDimensions(MergeFunctor merge_func, int N) { - auto VectorReorganise = [](DimVector *vec, int l_idx, int m_idx) { - (*vec)[m_idx - 1] = std::accumulate(vec->begin() + l_idx, - vec->begin() + m_idx, - 1, - std::multiplies()); - vec->erase(vec->begin() + l_idx, vec->begin() + m_idx - 1); - }; - - int64_t i = 0; - while (i < dim_size) { - int cnt = 0; - int low_idx = i; - bool equal = true; - do { - merge_func(equal, in_dims, out_dims, i, N); - if (equal) { - i++; - cnt++; - } else { - break; - } - } while (i < dim_size); - - if (cnt > 1) { - for (auto &in_dim : in_dims) { - VectorReorganise(&in_dim, low_idx, i); - } - VectorReorganise(&out_dims, low_idx, i); - dim_size -= --cnt; - i -= cnt; - } else if (cnt < 1) { - i++; - } - } - } - - // To judge whether shape of any input tensors is sequential - // 1-value-dimensions, and metric the length of it. - bool FindSequentialOneDim(int *swap_index) { - int index = 0; - int max_one_length = 0; - for (int j = 0; j < N; ++j) { - int seq_one_length = 0; - bool active_seq = false; - - for (int i = 0; i < dim_size; ++i) { - if (!active_seq && in_dims[j][i] == 1) { - seq_one_length = 1; - active_seq = true; - } else if (active_seq) { - if (in_dims[j][i] == 1) { - seq_one_length++; - } else { - active_seq = false; - } - } - } - index = seq_one_length > max_one_length ? j : index; - max_one_length = std::max(seq_one_length, max_one_length); - } - - bool has_seq_one = max_one_length > 1; - if (has_seq_one) { - std::swap(in_dims[0], in_dims[index]); - *swap_index = index; - } - return has_seq_one; - } - - public: - explicit DimensionsTransform(const std::vector &ins, - const phi::DDim &dims, - int axis) { - N = std::max(static_cast(ins.size()), 2); - dim_size = dims.size(); - out_dims = phi::vectorize(dims); - in_dims.resize(N); - if (ins.size() == 1) { - // when ins.size() = 1, broadcast input to output - in_dims[0] = phi::vectorize(ins[0]->dims()); - in_dims[1] = out_dims; - // Add out_dims to in_dims to avoid errors in dims merging - } else { - for (int j = 0; j < N; ++j) { - in_dims[j] = phi::vectorize(ins[j]->dims()); - } - } - InputDimensionsExtend(N, axis); - - // To Merge the dimensions of input_tensors while the consequtive - // equal-dimensions appears. Example below : - // in_1.shape = [2, 3, 4, 5] in_1.shape = [2, 12, 5] - // in_2.shape = [1, 3, 4, 5] -> in_2.shape = [1, 12, 5] - // in_3.shape = [2, 3, 4, 1] in_3.shape = [2, 12, 1] - auto merge_sequential_dims = [](bool &equal, - std::vector &in_dims, - DimVector &out, - int i, - int num) { - for (int j = 1; j < num; ++j) { - equal &= (in_dims[0][i] == in_dims[j][i]) ? true : false; - } - }; - MergeFunctor merge_ptr = merge_sequential_dims; - MergeDimensions(merge_ptr, N); - - // To Merge the dimension of input_tensors while the sequential - // 1-value-dimensions appears. Example below : - // in_1.shape = [2, 1, 1, 5] in_1.shape = [2, 1, 5] - // in_2.shape = [2, 3, 4, 5] -> in_2.shape = [1, 12, 5] - // in_3.shape = [2, 3, 4, 1] in_3.shape = [2, 12, 1] - // Caution: Once 1-value-dimensions appears, the corresponding - // shape position of other input tensors must be same with the - // output tensor`s shape, or incorrect merge may occur. - auto merge_sequential_one_dims = [](bool &equal, - std::vector &in_dims, - DimVector &out, - int i, - int num) { - equal = in_dims[0][i] == 1; - if (equal) { - for (int j = 1; j < num; ++j) { - equal &= in_dims[j][i] == out[i]; - } - } - }; - for (auto i = 0; i < dim_size; ++i) { - int swap_idx = 0; - bool has_seq_one = FindSequentialOneDim(&swap_idx); - if (!has_seq_one) break; - merge_ptr = merge_sequential_one_dims; - MergeDimensions(merge_ptr, N); - std::swap(in_dims[swap_idx], in_dims[0]); - } - } -}; - template int GetVecsize(const std::vector &ins, std::vector *outs) { @@ -313,7 +117,7 @@ struct BroadcastDataLoader { #pragma unroll for (int i = 0; i < phi::DDim::kMaxRank; ++i) { - if (i == configs[0].kDims) break; + if (i == configs[0].rank) break; auto fast_divmoder = configs[0].divmoders[i].Divmod(idx); idx = fast_divmoder.val[0]; #pragma unroll @@ -1071,7 +875,19 @@ void BroadcastKernelForDifferentVecSize( #endif // mergedim and get vec_size - const auto merge_dims = DimensionsTransform(ins, (*outs)[0]->dims(), axis); + const auto dims_simplifier = + BroadcastDimsSimplifier(ins, (*outs)[0]->dims(), axis); + if (VLOG_IS_ON(4)) { + for (size_t i = 0; i < dims_simplifier.in_dims.size(); ++i) { + VLOG(4) << "input i=" << i << ": origin_dims={" << ins[i]->dims() + << "}, simplied_dims={" + << phi::make_ddim(dims_simplifier.in_dims[i]) << "}"; + } + VLOG(4) << "output: origin_dims={" << (*outs)[0]->dims() + << "}, simplied_dims={" << phi::make_ddim(dims_simplifier.out_dims) + << "}"; + } + phi::Array configs; // get vec_size @@ -1081,14 +897,14 @@ void BroadcastKernelForDifferentVecSize( 2, phi::errors::InvalidArgument( "XPU only support inputs is 2, but received %d", ins.size())); - configs[0] = kps::details::BroadcastConfig(merge_dims.out_dims, - merge_dims.in_dims[0], - merge_dims.in_dims[1], - merge_dims.dim_size); - configs[1] = kps::details::BroadcastConfig(merge_dims.out_dims, - merge_dims.in_dims[1], - merge_dims.in_dims[0], - merge_dims.dim_size); + configs[0] = kps::details::BroadcastConfig(dims_simplifier.out_dims, + dims_simplifier.in_dims[0], + dims_simplifier.in_dims[1], + dims_simplifier.rank); + configs[1] = kps::details::BroadcastConfig(dims_simplifier.out_dims, + dims_simplifier.in_dims[1], + dims_simplifier.in_dims[0], + dims_simplifier.rank); auto type = kps::details::OptType::CanNotOptimize; bool is_optimize = configs[0].cmp_type != type; int vec_size = is_optimize ? VecSizeL : VecSizeM; @@ -1099,8 +915,9 @@ void BroadcastKernelForDifferentVecSize( // eg: out's shape [3, 45, 1]. then out_dims = {1, 45, 3} // if (ins[i]->numel() != (*outs)[0]->numel()) { if (ins[i]->numel()) { - configs[i] = kps::details::BroadcastConfig( - merge_dims.out_dims, merge_dims.in_dims[i], merge_dims.dim_size); + configs[i] = kps::details::BroadcastConfig(dims_simplifier.out_dims, + dims_simplifier.in_dims[i], + dims_simplifier.rank); } } int vec_size = GetVecsize(ins, outs); diff --git a/paddle/phi/kernels/funcs/dims_simplifier.h b/paddle/phi/kernels/funcs/dims_simplifier.h new file mode 100644 index 0000000000000..21f14bdba7834 --- /dev/null +++ b/paddle/phi/kernels/funcs/dims_simplifier.h @@ -0,0 +1,247 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/phi/core/ddim.h" +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { +namespace funcs { + +struct BroadcastDimsSimplifier { + using DimVector = std::vector; + typedef void (*MergeFunctor)( + bool &, std::vector &, DimVector &, int, int); + + int64_t N; + int64_t rank; + DimVector out_dims; + std::vector in_dims; + + public: + BroadcastDimsSimplifier(const std::vector &ins, + const phi::DDim &dims, + int axis) { + if (!NeedBroadcast(ins, dims)) { + int64_t numel = phi::product(dims); + rank = 1; + N = ins.size(); + out_dims = DimVector{numel}; + in_dims.resize(N); + for (int64_t i = 0; i < N; ++i) { + in_dims[i] = DimVector{numel}; + } + return; + } + + N = std::max(static_cast(ins.size()), 2); + in_dims.resize(N); + rank = dims.size(); + out_dims = phi::vectorize(dims); + if (ins.size() == 1) { + // When ins.size() = 1, broadcast input to output. + in_dims[0] = phi::vectorize(ins[0]->dims()); + // Add out_dims to in_dims to avoid errors in dims merging. + in_dims[1] = out_dims; + } else { + for (int j = 0; j < N; ++j) { + in_dims[j] = phi::vectorize(ins[j]->dims()); + } + } + ExtendInputDimensions(N, axis); + + // To Merge the dimensions of input_tensors while the consequtive + // equal-dimensions appears. Example below : + // in_1.shape = [2, 3, 4, 5] in_1.shape = [2, 12, 5] + // in_2.shape = [1, 3, 4, 5] -> in_2.shape = [1, 12, 5] + // in_3.shape = [2, 3, 4, 1] in_3.shape = [2, 12, 1] + auto merge_sequential_dims = [](bool &equal, + std::vector &in_dims, + DimVector &out, + int i, + int num) { + for (int j = 1; j < num; ++j) { + equal &= (in_dims[0][i] == in_dims[j][i]) ? true : false; + } + }; + MergeFunctor merge_ptr = merge_sequential_dims; + MergeDimensions(merge_ptr, N); + + // To Merge the dimension of input_tensors while the sequential + // 1-value-dimensions appears. Example below : + // in_1.shape = [2, 1, 1, 5] in_1.shape = [2, 1, 5] + // in_2.shape = [2, 3, 4, 5] -> in_2.shape = [1, 12, 5] + // in_3.shape = [2, 3, 4, 1] in_3.shape = [2, 12, 1] + // Caution: Once 1-value-dimensions appears, the corresponding + // shape position of other input tensors must be same with the + // output tensor`s shape, or incorrect merge may occur. + auto merge_sequential_one_dims = [](bool &equal, + std::vector &in_dims, + DimVector &out, + int i, + int num) { + equal = in_dims[0][i] == 1; + if (equal) { + for (int j = 1; j < num; ++j) { + equal &= in_dims[j][i] == out[i]; + } + } + }; + for (auto i = 0; i < rank; ++i) { + int swap_idx = 0; + bool has_seq_one = FindSequentialOneDim(&swap_idx); + if (!has_seq_one) { + break; + } + merge_ptr = merge_sequential_one_dims; + MergeDimensions(merge_ptr, N); + std::swap(in_dims[swap_idx], in_dims[0]); + } + } + + private: + bool NeedBroadcast(const std::vector &ins, + const phi::DDim &dims) { + bool no_broadcast_flag = true; + for (auto *in : ins) { + no_broadcast_flag &= ins[0]->dims() == in->dims(); + } + if (ins.size() > 0) { + no_broadcast_flag &= dims == ins[0]->dims(); + } + return !no_broadcast_flag; + } + + // To compensate the lackage of input_tensors' dimension with axis. + void ExtendInputDimensions(int N, int axis) { + for (auto &in_dim : in_dims) { + int64_t in_idx = 0; + if (in_dim.size() < rank) { + DimVector tmp_dim(rank, 1); + for (; in_idx < in_dim.size();) { + if (in_dim[in_idx] == out_dims[axis] || in_dim[in_idx] == 1) { + tmp_dim[axis] = in_dim[in_idx]; + in_idx++; + axis++; + } else { + PADDLE_THROW(phi::errors::InvalidArgument( + "The %d-th dimension of input tensor is expected to be equal " + "with the %d-th dimension of output tensor %d or 1, but " + "received %d.", + in_idx + 1, + axis + 1, + out_dims[axis], + in_dim[in_idx])); + } + } + in_dim.resize(rank); + std::copy(tmp_dim.begin(), tmp_dim.end(), in_dim.begin()); + } else { + for (; in_idx < rank;) { + if (in_dim[in_idx] == out_dims[in_idx] || in_dim[in_idx] == 1) { + in_idx++; + } else { + PADDLE_THROW(phi::errors::InvalidArgument( + "The %d-th dimension of input tensor is expected to be equal " + "with the %d-th dimension of output tensor %d or 1, but " + "received %d.", + in_idx + 1, + in_idx + 1, + out_dims[in_idx], + in_dim[in_idx])); + } + } + } + std::reverse(in_dim.begin(), in_dim.end()); + } + std::reverse(out_dims.begin(), out_dims.end()); + } + + // Merge sequential dimension to shrink calculation cost for + // offset computation in CUDA Kernel. + template + __inline__ void MergeDimensions(MergeFunctor merge_func, int N) { + auto VectorReorganise = [](DimVector *vec, int l_idx, int m_idx) { + (*vec)[m_idx - 1] = std::accumulate(vec->begin() + l_idx, + vec->begin() + m_idx, + 1, + std::multiplies()); + vec->erase(vec->begin() + l_idx, vec->begin() + m_idx - 1); + }; + + int64_t i = 0; + while (i < rank) { + int cnt = 0; + int low_idx = i; + bool equal = true; + do { + merge_func(equal, in_dims, out_dims, i, N); + if (equal) { + i++; + cnt++; + } else { + break; + } + } while (i < rank); + + if (cnt > 1) { + for (auto &in_dim : in_dims) { + VectorReorganise(&in_dim, low_idx, i); + } + VectorReorganise(&out_dims, low_idx, i); + rank -= --cnt; + i -= cnt; + } else if (cnt < 1) { + i++; + } + } + } + + // To judge whether shape of any input tensors is sequential + // 1-value-dimensions, and metric the length of it. + bool FindSequentialOneDim(int *swap_index) { + int index = 0; + int max_one_length = 0; + for (int j = 0; j < N; ++j) { + int seq_one_length = 0; + bool active_seq = false; + + for (int i = 0; i < rank; ++i) { + if (!active_seq && in_dims[j][i] == 1) { + seq_one_length = 1; + active_seq = true; + } else if (active_seq) { + if (in_dims[j][i] == 1) { + seq_one_length++; + } else { + active_seq = false; + } + } + } + index = seq_one_length > max_one_length ? j : index; + max_one_length = std::max(seq_one_length, max_one_length); + } + + bool has_seq_one = max_one_length > 1; + if (has_seq_one) { + std::swap(in_dims[0], in_dims[index]); + *swap_index = index; + } + return has_seq_one; + } +}; + +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/primitive/datamover_primitives.h b/paddle/phi/kernels/primitive/datamover_primitives.h index d6139501b4e3c..ac27916196105 100644 --- a/paddle/phi/kernels/primitive/datamover_primitives.h +++ b/paddle/phi/kernels/primitive/datamover_primitives.h @@ -85,33 +85,28 @@ struct FastDivMod { struct BroadcastConfig { FastDivMod divmoders[phi::DDim::kMaxRank]; uint32_t strides[phi::DDim::kMaxRank]; - int kDims{0}; - HOSTDEVICE BroadcastConfig() {} - - HOSTDEVICE BroadcastConfig(const std::vector& out_dims, - const std::vector& in_dims, - int dim_size) { - std::vector strides_in; - std::vector divmoders_in; - // for divmoders - divmoders_in.resize(dim_size); + int rank{0}; + + // BroadcastConfig should be defined on host used on device. + BroadcastConfig() {} + + BroadcastConfig(const std::vector& out_dims, + const std::vector& in_dims, + int dim_size) { for (int i = 0; i < dim_size; ++i) { - divmoders_in[i] = FastDivMod(out_dims[i]); + divmoders[i] = FastDivMod(out_dims[i]); } - // for strides - strides_in.resize(dim_size, 1); + for (int i = 0; i < dim_size; ++i) { - strides_in[i] = in_dims[i] == 1 ? 0 : strides_in[i]; - strides_in[i] = (i != 0 && strides_in[i] != 0) - ? std::accumulate(in_dims.begin(), - in_dims.begin() + i, - 1, - std::multiplies()) - : strides_in[i]; + strides[i] = in_dims[i] == 1 ? 0 : 1; + strides[i] = (i != 0 && strides[i] != 0) + ? std::accumulate(in_dims.begin(), + in_dims.begin() + i, + 1, + std::multiplies()) + : strides[i]; } - kDims = dim_size; - memcpy(strides, strides_in.data(), kDims * sizeof(uint32_t)); - memcpy(divmoders, divmoders_in.data(), kDims * sizeof(FastDivMod)); + rank = dim_size; } }; @@ -452,7 +447,7 @@ __device__ __forceinline__ void ReadDataBc( } #pragma unroll for (int i = 0; i < phi::DDim::kMaxRank; ++i) { - if (i >= config.kDims) break; + if (i >= config.rank) break; auto fast_divmoder = config.divmoders[i].Divmod(index_output); index_output = fast_divmoder.val[0]; index_src += fast_divmoder.val[1] * config.strides[i]; @@ -784,7 +779,7 @@ __device__ __forceinline__ void ReadDataBc( } #pragma unroll for (int i = 0; i < phi::DDim::kMaxRank; ++i) { - if (i >= config.kDims) break; + if (i >= config.rank) break; auto fast_divmoder = config.divmoders[i].Divmod(index_output); index_output = fast_divmoder.val[0]; index_src += fast_divmoder.val[1] * config.strides[i]; From ff44df1841563ff8da4683fab09324e5f4052e51 Mon Sep 17 00:00:00 2001 From: xiaoxiaohehe001 <49090790+xiaoxiaohehe001@users.noreply.github.com> Date: Thu, 17 Nov 2022 13:35:41 +0800 Subject: [PATCH 057/210] [Paddle Inference] Support cast trt converter of bool input and output . (#48043) * add_cast_bool * cast --- .../inference/tensorrt/convert/cast_op.cc | 7 +++++++ paddle/fluid/inference/tensorrt/op_teller.cc | 11 ++++++++--- .../ir/inference/test_trt_convert_cast.py | 18 ++++++++++++++---- .../ir/inference/test_trt_convert_where.py | 2 +- 4 files changed, 30 insertions(+), 8 deletions(-) diff --git a/paddle/fluid/inference/tensorrt/convert/cast_op.cc b/paddle/fluid/inference/tensorrt/convert/cast_op.cc index b2b06744d984a..7b74e7aa17611 100644 --- a/paddle/fluid/inference/tensorrt/convert/cast_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/cast_op.cc @@ -42,14 +42,21 @@ class CastOpConverter : public OpConverter { auto* layer = TRT_ENGINE_ADD_LAYER(engine_, Identity, *input); switch (out_dtype) { + case 0: // BOOL = 0 + layer->setOutputType(0, nvinfer1::DataType::kBOOL); + layer->getOutput(0)->setType(nvinfer1::DataType::kBOOL); + break; case 2: // INT32 = 2 layer->setOutputType(0, nvinfer1::DataType::kINT32); + layer->getOutput(0)->setType(nvinfer1::DataType::kINT32); break; case 4: // FP16 = 4 layer->setOutputType(0, nvinfer1::DataType::kHALF); + layer->getOutput(0)->setType(nvinfer1::DataType::kHALF); break; case 5: // FP32 = 5 layer->setOutputType(0, nvinfer1::DataType::kFLOAT); + layer->getOutput(0)->setType(nvinfer1::DataType::kFLOAT); break; default: LOG(ERROR) << "Unable to convert a fluid data type(" << out_dtype diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc index 3e6fd52fab7f4..dfe1b2ca623bc 100644 --- a/paddle/fluid/inference/tensorrt/op_teller.cc +++ b/paddle/fluid/inference/tensorrt/op_teller.cc @@ -2124,10 +2124,15 @@ struct SimpleOpTypeSetTeller : public Teller { VLOG(3) << "unsupport data type conversion"; return false; } - if (in_dtype == 0) { - VLOG(3) << "do not support input data type as bool now"; - return false; +#if IS_TRT_VERSION_GE(8400) + if (in_dtype == 0 || out_dtype == 0) { + if (with_dynamic_shape) { + VLOG(3) << "the cast op supports inputs and outputs of BOOL by " + "trt8.4 above "; + return true; + } } +#endif if (!((in_dtype == 5 || in_dtype == 4 || in_dtype == 2) && (out_dtype == 5 || out_dtype == 4 || out_dtype == 2))) { VLOG(3) << "only valid conversions are: " diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_cast.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_cast.py index c063019a8f475..3730fabd03e0e 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_cast.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_cast.py @@ -30,9 +30,16 @@ def is_program_valid(self, program_config: ProgramConfig) -> bool: return False if attrs[0]['in_dtype'] in [4, 5] and attrs[0]['out_dtype'] == 4: return False - if attrs[0]['in_dtype'] not in [2, 4, 5] or attrs[0][ - 'out_dtype' - ] not in [2, 4, 5]: + + out_dtype = [2, 4, 5] + ver = paddle_infer.get_trt_compile_version() + if ver[0] * 1000 + ver[1] * 100 + ver[2] * 10 > 8400: + out_dtype.insert(3, 0) + + if ( + attrs[0]['in_dtype'] not in [2, 4, 5] + or attrs[0]['out_dtype'] not in out_dtype + ): return False return True @@ -49,6 +56,7 @@ def generate_input(type): for in_dtype in [0, 2, 5, 6]: for out_dtype in [0, 2, 5, 6]: + self.out_dtype = out_dtype dics = [ {"in_dtype": in_dtype, "out_dtype": out_dtype}, {"in_dtype": out_dtype, "out_dtype": in_dtype}, @@ -89,7 +97,7 @@ def sample_predictor_configs( ) -> (paddle_infer.Config, List[int], float): def generate_dynamic_shape(attrs): self.dynamic_shape.min_input_shape = {"input_data": [1, 3, 64, 64]} - self.dynamic_shape.max_input_shape = {"input_data": [4, 3, 64, 64]} + self.dynamic_shape.max_input_shape = {"input_data": [1, 3, 64, 64]} self.dynamic_shape.opt_input_shape = {"input_data": [1, 3, 64, 64]} def clear_dynamic_shape(): @@ -98,6 +106,8 @@ def clear_dynamic_shape(): self.dynamic_shape.opt_input_shape = {} def generate_trt_nodes_num(attrs, dynamic_shape): + if not dynamic_shape and self.out_dtype == 0: + return 0, 4 return 1, 2 attrs = [ diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_where.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_where.py index 45d8f7d30a568..913445af0d2b2 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_where.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_where.py @@ -195,7 +195,7 @@ def clear_dynamic_shape(): def generate_trt_nodes_num(attrs, dynamic_shape): if not dynamic_shape: return 0, 6 - return 1, 5 + return 1, 4 attrs = [ program_config.ops[i].attrs for i in range(len(program_config.ops)) From b7841a2b6641c46cff7843d92707645e5ba5c4a7 Mon Sep 17 00:00:00 2001 From: Wang Xin Date: Thu, 17 Nov 2022 14:11:28 +0800 Subject: [PATCH 058/210] move "function_traits.h" from fluid to phi (#48065) --- paddle/phi/kernels/funcs/broadcast_function.h | 4 ++-- paddle/phi/kernels/funcs/elementwise_base.h | 8 ++++---- .../platform => phi/kernels/funcs}/function_traits.h | 10 +++++----- 3 files changed, 11 insertions(+), 11 deletions(-) rename paddle/{fluid/platform => phi/kernels/funcs}/function_traits.h (93%) diff --git a/paddle/phi/kernels/funcs/broadcast_function.h b/paddle/phi/kernels/funcs/broadcast_function.h index 67d3a309b1f33..a222422c89fdc 100644 --- a/paddle/phi/kernels/funcs/broadcast_function.h +++ b/paddle/phi/kernels/funcs/broadcast_function.h @@ -176,7 +176,7 @@ __device__ void VectorizedBroadcastKernelImpl( #endif constexpr bool kCallElementwiseAny = - paddle::platform::FunctionTraits::has_pointer_args; + phi::funcs::FunctionTraits::has_pointer_args; phi::funcs::ElementwisePrimitiveCaller, VecSize, @@ -787,7 +787,7 @@ void BroadcastKernelForDifferentVecSize( std::vector *outs, int axis, Functor func) { - using Traits = paddle::platform::FunctionTraits; + using Traits = phi::funcs::FunctionTraits; const int kArity = Traits::has_pointer_args ? static_cast(ET) : Traits::arity; PADDLE_ENFORCE_EQ( diff --git a/paddle/phi/kernels/funcs/elementwise_base.h b/paddle/phi/kernels/funcs/elementwise_base.h index a1b0a70956931..17b0a653cc8a8 100644 --- a/paddle/phi/kernels/funcs/elementwise_base.h +++ b/paddle/phi/kernels/funcs/elementwise_base.h @@ -23,9 +23,9 @@ limitations under the License. */ #include "paddle/phi/kernels/funcs/math_function.h" #if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__) -#include "paddle/fluid/platform/function_traits.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/kernels/funcs/aligned_vector.h" +#include "paddle/phi/kernels/funcs/function_traits.h" #include "paddle/phi/kernels/primitive/kernel_primitives.h" #define HOSTDEVICE __host__ __device__ @@ -563,7 +563,7 @@ int GetVectorizedSizeForTensors(const std::vector &ins, #ifdef PADDLE_WITH_XPU_KP int vec_size = 256; #else - using Traits = paddle::platform::FunctionTraits; + using Traits = phi::funcs::FunctionTraits; using ArgsT = typename Traits::ArgsTuple; const int Arity = Traits::arity; int vec_size = 4; @@ -736,7 +736,7 @@ __device__ void VectorizedElementwiseKernelImpl( int num, int read_lens, Functor func) { - using Traits = paddle::platform::FunctionTraits; + using Traits = phi::funcs::FunctionTraits; using ArgsT = typename Traits::ArgsTuple; ArgsT args[VecSize]; ConditionalT result[VecSize]; @@ -831,7 +831,7 @@ void ElementwiseKernel(const KPDevice &ctx, const std::vector &ins, std::vector *outs, Functor func) { - using Traits = paddle::platform::FunctionTraits; + using Traits = phi::funcs::FunctionTraits; const int kArity = Traits::arity; PADDLE_ENFORCE_EQ(ins.size(), kArity, diff --git a/paddle/fluid/platform/function_traits.h b/paddle/phi/kernels/funcs/function_traits.h similarity index 93% rename from paddle/fluid/platform/function_traits.h rename to paddle/phi/kernels/funcs/function_traits.h index 662e3ac58a6c4..aae3a35dda125 100644 --- a/paddle/fluid/platform/function_traits.h +++ b/paddle/phi/kernels/funcs/function_traits.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.1 (the "License"); you may not use this file except in compliance with the License. @@ -16,8 +16,8 @@ limitations under the License. */ #include -namespace paddle { -namespace platform { +namespace phi { +namespace funcs { template struct IsPointerArgs { static_assert(Arity == sizeof...(Args), "Arity and Args not match!"); @@ -57,5 +57,5 @@ struct FunctionTraits { using ArgsTuple = std::tuple; }; -} // namespace platform -} // namespace paddle +} // namespace funcs +} // namespace phi From b4460eee6959a47203ed114991162f478dda62fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=82=85=E5=89=91=E5=AF=92?= Date: Thu, 17 Nov 2022 14:14:43 +0800 Subject: [PATCH 059/210] remove fluid.layers.affine_grid API (#47851) --- python/paddle/fluid/layers/nn.py | 85 ------------------- .../unittests/test_affine_grid_function.py | 4 +- .../fluid/tests/unittests/test_layers.py | 8 +- 3 files changed, 8 insertions(+), 89 deletions(-) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 41766c6651aad..1e740ca967343 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -177,7 +177,6 @@ 'mul', 'maxout', 'space_to_depth', - 'affine_grid', 'affine_channel', 'similarity_focus', 'hash', @@ -9693,90 +9692,6 @@ def _attr_offsets_check(offset_val): return out -def affine_grid(theta, out_shape, name=None): - """ - :alias_main: paddle.nn.functional.affine_grid - :alias: paddle.nn.functional.affine_grid,paddle.nn.functional.vision.affine_grid - :old_api: paddle.fluid.layers.affine_grid - - It generates a grid of (x,y) coordinates using the parameters of - the affine transformation that correspond to a set of points where - the input feature map should be sampled to produce the transformed - output feature map. - - Args: - theta (Variable) - A Tensor with shape [N, 2, 3]. It contains a batch of affine transform parameters. - The data type can be float32 or float64. - out_shape (Variable | list | tuple): The shape of target output with format [batch_size, channel, height, width]. - ``out_shape`` can be a Tensor or a list or tuple. The data - type must be int32. - name(str|None): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`. - - Returns: - Variable: A Tensor with shape [batch_size, H, W, 2] while 'H' and 'W' are the height and width of feature map in affine transformation. The data type is the same as `theta`. - - Raises: - ValueError: If the type of arguments is not supported. - - Examples: - - .. code-block:: python - - import paddle.fluid as fluid - import numpy as np - place = fluid.CPUPlace() - theta = fluid.data(name="x", shape=[None, 2, 3], dtype="float32") - out_shape = fluid.data(name="y", shape=[4], dtype="int32") - grid_0 = fluid.layers.affine_grid(theta, out_shape) - grid_1 = fluid.layers.affine_grid(theta, [5, 3, 28, 28]) - batch_size=2 - exe = fluid.Executor(place) - exe.run(fluid.default_startup_program()) - output= exe.run(feed={"x": np.random.rand(batch_size,2,3).astype("float32"), - "y": np.array([5, 3, 28, 28]).astype("int32")}, - fetch_list=[grid_0.name, grid_1.name]) - print(output[0]) - print(output[1]) - """ - helper = LayerHelper('affine_grid') - - check_variable_and_dtype( - theta, 'theta', ['float32', 'float64'], 'affine_grid' - ) - - if not ( - isinstance(out_shape, list) - or isinstance(out_shape, tuple) - or isinstance(out_shape, Variable) - ): - raise ValueError("The out_shape should be a list, tuple or Variable.") - - if not isinstance(theta, Variable): - raise ValueError("The theta should be a Variable.") - - out = helper.create_variable_for_type_inference(theta.dtype) - ipts = {'Theta': theta} - attrs = {} - if isinstance(out_shape, Variable): - ipts['OutputShape'] = out_shape - check_variable_and_dtype( - out_shape, 'out_shape', ['int32'], 'affine_grid' - ) - else: - attrs['output_shape'] = out_shape - if core.is_compiled_with_rocm(): - # ROCM platform do not have MIOPEN kernel for affine_grid - attrs['use_cudnn'] = False - - helper.append_op( - type='affine_grid', - inputs=ipts, - outputs={'Output': out}, - attrs=None if len(attrs) == 0 else attrs, - ) - return out - - def pad2d( input, paddings=[0, 0, 0, 0], diff --git a/python/paddle/fluid/tests/unittests/test_affine_grid_function.py b/python/paddle/fluid/tests/unittests/test_affine_grid_function.py index cc8371ae82109..420b6e61ca266 100644 --- a/python/paddle/fluid/tests/unittests/test_affine_grid_function.py +++ b/python/paddle/fluid/tests/unittests/test_affine_grid_function.py @@ -52,7 +52,9 @@ def fluid_layer(self, place): theta_var = fluid.data( "input", self.theta_shape, dtype=self.dtype ) - y_var = fluid.layers.affine_grid(theta_var, self.output_shape) + y_var = paddle.nn.functional.affine_grid( + theta_var, self.output_shape + ) feed_dict = {"input": self.theta} exe = fluid.Executor(place) exe.run(start) diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index 7b7dfd399120f..2f08a12b4b4ff 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -3278,7 +3278,9 @@ def make_pool2d_infershape(self): fluid.default_main_program(), fluid.default_startup_program() ): theta = self._get_data("theta", shape=[2, 3], dtype='float32') - x = fluid.layers.affine_grid(theta, out_shape=[2, 3, 244, 244]) + x = paddle.nn.functional.affine_grid( + theta, out_shape=[2, 3, 244, 244] + ) return layers.pool2d( x, pool_size=[5, 3], pool_stride=[1, 2], pool_padding=(2, 1) ) @@ -4175,8 +4177,8 @@ def test_affine_grid(self): theta = layers.data(name="theta", shape=[2, 3], dtype="float32") out_shape = layers.data(name="out_shape", shape=[-1], dtype="int32") - data_0 = layers.affine_grid(theta, out_shape) - data_1 = layers.affine_grid(theta, [5, 3, 28, 28]) + data_0 = paddle.nn.functional.affine_grid(theta, out_shape) + data_1 = paddle.nn.functional.affine_grid(theta, [5, 3, 28, 28]) self.assertIsNotNone(data_0) self.assertIsNotNone(data_1) From 071708fae3fe0d4d97f611f9cc22cf8736564910 Mon Sep 17 00:00:00 2001 From: taixiurong Date: Thu, 17 Nov 2022 14:27:58 +0800 Subject: [PATCH 060/210] =?UTF-8?q?xpu-paddlepaddle-41=20[=E4=BB=BB?= =?UTF-8?q?=E5=8A=A1]=20ffn=20and=20attention=20test=3Dkunlun=20(#46658)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- paddle/fluid/operators/fused/CMakeLists.txt | 2 + .../operators/fused/fused_attention_op_xpu.cc | 939 ++++++++++++++++++ .../fused/fused_feedforward_op_xpu.cc | 828 +++++++++++++++ .../fused/xpu_fused_common_function.h | 224 +++++ .../fluid/platform/device/xpu/xpu2_op_list.h | 12 + paddle/phi/kernels/xpu/xpu_api_wrapper.h | 9 +- .../xpu/test_fused_attention_op_xpu.py | 331 ++++++ .../xpu/test_fused_feedforward_op_xpu.py | 379 +++++++ 8 files changed, 2723 insertions(+), 1 deletion(-) create mode 100644 paddle/fluid/operators/fused/fused_attention_op_xpu.cc create mode 100644 paddle/fluid/operators/fused/fused_feedforward_op_xpu.cc create mode 100644 paddle/fluid/operators/fused/xpu_fused_common_function.h create mode 100644 python/paddle/fluid/tests/unittests/xpu/test_fused_attention_op_xpu.py create mode 100644 python/paddle/fluid/tests/unittests/xpu/test_fused_feedforward_op_xpu.py diff --git a/paddle/fluid/operators/fused/CMakeLists.txt b/paddle/fluid/operators/fused/CMakeLists.txt index 9a14d35b59990..23cdc33658d1c 100755 --- a/paddle/fluid/operators/fused/CMakeLists.txt +++ b/paddle/fluid/operators/fused/CMakeLists.txt @@ -38,6 +38,8 @@ if(WITH_XPU) op_library(resnet_basic_block_op) op_library(resnet_unit_op) op_library(fused_gemm_epilogue_op) + op_library(fused_attention_op) + op_library(fused_feedforward_op) endif() if(WITH_GPU OR WITH_ROCM) diff --git a/paddle/fluid/operators/fused/fused_attention_op_xpu.cc b/paddle/fluid/operators/fused/fused_attention_op_xpu.cc new file mode 100644 index 0000000000000..6bf2e3d80335f --- /dev/null +++ b/paddle/fluid/operators/fused/fused_attention_op_xpu.cc @@ -0,0 +1,939 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PADDLE_WITH_XPU +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/operators/fused/xpu_fused_common_function.h" +#include "paddle/fluid/operators/matmul_v2_op.h" +#include "paddle/fluid/operators/xpu_api_wrapper.h" +#include "paddle/fluid/platform/device/device_wrapper.h" + +namespace paddle { +namespace operators { + +using Tensor = phi::DenseTensor; + +template +class FusedAttentionOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + using XPUTypeT = typename XPUTypeTrait::Type; + + // inputs tensor + auto *input_x = ctx.Input("X"); + + const auto pre_layer_norm = ctx.Attr("pre_layer_norm"); + + // shape [3, num_head, dim_head, dim_embed] + auto *qkv_weight = ctx.Input("QKVW"); + // shape [3 , num_head, dim_head] + auto *qkv_bias = ctx.Input("QKVBias"); + + // shape [batch_size, 1, 1, seq_len] + auto *src_mask = ctx.Input("SrcMask"); + + // shape [dim_embed, dim_embed] + auto *out_linear_weight = ctx.Input("OutLinearW"); + // shape [dim_embed] + auto *out_linear_bias = ctx.Input("OutLinearBias"); + + const Tensor *ln_scale = nullptr; + const Tensor *ln_bias = nullptr; + float epsilon = 0.0f; + + if (pre_layer_norm) { + ln_scale = ctx.Input("LnScale"); + ln_bias = ctx.Input("LnBias"); + epsilon = ctx.Attr("epsilon"); + } else { + ln_scale = ctx.Input("Ln2Scale"); + ln_bias = ctx.Input("Ln2Bias"); + epsilon = ctx.Attr("ln_epsilon"); + } + + // outputs tensor + // qkv 的值,并已经做了transpos后的值 + // shape [3, batch_size, num_head, seq_len, dim_head] + auto *TransposeOut2 = ctx.Output("TransposeOut2"); + + // shape [batch_size, num_head, seq_len, seq_len] + auto *softmax_out = ctx.Output("SoftmaxOut"); + // shape [batch_size, num_head, seq_len, seq_len] + auto *attn_dropout_mask_out = ctx.Output("AttnDropoutMaskOut"); + // shape [batch_size, num_head, seq_len, seq_len] + auto *attn_dropout_out = ctx.Output("AttnDropoutOut"); + + // shape [[batch_size, seq_len, num_head, dim_head]] + auto *fmha_out = ctx.Output("FMHAOut"); + + // shape [batch_size, seq_len, dim_embed] + auto *dropout_mask_out = ctx.Output("DropoutMaskOut"); + + // final output + // shape [batch_size, seq_len, dim_embed] + auto *out = ctx.Output("Y"); + + // 下面这个tensor是不需要返回, 但是新的动态图需要 + auto *QKOut = ctx.Output("QKOut"); + QKOut->mutable_data(ctx.GetPlace()); + auto *QKTVOut = ctx.Output("QKTVOut"); + QKTVOut->mutable_data(ctx.GetPlace()); + auto *OutLinearOut = ctx.Output("OutLinearOut"); + OutLinearOut->mutable_data(ctx.GetPlace()); + auto *QKVBiasOut = ctx.Output("QKVBiasOut"); + QKVBiasOut->mutable_data(ctx.GetPlace()); + auto *SrcMaskOut = ctx.Output("SrcMaskOut"); + SrcMaskOut->mutable_data(ctx.GetPlace()); + auto *qkv_out = ctx.Output("QKVOut"); + qkv_out->mutable_data(ctx.GetPlace()); + + Tensor *bias_dropout_residual_out = nullptr; + Tensor *ln_mean = nullptr; + Tensor *ln_var = nullptr; + Tensor *ln_out = nullptr; + + if (pre_layer_norm) { + ln_mean = ctx.Output("LnMean"); + ln_var = ctx.Output("LnVariance"); + ln_out = ctx.Output("LnOut"); + } else { + ln_mean = ctx.Output("Ln2Mean"); + ln_var = ctx.Output("Ln2Variance"); + bias_dropout_residual_out = ctx.Output("BiasDropoutResidualOut"); + } + + // dropout info + float attn_dropout_rate = ctx.Attr("attn_dropout_rate"); + + bool is_test_1 = ctx.Attr("is_test"); + + auto &dropout_implementation_1 = + ctx.Attr("attn_dropout_implementation"); + + bool is_upscale_in_train_1 = + (dropout_implementation_1 == "upscale_in_train"); + auto *seed_1 = ctx.HasInput("Seed1") ? ctx.Input("Seed1") : nullptr; + + bool is_fix_seed_1 = ctx.Attr("attn_dropout_fix_seed"); + + int seed_val_1 = ctx.Attr("attn_dropout_seed"); + + XPUDropoutParam attn_dropout_param; + attn_dropout_param.initXPUDropoutParam(attn_dropout_rate, + is_upscale_in_train_1, + is_test_1, + is_fix_seed_1, + seed_1, + seed_val_1); + + XPUDropoutParam dropout_param(ctx, 0); + + // 先计算纬度 + const auto input_x_dims = input_x->dims(); + const auto qkv_w_dims = qkv_weight->dims(); + + int batch_size = input_x_dims[0]; + int seq_len = input_x_dims[1]; + int embed_dims = input_x_dims[2]; + int num_heads = qkv_w_dims[1]; + int head_dims = qkv_w_dims[2]; + + // 输入指针 + const XPUTypeT *input_x_ptr = + reinterpret_cast(input_x->data()); + + const XPUTypeT *qkv_weight_ptr = + reinterpret_cast(qkv_weight->data()); + const XPUTypeT *qkv_bias_ptr = + reinterpret_cast(qkv_bias->data()); + const XPUTypeT *src_mask_ptr = + (src_mask == nullptr) + ? (nullptr) + : (reinterpret_cast(src_mask->data())); + + const XPUTypeT *out_linear_weight_ptr = + reinterpret_cast(out_linear_weight->data()); + + const XPUTypeT *out_linear_bias_ptr = + reinterpret_cast(out_linear_bias->data()); + + const float *ln_scale_ptr = + (ln_scale == nullptr) ? (nullptr) : ln_scale->data(); + + const float *ln_bias_ptr = + (ln_bias == nullptr) ? (nullptr) : ln_bias->data(); + + // 输出指针 + XPUTypeT *qkv_transpose_out_ptr = reinterpret_cast( + TransposeOut2->mutable_data(ctx.GetPlace())); + + XPUTypeT *softmax_out_ptr = reinterpret_cast( + softmax_out->mutable_data(ctx.GetPlace())); + + XPUTypeT *attn_dropout_mask_out_ptr = reinterpret_cast( + attn_dropout_mask_out->mutable_data(ctx.GetPlace())); + + XPUTypeT *attn_dropout_out_ptr = reinterpret_cast( + attn_dropout_out->mutable_data(ctx.GetPlace())); + + XPUTypeT *fmha_out_ptr = + reinterpret_cast(fmha_out->mutable_data(ctx.GetPlace())); + + XPUTypeT *dropout_mask_out_ptr = reinterpret_cast( + dropout_mask_out->mutable_data(ctx.GetPlace())); + + XPUTypeT *out_ptr = + reinterpret_cast(out->mutable_data(ctx.GetPlace())); + + XPUTypeT *bias_dropout_residual_out_ptr = + (bias_dropout_residual_out == nullptr) + ? (nullptr) + : (reinterpret_cast( + bias_dropout_residual_out->mutable_data(ctx.GetPlace()))); + + float *ln_mean_ptr = (ln_mean == nullptr) + ? (nullptr) + : ln_mean->mutable_data(ctx.GetPlace()); + + float *ln_var_ptr = (ln_var == nullptr) + ? (nullptr) + : ln_var->mutable_data(ctx.GetPlace()); + + XPUTypeT *ln_out_ptr = (ln_out == nullptr) + ? (nullptr) + : (reinterpret_cast( + ln_out->mutable_data(ctx.GetPlace()))); + + auto &dev_ctx = ctx.template device_context(); + + xpu::Context *xpu_ctx = dev_ctx.x_context(); + + xpu::ctx_guard RAII_GUARD(xpu_ctx); + + int l3_total_size = xpu_ctx->_l3_mgr.get_size(); + + XPUTypeT *qkv_before_transpos_ptr = + NULL; // x2[batch_size, seq_len, 3, num_heads,head_dims] + XPUTypeT *qk_ptr = NULL; // qk [batch_size, num_heads, seq_len, seq_len] + XPUTypeT *qkv_ptr = NULL; // qkv[batch_size, num_heads, seq_len, head_dims] + XPUTypeT *linear_out_ptr = + NULL; // x4, x5 [batch_size, seq_len, embed_dims] + + int temp_size_1 = batch_size * seq_len * 3 * num_heads * head_dims; + int temp_size_2 = batch_size * num_heads * seq_len * seq_len; + int temp_size_3 = batch_size * num_heads * seq_len * head_dims; + int temp_size_4 = batch_size * seq_len * embed_dims; + + std::vector temp_vec = { + temp_size_1, temp_size_2, temp_size_3, temp_size_4}; + std::sort(temp_vec.begin(), temp_vec.end(), std::greater()); + XPUTypeT *max_gm_ptr = RAII_GUARD.alloc(temp_vec[0]); + PADDLE_ENFORCE_XDNN_NOT_NULL(max_gm_ptr); + qkv_before_transpos_ptr = max_gm_ptr; + qk_ptr = max_gm_ptr; + qkv_ptr = max_gm_ptr; + linear_out_ptr = max_gm_ptr; + int sizeof_t = sizeof(XPUTypeT); + for (size_t i = 0; i < temp_vec.size(); ++i) { + if (l3_total_size >= temp_vec[i] * sizeof_t) { + XPUTypeT *l3_ptr = RAII_GUARD.alloc_l3(temp_vec[i]); + qkv_before_transpos_ptr = + (temp_size_1 <= temp_vec[i]) ? l3_ptr : max_gm_ptr; + qk_ptr = (temp_size_2 <= temp_vec[i]) ? l3_ptr : max_gm_ptr; + qkv_ptr = (temp_size_3 <= temp_vec[i]) ? l3_ptr : max_gm_ptr; + linear_out_ptr = (temp_size_4 <= temp_vec[i]) ? l3_ptr : max_gm_ptr; + break; + } + } + + int r = 0; + const XPUTypeT *x_cacl_ptr = input_x_ptr; + if (pre_layer_norm) { + r = xpu::layer_norm(xpu_ctx, + input_x_ptr, + ln_out_ptr, + batch_size * seq_len, + embed_dims, + epsilon, + ln_scale_ptr, + ln_bias_ptr, + ln_mean_ptr, + ln_var_ptr); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "layer_norm"); + x_cacl_ptr = ln_out_ptr; + } + + // fc + phi::XpuFcInfo qkv_fc_info; + qkv_fc_info.InitFcInfo(0, + batch_size * seq_len, + 3 * num_heads * head_dims, + embed_dims, + false, + true, + nullptr, + nullptr, + nullptr); + + phi::MatMulXPUFunction(xpu_ctx, + x_cacl_ptr, + qkv_weight_ptr, + qkv_before_transpos_ptr, + qkv_fc_info, + 1.0f); + + // bias + r = xpu::broadcast_add(xpu_ctx, + qkv_before_transpos_ptr, + qkv_bias_ptr, + qkv_before_transpos_ptr, + {batch_size * seq_len, 3 * num_heads * head_dims}, + {3 * num_heads * head_dims}); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast_add"); + + // transpose + r = xpu::transpose(xpu_ctx, + qkv_before_transpos_ptr, + qkv_transpose_out_ptr, + {batch_size, seq_len, 3, num_heads, head_dims}, + {2, 0, 3, 1, 4}); + + PADDLE_ENFORCE_XDNN_SUCCESS(r, "transpose"); + + int qkv_every_size = batch_size * seq_len * num_heads * head_dims; + { + float alpha = 1.0 / sqrt(head_dims); + r = scale(xpu_ctx, + qkv_transpose_out_ptr, + qkv_transpose_out_ptr, + qkv_every_size, + false, + alpha, + 0.0f); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "scale"); + } + + // begin fhma + // 1. qk 2. qk + mask 3. softmax 4.dropout 5. qkv 6. transpos + { + const XPUTypeT *q_ptr = qkv_transpose_out_ptr; + const XPUTypeT *k_ptr = q_ptr + qkv_every_size; + const XPUTypeT *v_ptr = k_ptr + qkv_every_size; + phi::XpuFcInfo qk_fc_info; + qk_fc_info.InitFcInfo(batch_size * num_heads, + seq_len, + seq_len, + head_dims, + false, + true, + nullptr, + nullptr, + nullptr); + phi::MatMulXPUFunction( + xpu_ctx, q_ptr, k_ptr, qk_ptr, qk_fc_info, 1.0f); + + if (src_mask_ptr) { + r = xpu::broadcast_add(xpu_ctx, + qk_ptr, + src_mask_ptr, + qk_ptr, + {batch_size, num_heads, seq_len, seq_len}, + {batch_size, 1, 1, seq_len}); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast_add"); + } + // do softmax + r = xpu::softmax(xpu_ctx, + qk_ptr, + softmax_out_ptr, + {batch_size, num_heads, seq_len, seq_len}, + 3); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "softmax"); + + // do dropout + Dropout(xpu_ctx, + softmax_out_ptr, + attn_dropout_mask_out_ptr, + attn_dropout_out_ptr, + attn_dropout_param, + batch_size * num_heads * seq_len * seq_len); + + phi::XpuFcInfo qktv_fc_info; + qktv_fc_info.InitFcInfo(batch_size * num_heads, + seq_len, + head_dims, + seq_len, + false, + false, + nullptr, + nullptr, + nullptr); + phi::MatMulXPUFunction( + xpu_ctx, attn_dropout_out_ptr, v_ptr, qkv_ptr, qktv_fc_info, 1.0f); + r = xpu::transpose(xpu_ctx, + qkv_ptr, + fmha_out_ptr, + {batch_size, num_heads, seq_len, head_dims}, + {0, 2, 1, 3}); + + PADDLE_ENFORCE_XDNN_SUCCESS(r, "transpose"); + } + + // linear_out + phi::XpuFcInfo linear_fc_info; + linear_fc_info.InitFcInfo(0, + batch_size * seq_len, + embed_dims, + embed_dims, + false, + false, + nullptr, + nullptr, + nullptr); + phi::MatMulXPUFunction(xpu_ctx, + fmha_out_ptr, + out_linear_weight_ptr, + linear_out_ptr, + linear_fc_info, + 1.0f); + + // out_linear_bias_ptr + r = xpu::broadcast_add(xpu_ctx, + linear_out_ptr, + out_linear_bias_ptr, + linear_out_ptr, + {batch_size * seq_len, embed_dims}, + {embed_dims}); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast_add"); + + Dropout(xpu_ctx, + linear_out_ptr, + dropout_mask_out_ptr, + linear_out_ptr, + dropout_param, + batch_size * seq_len * embed_dims); + + XPUTypeT *real_out_ptr = out_ptr; + if (pre_layer_norm == false) { + real_out_ptr = bias_dropout_residual_out_ptr; + } + + r = xpu::add(xpu_ctx, + linear_out_ptr, + input_x_ptr, + real_out_ptr, + batch_size * seq_len * embed_dims); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "add"); + + if (pre_layer_norm == false) { + r = xpu::layer_norm(xpu_ctx, + real_out_ptr, + out_ptr, + batch_size * seq_len, + embed_dims, + epsilon, + ln_scale_ptr, + ln_bias_ptr, + ln_mean_ptr, + ln_var_ptr); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "layer_norm"); + } + } +}; + +// template +template +class FusedAttentionGradXPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + using XPUTypeT = typename XPUTypeTrait::Type; + const auto pre_layer_norm = ctx.Attr("pre_layer_norm"); + + // dropout info + float attn_dropout_prob = ctx.Attr("attn_dropout_rate"); + bool is_test_1 = ctx.Attr("is_test"); + auto &dropout_implementation_1 = + ctx.Attr("attn_dropout_implementation"); + bool is_upscale_in_train_1 = + (dropout_implementation_1 == "upscale_in_train"); + auto *seed_1 = ctx.HasInput("Seed1") ? ctx.Input("Seed1") : nullptr; + bool is_fix_seed_1 = ctx.Attr("attn_dropout_fix_seed"); + int seed_val_1 = ctx.Attr("attn_dropout_seed"); + + XPUDropoutParam attn_dropout_param; + attn_dropout_param.initXPUDropoutParam(attn_dropout_prob, + is_upscale_in_train_1, + is_test_1, + is_fix_seed_1, + seed_1, + seed_val_1); + + XPUDropoutParam dropout_param(ctx, 0); + // get inputs. + auto *d_y = ctx.Input(framework::GradVarName("Y")); + const XPUTypeT *d_y_ptr = + reinterpret_cast(d_y->data()); + // 前向必要参数 + auto *input_x = ctx.Input("X"); + const XPUTypeT *input_x_ptr = + reinterpret_cast(input_x->data()); + auto *qkv_transpose_out = ctx.Input("TransposeOut2"); + const XPUTypeT *qkv_transpose_out_ptr = + reinterpret_cast(qkv_transpose_out->data()); + auto *qkv_weight = ctx.Input("QKVW"); + const XPUTypeT *qkv_weight_ptr = + reinterpret_cast(qkv_weight->data()); + + auto *softmax_out = ctx.Input("SoftmaxOut"); + const XPUTypeT *softmax_out_ptr = + reinterpret_cast(softmax_out->data()); + auto *attn_dropout_out = ctx.Input("AttnDropoutOut"); + const XPUTypeT *attn_dropout_out_ptr = + reinterpret_cast(attn_dropout_out->data()); + + auto *attn_dropout_mask = ctx.Input("AttnDropoutMaskOut"); + const XPUTypeT *attn_dropout_mask_ptr = + reinterpret_cast(attn_dropout_mask->data()); + auto *fmha_out = ctx.Input("FMHAOut"); + const XPUTypeT *fmha_out_ptr = + reinterpret_cast(fmha_out->data()); + + auto *out_linear_weight = ctx.Input("OutLinearW"); + const XPUTypeT *out_linear_weight_ptr = + reinterpret_cast(out_linear_weight->data()); + + auto *dropout_mask_out = ctx.Input("DropoutMaskOut"); + const XPUTypeT *dropout_mask_out_ptr = + reinterpret_cast(dropout_mask_out->data()); + // 需要计算的梯度 + auto *d_qkv_weight = ctx.Output(framework::GradVarName("QKVW")); + XPUTypeT *d_qkv_weight_ptr = reinterpret_cast( + d_qkv_weight->mutable_data(ctx.GetPlace())); + + auto *d_qkv_bias = ctx.Output(framework::GradVarName("QKVBias")); + XPUTypeT *d_qkv_bias_ptr = reinterpret_cast( + d_qkv_bias->mutable_data(ctx.GetPlace())); + auto *d_out_linear_weight = + ctx.Output(framework::GradVarName("OutLinearW")); + + XPUTypeT *d_out_linear_weight_ptr = reinterpret_cast( + d_out_linear_weight->mutable_data(ctx.GetPlace())); + + auto *d_out_linear_bias = + ctx.Output(framework::GradVarName("OutLinearBias")); + XPUTypeT *d_out_linear_bias_ptr = reinterpret_cast( + d_out_linear_bias->mutable_data(ctx.GetPlace())); + // 有可能需要 + auto *d_src_mask_out = + ctx.Output(framework::GradVarName("SrcMaskOut")); + XPUTypeT *d_src_mask_out_ptr = + (d_src_mask_out == nullptr) + ? (nullptr) + : (reinterpret_cast( + d_src_mask_out->mutable_data(ctx.GetPlace()))); + // 输出 dx + auto *d_x = ctx.Output(framework::GradVarName("X")); + XPUTypeT *d_x_ptr = + reinterpret_cast(d_x->mutable_data(ctx.GetPlace())); + + const Tensor *ln_out = nullptr; + const Tensor *bias_dropout_residual_out = nullptr; + const Tensor *ln_scale = nullptr; + const Tensor *ln_mean = nullptr; + const Tensor *ln_var = nullptr; + Tensor *d_ln_scale = nullptr; + Tensor *d_ln_bias = nullptr; + + const XPUTypeT *ln_out_ptr = NULL; + const float *ln_scale_ptr = NULL; + const float *ln_mean_ptr = NULL; + const float *ln_var_ptr = NULL; + const XPUTypeT *bias_dropout_residual_out_ptr = NULL; + float *d_ln_scale_ptr = nullptr; + float *d_ln_bias_ptr = nullptr; + + float epsilon = 0.0f; + + if (pre_layer_norm) { + ln_out = ctx.Input("LnOut"); + ln_out_ptr = reinterpret_cast(ln_out->data()); + ln_scale = ctx.Input("LnScale"); + ln_mean = ctx.Input("LnMean"); + ln_var = ctx.Input("LnVariance"); + epsilon = ctx.Attr("epsilon"); + d_ln_scale = ctx.Output(framework::GradVarName("LnScale")); + d_ln_bias = ctx.Output(framework::GradVarName("LnBias")); + + } else { + ln_scale = ctx.Input("Ln2Scale"); + ln_mean = ctx.Input("Ln2Mean"); + ln_var = ctx.Input("Ln2Variance"); + epsilon = ctx.Attr("ln_epsilon"); + d_ln_scale = ctx.Output(framework::GradVarName("Ln2Scale")); + d_ln_bias = ctx.Output(framework::GradVarName("Ln2Bias")); + bias_dropout_residual_out = ctx.Input("BiasDropoutResidualOut"); + bias_dropout_residual_out_ptr = reinterpret_cast( + bias_dropout_residual_out->data()); + } + + ln_scale_ptr = ln_scale->data(); + ln_mean_ptr = ln_mean->data(); + ln_var_ptr = ln_var->data(); + d_ln_scale_ptr = d_ln_scale->mutable_data(ctx.GetPlace()); + d_ln_bias_ptr = d_ln_bias->mutable_data(ctx.GetPlace()); + + const auto input_x_dims = input_x->dims(); + const auto qkv_w_dims = qkv_weight->dims(); + + int batch_size = input_x_dims[0]; + int seq_len = input_x_dims[1]; + int embed_dims = input_x_dims[2]; + int num_heads = qkv_w_dims[1]; + int head_dims = qkv_w_dims[2]; + + auto &dev_ctx = ctx.template device_context(); + xpu::Context *xpu_ctx = dev_ctx.x_context(); + xpu::ctx_guard RAII_GUARD(xpu_ctx); + + int r = 0; + // int l3_total_size = xpu_ctx->_l3_mgr.get_size(); + XPUTypeT *d_ln_grad_ptr = NULL; // dx5 [batch_size, seq_len, hidden] + XPUTypeT *d_dropout_grad_ptr = NULL; // dx5 [batch_size, seq_len, hidden] + + XPUTypeT *d_fmha_out_ptr = + NULL; // d_fmha_out [batch_size, seq_len, num_heads, head_dims] + XPUTypeT *d_fmha_out_transpos_tmp_ptr = + NULL; // d_fmha_out_transpos [batch_size, seq_len, num_heads, + // head_dims] + + XPUTypeT *d_qk_ptr = + NULL; // d_qk_ptr[batch_size, num_heads, seq_len, seq_len] + + XPUTypeT *d_combination_qkv_ptr = + NULL; // d_combination_qkv_ptr[3, batch_size, num_heads, seq_len, + // head_dims] + XPUTypeT *d_transpos_qkv_ptr = + NULL; // dx2 [batch_size, seq_len, 3, num_heads, head_dims] + + XPUTypeT *d_last_layernorm_grad_ptr = + NULL; // d_layer_out [batch_size, seq_len, embed_dims] + + const XPUTypeT *dy_input_ptr = d_y_ptr; + + d_ln_grad_ptr = + RAII_GUARD.alloc(batch_size * seq_len * embed_dims); + d_dropout_grad_ptr = + RAII_GUARD.alloc_l3_or_gm(batch_size * seq_len * embed_dims); + d_fmha_out_ptr = RAII_GUARD.alloc_l3_or_gm(batch_size * seq_len * + num_heads * head_dims); + d_combination_qkv_ptr = + RAII_GUARD.alloc(batch_size * seq_len * embed_dims * 3); + d_transpos_qkv_ptr = RAII_GUARD.alloc_l3_or_gm( + batch_size * seq_len * embed_dims * 3); + d_fmha_out_transpos_tmp_ptr = + RAII_GUARD.alloc_l3_or_gm(batch_size * seq_len * embed_dims); + d_qk_ptr = RAII_GUARD.alloc_l3_or_gm(batch_size * seq_len * + seq_len * num_heads); + d_last_layernorm_grad_ptr = + RAII_GUARD.alloc_l3_or_gm(batch_size * seq_len * embed_dims); + + if (pre_layer_norm == false) { + r = xpu::layer_norm_grad(xpu_ctx, + bias_dropout_residual_out_ptr, + d_y_ptr, + d_ln_grad_ptr, + batch_size * seq_len, + embed_dims, + epsilon, + ln_scale_ptr, + ln_mean_ptr, + ln_var_ptr, + d_ln_scale_ptr, + d_ln_bias_ptr); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "layer_norm_grad"); + dy_input_ptr = d_ln_grad_ptr; + } + // dropout_grad + DropoutGrad(xpu_ctx, + dy_input_ptr, + dropout_mask_out_ptr, + d_dropout_grad_ptr, + dropout_param, + batch_size * num_heads * seq_len * head_dims); + + // linear_out + phi::XpuFcInfo linear_fc_info; + linear_fc_info.InitFcInfo(0, + batch_size * seq_len, + embed_dims, + embed_dims, + false, + false, + nullptr, + nullptr, + nullptr); + const XPUTypeT *a_1 = reinterpret_cast(NULL); + const XPUTypeT *b_1 = reinterpret_cast(NULL); + const XPUTypeT *a_2 = reinterpret_cast(NULL); + const XPUTypeT *b_2 = reinterpret_cast(NULL); + + XPUTypeT *c_1 = d_fmha_out_ptr; + XPUTypeT *c_2 = d_out_linear_weight_ptr; + phi::XpuFcInfo info_dfmha; + phi::XpuFcInfo info_dlinear_w; + + std::tuple + fc_info = phi::MatmulGradFcInfo(xpu_ctx, + &RAII_GUARD, + linear_fc_info, + false, + false, + fmha_out_ptr, + out_linear_weight_ptr, + d_dropout_grad_ptr); + + std::tie(info_dfmha, info_dlinear_w, a_1, b_1, a_2, b_2) = fc_info; + phi::MatMulXPUFunction( + xpu_ctx, a_2, b_2, c_2, info_dlinear_w, 1.0f, true); + + phi::MatMulXPUFunction( + xpu_ctx, a_1, b_1, c_1, info_dfmha, 1.0f, true); + + // dlinear_bias + r = xpu::reduce_sum(xpu_ctx, + d_dropout_grad_ptr, + d_out_linear_bias_ptr, + {batch_size * seq_len, embed_dims}, + {0}); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "reduce_sum"); + { + int qkv_size = batch_size * seq_len * num_heads * head_dims; + const XPUTypeT *q_out_ptr = qkv_transpose_out_ptr; + const XPUTypeT *k_out_ptr = q_out_ptr + qkv_size; + const XPUTypeT *v_out_ptr = k_out_ptr + qkv_size; + XPUTypeT *d_q_out_ptr = d_combination_qkv_ptr; + XPUTypeT *d_k_out_ptr = d_q_out_ptr + qkv_size; + XPUTypeT *d_v_out_ptr = d_k_out_ptr + qkv_size; + r = xpu::transpose(xpu_ctx, + d_fmha_out_ptr, + d_fmha_out_transpos_tmp_ptr, + {batch_size, seq_len, num_heads, head_dims}, + {0, 2, 1, 3}); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "transpose"); + + phi::XpuFcInfo qktv_fc_info; + qktv_fc_info.InitFcInfo(batch_size * num_heads, + seq_len, + head_dims, + seq_len, + false, + false, + nullptr, + nullptr, + nullptr); + + const XPUTypeT *a_1 = reinterpret_cast(NULL); + const XPUTypeT *b_1 = reinterpret_cast(NULL); + const XPUTypeT *a_2 = reinterpret_cast(NULL); + const XPUTypeT *b_2 = reinterpret_cast(NULL); + XPUTypeT *c_1 = d_qk_ptr; + XPUTypeT *c_2 = d_v_out_ptr; + phi::XpuFcInfo info_d_qk; + phi::XpuFcInfo info_d_v; + + std::tuple + fc_info = phi::MatmulGradFcInfo(xpu_ctx, + &RAII_GUARD, + qktv_fc_info, + false, + false, + attn_dropout_out_ptr, + v_out_ptr, + d_fmha_out_transpos_tmp_ptr); + + std::tie(info_d_qk, info_d_v, a_1, b_1, a_2, b_2) = fc_info; + phi::MatMulXPUFunction( + xpu_ctx, a_1, b_1, c_1, info_d_qk, 1.0f, true); + phi::MatMulXPUFunction( + xpu_ctx, a_2, b_2, c_2, info_d_v, 1.0f, true); + + DropoutGrad(xpu_ctx, + d_qk_ptr, + attn_dropout_mask_ptr, + d_qk_ptr, + attn_dropout_param, + batch_size * seq_len * seq_len * num_heads); + + r = xpu::softmax_grad(xpu_ctx, + softmax_out_ptr, + d_qk_ptr, + d_qk_ptr, + {batch_size, num_heads, seq_len, seq_len}, + 3); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "softmax_grad"); + + if (d_src_mask_out_ptr) { + r = xpu::copy(xpu_ctx, + d_qk_ptr, + d_src_mask_out_ptr, + batch_size * seq_len * seq_len * num_heads); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "copy"); + } + phi::XpuFcInfo qk_fc_info; + qk_fc_info.InitFcInfo(batch_size * num_heads, + seq_len, + seq_len, + head_dims, + false, + true, + nullptr, + nullptr, + nullptr); + + a_1 = reinterpret_cast(NULL); + b_1 = reinterpret_cast(NULL); + a_2 = reinterpret_cast(NULL); + b_2 = reinterpret_cast(NULL); + c_1 = d_q_out_ptr; + c_2 = d_k_out_ptr; + phi::XpuFcInfo info_d_q; + phi::XpuFcInfo info_d_k; + + fc_info = phi::MatmulGradFcInfo(xpu_ctx, + &RAII_GUARD, + qk_fc_info, + false, + true, + q_out_ptr, + k_out_ptr, + d_qk_ptr); + + std::tie(info_d_q, info_d_k, a_1, b_1, a_2, b_2) = fc_info; + + phi::MatMulXPUFunction( + xpu_ctx, a_1, b_1, c_1, info_d_q, 1.0f / sqrt(head_dims), true); + + phi::MatMulXPUFunction( + xpu_ctx, a_2, b_2, c_2, info_d_k, 1.0f, true); + } + + // + r = xpu::transpose(xpu_ctx, + d_combination_qkv_ptr, + d_transpos_qkv_ptr, + {3, batch_size, num_heads, seq_len, head_dims}, + {1, 3, 0, 2, 4}); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "transpose"); + // dx and d_qkv_w + phi::XpuFcInfo qkv_fc_info; + qkv_fc_info.InitFcInfo(0, + batch_size * seq_len, + 3 * num_heads * head_dims, + embed_dims, + false, + true, + nullptr, + nullptr, + nullptr); + + a_1 = reinterpret_cast(NULL); + b_1 = reinterpret_cast(NULL); + a_2 = reinterpret_cast(NULL); + b_2 = reinterpret_cast(NULL); + c_1 = (pre_layer_norm == true) ? d_last_layernorm_grad_ptr : d_x_ptr; + c_2 = d_qkv_weight_ptr; + phi::XpuFcInfo info_d_x; + phi::XpuFcInfo info_d_qkv_w; + + const XPUTypeT *use_calc_input_x_ptr = + (pre_layer_norm == true) ? ln_out_ptr : input_x_ptr; + + fc_info = phi::MatmulGradFcInfo(xpu_ctx, + &RAII_GUARD, + qkv_fc_info, + false, + true, + use_calc_input_x_ptr, + qkv_weight_ptr, + d_transpos_qkv_ptr); + + std::tie(info_d_x, info_d_qkv_w, a_1, b_1, a_2, b_2) = fc_info; + phi::MatMulXPUFunction( + xpu_ctx, a_1, b_1, c_1, info_d_x, 1.0f, true); + phi::MatMulXPUFunction( + xpu_ctx, a_2, b_2, c_2, info_d_qkv_w, 1.0f, true); + + // d_qkv_bias + r = xpu::reduce_sum(xpu_ctx, + d_transpos_qkv_ptr, + d_qkv_bias_ptr, + {batch_size * seq_len, 3 * embed_dims}, + {0}); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "reduce_sum"); + + if (pre_layer_norm) { + r = xpu::layer_norm_grad(xpu_ctx, + input_x_ptr, + c_1, + d_x_ptr, + batch_size * seq_len, + embed_dims, + epsilon, + ln_scale_ptr, + ln_mean_ptr, + ln_var_ptr, + d_ln_scale_ptr, + d_ln_bias_ptr); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "layer_norm_grad"); + } + + // add rediaus dy + r = xpu::add(xpu_ctx, + dy_input_ptr, + d_x_ptr, + d_x_ptr, + batch_size * seq_len * embed_dims); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "add"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_XPU_KERNEL( + fused_attention, + ops::FusedAttentionOpKernel, + ops::FusedAttentionOpKernel); + +REGISTER_OP_XPU_KERNEL( + fused_attention_grad, + ops::FusedAttentionGradXPUKernel, + ops::FusedAttentionGradXPUKernel); + +#endif diff --git a/paddle/fluid/operators/fused/fused_feedforward_op_xpu.cc b/paddle/fluid/operators/fused/fused_feedforward_op_xpu.cc new file mode 100644 index 0000000000000..b94d37a921fb6 --- /dev/null +++ b/paddle/fluid/operators/fused/fused_feedforward_op_xpu.cc @@ -0,0 +1,828 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PADDLE_WITH_XPU +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/fluid/operators/matmul_v2_op.h" +#include "paddle/fluid/operators/xpu_api_wrapper.h" +#include "paddle/fluid/platform/device/device_wrapper.h" +#include "paddle/phi/core/ddim.h" +#include "paddle/phi/kernels/funcs/blas/blas.h" + +#include "paddle/fluid/operators/fused/xpu_fused_common_function.h" + +namespace paddle { +namespace operators { + +using Tensor = phi::DenseTensor; + +template +class FusedFeedForwardXPUKernel : public framework::OpKernel { + using XPUTypeT = typename XPUTypeTrait::Type; + + public: + void FFN(const phi::XPUContext& dev_ctx, + const Tensor* x, + const Tensor* linear1_weight, + const Tensor* linear1_bias, + const Tensor* linear2_weight, + const Tensor* linear2_bias, + const Tensor* ln_scale, + const Tensor* ln_bias, + Tensor* out, + Tensor* dropout1_mask, + Tensor* dropout2_mask, + Tensor* ln_mean, + Tensor* ln_variance, + Tensor* linear1_out, + Tensor* ln1_out, + Tensor* dropout1_out, + Tensor* dropout2_out, + const int bsz_seq, + const int d_model, + const int dim_feedforward, + const std::string& act_method, + const bool pre_layer_norm, + const float epsilon1, + const float epsilon2, + const XPUDropoutParam& dropout_param1, + const XPUDropoutParam& dropout_param2, + int ring_id) const { + xpu::Context* xpu_ctx = dev_ctx.x_context(); + xpu::ctx_guard RAII_GUARD(xpu_ctx); + + int r = xpu::SUCCESS; + + const XPUTypeT* x_ptr = reinterpret_cast(x->data()); + const XPUTypeT* residual_ptr = x_ptr; + const XPUTypeT* linear1_weight_ptr = + reinterpret_cast(linear1_weight->data()); + const XPUTypeT* linear1_bias_ptr = + reinterpret_cast(linear1_bias->data()); + const XPUTypeT* linear2_weight_ptr = + reinterpret_cast(linear2_weight->data()); + const XPUTypeT* linear2_bias_ptr = + reinterpret_cast(linear2_bias->data()); + + const float* ln_scale_ptr = ln_scale->data(); + + const float* ln_bias_ptr = ln_bias->data(); + + // out + XPUTypeT* out_ptr = reinterpret_cast(out->data()); + XPUTypeT* linear1_out_ptr = + reinterpret_cast(linear1_out->data()); + XPUTypeT* dropout1_mask_ptr = + reinterpret_cast(dropout1_mask->data()); + XPUTypeT* dropout2_mask_ptr = + reinterpret_cast(dropout2_mask->data()); + float* ln_mean_ptr = ln_mean->data(); + float* ln_variance_ptr = ln_variance->data(); + + XPUTypeT* dropout1_out_ptr = + reinterpret_cast(dropout1_out->data()); + XPUTypeT* dropout2_out_ptr = + reinterpret_cast(dropout2_out->data()); + + size_t l3_total_size = xpu_ctx->_l3_mgr.get_size(); + XPUTypeT* linear2_before_tmp_ptr = NULL; // dim_feedforward * bsz_seq + XPUTypeT* linear2_after_tmp_ptr = NULL; // d_model * bsz_seq + if (l3_total_size >= dim_feedforward * bsz_seq * sizeof(T)) { + XPUTypeT* l3_ptr = + RAII_GUARD.alloc_l3(dim_feedforward * bsz_seq); + PADDLE_ENFORCE_XDNN_NOT_NULL(l3_ptr); + linear2_before_tmp_ptr = linear2_after_tmp_ptr = l3_ptr; + } else if ((l3_total_size < dim_feedforward * bsz_seq * sizeof(T)) && + (l3_total_size >= d_model * bsz_seq * sizeof(T))) { + XPUTypeT* l3_ptr = RAII_GUARD.alloc_l3(d_model * bsz_seq); + PADDLE_ENFORCE_XDNN_NOT_NULL(l3_ptr); + linear2_after_tmp_ptr = l3_ptr; + linear2_before_tmp_ptr = + RAII_GUARD.alloc(dim_feedforward * bsz_seq); + PADDLE_ENFORCE_XDNN_NOT_NULL(linear2_before_tmp_ptr); + + } else { + XPUTypeT* gm_ptr = RAII_GUARD.alloc(dim_feedforward * bsz_seq); + PADDLE_ENFORCE_XDNN_NOT_NULL(gm_ptr); + linear2_before_tmp_ptr = linear2_after_tmp_ptr = gm_ptr; + } + + // layernorm + if (pre_layer_norm) { + XPUTypeT* ln1_out_ptr = reinterpret_cast(ln1_out->data()); + r = xpu::layer_norm(xpu_ctx, + x_ptr, + ln1_out_ptr, + bsz_seq, + d_model, + epsilon1, + ln_scale_ptr, + ln_bias_ptr, + ln_mean_ptr, + ln_variance_ptr); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "layer_norm "); + x_ptr = ln1_out_ptr; + } + + // fc + phi::XpuFcInfo linear1_fc_info; + linear1_fc_info.InitFcInfo(0, + bsz_seq, + dim_feedforward, + d_model, + false, + false, + nullptr, + nullptr, + nullptr); + phi::MatMulXPUFunction(xpu_ctx, + x_ptr, + linear1_weight_ptr, + linear2_before_tmp_ptr, + linear1_fc_info, + 1.0f); + + // bias + r = xpu::broadcast_add(xpu_ctx, + linear2_before_tmp_ptr, + linear1_bias_ptr, + linear1_out_ptr, + {bsz_seq, dim_feedforward}, + {dim_feedforward}); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast_add"); + + // act + if (act_method == "gelu") { + r = xpu::gelu(xpu_ctx, + linear1_out_ptr, + linear2_before_tmp_ptr, + linear1_out->numel()); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "gelu"); + } else if (act_method == "relu") { + r = xpu::relu(xpu_ctx, + linear1_out_ptr, + linear2_before_tmp_ptr, + linear1_out->numel()); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "relu"); + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "Currently only supports gelu or relu activation functions!")); + } + + // dropout1 + Dropout(xpu_ctx, + linear2_before_tmp_ptr, + dropout1_mask_ptr, + dropout1_out_ptr, + dropout_param1, + dropout1_out->numel()); + + // fc + phi::XpuFcInfo linear2_fc_info; + linear2_fc_info.InitFcInfo(0, + bsz_seq, + d_model, + dim_feedforward, + false, + false, + nullptr, + nullptr, + nullptr); + phi::MatMulXPUFunction(xpu_ctx, + dropout1_out_ptr, + linear2_weight_ptr, + dropout2_out_ptr, + linear2_fc_info, + 1.0f); + + // bias + r = xpu::broadcast_add(xpu_ctx, + dropout2_out_ptr, + linear2_bias_ptr, + dropout2_out_ptr, + {bsz_seq, d_model}, + {d_model}); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast_add"); + + // dropout2 + Dropout(xpu_ctx, + dropout2_out_ptr, + dropout2_mask_ptr, + dropout2_out_ptr, + dropout_param2, + dropout2_out->numel()); + + // residual_ptr + dropout_out + XPUTypeT* residual_add_out_ptr = out_ptr; + if (pre_layer_norm == false) { + residual_add_out_ptr = dropout2_out_ptr; + } + r = xpu::broadcast_add(xpu_ctx, + residual_ptr, + dropout2_out_ptr, + residual_add_out_ptr, + {bsz_seq, d_model}, + {bsz_seq, d_model}); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast_add"); + + if (pre_layer_norm == false) { + r = xpu::layer_norm(xpu_ctx, + residual_add_out_ptr, + out_ptr, + bsz_seq, + d_model, + epsilon2, + ln_scale_ptr, + ln_bias_ptr, + ln_mean_ptr, + ln_variance_ptr); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "layer_norm"); + } + } + + void Compute(const framework::ExecutionContext& context) const override { + auto place = context.GetPlace(); + + auto* x = context.Input("X"); + + auto* linear1_weight = context.Input("Linear1Weight"); + auto* linear1_bias = context.Input("Linear1Bias"); + auto* linear2_weight = context.Input("Linear2Weight"); + auto* linear2_bias = context.Input("Linear2Bias"); + const bool pre_layer_norm = context.Attr("pre_layer_norm"); + + const Tensor* ln_scale = nullptr; + const Tensor* ln_bias = nullptr; + Tensor* ln_mean = nullptr; + Tensor* ln_variance = nullptr; + Tensor* ln1_out = nullptr; + + if (pre_layer_norm) { + ln_scale = context.Input("Ln1Scale"); + ln_bias = context.Input("Ln1Bias"); + ln_mean = context.Output("Ln1Mean"); + ln_variance = context.Output("Ln1Variance"); + ln1_out = context.Output("Ln1Out"); + ln1_out->mutable_data(place); + } else { + ln_scale = context.Input("Ln2Scale"); + ln_bias = context.Input("Ln2Bias"); + ln_mean = context.Output("Ln2Mean"); + ln_variance = context.Output("Ln2Variance"); + } + + auto* out = context.Output("Out"); + auto* dropout1_mask = context.Output("Dropout1Mask"); + auto* dropout2_mask = context.Output("Dropout2Mask"); + auto* linear1_out = context.Output("Linear1Out"); + + auto* dropout1_out = context.Output("Dropout1Out"); + auto* dropout2_out = context.Output("Dropout2Out"); + + const std::string act_method = context.Attr("act_method"); + + const int ring_id = context.Attr("ring_id"); + const float epsilon1 = context.Attr("ln1_epsilon"); + const float epsilon2 = context.Attr("ln2_epsilon"); + XPUDropoutParam dropout_param1; + dropout_param1.initXPUDropoutParam(context, 1); + XPUDropoutParam dropout_param2; + dropout_param2.initXPUDropoutParam(context, 2); + + ln_mean->mutable_data(place); + ln_variance->mutable_data(place); + out->mutable_data(place); + dropout1_mask->mutable_data(place); + dropout2_mask->mutable_data(place); + dropout1_out->mutable_data(place); + dropout2_out->mutable_data(place); + linear1_out->mutable_data(place); + + auto x_dim = x->dims(); + auto mat_dim_x = phi::funcs::CreateMatrixDescriptor( + RowMatrixFromVector(x_dim), 0, false); + + auto dim = linear1_weight->dims(); + int d_model = dim[0]; + int dim_feedforward = dim[dim.size() - 1]; + int bsz_seq = mat_dim_x.batch_size_ * mat_dim_x.height_; + + auto& dev_ctx = context.template device_context(); + FFN(dev_ctx, + x, + linear1_weight, + linear1_bias, + linear2_weight, + linear2_bias, + ln_scale, + ln_bias, + out, + dropout1_mask, + dropout2_mask, + ln_mean, + ln_variance, + linear1_out, + ln1_out, + dropout1_out, + dropout2_out, + bsz_seq, + d_model, + dim_feedforward, + act_method, + pre_layer_norm, + epsilon1, + epsilon2, + dropout_param1, + dropout_param2, + ring_id); + } +}; + +template +class FusedFeedForwardGradXPUKernel : public framework::OpKernel { + using XPUTypeT = typename XPUTypeTrait::Type; + + public: + void FFNGrad(const phi::XPUContext& dev_ctx, + const Tensor* d_out, + const Tensor* x, + const Tensor* dropout1_mask, + const Tensor* dropout2_mask, + const Tensor* linear1_out, + const Tensor* ln1_out, + const Tensor* dropout1_out, + const Tensor* dropout2_out, + const Tensor* linear1_weight, + const Tensor* linear2_weight, + const Tensor* ln_scale, + const Tensor* ln_mean, + const Tensor* ln_variance, + Tensor* d_x, + Tensor* d_linear1_weight, + Tensor* d_linear1_bias, + Tensor* d_linear2_weight, + Tensor* d_linear2_bias, + Tensor* d_ln_scale, + Tensor* d_ln_bias, + const int bsz_seq, + const int d_model, + const int dim_feedforward, + const XPUDropoutParam& dropout_param1, + const XPUDropoutParam& dropout_param2, + const std::string& act_method, + const bool pre_layer_norm, + const float epsilon, + const int ring_id) const { + xpu::Context* xpu_ctx = dev_ctx.x_context(); + xpu::ctx_guard RAII_GUARD(xpu_ctx); + int r = xpu::SUCCESS; + + // inputs ptr + const XPUTypeT* d_out_ptr = + reinterpret_cast(d_out->data()); + const XPUTypeT* x_ptr = reinterpret_cast(x->data()); + const XPUTypeT* dropout1_mask_ptr = + reinterpret_cast(dropout1_mask->data()); + const XPUTypeT* dropout2_mask_ptr = + reinterpret_cast(dropout2_mask->data()); + const XPUTypeT* linear1_out_ptr = + reinterpret_cast(linear1_out->data()); + const XPUTypeT* dropout1_out_ptr = + reinterpret_cast(dropout1_out->data()); + const XPUTypeT* linear1_weight_ptr = + reinterpret_cast(linear1_weight->data()); + const XPUTypeT* linear2_weight_ptr = + reinterpret_cast(linear2_weight->data()); + const float* ln_scale_ptr = ln_scale->data(); + + const float* ln_mean_ptr = ln_mean->data(); + const float* ln_variance_ptr = ln_variance->data(); + // outputs ptr + XPUTypeT* d_x_ptr = reinterpret_cast(d_x->data()); + XPUTypeT* d_linear1_weight_ptr = + reinterpret_cast(d_linear1_weight->data()); + XPUTypeT* d_linear1_bias_ptr = + reinterpret_cast(d_linear1_bias->data()); + XPUTypeT* d_linear2_weight_ptr = + reinterpret_cast(d_linear2_weight->data()); + XPUTypeT* d_linear2_bias_ptr = + reinterpret_cast(d_linear2_bias->data()); + float* d_ln_scale_ptr = d_ln_scale->data(); + float* d_ln_bias_ptr = d_ln_bias->data(); + + size_t l3_total_size = xpu_ctx->_l3_mgr.get_size(); + + XPUTypeT* big_tmp_l3_ptr = NULL; // dim_feedforward * bsz_seq + XPUTypeT* small_tmp_l3_ptr = NULL; // d_model * bsz_seq + XPUTypeT* big_tmp_gm_ptr = NULL; // dim_feedforward * bsz_seq + XPUTypeT* small_tmp_gm_ptr = NULL; // d_model * bsz_seq + + XPUTypeT* d_layernorm_out_ptr = NULL; // dx9 + XPUTypeT* d_dropout2_out_ptr = NULL; // dx7 + + XPUTypeT* d_linear2_out_ptr = NULL; // dx5 + XPUTypeT* d_dropout1_out_ptr = NULL; // dx4 + XPUTypeT* d_act_out_ptr = NULL; // dx3 + + XPUTypeT* d_linear1_out_ptr = NULL; // dx1 + + const XPUTypeT* d_residual_ptr = d_out_ptr; + + if (l3_total_size >= (dim_feedforward * bsz_seq * sizeof(T) + + d_model * bsz_seq * sizeof(T))) { + big_tmp_l3_ptr = RAII_GUARD.alloc_l3(dim_feedforward * bsz_seq); + PADDLE_ENFORCE_XDNN_NOT_NULL(big_tmp_l3_ptr); + small_tmp_l3_ptr = RAII_GUARD.alloc_l3(d_model * bsz_seq); + PADDLE_ENFORCE_XDNN_NOT_NULL(small_tmp_l3_ptr); + d_layernorm_out_ptr = small_tmp_l3_ptr; + d_dropout2_out_ptr = small_tmp_l3_ptr; + d_linear2_out_ptr = big_tmp_l3_ptr; + d_dropout1_out_ptr = big_tmp_l3_ptr; + d_act_out_ptr = big_tmp_l3_ptr; + d_linear1_out_ptr = small_tmp_l3_ptr; + } else if (l3_total_size >= dim_feedforward * bsz_seq * sizeof(T)) { + big_tmp_l3_ptr = RAII_GUARD.alloc_l3(dim_feedforward * bsz_seq); + PADDLE_ENFORCE_XDNN_NOT_NULL(big_tmp_l3_ptr); + small_tmp_l3_ptr = big_tmp_l3_ptr; + big_tmp_gm_ptr = RAII_GUARD.alloc(dim_feedforward * bsz_seq); + PADDLE_ENFORCE_XDNN_NOT_NULL(big_tmp_gm_ptr); + small_tmp_gm_ptr = RAII_GUARD.alloc(d_model * bsz_seq); + PADDLE_ENFORCE_XDNN_NOT_NULL(small_tmp_gm_ptr); + + d_layernorm_out_ptr = small_tmp_l3_ptr; + d_dropout2_out_ptr = small_tmp_gm_ptr; + d_linear2_out_ptr = big_tmp_l3_ptr; + d_dropout1_out_ptr = big_tmp_l3_ptr; + d_act_out_ptr = big_tmp_gm_ptr; + d_linear1_out_ptr = small_tmp_l3_ptr; + + } else if (l3_total_size >= d_model * bsz_seq * sizeof(T)) { + big_tmp_gm_ptr = RAII_GUARD.alloc(dim_feedforward * bsz_seq); + PADDLE_ENFORCE_XDNN_NOT_NULL(big_tmp_gm_ptr); + small_tmp_l3_ptr = RAII_GUARD.alloc_l3(d_model * bsz_seq); + PADDLE_ENFORCE_XDNN_NOT_NULL(small_tmp_l3_ptr); + + d_layernorm_out_ptr = small_tmp_l3_ptr; + d_dropout2_out_ptr = small_tmp_l3_ptr; + d_linear2_out_ptr = big_tmp_gm_ptr; + d_dropout1_out_ptr = big_tmp_gm_ptr; + d_act_out_ptr = big_tmp_gm_ptr; + d_linear1_out_ptr = small_tmp_l3_ptr; + } else { + big_tmp_gm_ptr = RAII_GUARD.alloc(dim_feedforward * bsz_seq); + PADDLE_ENFORCE_XDNN_NOT_NULL(big_tmp_gm_ptr); + small_tmp_gm_ptr = RAII_GUARD.alloc(d_model * bsz_seq); + PADDLE_ENFORCE_XDNN_NOT_NULL(small_tmp_gm_ptr); + d_layernorm_out_ptr = small_tmp_gm_ptr; + d_dropout2_out_ptr = small_tmp_gm_ptr; + d_linear2_out_ptr = big_tmp_gm_ptr; + d_dropout1_out_ptr = big_tmp_gm_ptr; + d_act_out_ptr = big_tmp_gm_ptr; + d_linear1_out_ptr = small_tmp_gm_ptr; + } + + if (pre_layer_norm == false) { + const XPUTypeT* dropout2_out_ptr = + reinterpret_cast(dropout2_out->data()); + r = xpu::layer_norm_grad(xpu_ctx, + dropout2_out_ptr, + d_out_ptr, + d_layernorm_out_ptr, + bsz_seq, + d_model, + epsilon, + ln_scale_ptr, + ln_mean_ptr, + ln_variance_ptr, + d_ln_scale_ptr, + d_ln_bias_ptr); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "layer_norm_grad"); + d_residual_ptr = d_layernorm_out_ptr; + } + DropoutGrad(xpu_ctx, + d_residual_ptr, + dropout2_mask_ptr, + d_dropout2_out_ptr, + dropout_param2, + bsz_seq * d_model); + // linear_grad2 + r = xpu::reduce_sum(xpu_ctx, + d_dropout2_out_ptr, + d_linear2_bias_ptr, + {bsz_seq, d_model}, + {0}); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "reduce_sum"); + + phi::XpuFcInfo linear2_fc_info; + linear2_fc_info.InitFcInfo(0, + bsz_seq, + d_model, + dim_feedforward, + false, + false, + nullptr, + nullptr, + nullptr); + + const XPUTypeT* a_1 = reinterpret_cast(NULL); + const XPUTypeT* b_1 = reinterpret_cast(NULL); + const XPUTypeT* a_2 = reinterpret_cast(NULL); + const XPUTypeT* b_2 = reinterpret_cast(NULL); + XPUTypeT* c_1 = d_linear2_out_ptr; + XPUTypeT* c_2 = d_linear2_weight_ptr; + phi::XpuFcInfo info_d_dropout1; + phi::XpuFcInfo info_dw2; + + std::tuple + fc_info = phi::MatmulGradFcInfo(xpu_ctx, + &RAII_GUARD, + linear2_fc_info, + false, + false, + dropout1_out_ptr, + linear2_weight_ptr, + d_dropout2_out_ptr); + + std::tie(info_d_dropout1, info_dw2, a_1, b_1, a_2, b_2) = fc_info; + + // if l3_total_size >= dim_feedforward * bsz_seq * sizeof(T), first transpos + if (l3_total_size >= dim_feedforward * bsz_seq * sizeof(T) && + info_dw2.trans_x) { + r = xpu::transpose(xpu_ctx, + dropout1_out_ptr, + big_tmp_l3_ptr, + {bsz_seq, dim_feedforward}, + {1, 0}); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "transpose"); + a_2 = big_tmp_l3_ptr; + info_dw2.trans_x = !info_dw2.trans_x; + info_dw2.stride_x = info_dw2.k; + } + + phi::MatMulXPUFunction( + xpu_ctx, a_1, b_1, c_1, info_d_dropout1, 1.0f, true); + + phi::MatMulXPUFunction( + xpu_ctx, a_2, b_2, c_2, info_dw2, 1.0f, true); + + // dropout_grad1 + DropoutGrad(xpu_ctx, + d_linear2_out_ptr, + dropout1_mask_ptr, + d_dropout1_out_ptr, + dropout_param1, + bsz_seq * dim_feedforward); + + // act_grad + if (act_method == "gelu") { + r = xpu::gelu_grad(xpu_ctx, + linear1_out_ptr, + linear1_out_ptr, + d_dropout1_out_ptr, + d_act_out_ptr, + bsz_seq * dim_feedforward); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "gelu_grad"); + } else if (act_method == "relu") { + r = xpu::relu_grad(xpu_ctx, + linear1_out_ptr, + linear1_out_ptr, + d_dropout1_out_ptr, + d_act_out_ptr, + bsz_seq * dim_feedforward); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "relu_grad"); + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "Currently only supports gelu or relu activation functions!")); + } + + // linear1_grad + r = xpu::reduce_sum(xpu_ctx, + d_act_out_ptr, + d_linear1_bias_ptr, + {bsz_seq, dim_feedforward}, + {0}); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "reduce_sum"); + + phi::XpuFcInfo linear1_fc_info; + linear1_fc_info.InitFcInfo(0, + bsz_seq, + dim_feedforward, + d_model, + false, + false, + nullptr, + nullptr, + nullptr); + + a_1 = reinterpret_cast(NULL); + b_1 = reinterpret_cast(NULL); + a_2 = reinterpret_cast(NULL); + b_2 = reinterpret_cast(NULL); + + c_1 = (pre_layer_norm == true ? d_linear1_out_ptr : d_x_ptr); + c_2 = d_linear1_weight_ptr; + phi::XpuFcInfo info_dx; + phi::XpuFcInfo info_dw1; + + const XPUTypeT* linear1_x_ptr = + (pre_layer_norm == true + ? reinterpret_cast(ln1_out->data()) + : x_ptr); + + if (l3_total_size >= d_model * bsz_seq * sizeof(T) && info_dw1.trans_x) { + r = xpu::transpose( + xpu_ctx, linear1_x_ptr, small_tmp_l3_ptr, {bsz_seq, d_model}, {1, 0}); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "transpose"); + a_2 = small_tmp_l3_ptr; + info_dw1.trans_x = !info_dw1.trans_x; + info_dw1.stride_x = info_dw1.k; + } + + fc_info = phi::MatmulGradFcInfo(xpu_ctx, + &RAII_GUARD, + linear1_fc_info, + false, + false, + linear1_x_ptr, + linear1_weight_ptr, + d_act_out_ptr); + + std::tie(info_dx, info_dw1, a_1, b_1, a_2, b_2) = fc_info; + + phi::MatMulXPUFunction( + xpu_ctx, a_1, b_1, c_1, info_dx, 1.0f, true); + + phi::MatMulXPUFunction( + xpu_ctx, a_2, b_2, c_2, info_dw1, 1.0f, true); + + if (pre_layer_norm) { + r = xpu::layer_norm_grad(xpu_ctx, + x_ptr, + c_1, + c_1, + bsz_seq, + d_model, + epsilon, + ln_scale_ptr, + ln_mean_ptr, + ln_variance_ptr, + d_ln_scale_ptr, + d_ln_bias_ptr); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "layer_norm_grad"); + } + + r = xpu::add(xpu_ctx, c_1, d_residual_ptr, d_x_ptr, d_model * bsz_seq); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "add"); + } + + void Compute(const framework::ExecutionContext& context) const override { + auto place = context.GetPlace(); + const bool pre_layer_norm = context.Attr("pre_layer_norm"); + // inputs + auto* d_out = context.Input(framework::GradVarName("Out")); + auto* x = context.Input("X"); + + auto* dropout1_mask = context.Input("Dropout1Mask"); + auto* dropout2_mask = context.Input("Dropout2Mask"); + auto* linear1_out = context.Input("Linear1Out"); + auto* ln1_out = pre_layer_norm ? context.Input("Ln1Out") : nullptr; + + auto* dropout1_out = context.Input("Dropout1Out"); + auto* dropout2_out = context.Input("Dropout2Out"); + auto* linear1_weight = context.Input("Linear1Weight"); + auto* linear2_weight = context.Input("Linear2Weight"); + + const Tensor* ln_mean = nullptr; + const Tensor* ln_variance = nullptr; + const Tensor* ln_scale = nullptr; + + if (pre_layer_norm) { + ln_mean = context.Input("Ln1Mean"); + ln_variance = context.Input("Ln1Variance"); + ln_scale = context.Input("Ln1Scale"); + } else { + ln_mean = context.Input("Ln2Mean"); + ln_variance = context.Input("Ln2Variance"); + ln_scale = context.Input("Ln2Scale"); + } + + // output + auto* d_x = context.Output(framework::GradVarName("X")); + + Tensor* d_ln_scale = nullptr; + Tensor* d_ln_bias = nullptr; + + if (pre_layer_norm) { + d_ln_scale = context.Output(framework::GradVarName("Ln1Scale")); + d_ln_bias = context.Output(framework::GradVarName("Ln1Bias")); + } else { + d_ln_scale = context.Output(framework::GradVarName("Ln2Scale")); + d_ln_bias = context.Output(framework::GradVarName("Ln2Bias")); + } + + auto* d_linear1_weight = + context.Output(framework::GradVarName("Linear1Weight")); + auto* d_linear1_bias = + context.Output(framework::GradVarName("Linear1Bias")); + auto* d_linear2_weight = + context.Output(framework::GradVarName("Linear2Weight")); + auto* d_linear2_bias = + context.Output(framework::GradVarName("Linear2Bias")); + + float epsilon = 0.0f; + if (pre_layer_norm) { + epsilon = context.Attr("ln1_epsilon"); + } else { + epsilon = context.Attr("ln2_epsilon"); + } + + const std::string act_method = context.Attr("act_method"); + + XPUDropoutParam dropout_param1(context, 1); + XPUDropoutParam dropout_param2(context, 2); + + const int ring_id = context.Attr("ring_id"); + + d_x->mutable_data(place); + d_ln_scale->mutable_data(place); + d_ln_bias->mutable_data(place); + d_linear1_bias->mutable_data(place); + d_linear2_bias->mutable_data(place); + d_linear1_weight->mutable_data(place); + d_linear2_weight->mutable_data(place); + + auto x_dim = x->dims(); + auto mat_dim_x = phi::funcs::CreateMatrixDescriptor( + RowMatrixFromVector(x_dim), 0, false); + + auto linear1_weight_dim = linear1_weight->dims(); + int d_model = linear1_weight_dim[0]; + int dim_feedforward = linear1_weight_dim[linear1_weight_dim.size() - 1]; + int bsz_seq = mat_dim_x.batch_size_ * mat_dim_x.height_; + auto& dev_ctx = context.template device_context(); + + FFNGrad(dev_ctx, + d_out, + x, + dropout1_mask, + dropout2_mask, + linear1_out, + ln1_out, + dropout1_out, + dropout2_out, + linear1_weight, + linear2_weight, + ln_scale, + ln_mean, + ln_variance, + d_x, + d_linear1_weight, + d_linear1_bias, + d_linear2_weight, + d_linear2_bias, + d_ln_scale, + d_ln_bias, + bsz_seq, + d_model, + dim_feedforward, + dropout_param1, + dropout_param2, + act_method, + pre_layer_norm, + epsilon, + ring_id); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_XPU_KERNEL( + fused_feedforward, + ops::FusedFeedForwardXPUKernel, + ops::FusedFeedForwardXPUKernel); + +REGISTER_OP_XPU_KERNEL( + fused_feedforward_grad, + ops::FusedFeedForwardGradXPUKernel, + ops::FusedFeedForwardGradXPUKernel); + +#endif diff --git a/paddle/fluid/operators/fused/xpu_fused_common_function.h b/paddle/fluid/operators/fused/xpu_fused_common_function.h new file mode 100644 index 0000000000000..1a1ec8c47f9ba --- /dev/null +++ b/paddle/fluid/operators/fused/xpu_fused_common_function.h @@ -0,0 +1,224 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#ifdef PADDLE_WITH_XPU +#include "paddle/fluid/platform/device/device_wrapper.h" + +namespace paddle { +namespace operators { +using Tensor = phi::DenseTensor; + +struct XPUDropoutParam { + float dropout_prob; + bool is_upscale_in_train; + bool is_test; + bool fix_seed; + const Tensor *tensor_seed; + int seed_val; + + XPUDropoutParam() { + fix_seed = false; + is_test = false; + is_upscale_in_train = false; + dropout_prob = 0.5; + tensor_seed = nullptr; + seed_val = 0; + } + + XPUDropoutParam(const framework::ExecutionContext &context, + const int dropout_index) { + std::string pre_fix = "dropout"; + std::string str_index = std::to_string(dropout_index); + if (dropout_index > 0) { + pre_fix = pre_fix + str_index + "_"; + } else { + pre_fix = pre_fix + "_"; + } + dropout_prob = context.Attr(pre_fix + "rate"); + auto &dropout_implementation = + context.Attr(pre_fix + "implementation"); + is_upscale_in_train = (dropout_implementation == "upscale_in_train"); + is_test = context.Attr("is_test"); + fix_seed = context.Attr(pre_fix + "fix_seed"); + + std::string str_seed = "Dropout"; + if (dropout_index > 0) { + str_seed = str_seed + str_index + "Seed"; + } else { + str_seed = str_seed + "Seed"; + } + + tensor_seed = + context.HasInput(str_seed) ? context.Input(str_seed) : nullptr; + if (tensor_seed) { + seed_val = *(tensor_seed->data()); + } else { + seed_val = fix_seed ? context.Attr(pre_fix + "seed") : 0; + } + } + + void initXPUDropoutParam(float dropout_prob_, + bool is_upscale_in_train_, + bool is_test_, + bool fix_seed_, + const Tensor *tensor_seed, + int seed_val_) { + dropout_prob = dropout_prob_; + is_upscale_in_train = is_upscale_in_train_; + is_test = is_test_; + fix_seed = fix_seed_; + if (tensor_seed) { + seed_val = *(tensor_seed->data()); + } else { + seed_val = fix_seed ? seed_val_ : 0; + } + } + + void initXPUDropoutParam(const framework::ExecutionContext &context, + int dropout_index) { + std::string pre_fix = "dropout"; + std::string str_index = std::to_string(dropout_index); + if (dropout_index > 0) { + pre_fix = pre_fix + str_index + "_"; + } else { + pre_fix = pre_fix + "_"; + } + dropout_prob = context.Attr(pre_fix + "rate"); + auto &dropout_implementation = + context.Attr(pre_fix + "implementation"); + is_upscale_in_train = (dropout_implementation == "upscale_in_train"); + is_test = context.Attr("is_test"); + fix_seed = context.Attr(pre_fix + "fix_seed"); + std::string str_seed = "Dropout"; + if (dropout_index > 0) { + str_seed = str_seed + str_index + "Seed"; + } else { + str_seed = str_seed + "Seed"; + } + tensor_seed = + context.HasInput(str_seed) ? context.Input(str_seed) : nullptr; + + if (tensor_seed) { + seed_val = *(tensor_seed->data()); + } else { + seed_val = fix_seed ? context.Attr(pre_fix + "seed") : 0; + } + } +}; + +/****************** + * check is l3 + *******************/ + +static bool is_in_l3(const void *addr) { + int64_t addr_int = (int64_t)addr; + int addr_int_high = addr_int >> 32; + return (addr_int_high == 0); +} + +/************************* + * dropout + *************************/ + +template +void Dropout(xpu::Context *xpu_ctx, + const T *x, + T *mask, + T *y, + const XPUDropoutParam ¶m, + int len) { + using XPUType = typename XPUTypeTrait::Type; + int r = XPU_SUCCESS; + if (param.dropout_prob == 0.0f) { + r = xpu::copy(xpu_ctx, + reinterpret_cast(x), + reinterpret_cast(y), + len); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "copy"); + return; + } + if (!param.is_test) { + if (param.dropout_prob == 1.0f) { + r = xpu::constant( + xpu_ctx, reinterpret_cast(y), len, XPUType(0)); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "constant"); + r = xpu::constant( + xpu_ctx, reinterpret_cast(mask), len, XPUType(0)); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "constant"); + } else { + r = xpu::dropout(xpu_ctx, + reinterpret_cast(x), + reinterpret_cast(y), + reinterpret_cast(mask), + param.seed_val, + len, + param.is_upscale_in_train, + param.dropout_prob); + + PADDLE_ENFORCE_XDNN_SUCCESS(r, "dropout"); + } + } else { + float scale = (param.is_upscale_in_train) + ? (1.0) + : (static_cast(1.0f - param.dropout_prob)); + r = xpu::scale(xpu_ctx, + reinterpret_cast(x), + reinterpret_cast(y), + len, + false, + scale, + 0.0f); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "scale"); + } +} + +template +void DropoutGrad(xpu::Context *xpu_ctx, + const T *dy, + const T *mask, + T *dx, + const XPUDropoutParam ¶m, + int len) { + using XPUType = typename XPUTypeTrait::Type; + if (param.dropout_prob == 0.0f) { + int r = xpu::copy(xpu_ctx, + reinterpret_cast(dy), + reinterpret_cast(dx), + len); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "copy"); + return; + } + if (!param.is_upscale_in_train) { + int r = xpu::mul(xpu_ctx, + reinterpret_cast(dy), + reinterpret_cast(mask), + reinterpret_cast(dx), + len); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "mul"); + } else { + int r = xpu::dropout_grad(xpu_ctx, + reinterpret_cast(mask), + reinterpret_cast(dy), + reinterpret_cast(dx), + param.dropout_prob, + len); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "dropout_grad"); + } +} + +} // namespace operators +} // namespace paddle +#endif diff --git a/paddle/fluid/platform/device/xpu/xpu2_op_list.h b/paddle/fluid/platform/device/xpu/xpu2_op_list.h index 8773ae273a69e..cbcbde8f9ddcd 100644 --- a/paddle/fluid/platform/device/xpu/xpu2_op_list.h +++ b/paddle/fluid/platform/device/xpu/xpu2_op_list.h @@ -704,6 +704,18 @@ XPUOpMap& get_kl2_ops() { {"fused_gemm_epilogue_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), pOpKernelType(vartype::FP16, XPUPlace())})}, + {"fused_attention", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace())})}, + {"fused_attention_grad", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace())})}, + {"fused_feedforward", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace())})}, + {"fused_feedforward_grad", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace())})}, }; return s_xpu2_kernels; diff --git a/paddle/phi/kernels/xpu/xpu_api_wrapper.h b/paddle/phi/kernels/xpu/xpu_api_wrapper.h index 8433c6b421eed..277a4e953d6e1 100644 --- a/paddle/phi/kernels/xpu/xpu_api_wrapper.h +++ b/paddle/phi/kernels/xpu/xpu_api_wrapper.h @@ -382,7 +382,8 @@ static void MatMulXPUFunction(xpu::Context* xpu_ctx, const T* y, T* out, const XpuFcInfo& fcinfo, - float alpha) { + float alpha, + bool is_grad = false) { using XPUType = typename XPUTypeTrait::Type; int fccal_type = FCCalcType(); @@ -398,6 +399,12 @@ static void MatMulXPUFunction(xpu::Context* xpu_ctx, }; auto fc_api = fc_api_list[fccal_type]; + if (std::getenv("XPU_PADDLE_FC_GRAD_LOCAL") != nullptr) { + if (is_grad) { + fc_api = fc_api_list[2]; + } + } + auto fc_batch_api = fc_batch_api_list[fccal_type]; int m = fcinfo.m; diff --git a/python/paddle/fluid/tests/unittests/xpu/test_fused_attention_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_fused_attention_op_xpu.py new file mode 100644 index 0000000000000..6462bec102ee5 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/xpu/test_fused_attention_op_xpu.py @@ -0,0 +1,331 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import sys + +sys.path.append("..") + +import paddle +import paddle.nn.functional as F +import paddle.incubate.nn.functional as incubate_f +from paddle.nn.layer.norm import LayerNorm +from paddle.nn.layer.common import Linear, Dropout +from paddle.nn.layer.transformer import _convert_attention_mask +from paddle import tensor +from paddle.fluid import layers +import unittest +from op_test_xpu import XPUOpTest +from paddle.fluid.framework import default_main_program + +from xpu.get_test_cover_info import ( + create_test_class, + get_xpu_op_support_types, + XPUOpTestWrapper, +) + +default_main_program().random_seed = 42 + + +class XPUTestFusedAttentionOp(XPUOpTestWrapper): + def __init__(self): + self.op_name = 'fused_attention' + self.use_dynamic_create_class = False + + class TestFusedAttentionOp(XPUOpTest): + def setUp(self): + self.config() + self.generate_input_data() + self.rtol = 1e-5 + self.atol = 1e-3 + if self.x_type == np.float16 or str(self.x_type) == "float16": + self.atol = 1e-1 + + paddle.set_default_dtype(self.x_type) + self.__class__.op_type = "fused_attention" + # use autograd to check grad in this unittest. + self.__class__.no_need_check_grad = True + self.q_proj = Linear( + self.embed_dim, + self.embed_dim, + self.weight_attr, + bias_attr=self.bias_attr, + ) + self.k_proj = Linear( + self.kdim, + self.embed_dim, + self.weight_attr, + bias_attr=self.bias_attr, + ) + self.v_proj = Linear( + self.vdim, + self.embed_dim, + self.weight_attr, + bias_attr=self.bias_attr, + ) + self.out_proj = Linear( + self.embed_dim, + self.embed_dim, + self.weight_attr, + bias_attr=self.bias_attr, + ) + paddle.set_default_dtype(np.float32) + self.norm1 = LayerNorm(self.embed_dim) + self.norm2 = LayerNorm(self.embed_dim) + paddle.set_default_dtype(self.x_type) + self.dropout = Dropout(self.dropout_prob, mode="upscale_in_train") + + def config(self): + self.x_type = self.in_type + self.attn_mask_type = np.float32 + self.pre_layer_norm = True + self.has_attn_mask = False + self.training = True + + self.batch_size = 8 + self.query_length = 128 + self.cache_length = 128 + self.head_dim = 64 + self.num_heads = 16 + self.embed_dim = self.head_dim * self.num_heads + + self.dropout_prob = 0.0 + self.attn_dropout_prob = 0.0 + self.weight_attr = None + self.bias_attr = None + self.kdim, self.vdim = self.embed_dim, self.embed_dim + self.key_length, self.value_length = ( + self.query_length, + self.query_length, + ) + + def generate_input_data(self): + self.query = np.random.rand( + self.batch_size, self.query_length, self.embed_dim + ).astype(self.x_type) + out_seq_len = self.key_length + if self.has_attn_mask: + # [B, n_head, seq_len, out_seq_len] + self.attn_mask = np.ones( + ( + self.batch_size, + self.num_heads, + self.query_length, + out_seq_len, + ), + dtype=self.attn_mask_type, + ) + else: + self.attn_mask = None + self.key, self.value = self.query, self.query + + self.dout = np.random.random( + (self.batch_size, self.query_length, self.embed_dim) + ).astype(self.x_type) + + def GetBaselineOut(self): + paddle.disable_static() + tensor_query = paddle.to_tensor(self.query, stop_gradient=False) + + if self.has_attn_mask: + attn_mask = paddle.to_tensor( + self.attn_mask, stop_gradient=False + ) + else: + attn_mask = None + residual = tensor_query + + ln1_out = tensor_query + if self.pre_layer_norm: + ln1_out = self.norm1(tensor_query) + + q = self.q_proj(ln1_out) + q = tensor.reshape(x=q, shape=[0, 0, self.num_heads, self.head_dim]) + q_out = tensor.transpose(x=q, perm=[0, 2, 1, 3]) + k = self.k_proj(ln1_out) + v = self.v_proj(ln1_out) + k = tensor.reshape(x=k, shape=[0, 0, self.num_heads, self.head_dim]) + k_out = tensor.transpose(x=k, perm=[0, 2, 1, 3]) + v = tensor.reshape(x=v, shape=[0, 0, self.num_heads, self.head_dim]) + v_out = tensor.transpose(x=v, perm=[0, 2, 1, 3]) + + # [B, n_head, seq_len, head_dim] * [B, n_head, out_seq_len, head_dim] + # --> [B, n_head, seq_len, out_seq_len] + qk_out = layers.matmul( + x=q_out * self.head_dim**-0.5, y=k_out, transpose_y=True + ) + + if attn_mask is not None: + attn_mask = _convert_attention_mask(attn_mask, qk_out.dtype) + attn_mask_out = qk_out + attn_mask + softmax_out = F.softmax(attn_mask_out) + else: + softmax_out = F.softmax(qk_out) + + if self.dropout_prob: + dropout_out = F.dropout( + softmax_out, + self.dropout_prob, + training=self.training, + mode="upscale_in_train", + ) + # [B, n_head, seq_len, out_seq_len] * [B, n_head, out_seq_len, head_dim] + # --> [B, n_head, seq_len, head_dim] + qktv_out = tensor.matmul(dropout_out, v_out) + else: + qktv_out = tensor.matmul(softmax_out, v_out) + + fmha_out = tensor.transpose(qktv_out, perm=[0, 2, 1, 3]) + out_linear_in = tensor.reshape( + x=fmha_out, shape=[0, 0, fmha_out.shape[2] * fmha_out.shape[3]] + ) + out = self.out_proj(out_linear_in) + + residual_out = residual + self.dropout(out) + if not self.pre_layer_norm: + final_out = self.norm1(residual_out) + else: + final_out = residual_out + + paddle.autograd.backward( + [final_out], [paddle.to_tensor(self.dout)], retain_graph=True + ) + return final_out, tensor_query.grad + + def GetFusedAttentionOut(self): + paddle.disable_static() + q_proj_weight = paddle.to_tensor( + self.q_proj.weight, stop_gradient=False + ) + k_proj_weight = paddle.to_tensor( + self.k_proj.weight, stop_gradient=False + ) + v_proj_weight = paddle.to_tensor( + self.v_proj.weight, stop_gradient=False + ) + out_linear_weight = paddle.to_tensor( + self.out_proj.weight, stop_gradient=False + ) + + if self.bias_attr is False: + qkv_bias_tensor = None + out_linear_bias = None + else: + q_proj_bias = paddle.to_tensor( + self.q_proj.bias, stop_gradient=False + ) + k_proj_bias = paddle.to_tensor( + self.k_proj.bias, stop_gradient=False + ) + v_proj_bias = paddle.to_tensor( + self.v_proj.bias, stop_gradient=False + ) + qkv_bias = np.concatenate( + ( + q_proj_bias.numpy(), + k_proj_bias.numpy(), + v_proj_bias.numpy(), + ) + ) + qkv_bias = qkv_bias.reshape((3, self.num_heads, self.head_dim)) + qkv_bias_tensor = paddle.to_tensor( + qkv_bias, stop_gradient=False + ) + out_linear_bias = paddle.to_tensor( + self.out_proj.bias, stop_gradient=False + ) + + ln1_scale = paddle.to_tensor(self.norm1.weight, stop_gradient=False) + ln1_bias = paddle.to_tensor(self.norm1.bias, stop_gradient=False) + ln2_scale = paddle.to_tensor(self.norm2.weight, stop_gradient=False) + ln2_bias = paddle.to_tensor(self.norm2.bias, stop_gradient=False) + + q_proj_weight = q_proj_weight.numpy().transpose((1, 0)) + k_proj_weight = k_proj_weight.numpy().transpose((1, 0)) + v_proj_weight = v_proj_weight.numpy().transpose((1, 0)) + qkv_weight = np.concatenate( + (q_proj_weight, k_proj_weight, v_proj_weight) + ) + qkv_weight = qkv_weight.reshape( + (3, self.num_heads, self.head_dim, self.embed_dim) + ) + + x = paddle.to_tensor(self.query, stop_gradient=False) + cache_kv = None + if self.has_attn_mask: + attn_mask = paddle.to_tensor( + self.attn_mask, stop_gradient=False + ) + else: + attn_mask = None + qkv_weight_tensor = paddle.to_tensor( + qkv_weight, stop_gradient=False + ) + epsilon = 1e-05 + ln2_epsilon = 1e-05 + + if attn_mask is not None: + attn_mask = _convert_attention_mask(attn_mask, x.dtype) + final_out = incubate_f.fused_multi_head_attention( + x, + qkv_weight_tensor, + out_linear_weight, + self.pre_layer_norm, + ln1_scale, + ln1_bias, + ln2_scale, + ln2_bias, + epsilon, + qkv_bias_tensor, + out_linear_bias, + cache_kv, + attn_mask, + self.dropout_prob, + self.attn_dropout_prob, + ln2_epsilon, + ) + + paddle.autograd.backward( + [final_out], [paddle.to_tensor(self.dout)], retain_graph=True + ) + return final_out, x.grad + + def test_fused_attention_op(self): + final_out_ref, x_grad_ref = self.GetBaselineOut() + final_out, x_grad = self.GetFusedAttentionOut() + np.testing.assert_allclose( + final_out_ref, final_out.numpy(), rtol=self.rtol, atol=self.atol + ) + np.testing.assert_allclose( + x_grad_ref, x_grad.numpy(), rtol=self.rtol, atol=self.atol + ) + + class TestFusedAttentionOpPreLn(TestFusedAttentionOp): + def config(self): + super().config() + self.pre_layer_norm = True + + class TestFusedAttentionOpNoneAttnMask(TestFusedAttentionOp): + def config(self): + super().config() + self.pre_layer_norm = True + self.has_attn_mask = False + + +support_types = get_xpu_op_support_types('fused_attention') +for stype in support_types: + create_test_class(globals(), XPUTestFusedAttentionOp, stype) + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/xpu/test_fused_feedforward_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_fused_feedforward_op_xpu.py new file mode 100644 index 0000000000000..f8a6fb75eba0e --- /dev/null +++ b/python/paddle/fluid/tests/unittests/xpu/test_fused_feedforward_op_xpu.py @@ -0,0 +1,379 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import numpy as np +import sys + +sys.path.append("..") + +import paddle +from paddle.nn.layer import transformer +import paddle.nn.functional as F +import paddle.incubate.nn.functional as incubate_f +from paddle.nn.layer.norm import LayerNorm +from paddle.nn.layer.common import Linear, Dropout +import unittest +from op_test_xpu import XPUOpTest +from paddle.fluid.framework import default_main_program + +from xpu.get_test_cover_info import ( + create_test_class, + XPUOpTestWrapper, +) + + +class XPUTestFusedFFNOp(XPUOpTestWrapper): + def __init__(self): + self.op_name = 'fused_feedforward' + self.use_dynamic_create_class = False + + class TestFusedFFNOp(XPUOpTest): + def getDtype(self): + self.dtype = self.in_type + self.layer_norm_dtype = "float32" + + def getShape(self): + self.batch_size = np.random.randint(1, 32) + self.query_length = np.random.randint(32, 128) + self.d_model = np.random.randint(32, 512) + self.dim_feedforward = np.random.randint(32, 512) + + def getDiff(self): + self.rtol = 1e-2 + self.atol = 1e-3 + if self.dtype == np.float16 or self.dtype == "float16": + self.atol = 1e-1 + + def getActivation(self): + self.act_method = "gelu" + + def getNormalizeBefore(self): + self.pre_layer_norm = False + + def setUp(self): + paddle.disable_static() + self.__class__.op_type = "fused_feedforward" + # check grad in test_out_and_grad() + self.__class__.no_need_check_grad = True + self.getDtype() + self.getShape() + self.getDiff() + self.getActivation() + self.getNormalizeBefore() + paddle.set_default_dtype(self.dtype) + self.weight_attr = None + self.bias_attr = None + + self.weight_attrs = transformer._convert_param_attr_to_list( + self.weight_attr, 2 + ) + self.bias_attrs = transformer._convert_param_attr_to_list( + self.bias_attr, 2 + ) + self.linear1 = Linear( + self.d_model, + self.dim_feedforward, + self.weight_attrs[1], + bias_attr=self.bias_attrs[1], + ) + self.linear2 = Linear( + self.dim_feedforward, + self.d_model, + self.weight_attrs[1], + bias_attr=self.bias_attrs[1], + ) + + paddle.set_default_dtype(self.layer_norm_dtype) + self.norm1 = LayerNorm(self.d_model) + self.norm2 = LayerNorm(self.d_model) + paddle.set_default_dtype(self.dtype) + self.dropout1 = Dropout(0.0, mode="upscale_in_train") + self.dropout2 = Dropout(0.0, mode="upscale_in_train") + self.activation = getattr(F, self.act_method) + + self.src = np.random.random( + (self.batch_size, self.query_length, self.d_model) + ).astype(self.dtype) + self.dout = np.random.random( + (self.batch_size, self.query_length, self.d_model) + ).astype(self.dtype) + + def Base(self): + paddle.disable_static() + tensor_src = paddle.to_tensor(self.src, stop_gradient=False) + residual = tensor_src + if self.pre_layer_norm: + ln1_out = self.norm1(tensor_src) + linear2_out = self.linear2( + self.dropout1(self.activation(self.linear1(ln1_out))) + ) + dropout2_out = residual + self.dropout2(linear2_out) + paddle.autograd.backward( + [dropout2_out], [paddle.to_tensor(self.dout)], True + ) + return dropout2_out, tensor_src.grad + else: + linear2_out = self.linear2( + self.dropout1(self.activation(self.linear1(tensor_src))) + ) + dropout2_out = residual + self.dropout2(linear2_out) + dropout2_out = self.norm2(dropout2_out) + paddle.autograd.backward( + [dropout2_out], [paddle.to_tensor(self.dout)], True + ) + return dropout2_out, tensor_src.grad + + def FusedFFN(self): + paddle.disable_static() + linear1_weight = paddle.to_tensor( + self.linear1.weight, stop_gradient=False + ) + linear1_bias = paddle.to_tensor( + self.linear1.bias, stop_gradient=False + ) + linear2_weight = paddle.to_tensor( + self.linear2.weight, stop_gradient=False + ) + linear2_bias = paddle.to_tensor( + self.linear2.bias, stop_gradient=False + ) + ln1_scale = paddle.to_tensor(self.norm1.weight, stop_gradient=False) + ln1_bias = paddle.to_tensor(self.norm1.bias, stop_gradient=False) + ln2_scale = paddle.to_tensor(self.norm2.weight, stop_gradient=False) + ln2_bias = paddle.to_tensor(self.norm2.bias, stop_gradient=False) + x = paddle.to_tensor(self.src, stop_gradient=False) + out = incubate_f.fused_feedforward( + x, + linear1_weight, + linear2_weight, + linear1_bias, + linear2_bias, + ln1_scale, + ln1_bias, + ln2_scale, + ln2_bias, + 0.0, + 0.0, + activation=self.act_method, + pre_layer_norm=self.pre_layer_norm, + ) + paddle.autograd.backward([out], [paddle.to_tensor(self.dout)]) + return out, x.grad + + def test_out_and_grad(self): + default_main_program().random_seed = 42 + base_out, base_grad = self.Base() + fused_out, fused_grad = self.FusedFFN() + np.testing.assert_allclose( + base_out.numpy(), + fused_out.numpy(), + rtol=self.rtol, + atol=self.atol, + ) + np.testing.assert_allclose( + base_grad.numpy(), + fused_grad.numpy(), + rtol=self.rtol, + atol=self.atol, + ) + + class TestFusedFFNOpActivation(TestFusedFFNOp): + def getActivation(self): + self.act_method = "relu" + + class TestFusedFFNOpNormalizeBefore(TestFusedFFNOp): + def getNormalizeBefore(self): + self.pre_layer_norm = True + + def getShape(self): + self.batch_size = 1 + self.query_length = 1 + self.d_model = 8 + self.dim_feedforward = 8 + + +class APITestStaticFusedFFN(unittest.TestCase): + def test_static(self): + paddle.enable_static() + default_main_program().random_seed = 42 + dtype = "float32" + layer_norm_dtype = "float32" + batch_size = 1 + d_model = 8 + dim_feedforward = 8 + + x = paddle.static.data( + name='x', shape=[batch_size, d_model, dim_feedforward], dtype=dtype + ) + linear1_weight = paddle.static.data( + name='linear1_weight', shape=[d_model, dim_feedforward], dtype=dtype + ) + linear1_bias = paddle.static.data( + name='linear1_bias', shape=[dim_feedforward], dtype=dtype + ) + linear2_weight = paddle.static.data( + name='linear2_weight', shape=[dim_feedforward, d_model], dtype=dtype + ) + linear2_bias = paddle.static.data(name='linear2_bias', shape=[d_model]) + ln1_scale = paddle.static.data(name='ln1_scale', shape=[d_model]) + ln1_bias = paddle.static.data(name='ln1_scale', shape=[d_model]) + ln2_scale = paddle.static.data(name='ln2_scale', shape=[d_model]) + ln2_bias = paddle.static.data(name='ln2_scale', shape=[d_model]) + + fused_out = incubate_f.fused_feedforward( + x, + linear1_weight, + linear2_weight, + linear1_bias, + linear2_bias, + ln1_scale, + ln1_bias, + ln2_scale, + ln2_bias, + 0.0, + 0.0, + activation="relu", + pre_layer_norm=False, + ) + + linear1_out = F.linear(x, linear1_weight, linear1_bias) + act_out = F.relu(linear1_out) + dropout1_out = F.dropout(x=act_out, p=0.0, training=False) + linear2_out = F.linear(dropout1_out, linear2_weight, linear2_bias) + dropout2_out = x + F.dropout(x=linear2_out, p=0.0, training=False) + ln_out = F.layer_norm( + dropout2_out, + normalized_shape=list([d_model]), + weight=ln2_scale, + bias=ln2_bias, + ) + + exe = paddle.static.Executor(paddle.XPUPlace(0)) + + x_data = np.random.random( + (batch_size, d_model, dim_feedforward) + ).astype(dtype) + linear1_weight_data = np.random.random( + (d_model, dim_feedforward) + ).astype(dtype) + linear1_bias_data = np.zeros((dim_feedforward)).astype(dtype) + linear2_weight_data = np.random.random( + (dim_feedforward, d_model) + ).astype(dtype) + linear2_bias_data = np.zeros((d_model)).astype(dtype) + + ln1_scale_data = np.ones((d_model)).astype(layer_norm_dtype) + ln1_bias_data = np.zeros((d_model)).astype(layer_norm_dtype) + ln2_scale_data = np.ones((d_model)).astype(layer_norm_dtype) + ln2_bias_data = np.zeros((d_model)).astype(layer_norm_dtype) + + res_list = [fused_out, ln_out] + real_res = [] + + for res in res_list: + fetch = exe.run( + feed={ + 'x': x_data, + 'linear1_weight': linear1_weight_data, + 'linear1_bias': linear1_bias_data, + 'linear2_weight': linear2_weight_data, + 'linear2_bias': linear2_bias_data, + 'ln1_scale': ln1_scale_data, + 'ln1_bias': ln1_bias_data, + 'ln2_scale': ln2_scale_data, + 'ln2_bias': ln2_bias_data, + }, + fetch_list=[res], + ) + real_res.append(fetch) + np.testing.assert_allclose( + real_res[0], real_res[1], rtol=1e-05, atol=0.001 + ) + + +class TestFusedFFNOpError(unittest.TestCase): + def test_errors(self): + paddle.enable_static() + with paddle.static.program_guard( + paddle.static.Program(), paddle.static.Program() + ): + + def test_dtype(): + x = paddle.static.data( + name='x', shape=[1, 10, 10], dtype="int32" + ) + linear1_weight = paddle.static.data( + name='linear1_weight', shape=[1, 10, 10], dtype="float32" + ) + linear2_weight = paddle.static.data( + name='linear2_weight', shape=[1, 10, 10], dtype="float32" + ) + incubate_f.fused_feedforward(x, linear1_weight, linear2_weight) + + self.assertRaises(TypeError, test_dtype) + + def test_dropout_rate_type(): + x = paddle.static.data( + name='x1', shape=[1, 10, 10], dtype="float32" + ) + linear1_weight = paddle.static.data( + name='linear1_weight1', shape=[10, 10], dtype="float32" + ) + linear2_weight = paddle.static.data( + name='linear2_weight1', shape=[10, 10], dtype="float32" + ) + incubate_f.fused_feedforward( + x, linear1_weight, linear2_weight, dropout1_rate="a" + ) + + self.assertRaises(TypeError, test_dropout_rate_type) + + def test_dropout_rate_value(): + x = paddle.static.data( + name='x2', shape=[1, 10, 10], dtype="float32" + ) + linear1_weight = paddle.static.data( + name='linear1_weight2', shape=[10, 10], dtype="float32" + ) + linear2_weight = paddle.static.data( + name='linear2_weight2', shape=[10, 10], dtype="float32" + ) + incubate_f.fused_feedforward( + x, linear1_weight, linear2_weight, dropout2_rate=-1 + ) + + self.assertRaises(ValueError, test_dropout_rate_value) + + def test_dropout_mode(): + x = paddle.static.data( + name='x3', shape=[1, 10, 10], dtype="float32" + ) + linear1_weight = paddle.static.data( + name='linear1_weight3', shape=[10, 10], dtype="float32" + ) + linear2_weight = paddle.static.data( + name='linear2_weight3', shape=[10, 10], dtype="float32" + ) + incubate_f.fused_feedforward( + x, linear1_weight, linear2_weight, mode='test' + ) + + self.assertRaises(ValueError, test_dropout_mode) + + +support_types = {"float32"} # get_xpu_op_support_types('fused_feedforward') +for stype in support_types: + create_test_class(globals(), XPUTestFusedFFNOp, stype) + +if __name__ == "__main__": + unittest.main() From c374894d9e33bcc86af2453a4b6c1e20765a2ec6 Mon Sep 17 00:00:00 2001 From: Mountagha Date: Thu, 17 Nov 2022 01:37:48 -0500 Subject: [PATCH 061/210] optimizing a bit tensor_array initialization (#48066) --- paddle/phi/core/tensor_array.cc | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/paddle/phi/core/tensor_array.cc b/paddle/phi/core/tensor_array.cc index 0f8e4ba4c926d..2007e71d5a0fd 100644 --- a/paddle/phi/core/tensor_array.cc +++ b/paddle/phi/core/tensor_array.cc @@ -23,13 +23,12 @@ TensorArray::TensorArray(const std::vector& vec) { /// \brief Test whether the tensor's storage in TensorArray is allocated. /// return Whether all tensors in TensorArray is allocated. bool TensorArray::initialized() const { - bool init = true; for (auto tensor : tensors_) { if (!tensor.IsInitialized()) { - init = false; + return false; } } - return init; + return true; } int64_t TensorArray::numel() const { From f3650201e2c0316261f8474c2501e33e594cecff Mon Sep 17 00:00:00 2001 From: YuanRisheng Date: Thu, 17 Nov 2022 14:42:37 +0800 Subject: [PATCH 062/210] [PHI]Standardise some C++ API (Part5) (#47860) * standard api * fix xpu bugs --- .../operators/fused/fused_dropout_test.h | 1 - paddle/fluid/operators/tril_triu_op.cc | 2 +- paddle/phi/api/yaml/legacy_backward.yaml | 22 +++++++++---- paddle/phi/api/yaml/legacy_ops.yaml | 19 ++++++++--- paddle/phi/infermeta/multiary.cc | 8 ++--- paddle/phi/infermeta/multiary.h | 8 ++--- paddle/phi/infermeta/ternary.cc | 1 - paddle/phi/infermeta/ternary.h | 1 - paddle/phi/infermeta/unary.cc | 18 ++++++++--- paddle/phi/infermeta/unary.h | 14 +++++--- paddle/phi/kernels/cpu/hsigmoid_loss_grad.h | 4 --- .../kernels/cpu/hsigmoid_loss_grad_kernel.cc | 8 ----- .../phi/kernels/cpu/hsigmoid_loss_kernel.cc | 8 ++--- .../phi/kernels/cpu/layer_norm_grad_kernel.cc | 1 - paddle/phi/kernels/cpu/layer_norm_kernel.cc | 1 - paddle/phi/kernels/cpu/matrix_rank_kernel.cc | 2 +- ...rad_kernel.cc => tril_triu_grad_kernel.cc} | 24 +++++++++++++- .../{tril_kernel.cc => tril_triu_kernel.cc} | 24 +++++++++++++- .../phi/kernels/gpu/layer_norm_grad_kernel.cu | 1 - paddle/phi/kernels/gpu/layer_norm_kernel.cu | 1 - paddle/phi/kernels/gpu/lstsq_kernel.cu | 6 ++-- paddle/phi/kernels/gpu/matrix_rank_kernel.cu | 2 +- paddle/phi/kernels/gpu/qr_kernel.cu | 6 ++-- ...rad_kernel.cu => tril_triu_grad_kernel.cu} | 24 +++++++++++++- .../{tril_kernel.cu => tril_triu_kernel.cu} | 24 +++++++++++++- .../phi/kernels/hsigmoid_loss_grad_kernel.h | 4 --- paddle/phi/kernels/hsigmoid_loss_kernel.h | 8 ++--- paddle/phi/kernels/impl/qr_grad_kernel_impl.h | 6 ++-- ...el_impl.h => tril_triu_grad_kernel_impl.h} | 28 ++++++++++++---- ..._kernel_impl.h => tril_triu_kernel_impl.h} | 28 ++++++++++++---- paddle/phi/kernels/layer_norm_grad_kernel.h | 1 - paddle/phi/kernels/layer_norm_kernel.h | 1 - paddle/phi/kernels/matrix_rank_kernel.h | 2 +- .../hsigmoid_loss_grad_kernel.cc | 8 ----- .../selected_rows/hsigmoid_loss_grad_kernel.h | 4 --- ..._grad_kernel.h => tril_triu_grad_kernel.h} | 14 +++++++- .../{tril_kernel.h => tril_triu_kernel.h} | 26 +++++++++++---- .../phi/kernels/xpu/layer_norm_grad_kernel.cc | 1 - paddle/phi/kernels/xpu/layer_norm_kernel.cc | 1 - ...rad_kernel.cc => tril_triu_grad_kernel.cc} | 32 +++++++++++++++---- .../{tril_kernel.cc => tril_triu_kernel.cc} | 31 ++++++++++++++---- .../ops/compat/hierarchical_sigmoid_sig.cc | 26 +++------------ paddle/phi/ops/compat/layer_norm_sig.cc | 4 +-- paddle/phi/ops/compat/matrix_rank_sig.cc | 2 +- paddle/phi/ops/compat/tril_triu_sig.cc | 7 ++-- python/paddle/fluid/dygraph/nn.py | 1 - .../fluid/tests/unittests/test_hsigmoid_op.py | 7 ++-- .../tests/unittests/test_matrix_rank_op.py | 2 +- python/paddle/nn/functional/loss.py | 8 ++--- python/paddle/nn/functional/norm.py | 2 +- python/paddle/tensor/creation.py | 2 +- python/paddle/tensor/linalg.py | 2 +- 52 files changed, 309 insertions(+), 179 deletions(-) rename paddle/phi/kernels/cpu/{tril_grad_kernel.cc => tril_triu_grad_kernel.cc} (60%) rename paddle/phi/kernels/cpu/{tril_kernel.cc => tril_triu_kernel.cc} (60%) rename paddle/phi/kernels/gpu/{tril_grad_kernel.cu => tril_triu_grad_kernel.cu} (60%) rename paddle/phi/kernels/gpu/{tril_kernel.cu => tril_triu_kernel.cu} (60%) rename paddle/phi/kernels/impl/{tril_grad_kernel_impl.h => tril_triu_grad_kernel_impl.h} (68%) rename paddle/phi/kernels/impl/{tril_kernel_impl.h => tril_triu_kernel_impl.h} (70%) rename paddle/phi/kernels/{tril_grad_kernel.h => tril_triu_grad_kernel.h} (66%) rename paddle/phi/kernels/{tril_kernel.h => tril_triu_kernel.h} (60%) rename paddle/phi/kernels/xpu/{tril_grad_kernel.cc => tril_triu_grad_kernel.cc} (67%) rename paddle/phi/kernels/xpu/{tril_kernel.cc => tril_triu_kernel.cc} (69%) diff --git a/paddle/fluid/operators/fused/fused_dropout_test.h b/paddle/fluid/operators/fused/fused_dropout_test.h index b71ed39c68251..a985d23b483a7 100644 --- a/paddle/fluid/operators/fused/fused_dropout_test.h +++ b/paddle/fluid/operators/fused/fused_dropout_test.h @@ -193,7 +193,6 @@ void LayerNorm(const std::vector> &scale, bias_opt, 1e-5, 1, - false, tensor_y, tensor_mean, tensor_variance); diff --git a/paddle/fluid/operators/tril_triu_op.cc b/paddle/fluid/operators/tril_triu_op.cc index 97c9289295022..5d2c3c0797acf 100644 --- a/paddle/fluid/operators/tril_triu_op.cc +++ b/paddle/fluid/operators/tril_triu_op.cc @@ -93,7 +93,7 @@ namespace ops = paddle::operators; namespace plat = paddle::platform; DECLARE_INFER_SHAPE_FUNCTOR(tril_triu, TrilTriuInferShapeFunctor, - PD_INFER_META(phi::TrilInferMeta)); + PD_INFER_META(phi::TrilTriuInferMeta)); REGISTER_OPERATOR(tril_triu, ops::TrilTriuOp, ops::TrilTriuOpMaker, diff --git a/paddle/phi/api/yaml/legacy_backward.yaml b/paddle/phi/api/yaml/legacy_backward.yaml index f920bbb8b23a7..a61aa52cc821f 100755 --- a/paddle/phi/api/yaml/legacy_backward.yaml +++ b/paddle/phi/api/yaml/legacy_backward.yaml @@ -655,8 +655,8 @@ inplace : (out_grad -> x_grad) - backward_op : hsigmoid_loss_grad - forward : hsigmoid_loss (Tensor x, Tensor w, Tensor label, Tensor path, Tensor code, Tensor bias, int num_classes, bool remote_prefetch, int trainer_id, int64_t[] height_sections, str[] epmap, str[] table_names, bool is_sparse) -> Tensor(out), Tensor(pre_out), Tensor(w_out) - args : (Tensor x, Tensor w, Tensor label, Tensor path, Tensor code, Tensor bias, Tensor pre_out, Tensor out_grad, int num_classes, bool remote_prefetch, int trainer_id, int64_t[] height_sections, str[] epmap, str[] table_names, bool is_sparse) + forward : hsigmoid_loss (Tensor x, Tensor label, Tensor w, Tensor bias, Tensor path, Tensor code, int num_classes, bool remote_prefetch, bool is_sparse) -> Tensor(out), Tensor(pre_out), Tensor(w_out) + args : (Tensor x, Tensor w, Tensor label, Tensor path, Tensor code, Tensor bias, Tensor pre_out, Tensor out_grad, int num_classes, bool remote_prefetch, bool is_sparse) output : Tensor(x_grad), Tensor(w_grad), Tensor(bias_grad) infer_meta : func : GeneralTernaryGradInferMeta @@ -764,8 +764,8 @@ func : label_smooth_grad - backward_op : layer_norm_grad - forward : layer_norm (Tensor x, Tensor scale, Tensor bias, float epsilon, int begin_norm_axis, bool is_test) -> Tensor(out), Tensor(mean), Tensor(variance) - args : (Tensor x, Tensor scale, Tensor bias, Tensor mean, Tensor variance, Tensor out_grad, float epsilon, int begin_norm_axis, bool is_test) + forward : layer_norm (Tensor x, Tensor scale, Tensor bias, float epsilon, int begin_norm_axis) -> Tensor(out), Tensor(mean), Tensor(variance) + args : (Tensor x, Tensor scale, Tensor bias, Tensor mean, Tensor variance, Tensor out_grad, float epsilon, int begin_norm_axis) output : Tensor(x_grad), Tensor(scale_grad), Tensor(bias_grad) infer_meta : func : LayerNormGradInferMeta @@ -1830,8 +1830,8 @@ func : triangular_solve_grad - backward_op : tril_grad - forward : tril(Tensor x, int diagonal, bool lower) -> Tensor(out) - args : (Tensor out_grad, int diagonal, bool lower) + forward : tril(Tensor x, int diagonal) -> Tensor(out) + args : (Tensor out_grad, int diagonal) output : Tensor(x_grad) infer_meta : func : UnchangedInferMeta @@ -1851,6 +1851,16 @@ func : trilinear_interp_grad data_type : output_grad +- backward_op : triu_grad + forward : triu(Tensor x, int diagonal) -> Tensor(out) + args : (Tensor out_grad, int diagonal) + output : Tensor(x_grad) + infer_meta : + func : UnchangedInferMeta + param : [out_grad] + kernel : + func : triu_grad + - backward_op : unbind_grad forward : unbind (Tensor input, int axis) -> Tensor[](out) args : (Tensor[] out_grad, int axis) diff --git a/paddle/phi/api/yaml/legacy_ops.yaml b/paddle/phi/api/yaml/legacy_ops.yaml index a1bc49a477ac3..7fb2c2441055e 100755 --- a/paddle/phi/api/yaml/legacy_ops.yaml +++ b/paddle/phi/api/yaml/legacy_ops.yaml @@ -934,7 +934,7 @@ backward : hardtanh_grad - op : hsigmoid_loss - args : (Tensor x, Tensor w, Tensor label, Tensor path, Tensor code, Tensor bias, int num_classes, bool remote_prefetch, int trainer_id, int64_t[] height_sections, str[] epmap, str[] table_names, bool is_sparse) + args : (Tensor x, Tensor label, Tensor w, Tensor bias, Tensor path, Tensor code, int num_classes, bool remote_prefetch, bool is_sparse) output : Tensor(out), Tensor(pre_out), Tensor(w_out) infer_meta : func : HSigmoidLossInferMeta @@ -1091,7 +1091,7 @@ inplace : (param -> param_out), (moment1 -> moment1_out), (moment2 -> moment2_out), (beta1_pow -> beta1_pow_out), (beta2_pow -> beta2_pow_out), (master_param -> master_param_outs) - op : layer_norm - args : (Tensor x, Tensor scale, Tensor bias, float epsilon, int begin_norm_axis, bool is_test) + args : (Tensor x, Tensor scale, Tensor bias, float epsilon, int begin_norm_axis) output : Tensor(out), Tensor(mean), Tensor(variance) infer_meta : func : LayerNormInferMeta @@ -1294,11 +1294,11 @@ backward : matrix_power_grad - op : matrix_rank - args : (Tensor x, float tol, bool use_default_tol=true, bool hermitian=false) + args : (Tensor x, float tol, bool hermitian=false, bool use_default_tol=true) output : Tensor(out) infer_meta : func : MatrixRankInferMeta - param : [x, use_default_tol, hermitian] + param : [x, hermitian, use_default_tol] kernel : func : matrix_rank @@ -2226,7 +2226,7 @@ backward : triangular_solve_grad - op : tril - args : (Tensor x, int diagonal, bool lower) + args : (Tensor x, int diagonal) output : Tensor(out) infer_meta : func : TrilInferMeta @@ -2257,6 +2257,15 @@ data_type : x backward : trilinear_interp_grad +- op : triu + args : (Tensor x, int diagonal) + output : Tensor(out) + infer_meta : + func : TriuInferMeta + kernel : + func : triu + backward : triu_grad + - op : triu_indices args : (int row, int col, int offset, DataType dtype, Place place={}) output : Tensor(out) diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc index b9e84c2df5714..29c2436c8e8c0 100644 --- a/paddle/phi/infermeta/multiary.cc +++ b/paddle/phi/infermeta/multiary.cc @@ -1330,17 +1330,13 @@ void GraphSampleNeighborsInferMeta(const MetaTensor& row, } void HSigmoidLossInferMeta(const MetaTensor& x, - const MetaTensor& w, const MetaTensor& label, + const MetaTensor& w, + const MetaTensor& bias, const MetaTensor& path, const MetaTensor& code, - const MetaTensor& bias, int num_classes, bool remote_prefetch, - int trainer_id, - const std::vector& height_sections, - const std::vector& epmap, - const std::vector& table_names, bool is_sparse, MetaTensor* out, MetaTensor* pre_out, diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h index a37925202926a..3607097400a39 100644 --- a/paddle/phi/infermeta/multiary.h +++ b/paddle/phi/infermeta/multiary.h @@ -288,17 +288,13 @@ void GraphSampleNeighborsInferMeta(const MetaTensor& row, MetaTensor* out_eids); void HSigmoidLossInferMeta(const MetaTensor& x, - const MetaTensor& w, const MetaTensor& label, + const MetaTensor& w, + const MetaTensor& bias, const MetaTensor& path, const MetaTensor& code, - const MetaTensor& bias, int num_classes, bool remote_prefetch, - int trainer_id, - const std::vector& height_sections, - const std::vector& epmap, - const std::vector& table_names, bool is_sparse, MetaTensor* out, MetaTensor* pre_out, diff --git a/paddle/phi/infermeta/ternary.cc b/paddle/phi/infermeta/ternary.cc index 9b3ffbd083762..1b945c0254fb3 100644 --- a/paddle/phi/infermeta/ternary.cc +++ b/paddle/phi/infermeta/ternary.cc @@ -498,7 +498,6 @@ void LayerNormInferMeta(const MetaTensor& x, const MetaTensor& bias, float epsilon, int begin_norm_axis, - bool is_test, MetaTensor* out, MetaTensor* mean, MetaTensor* variance, diff --git a/paddle/phi/infermeta/ternary.h b/paddle/phi/infermeta/ternary.h index 1d0e7e8744dc1..7f24f2970095b 100644 --- a/paddle/phi/infermeta/ternary.h +++ b/paddle/phi/infermeta/ternary.h @@ -87,7 +87,6 @@ void LayerNormInferMeta(const MetaTensor& x, const MetaTensor& bias, float epsilon, int begin_norm_axis, - bool is_test, MetaTensor* out, MetaTensor* mean, MetaTensor* variance, diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc index f51a4a2b2b9de..a3da5aca24e11 100644 --- a/paddle/phi/infermeta/unary.cc +++ b/paddle/phi/infermeta/unary.cc @@ -1882,8 +1882,8 @@ void LUInferMeta(const MetaTensor& x, } void MatrixRankInferMeta(const MetaTensor& x, - bool use_default_tol, bool hermitian, + bool use_default_tol, MetaTensor* out) { auto dim_x = x.dims(); PADDLE_ENFORCE_GE(dim_x.size(), @@ -4156,10 +4156,10 @@ void UnbindInferMeta(const MetaTensor& x, } } -void TrilInferMeta(const MetaTensor& x, - int diagonal, - bool lower, - MetaTensor* out) { +void TrilTriuInferMeta(const MetaTensor& x, + int diagonal, + bool lower, + MetaTensor* out) { const auto& x_dims = x.dims(); PADDLE_ENFORCE_GE(x_dims.size(), 2, @@ -4170,6 +4170,14 @@ void TrilInferMeta(const MetaTensor& x, out->set_dtype(x.dtype()); } +void TrilInferMeta(const MetaTensor& x, int diagonal, MetaTensor* out) { + TrilTriuInferMeta(x, diagonal, true, out); +} + +void TriuInferMeta(const MetaTensor& x, int diagonal, MetaTensor* out) { + TrilTriuInferMeta(x, diagonal, false, out); +} + void UnchangedInferMeta(const MetaTensor& x, MetaTensor* out) { out->share_meta(x); } diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h index 153b2b8f5f217..c7b7780b0cbee 100644 --- a/paddle/phi/infermeta/unary.h +++ b/paddle/phi/infermeta/unary.h @@ -264,8 +264,8 @@ void LUInferMeta(const MetaTensor& x, void MatrixPowerInferMeta(const MetaTensor& x, int n, MetaTensor* out); void MatrixRankInferMeta(const MetaTensor& x, - bool use_default_tol, bool hermitian, + bool use_default_tol, MetaTensor* out); void MaxOutInferMeta(const MetaTensor& x, @@ -601,10 +601,14 @@ void TransposeGradInferMeta(const MetaTensor& x, const std::vector& axis, MetaTensor* out); -void TrilInferMeta(const MetaTensor& x, - int diagonal, - bool lower, - MetaTensor* out); +void TrilInferMeta(const MetaTensor& x, int diagonal, MetaTensor* out); + +void TriuInferMeta(const MetaTensor& x, int diagonal, MetaTensor* out); + +void TrilTriuInferMeta(const MetaTensor& x, + int diagonal, + bool lower, + MetaTensor* out); void UnbindInferMeta(const MetaTensor& x, int axis, diff --git a/paddle/phi/kernels/cpu/hsigmoid_loss_grad.h b/paddle/phi/kernels/cpu/hsigmoid_loss_grad.h index 7e6693c4dd7a1..12960e305a0c6 100644 --- a/paddle/phi/kernels/cpu/hsigmoid_loss_grad.h +++ b/paddle/phi/kernels/cpu/hsigmoid_loss_grad.h @@ -37,10 +37,6 @@ void HSigmoidLossGradKernelImpl(const Context& ctx, const DenseTensor& out_grad, int num_classes, bool remote_prefetch, - int trainer_id, - const std::vector& height_sections, - const std::vector& epmap, - const std::vector& table_names, bool is_sparse, DenseTensor* x_grad, DenseTensor* w_grad, diff --git a/paddle/phi/kernels/cpu/hsigmoid_loss_grad_kernel.cc b/paddle/phi/kernels/cpu/hsigmoid_loss_grad_kernel.cc index efb59d1f48267..bc741b32b3afc 100644 --- a/paddle/phi/kernels/cpu/hsigmoid_loss_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/hsigmoid_loss_grad_kernel.cc @@ -32,10 +32,6 @@ void HSigmoidLossGradKernel(const Context& ctx, const DenseTensor& out_grad, int num_classes, bool remote_prefetch, - int trainer_id, - const std::vector& height_sections, - const std::vector& epmap, - const std::vector& table_names, bool is_sparse, DenseTensor* x_grad, DenseTensor* w_grad, @@ -51,10 +47,6 @@ void HSigmoidLossGradKernel(const Context& ctx, out_grad, num_classes, remote_prefetch, - trainer_id, - height_sections, - epmap, - table_names, is_sparse, x_grad, w_grad, diff --git a/paddle/phi/kernels/cpu/hsigmoid_loss_kernel.cc b/paddle/phi/kernels/cpu/hsigmoid_loss_kernel.cc index fa0c83031d904..a6f10b4ff13b4 100644 --- a/paddle/phi/kernels/cpu/hsigmoid_loss_kernel.cc +++ b/paddle/phi/kernels/cpu/hsigmoid_loss_kernel.cc @@ -30,17 +30,13 @@ namespace math = paddle::operators::math; template void HSigmoidLossKernel(const Context& ctx, const DenseTensor& x, - const DenseTensor& w, const DenseTensor& label, + const DenseTensor& w, + const paddle::optional& bias, const paddle::optional& path, const paddle::optional& code, - const paddle::optional& bias, int num_classes, bool remote_prefetch, - int trainer_id, - const std::vector& height_sections, - const std::vector& epmap, - const std::vector& table_names, bool is_sparse, DenseTensor* out, DenseTensor* pre_out, diff --git a/paddle/phi/kernels/cpu/layer_norm_grad_kernel.cc b/paddle/phi/kernels/cpu/layer_norm_grad_kernel.cc index 58d69cb3454e7..fb61ced9f4326 100644 --- a/paddle/phi/kernels/cpu/layer_norm_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/layer_norm_grad_kernel.cc @@ -40,7 +40,6 @@ void LayerNormGradKernel(const Context& dev_ctx, const DenseTensor& out_grad, float epsilon, int begin_norm_axis, - bool is_test, DenseTensor* x_grad, DenseTensor* scale_grad, DenseTensor* bias_grad) { diff --git a/paddle/phi/kernels/cpu/layer_norm_kernel.cc b/paddle/phi/kernels/cpu/layer_norm_kernel.cc index 7061d4f0ad730..542f19c83b274 100644 --- a/paddle/phi/kernels/cpu/layer_norm_kernel.cc +++ b/paddle/phi/kernels/cpu/layer_norm_kernel.cc @@ -35,7 +35,6 @@ void LayerNormKernel(const Context& dev_ctx, const paddle::optional& bias_opt, float epsilon, int begin_norm_axis, - bool is_test, DenseTensor* y, DenseTensor* mean, DenseTensor* var) { diff --git a/paddle/phi/kernels/cpu/matrix_rank_kernel.cc b/paddle/phi/kernels/cpu/matrix_rank_kernel.cc index f56bd3d6dbe8a..9c23ead1e68da 100644 --- a/paddle/phi/kernels/cpu/matrix_rank_kernel.cc +++ b/paddle/phi/kernels/cpu/matrix_rank_kernel.cc @@ -24,8 +24,8 @@ template void MatrixRankKernel(const Context& dev_ctx, const DenseTensor& x, float tol, - bool use_default_tol, bool hermitian, + bool use_default_tol, DenseTensor* out) { DenseTensor atol_tensor; if (use_default_tol) { diff --git a/paddle/phi/kernels/cpu/tril_grad_kernel.cc b/paddle/phi/kernels/cpu/tril_triu_grad_kernel.cc similarity index 60% rename from paddle/phi/kernels/cpu/tril_grad_kernel.cc rename to paddle/phi/kernels/cpu/tril_triu_grad_kernel.cc index fba457424fc05..83600f7fd6df6 100644 --- a/paddle/phi/kernels/cpu/tril_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/tril_triu_grad_kernel.cc @@ -14,7 +14,29 @@ #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/impl/tril_grad_kernel_impl.h" +#include "paddle/phi/kernels/impl/tril_triu_grad_kernel_impl.h" + +PD_REGISTER_KERNEL(tril_triu_grad, + CPU, + ALL_LAYOUT, + phi::TrilTriuGradKernel, + bool, + float, + double, + int, + int64_t, + phi::dtype::float16) {} + +PD_REGISTER_KERNEL(triu_grad, + CPU, + ALL_LAYOUT, + phi::TriuGradKernel, + bool, + float, + double, + int, + int64_t, + phi::dtype::float16) {} PD_REGISTER_KERNEL(tril_grad, CPU, diff --git a/paddle/phi/kernels/cpu/tril_kernel.cc b/paddle/phi/kernels/cpu/tril_triu_kernel.cc similarity index 60% rename from paddle/phi/kernels/cpu/tril_kernel.cc rename to paddle/phi/kernels/cpu/tril_triu_kernel.cc index 82902a1977297..1a5773d3dd2f9 100644 --- a/paddle/phi/kernels/cpu/tril_kernel.cc +++ b/paddle/phi/kernels/cpu/tril_triu_kernel.cc @@ -14,7 +14,29 @@ #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/impl/tril_kernel_impl.h" +#include "paddle/phi/kernels/impl/tril_triu_kernel_impl.h" + +PD_REGISTER_KERNEL(tril_triu, + CPU, + ALL_LAYOUT, + phi::TrilTriuKernel, + bool, + float, + double, + int, + int64_t, + phi::dtype::float16) {} + +PD_REGISTER_KERNEL(triu, + CPU, + ALL_LAYOUT, + phi::TriuKernel, + bool, + float, + double, + int, + int64_t, + phi::dtype::float16) {} PD_REGISTER_KERNEL(tril, CPU, diff --git a/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu index 961937441e1cf..0ec43eab3785c 100644 --- a/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu @@ -31,7 +31,6 @@ void LayerNormGradKernel(const Context &dev_ctx, const DenseTensor &out_grad, float epsilon, int begin_norm_axis, - bool is_test, DenseTensor *x_grad, DenseTensor *scale_grad, DenseTensor *bias_grad) { diff --git a/paddle/phi/kernels/gpu/layer_norm_kernel.cu b/paddle/phi/kernels/gpu/layer_norm_kernel.cu index f2ad08c680058..1dd1070884732 100644 --- a/paddle/phi/kernels/gpu/layer_norm_kernel.cu +++ b/paddle/phi/kernels/gpu/layer_norm_kernel.cu @@ -61,7 +61,6 @@ void LayerNormKernel(const Context &dev_ctx, const paddle::optional &bias_opt, float epsilon, int begin_norm_axis, - bool is_test, DenseTensor *y, DenseTensor *mean, DenseTensor *var) { diff --git a/paddle/phi/kernels/gpu/lstsq_kernel.cu b/paddle/phi/kernels/gpu/lstsq_kernel.cu index 0e59dbe9df7fa..adb0ca09d8938 100644 --- a/paddle/phi/kernels/gpu/lstsq_kernel.cu +++ b/paddle/phi/kernels/gpu/lstsq_kernel.cu @@ -23,7 +23,7 @@ #include "paddle/phi/kernels/funcs/slice.h" #include "paddle/phi/kernels/impl/lstsq_kernel_impl.h" #include "paddle/phi/kernels/impl/qr_kernel_impl.h" -#include "paddle/phi/kernels/impl/tril_kernel_impl.h" +#include "paddle/phi/kernels/impl/tril_triu_kernel_impl.h" #include "paddle/phi/kernels/lstsq_kernel.h" #include "paddle/phi/kernels/matmul_kernel.h" #include "paddle/phi/kernels/transpose_kernel.h" @@ -110,7 +110,7 @@ void LstsqKernel(const Context& dev_ctx, DenseTensor* res_r = new DenseTensor(); res_r->Resize(phi::make_ddim({batch_count, min_mn, min_mn})); dev_ctx.template Alloc(res_r); - phi::TrilKernel(dev_ctx, slice_r, 0, false, res_r); + phi::TrilTriuKernel(dev_ctx, slice_r, 0, false, res_r); DenseTensor trans_y = phi::TransposeLast2Dim(dev_ctx, tmp_y); DenseTensor slice_y = @@ -135,7 +135,7 @@ void LstsqKernel(const Context& dev_ctx, DenseTensor* res_r = new DenseTensor(); res_r->Resize(phi::make_ddim({batch_count, min_mn, min_mn})); dev_ctx.template Alloc(res_r); - phi::TrilKernel(dev_ctx, slice_r, 0, false, res_r); + phi::TrilTriuKernel(dev_ctx, slice_r, 0, false, res_r); phi::TriangularSolveKernel( dev_ctx, *res_r, *new_y, true, true, false, solution); diff --git a/paddle/phi/kernels/gpu/matrix_rank_kernel.cu b/paddle/phi/kernels/gpu/matrix_rank_kernel.cu index 9727e2cc114c5..73e703fcabb20 100644 --- a/paddle/phi/kernels/gpu/matrix_rank_kernel.cu +++ b/paddle/phi/kernels/gpu/matrix_rank_kernel.cu @@ -27,8 +27,8 @@ template void MatrixRankKernel(const Context& dev_ctx, const DenseTensor& x, float tol, - bool use_default_tol, bool hermitian, + bool use_default_tol, DenseTensor* out) { DenseTensor atol_tensor; if (use_default_tol) { diff --git a/paddle/phi/kernels/gpu/qr_kernel.cu b/paddle/phi/kernels/gpu/qr_kernel.cu index 697cf952c1cec..99752ac486d6e 100644 --- a/paddle/phi/kernels/gpu/qr_kernel.cu +++ b/paddle/phi/kernels/gpu/qr_kernel.cu @@ -31,7 +31,7 @@ #include "paddle/phi/kernels/qr_kernel.h" #include "paddle/phi/kernels/slice_kernel.h" #include "paddle/phi/kernels/transpose_kernel.h" -#include "paddle/phi/kernels/tril_kernel.h" +#include "paddle/phi/kernels/tril_triu_kernel.h" namespace phi { @@ -103,12 +103,12 @@ void QrKernel(const Context& ctx, auto trans_qr = TransposeLast2Dim(ctx, qr); auto sliced_qr = SliceKernel( ctx, trans_qr, {trans_qr.dims().size() - 2}, {0}, {min_mn}, {1}, {}); - auto tmp_r = Tril(ctx, sliced_qr, 0, false); + auto tmp_r = TrilTriu(ctx, sliced_qr, 0, false); // Transpose 'tmp_r' to retore the original row-major order phi::Copy(ctx, tmp_r, r->place(), false, r); } else { auto trans_qr = TransposeLast2Dim(ctx, qr); - auto tmp_r = Tril(ctx, trans_qr, 0, false); + auto tmp_r = TrilTriu(ctx, trans_qr, 0, false); // Transpose 'tmp_r' to retore the original row-major order phi::Copy(ctx, tmp_r, r->place(), false, r); } diff --git a/paddle/phi/kernels/gpu/tril_grad_kernel.cu b/paddle/phi/kernels/gpu/tril_triu_grad_kernel.cu similarity index 60% rename from paddle/phi/kernels/gpu/tril_grad_kernel.cu rename to paddle/phi/kernels/gpu/tril_triu_grad_kernel.cu index 5bda0e54b33a6..5bd5b730acfeb 100644 --- a/paddle/phi/kernels/gpu/tril_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/tril_triu_grad_kernel.cu @@ -14,7 +14,7 @@ #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/impl/tril_grad_kernel_impl.h" +#include "paddle/phi/kernels/impl/tril_triu_grad_kernel_impl.h" PD_REGISTER_KERNEL(tril_grad, GPU, @@ -26,3 +26,25 @@ PD_REGISTER_KERNEL(tril_grad, int, int64_t, phi::dtype::float16) {} + +PD_REGISTER_KERNEL(triu_grad, + GPU, + ALL_LAYOUT, + phi::TriuGradKernel, + bool, + float, + double, + int, + int64_t, + phi::dtype::float16) {} + +PD_REGISTER_KERNEL(tril_triu_grad, + GPU, + ALL_LAYOUT, + phi::TrilTriuGradKernel, + bool, + float, + double, + int, + int64_t, + phi::dtype::float16) {} diff --git a/paddle/phi/kernels/gpu/tril_kernel.cu b/paddle/phi/kernels/gpu/tril_triu_kernel.cu similarity index 60% rename from paddle/phi/kernels/gpu/tril_kernel.cu rename to paddle/phi/kernels/gpu/tril_triu_kernel.cu index c50b7c513fd07..6a7aa54f8e379 100644 --- a/paddle/phi/kernels/gpu/tril_kernel.cu +++ b/paddle/phi/kernels/gpu/tril_triu_kernel.cu @@ -14,7 +14,29 @@ #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/impl/tril_kernel_impl.h" +#include "paddle/phi/kernels/impl/tril_triu_kernel_impl.h" + +PD_REGISTER_KERNEL(tril_triu, + GPU, + ALL_LAYOUT, + phi::TrilTriuKernel, + bool, + float, + double, + int, + int64_t, + phi::dtype::float16) {} + +PD_REGISTER_KERNEL(triu, + GPU, + ALL_LAYOUT, + phi::TriuKernel, + bool, + float, + double, + int, + int64_t, + phi::dtype::float16) {} PD_REGISTER_KERNEL(tril, GPU, diff --git a/paddle/phi/kernels/hsigmoid_loss_grad_kernel.h b/paddle/phi/kernels/hsigmoid_loss_grad_kernel.h index e31d429107990..c36b343017fd5 100644 --- a/paddle/phi/kernels/hsigmoid_loss_grad_kernel.h +++ b/paddle/phi/kernels/hsigmoid_loss_grad_kernel.h @@ -30,10 +30,6 @@ void HSigmoidLossGradKernel(const Context& ctx, const DenseTensor& out_grad, int num_classes, bool remote_prefetch, - int trainer_id, - const std::vector& height_sections, - const std::vector& epmap, - const std::vector& table_names, bool is_sparse, DenseTensor* x_grad, DenseTensor* w_grad, diff --git a/paddle/phi/kernels/hsigmoid_loss_kernel.h b/paddle/phi/kernels/hsigmoid_loss_kernel.h index c8fb3ca77f3f9..33a90c637e4e4 100644 --- a/paddle/phi/kernels/hsigmoid_loss_kernel.h +++ b/paddle/phi/kernels/hsigmoid_loss_kernel.h @@ -21,17 +21,13 @@ namespace phi { template void HSigmoidLossKernel(const Context& ctx, const DenseTensor& x, - const DenseTensor& w, const DenseTensor& label, + const DenseTensor& w, + const paddle::optional& bias, const paddle::optional& path, const paddle::optional& code, - const paddle::optional& bias, int num_classes, bool remote_prefetch, - int trainer_id, - const std::vector& height_sections, - const std::vector& epmap, - const std::vector& table_names, bool is_sparse, DenseTensor* out, DenseTensor* pre_out, diff --git a/paddle/phi/kernels/impl/qr_grad_kernel_impl.h b/paddle/phi/kernels/impl/qr_grad_kernel_impl.h index 5ad59f757aa22..5c04d9bb90cfe 100644 --- a/paddle/phi/kernels/impl/qr_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/qr_grad_kernel_impl.h @@ -29,7 +29,7 @@ #include "paddle/phi/kernels/slice_kernel.h" #include "paddle/phi/kernels/transpose_kernel.h" #include "paddle/phi/kernels/triangular_solve_kernel.h" -#include "paddle/phi/kernels/tril_kernel.h" +#include "paddle/phi/kernels/tril_triu_kernel.h" namespace phi { @@ -116,8 +116,8 @@ void QrGradKernel(const Context& ctx, DenseTensor M_tmp1 = Subtract(ctx, R_term, Q_term); // Compute M = (tril(M) + tril(M).mH()) * 0.5 Identity - DenseTensor M_tril_0 = Tril(ctx, M_tmp1, 0, true); - DenseTensor M_tril_1 = Tril(ctx, M_tmp1, -1, true); + DenseTensor M_tril_0 = TrilTriu(ctx, M_tmp1, 0, true); + DenseTensor M_tril_1 = TrilTriu(ctx, M_tmp1, -1, true); DenseTensor M = Add( ctx, M_tril_0, TransposeLast2Dim(ctx, M_tril_1)); diff --git a/paddle/phi/kernels/impl/tril_grad_kernel_impl.h b/paddle/phi/kernels/impl/tril_triu_grad_kernel_impl.h similarity index 68% rename from paddle/phi/kernels/impl/tril_grad_kernel_impl.h rename to paddle/phi/kernels/impl/tril_triu_grad_kernel_impl.h index 3f72d34a957bd..173f09e35e246 100644 --- a/paddle/phi/kernels/impl/tril_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/tril_triu_grad_kernel_impl.h @@ -16,16 +16,16 @@ #include "paddle/phi/kernels/funcs/for_range.h" #include "paddle/phi/kernels/funcs/tril_triu_compute.h" -#include "paddle/phi/kernels/tril_grad_kernel.h" +#include "paddle/phi/kernels/tril_triu_grad_kernel.h" namespace phi { template -void TrilGradKernel(const Context& ctx, - const DenseTensor& out_grad, - int diagonal, - bool lower, - DenseTensor* x_grad) { +void TrilTriuGradKernel(const Context& ctx, + const DenseTensor& out_grad, + int diagonal, + bool lower, + DenseTensor* x_grad) { const auto* dout_data = out_grad.data(); auto* dx_data = ctx.template Alloc(x_grad); @@ -40,4 +40,20 @@ void TrilGradKernel(const Context& ctx, for_range(tril_triu_grad_computer); } +template +void TrilGradKernel(const Context& ctx, + const DenseTensor& out_grad, + int diagonal, + DenseTensor* x_grad) { + TrilTriuGradKernel(ctx, out_grad, diagonal, true, x_grad); +} + +template +void TriuGradKernel(const Context& ctx, + const DenseTensor& out_grad, + int diagonal, + DenseTensor* x_grad) { + TrilTriuGradKernel(ctx, out_grad, diagonal, false, x_grad); +} + } // namespace phi diff --git a/paddle/phi/kernels/impl/tril_kernel_impl.h b/paddle/phi/kernels/impl/tril_triu_kernel_impl.h similarity index 70% rename from paddle/phi/kernels/impl/tril_kernel_impl.h rename to paddle/phi/kernels/impl/tril_triu_kernel_impl.h index 8e93e87fbc4d8..5820bbb3aaf08 100644 --- a/paddle/phi/kernels/impl/tril_kernel_impl.h +++ b/paddle/phi/kernels/impl/tril_triu_kernel_impl.h @@ -16,16 +16,16 @@ #include "paddle/phi/kernels/funcs/for_range.h" #include "paddle/phi/kernels/funcs/tril_triu_compute.h" -#include "paddle/phi/kernels/tril_kernel.h" +#include "paddle/phi/kernels/tril_triu_kernel.h" namespace phi { template -void TrilKernel(const Context& ctx, - const DenseTensor& x, - int diagonal, - bool lower, - DenseTensor* out) { +void TrilTriuKernel(const Context& ctx, + const DenseTensor& x, + int diagonal, + bool lower, + DenseTensor* out) { const auto* x_data = x.data(); auto* out_data = ctx.template Alloc(out); @@ -39,4 +39,20 @@ void TrilKernel(const Context& ctx, for_range(tril_triu_computer); } +template +void TrilKernel(const Context& ctx, + const DenseTensor& x, + int diagonal, + DenseTensor* out) { + TrilTriuKernel(ctx, x, diagonal, true, out); +} + +template +void TriuKernel(const Context& ctx, + const DenseTensor& x, + int diagonal, + DenseTensor* out) { + TrilTriuKernel(ctx, x, diagonal, false, out); +} + } // namespace phi diff --git a/paddle/phi/kernels/layer_norm_grad_kernel.h b/paddle/phi/kernels/layer_norm_grad_kernel.h index 7d7cd13109be1..81975760994de 100644 --- a/paddle/phi/kernels/layer_norm_grad_kernel.h +++ b/paddle/phi/kernels/layer_norm_grad_kernel.h @@ -28,7 +28,6 @@ void LayerNormGradKernel(const Context& ctx, const DenseTensor& out_grad, float epsilon, int begin_norm_axis, - bool is_test, DenseTensor* x_grad, DenseTensor* scale_grad, DenseTensor* bias_grad); diff --git a/paddle/phi/kernels/layer_norm_kernel.h b/paddle/phi/kernels/layer_norm_kernel.h index 28ffdfd47719b..2fddcec2278c9 100644 --- a/paddle/phi/kernels/layer_norm_kernel.h +++ b/paddle/phi/kernels/layer_norm_kernel.h @@ -26,7 +26,6 @@ void LayerNormKernel(const Context& ctx, const paddle::optional& bias, float epsilon, int begin_norm_axis, - bool is_test, DenseTensor* out, DenseTensor* mean, DenseTensor* variance); diff --git a/paddle/phi/kernels/matrix_rank_kernel.h b/paddle/phi/kernels/matrix_rank_kernel.h index 6edea2723e589..f0cb90097a3fc 100644 --- a/paddle/phi/kernels/matrix_rank_kernel.h +++ b/paddle/phi/kernels/matrix_rank_kernel.h @@ -22,8 +22,8 @@ template void MatrixRankKernel(const Context& dev_ctx, const DenseTensor& x, float tol, - bool use_default_tol, bool hermitian, + bool use_default_tol, DenseTensor* out); } // namespace phi diff --git a/paddle/phi/kernels/selected_rows/hsigmoid_loss_grad_kernel.cc b/paddle/phi/kernels/selected_rows/hsigmoid_loss_grad_kernel.cc index 1fedcb14f5426..2512304944e85 100644 --- a/paddle/phi/kernels/selected_rows/hsigmoid_loss_grad_kernel.cc +++ b/paddle/phi/kernels/selected_rows/hsigmoid_loss_grad_kernel.cc @@ -47,10 +47,6 @@ void HSigmoidLossGradKernel(const Context& ctx, const DenseTensor& out_grad, int num_classes, bool remote_prefetch, - int trainer_id, - const std::vector& height_sections, - const std::vector& epmap, - const std::vector& table_names, bool is_sparse, DenseTensor* x_grad, SelectedRows* w_grad, @@ -77,10 +73,6 @@ void HSigmoidLossGradKernel(const Context& ctx, out_grad, num_classes, remote_prefetch, - trainer_id, - height_sections, - epmap, - table_names, is_sparse, x_grad, w_grad_value, diff --git a/paddle/phi/kernels/selected_rows/hsigmoid_loss_grad_kernel.h b/paddle/phi/kernels/selected_rows/hsigmoid_loss_grad_kernel.h index fe4ffe24601ae..94ac63183fbfb 100644 --- a/paddle/phi/kernels/selected_rows/hsigmoid_loss_grad_kernel.h +++ b/paddle/phi/kernels/selected_rows/hsigmoid_loss_grad_kernel.h @@ -32,10 +32,6 @@ void HSigmoidLossGradKernel(const Context& ctx, const DenseTensor& out_grad, int num_classes, bool remote_prefetch, - int trainer_id, - const std::vector& height_sections, - const std::vector& epmap, - const std::vector& table_names, bool is_sparse, DenseTensor* x_grad, SelectedRows* w_grad, diff --git a/paddle/phi/kernels/tril_grad_kernel.h b/paddle/phi/kernels/tril_triu_grad_kernel.h similarity index 66% rename from paddle/phi/kernels/tril_grad_kernel.h rename to paddle/phi/kernels/tril_triu_grad_kernel.h index 7fc5e77363c62..42ddc2aba9d8a 100644 --- a/paddle/phi/kernels/tril_grad_kernel.h +++ b/paddle/phi/kernels/tril_triu_grad_kernel.h @@ -18,11 +18,23 @@ namespace phi { +template +void TrilTriuGradKernel(const Context& ctx, + const DenseTensor& out_grad, + int diagonal, + bool lower, + DenseTensor* x_grad); + template void TrilGradKernel(const Context& ctx, const DenseTensor& out_grad, int diagonal, - bool lower, + DenseTensor* x_grad); + +template +void TriuGradKernel(const Context& ctx, + const DenseTensor& out_grad, + int diagonal, DenseTensor* x_grad); } // namespace phi diff --git a/paddle/phi/kernels/tril_kernel.h b/paddle/phi/kernels/tril_triu_kernel.h similarity index 60% rename from paddle/phi/kernels/tril_kernel.h rename to paddle/phi/kernels/tril_triu_kernel.h index 52154c2b2bcb5..cbec3b6221da1 100644 --- a/paddle/phi/kernels/tril_kernel.h +++ b/paddle/phi/kernels/tril_triu_kernel.h @@ -19,22 +19,34 @@ namespace phi { +template +void TrilTriuKernel(const Context& ctx, + const DenseTensor& x, + int diagonal, + bool lower, + DenseTensor* out); + template void TrilKernel(const Context& ctx, const DenseTensor& x, int diagonal, - bool lower, DenseTensor* out); template -DenseTensor Tril(const Context& ctx, - const DenseTensor& x, - int diagonal, - bool lower) { +void TriuKernel(const Context& ctx, + const DenseTensor& x, + int diagonal, + DenseTensor* out); + +template +DenseTensor TrilTriu(const Context& ctx, + const DenseTensor& x, + int diagonal, + bool lower) { DenseTensor dense_out; MetaTensor meta_out(&dense_out); - TrilInferMeta(x, diagonal, lower, &meta_out); - TrilKernel(ctx, x, diagonal, lower, &dense_out); + TrilTriuInferMeta(x, diagonal, lower, &meta_out); + TrilTriuKernel(ctx, x, diagonal, lower, &dense_out); return dense_out; } diff --git a/paddle/phi/kernels/xpu/layer_norm_grad_kernel.cc b/paddle/phi/kernels/xpu/layer_norm_grad_kernel.cc index d7d8e9058e861..9bf2d779d0205 100644 --- a/paddle/phi/kernels/xpu/layer_norm_grad_kernel.cc +++ b/paddle/phi/kernels/xpu/layer_norm_grad_kernel.cc @@ -29,7 +29,6 @@ void LayerNormGradKernel(const Context& ctx, const DenseTensor& out_grad, float epsilon, int begin_norm_axis, - bool is_test, DenseTensor* x_grad, DenseTensor* scale_grad, DenseTensor* bias_grad) { diff --git a/paddle/phi/kernels/xpu/layer_norm_kernel.cc b/paddle/phi/kernels/xpu/layer_norm_kernel.cc index e1895bd0a745e..8fc3edffbcd66 100644 --- a/paddle/phi/kernels/xpu/layer_norm_kernel.cc +++ b/paddle/phi/kernels/xpu/layer_norm_kernel.cc @@ -26,7 +26,6 @@ void LayerNormKernel(const Context& ctx, const paddle::optional& bias, float epsilon, int begin_norm_axis, - bool is_test, DenseTensor* out, DenseTensor* mean, DenseTensor* variance) { diff --git a/paddle/phi/kernels/xpu/tril_grad_kernel.cc b/paddle/phi/kernels/xpu/tril_triu_grad_kernel.cc similarity index 67% rename from paddle/phi/kernels/xpu/tril_grad_kernel.cc rename to paddle/phi/kernels/xpu/tril_triu_grad_kernel.cc index af8dfdd8c0bad..bf6b083a431f7 100644 --- a/paddle/phi/kernels/xpu/tril_grad_kernel.cc +++ b/paddle/phi/kernels/xpu/tril_triu_grad_kernel.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/phi/kernels/tril_grad_kernel.h" +#include "paddle/phi/kernels/tril_triu_grad_kernel.h" #include "paddle/phi/backends/xpu/enforce_xpu.h" #include "paddle/phi/core/kernel_registry.h" @@ -20,11 +20,11 @@ namespace phi { template -void TrilGradKernel(const Context& ctx, - const DenseTensor& out_grad, - int diagonal, - bool lower, - DenseTensor* x_grad) { +void TrilTriuGradKernel(const Context& ctx, + const DenseTensor& out_grad, + int diagonal, + bool lower, + DenseTensor* x_grad) { using XPUType = typename XPUTypeTrait::Type; ctx.template Alloc(x_grad); auto dy_shape = vectorize(out_grad.dims()); @@ -46,7 +46,27 @@ void TrilGradKernel(const Context& ctx, } } +template +void TrilGradKernel(const Context& ctx, + const DenseTensor& out_grad, + int diagonal, + DenseTensor* x_grad) { + TrilTriuGradKernel(ctx, out_grad, diagonal, true, x_grad); +} + +template +void TriuGradKernel(const Context& ctx, + const DenseTensor& out_grad, + int diagonal, + DenseTensor* x_grad) { + TrilTriuGradKernel(ctx, out_grad, diagonal, false, x_grad); +} + } // namespace phi PD_REGISTER_KERNEL( tril_grad, XPU, ALL_LAYOUT, phi::TrilGradKernel, int, float) {} +PD_REGISTER_KERNEL( + triu_grad, XPU, ALL_LAYOUT, phi::TriuGradKernel, int, float) {} +PD_REGISTER_KERNEL( + tril_triu_grad, XPU, ALL_LAYOUT, phi::TrilTriuGradKernel, int, float) {} diff --git a/paddle/phi/kernels/xpu/tril_kernel.cc b/paddle/phi/kernels/xpu/tril_triu_kernel.cc similarity index 69% rename from paddle/phi/kernels/xpu/tril_kernel.cc rename to paddle/phi/kernels/xpu/tril_triu_kernel.cc index 4b4cf579c26c6..98defd569167f 100644 --- a/paddle/phi/kernels/xpu/tril_kernel.cc +++ b/paddle/phi/kernels/xpu/tril_triu_kernel.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/phi/kernels/tril_kernel.h" +#include "paddle/phi/kernels/tril_triu_kernel.h" #include "paddle/phi/backends/xpu/enforce_xpu.h" #include "paddle/phi/core/kernel_registry.h" @@ -20,11 +20,11 @@ namespace phi { template -void TrilKernel(const Context& ctx, - const DenseTensor& x, - int diagonal, - bool lower, - DenseTensor* out) { +void TrilTriuKernel(const Context& ctx, + const DenseTensor& x, + int diagonal, + bool lower, + DenseTensor* out) { using XPUType = typename XPUTypeTrait::Type; ctx.template Alloc(out); auto xshape = vectorize(x.dims()); @@ -46,6 +46,25 @@ void TrilKernel(const Context& ctx, } } +template +void TrilKernel(const Context& ctx, + const DenseTensor& x, + int diagonal, + DenseTensor* out) { + TrilTriuKernel(ctx, x, diagonal, true, out); +} + +template +void TriuKernel(const Context& ctx, + const DenseTensor& x, + int diagonal, + DenseTensor* out) { + TrilTriuKernel(ctx, x, diagonal, false, out); +} + } // namespace phi +PD_REGISTER_KERNEL( + tril_triu, XPU, ALL_LAYOUT, phi::TrilTriuKernel, int, float) {} PD_REGISTER_KERNEL(tril, XPU, ALL_LAYOUT, phi::TrilKernel, int, float) {} +PD_REGISTER_KERNEL(triu, XPU, ALL_LAYOUT, phi::TriuKernel, int, float) {} diff --git a/paddle/phi/ops/compat/hierarchical_sigmoid_sig.cc b/paddle/phi/ops/compat/hierarchical_sigmoid_sig.cc index c4e04e5d40b02..a8db0b33242bd 100644 --- a/paddle/phi/ops/compat/hierarchical_sigmoid_sig.cc +++ b/paddle/phi/ops/compat/hierarchical_sigmoid_sig.cc @@ -19,14 +19,8 @@ namespace phi { KernelSignature HierarchicalSigmoidOpArgumentMapping( const ArgumentMappingContext& ctx) { return KernelSignature("hsigmoid_loss", - {"X", "W", "Label", "PathTable", "PathCode", "Bias"}, - {"num_classes", - "remote_prefetch", - "trainer_id", - "height_sections", - "epmap", - "table_names", - "is_sparse"}, + {"X", "Label", "W", "Bias", "PathTable", "PathCode"}, + {"num_classes", "remote_prefetch", "is_sparse"}, {"Out", "PreOut", "W_Out"}); } @@ -42,13 +36,7 @@ KernelSignature HierarchicalSigmoidGradOpArgumentMapping( "Bias", "PreOut", "Out@GRAD"}, - {"num_classes", - "remote_prefetch", - "trainer_id", - "height_sections", - "epmap", - "table_names", - "is_sparse"}, + {"num_classes", "remote_prefetch", "is_sparse"}, {"X@GRAD", "W@GRAD", "Bias@GRAD"}); } else if (ctx.IsSelectedRowsOutput("W@GRAD")) { return KernelSignature("hsigmoid_loss_grad_sr", @@ -60,13 +48,7 @@ KernelSignature HierarchicalSigmoidGradOpArgumentMapping( "Bias", "PreOut", "Out@GRAD"}, - {"num_classes", - "remote_prefetch", - "trainer_id", - "height_sections", - "epmap", - "table_names", - "is_sparse"}, + {"num_classes", "remote_prefetch", "is_sparse"}, {"X@GRAD", "W@GRAD", "Bias@GRAD"}); } else { return KernelSignature("unregistered", {}, {}, {}); diff --git a/paddle/phi/ops/compat/layer_norm_sig.cc b/paddle/phi/ops/compat/layer_norm_sig.cc index ab4f9ab817157..d2e75a700d2b8 100644 --- a/paddle/phi/ops/compat/layer_norm_sig.cc +++ b/paddle/phi/ops/compat/layer_norm_sig.cc @@ -19,7 +19,7 @@ namespace phi { KernelSignature LayerNormOpArgumentMapping(const ArgumentMappingContext& ctx) { return KernelSignature("layer_norm", {"X", "Scale", "Bias"}, - {"epsilon", "begin_norm_axis", "is_test"}, + {"epsilon", "begin_norm_axis"}, {"Y", "Mean", "Variance"}); } @@ -27,7 +27,7 @@ KernelSignature LayerNormGradOpArgumentMapping( const ArgumentMappingContext& ctx) { return KernelSignature("layer_norm_grad", {"X", "Scale", "Bias", "Mean", "Variance", "Y@GRAD"}, - {"epsilon", "begin_norm_axis", "is_test"}, + {"epsilon", "begin_norm_axis"}, {"X@GRAD", "Scale@GRAD", "Bias@GRAD"}); } diff --git a/paddle/phi/ops/compat/matrix_rank_sig.cc b/paddle/phi/ops/compat/matrix_rank_sig.cc index bb884e9c3499b..3a9a040062703 100644 --- a/paddle/phi/ops/compat/matrix_rank_sig.cc +++ b/paddle/phi/ops/compat/matrix_rank_sig.cc @@ -28,8 +28,8 @@ KernelSignature MatrixRankOpArgumentMapping(const ArgumentMappingContext& ctx) { {"X"}, { "tol", - "use_default_tol", "hermitian", + "use_default_tol", }, {"Out"}); } diff --git a/paddle/phi/ops/compat/tril_triu_sig.cc b/paddle/phi/ops/compat/tril_triu_sig.cc index 3cf022c60e3da..3c5fa15b41cae 100644 --- a/paddle/phi/ops/compat/tril_triu_sig.cc +++ b/paddle/phi/ops/compat/tril_triu_sig.cc @@ -17,19 +17,16 @@ limitations under the License. */ namespace phi { KernelSignature TrilTriuOpArgumentMapping(const ArgumentMappingContext& ctx) { - return KernelSignature("tril", {"X"}, {"diagonal", "lower"}, {"Out"}); + return KernelSignature("tril_triu", {"X"}, {"diagonal", "lower"}, {"Out"}); } KernelSignature TrilTriuGradOpArgumentMapping( const ArgumentMappingContext& ctx) { return KernelSignature( - "tril_grad", {"Out@GRAD"}, {"diagonal", "lower"}, {"X@GRAD"}); + "tril_triu_grad", {"Out@GRAD"}, {"diagonal", "lower"}, {"X@GRAD"}); } } // namespace phi -PD_REGISTER_BASE_KERNEL_NAME(tril_triu, tril); -PD_REGISTER_BASE_KERNEL_NAME(tril_triu_grad, tril_grad); - PD_REGISTER_ARG_MAPPING_FN(tril_triu, phi::TrilTriuOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(tril_triu_grad, phi::TrilTriuGradOpArgumentMapping); diff --git a/python/paddle/fluid/dygraph/nn.py b/python/paddle/fluid/dygraph/nn.py index 1c864a8cd9034..4c8b9d7f555f1 100644 --- a/python/paddle/fluid/dygraph/nn.py +++ b/python/paddle/fluid/dygraph/nn.py @@ -1782,7 +1782,6 @@ def forward(self, input): self.bias, self._epsilon, self._begin_norm_axis, - False, ) return dygraph_utils._append_activation_in_dygraph( pre_act, act=self._act diff --git a/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py b/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py index 5ac218f9ebd7a..db3627d21ce75 100644 --- a/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py +++ b/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py @@ -170,18 +170,15 @@ def hsigmoidWithCustomTree( def python_api( input, - weight, label, + weight, + bias=None, path_table=None, path_code=None, - bias=None, num_classes=-1, is_sparse=False, remote_prefetch=False, ): - assert ( - is_sparse == remote_prefetch - ), "is_sparse is equal to remote_prefetch in dygraph." return paddle.nn.functional.hsigmoid_loss( input, label, diff --git a/python/paddle/fluid/tests/unittests/test_matrix_rank_op.py b/python/paddle/fluid/tests/unittests/test_matrix_rank_op.py index 72c104ad85c28..671d0831a57bd 100644 --- a/python/paddle/fluid/tests/unittests/test_matrix_rank_op.py +++ b/python/paddle/fluid/tests/unittests/test_matrix_rank_op.py @@ -26,7 +26,7 @@ np.random.seed(SEED) -def matrix_rank_wraper(x, tol=None, use_default_tol=True, hermitian=False): +def matrix_rank_wraper(x, tol=None, hermitian=False, use_default_tol=True): return paddle.linalg.matrix_rank(x, tol, hermitian) diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py index 6cf352086b415..efe26e5a42ebf 100755 --- a/python/paddle/nn/functional/loss.py +++ b/python/paddle/nn/functional/loss.py @@ -1017,17 +1017,13 @@ def hsigmoid_loss( if in_dygraph_mode(): out, _, _ = _C_ops.hsigmoid_loss( input, - weight, label, + weight, + bias, path_table, path_code, - bias, num_classes, is_sparse, - 0, - [], - [], - [], is_sparse, ) return out diff --git a/python/paddle/nn/functional/norm.py b/python/paddle/nn/functional/norm.py index cf487a23329fd..c356b8c994075 100644 --- a/python/paddle/nn/functional/norm.py +++ b/python/paddle/nn/functional/norm.py @@ -385,7 +385,7 @@ def layer_norm( pre_act, _, _, - ) = _C_ops.layer_norm(x, weight, bias, epsilon, begin_norm_axis, False) + ) = _C_ops.layer_norm(x, weight, bias, epsilon, begin_norm_axis) return dygraph_utils._append_activation_in_dygraph(pre_act, act=None) diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py index 774261237d228..9a907049f5400 100644 --- a/python/paddle/tensor/creation.py +++ b/python/paddle/tensor/creation.py @@ -1183,7 +1183,7 @@ def triu(x, diagonal=0, name=None): """ if in_dygraph_mode(): - return _C_ops.tril(x, diagonal, False) + return _C_ops.triu(x, diagonal, False) if _in_legacy_dygraph(): op = getattr(_legacy_C_ops, 'tril_triu') diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py index 41166b71a0cf6..1a3a5595928e3 100644 --- a/python/paddle/tensor/linalg.py +++ b/python/paddle/tensor/linalg.py @@ -1601,7 +1601,7 @@ def matrix_rank(x, tol=None, hermitian=False, name=None): else: tol_attr = float(tol) use_default_tol = False - return _C_ops.matrix_rank(x, tol_attr, use_default_tol, hermitian) + return _C_ops.matrix_rank(x, tol_attr, hermitian, use_default_tol) if _in_legacy_dygraph(): if tol is None: From 2f34fc7a1dcbeabc86b91febd6a22bd4202f04d3 Mon Sep 17 00:00:00 2001 From: huangjiyi <43315610+huangjiyi@users.noreply.github.com> Date: Thu, 17 Nov 2022 14:46:42 +0800 Subject: [PATCH 063/210] rm "paddle/fluid/framework/convert_utils.h" in phi (#48001) --- paddle/fluid/framework/convert_utils.cc | 35 --------------- paddle/fluid/framework/convert_utils.h | 4 +- .../operators/prune_gate_by_capacity_op.cu | 2 +- paddle/phi/core/utils/data_type.h | 45 +++++++++++++++++++ paddle/phi/infermeta/unary.cc | 14 +++--- .../kernels/cpu/index_sample_grad_kernel.cc | 22 ++++----- paddle/phi/kernels/cpu/index_sample_kernel.cc | 22 ++++----- .../kernels/cpu/put_along_axis_grad_kernel.cc | 11 +++-- .../phi/kernels/cpu/put_along_axis_kernel.cc | 17 ++++--- .../phi/kernels/cpu/take_along_axis_kernel.cc | 9 ++-- paddle/phi/kernels/funcs/math_function.h | 1 - paddle/phi/kernels/funcs/unique_functor.h | 22 ++++----- .../phi/kernels/gpu/fill_diagonal_kernel.cu | 1 - .../kernels/gpu/index_sample_grad_kernel.cu | 22 ++++----- paddle/phi/kernels/gpu/index_sample_kernel.cu | 22 ++++----- .../kernels/gpu/put_along_axis_grad_kernel.cu | 11 +++-- .../phi/kernels/gpu/put_along_axis_kernel.cu | 17 ++++--- .../phi/kernels/gpu/sync_batch_norm_utils.h | 4 +- .../gpu/take_along_axis_grad_kernel.cu | 9 ++-- .../phi/kernels/gpu/take_along_axis_kernel.cu | 9 ++-- 20 files changed, 138 insertions(+), 161 deletions(-) diff --git a/paddle/fluid/framework/convert_utils.cc b/paddle/fluid/framework/convert_utils.cc index 112894c20697c..49efde53f4a8e 100644 --- a/paddle/fluid/framework/convert_utils.cc +++ b/paddle/fluid/framework/convert_utils.cc @@ -162,40 +162,5 @@ DataType String2DataType(const std::string& str) { } } -std::string DataType2String(DataType dtype) { - switch (dtype) { - case DataType::BOOL: - return "bool"; - case DataType::INT8: - return "int8"; - case DataType::UINT8: - return "uint8"; - case DataType::INT16: - return "int16"; - case DataType::INT32: - return "int32"; - case DataType::INT64: - return "int64"; - case DataType::FLOAT16: - return "float16"; - case DataType::FLOAT32: - return "float32"; - case DataType::FLOAT64: - return "float64"; - case DataType::COMPLEX64: - return "complex64"; - case DataType::COMPLEX128: - return "complex128"; - case DataType::PSTRING: - return "pstring"; - case DataType::BFLOAT16: - return "bfloat16"; - default: - PADDLE_THROW(paddle::platform::errors::InvalidArgument( - "Unknow phi::DataType, the int value = %d.", - static_cast(dtype))); - return ""; - } -} } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/convert_utils.h b/paddle/fluid/framework/convert_utils.h index 42723b9a092bd..d3aca94003045 100644 --- a/paddle/fluid/framework/convert_utils.h +++ b/paddle/fluid/framework/convert_utils.h @@ -20,6 +20,7 @@ limitations under the License. */ #include "paddle/phi/core/tensor_meta.h" #include "paddle/fluid/framework/data_type.h" +#include "paddle/phi/core/utils/data_type.h" // TODO(chenweihang): this file may need to be removed @@ -37,7 +38,8 @@ paddle::framework::proto::VarType::Type TransToProtoVarType( size_t DataTypeSize(DataType dtype); DataType String2DataType(const std::string& str); -std::string DataType2String(DataType dtype); + +using phi::DataType2String; } // namespace framework } // namespace paddle diff --git a/paddle/fluid/operators/prune_gate_by_capacity_op.cu b/paddle/fluid/operators/prune_gate_by_capacity_op.cu index 9f5751fe0bdc7..cf0763be27d5b 100644 --- a/paddle/fluid/operators/prune_gate_by_capacity_op.cu +++ b/paddle/fluid/operators/prune_gate_by_capacity_op.cu @@ -121,7 +121,7 @@ class PruneGateByCapacityCUDAKernel : public framework::OpKernel { framework::TensorCopy(*expert_count, context.GetPlace(), &expert_count_out); PruneGateByCapacityFunctor functor( context, gate_idx, &expert_count_out, new_gate_idx_data); - VisitDataType(expert_count->type(), functor); + ::paddle::operators::VisitDataType(expert_count->type(), functor); } }; diff --git a/paddle/phi/core/utils/data_type.h b/paddle/phi/core/utils/data_type.h index 5e53d8c95b0b9..a7bf1123a05a0 100644 --- a/paddle/phi/core/utils/data_type.h +++ b/paddle/phi/core/utils/data_type.h @@ -41,6 +41,14 @@ static std::map var_type_map{{1, phi::DataType::INT16}, {6, phi::DataType::FLOAT64}, {20, phi::DataType::UINT8}}; +static std::map map_to_var_type{{phi::DataType::INT16, 1}, + {phi::DataType::INT32, 2}, + {phi::DataType::INT64, 3}, + {phi::DataType::FLOAT16, 4}, + {phi::DataType::FLOAT32, 5}, + {phi::DataType::FLOAT64, 6}, + {phi::DataType::UINT8, 20}}; + #define _PhiForEachDataTypeHelper_(callback, cpp_type, data_type) \ callback(cpp_type, data_type); @@ -129,4 +137,41 @@ inline DataType ToRealType(const DataType& type) { type)); } } + +inline std::string DataType2String(DataType dtype) { + switch (dtype) { + case DataType::BOOL: + return "bool"; + case DataType::INT8: + return "int8"; + case DataType::UINT8: + return "uint8"; + case DataType::INT16: + return "int16"; + case DataType::INT32: + return "int32"; + case DataType::INT64: + return "int64"; + case DataType::FLOAT16: + return "float16"; + case DataType::FLOAT32: + return "float32"; + case DataType::FLOAT64: + return "float64"; + case DataType::COMPLEX64: + return "complex64"; + case DataType::COMPLEX128: + return "complex128"; + case DataType::PSTRING: + return "pstring"; + case DataType::BFLOAT16: + return "bfloat16"; + default: + PADDLE_THROW( + errors::InvalidArgument("Unknow phi::DataType, the int value = %d.", + static_cast(dtype))); + return ""; + } +} + } // namespace phi diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc index a3da5aca24e11..5f602a134ecd9 100644 --- a/paddle/phi/infermeta/unary.cc +++ b/paddle/phi/infermeta/unary.cc @@ -17,11 +17,11 @@ limitations under the License. */ #include #include -#include "paddle/fluid/framework/convert_utils.h" #include "paddle/phi/common/data_type.h" #include "paddle/phi/common/type_traits.h" #include "paddle/phi/core/enforce.h" #include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/core/utils/data_type.h" #include "paddle/phi/kernels/funcs/parse_qr_mode.h" #include "paddle/phi/kernels/funcs/pooling.h" #include "paddle/phi/kernels/funcs/slice_utils.h" @@ -133,12 +133,9 @@ void ArgMinMaxInferMeta(const MetaTensor& x, phi::errors::InvalidArgument( "The attribute of dtype in argmin/argmax must be [%s] or [%s], but " "received [%s]", - paddle::framework::DataTypeToString( - paddle::framework::proto::VarType::INT32), - paddle::framework::DataTypeToString( - paddle::framework::proto::VarType::INT64), - paddle::framework::DataTypeToString( - static_cast(dtype)))); + phi::DataType2String(DataType::INT32), + phi::DataType2String(DataType::INT64), + phi::DataType2String(var_type_map[dtype]))); if (!config.is_runtime && axis.FromTensor()) { std::vector vec; @@ -180,11 +177,10 @@ void ArgMinMaxInferMeta(const MetaTensor& x, auto x_rank = x_dims.size(); if (int_axis < 0) int_axis += x_rank; if (config.is_runtime) { - if (dtype == paddle::framework::proto::VarType::INT32) { + if (dtype == map_to_var_type[DataType::INT32]) { int64_t all_element_num = 0; if (flatten) { all_element_num = phi::product(x_dims); - } else { all_element_num = x_dims[int_axis]; } diff --git a/paddle/phi/kernels/cpu/index_sample_grad_kernel.cc b/paddle/phi/kernels/cpu/index_sample_grad_kernel.cc index fe8ca4e432e21..50f2c3267fbc5 100644 --- a/paddle/phi/kernels/cpu/index_sample_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/index_sample_grad_kernel.cc @@ -14,11 +14,11 @@ #include "paddle/phi/kernels/index_sample_grad_kernel.h" -#include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/framework/tensor_util.h" #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/common/data_type.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/utils/data_type.h" namespace phi { template void IndexSampleGradInner(const Context& context, @@ -76,18 +76,14 @@ void IndexSampleGradKernel(const Context& ctx, auto index_type = index.dtype(); bool index_type_match = index_type == DataType::INT32 || index_type == DataType::INT64; - PADDLE_ENFORCE_EQ( - index_type_match, - true, - errors::InvalidArgument( - "Input(Index) holds the wrong type, it holds %s, but " - "desires to be %s or %s", - paddle::framework::DataTypeToString( - paddle::framework::TransToProtoVarType(index_type)), - paddle::framework::DataTypeToString( - paddle::framework::TransToProtoVarType(DataType::INT32)), - paddle::framework::DataTypeToString( - paddle::framework::TransToProtoVarType((DataType::INT64))))); + PADDLE_ENFORCE_EQ(index_type_match, + true, + errors::InvalidArgument( + "Input(Index) holds the wrong type, it holds %s, but " + "desires to be %s or %s", + phi::DataType2String(index_type), + phi::DataType2String(DataType::INT32), + phi::DataType2String(DataType::INT64))); if (index_type == DataType::INT32) { IndexSampleGradInner(ctx, out_grad, index, x_grad); } else if (index_type == DataType::INT64) { diff --git a/paddle/phi/kernels/cpu/index_sample_kernel.cc b/paddle/phi/kernels/cpu/index_sample_kernel.cc index faa6953704e80..11e24b10b153b 100644 --- a/paddle/phi/kernels/cpu/index_sample_kernel.cc +++ b/paddle/phi/kernels/cpu/index_sample_kernel.cc @@ -21,11 +21,11 @@ #include #include -#include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/framework/tensor_util.h" #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/common/data_type.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/utils/data_type.h" namespace phi { template void IndexSampleInner(const Context &context, @@ -89,18 +89,14 @@ void IndexSampleKernel(const Context &ctx, auto index_type = index.dtype(); bool index_type_match = index_type == DataType::INT32 || index_type == DataType::INT64; - PADDLE_ENFORCE_EQ( - index_type_match, - true, - errors::InvalidArgument( - "Input(Index) holds the wrong type, it holds %s, but " - "desires to be %s or %s", - paddle::framework::DataTypeToString( - paddle::framework::TransToProtoVarType(index_type)), - paddle::framework::DataTypeToString( - paddle::framework::TransToProtoVarType(DataType::INT32)), - paddle::framework::DataTypeToString( - paddle::framework::TransToProtoVarType((DataType::INT64))))); + PADDLE_ENFORCE_EQ(index_type_match, + true, + errors::InvalidArgument( + "Input(Index) holds the wrong type, it holds %s, but " + "desires to be %s or %s", + phi::DataType2String(index_type), + phi::DataType2String(DataType::INT32), + phi::DataType2String(DataType::INT64))); if (index_type == DataType::INT32) { IndexSampleInner(ctx, x, index, out); } else if (index_type == DataType::INT64) { diff --git a/paddle/phi/kernels/cpu/put_along_axis_grad_kernel.cc b/paddle/phi/kernels/cpu/put_along_axis_grad_kernel.cc index 8b313f435cd56..ca57c223beb4b 100644 --- a/paddle/phi/kernels/cpu/put_along_axis_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/put_along_axis_grad_kernel.cc @@ -14,9 +14,9 @@ #include "paddle/phi/kernels/put_along_axis_grad_kernel.h" -#include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/operators/gather_scatter_kernel.h" #include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/common/data_type.h" #include "paddle/phi/common/place.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/tensor_utils.h" @@ -37,11 +37,10 @@ void PutAlongAxisGradKernel(const Context& dev_ctx, true, errors::PreconditionNotMet("PutAlongAxisGradOpKernel only runs on CPU.")); - const auto& index_type = - paddle::framework::TransToProtoVarType(index.dtype()); + const auto& index_type = index.dtype(); if (x_grad) { phi::Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), false, x_grad); - if (index_type == paddle::framework::proto::VarType::INT32) { + if (index_type == DataType::INT32) { paddle::operators::cpu_scatter_input_grad_kernel( // Here passing an unused argument out_grad, because it's // convenient to instantiate a bunch of template function with the @@ -60,10 +59,10 @@ void PutAlongAxisGradKernel(const Context& dev_ctx, if (value_grad) { value_grad->Resize(index.dims()); value_grad->mutable_data(dev_ctx.GetPlace()); - if (index_type == paddle::framework::proto::VarType::INT32) { + if (index_type == DataType::INT32) { paddle::operators::cpu_gather_kernel( out_grad, axis, index, *value_grad, dev_ctx); - } else if (index_type == paddle::framework::proto::VarType::INT64) { + } else if (index_type == DataType::INT64) { paddle::operators::cpu_gather_kernel( out_grad, axis, index, *value_grad, dev_ctx); } diff --git a/paddle/phi/kernels/cpu/put_along_axis_kernel.cc b/paddle/phi/kernels/cpu/put_along_axis_kernel.cc index ed2884dddcc2c..e0cf5f6730c24 100644 --- a/paddle/phi/kernels/cpu/put_along_axis_kernel.cc +++ b/paddle/phi/kernels/cpu/put_along_axis_kernel.cc @@ -14,9 +14,9 @@ #include "paddle/phi/kernels/put_along_axis_kernel.h" -#include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/operators/gather_scatter_kernel.h" #include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/common/data_type.h" #include "paddle/phi/common/place.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/tensor_utils.h" @@ -37,29 +37,28 @@ void PutAlongAxisKernel(const Context& dev_ctx, errors::PreconditionNotMet("PutAlongAxisOpKernel only runs on CPU.")); phi::Copy(dev_ctx, x, dev_ctx.GetPlace(), false, out); - const auto& index_type = - paddle::framework::TransToProtoVarType(index.dtype()); + const auto& index_type = index.dtype(); if (reduce == "add") { - if (index_type == paddle::framework::proto::VarType::INT32) { + if (index_type == DataType::INT32) { paddle::operators::cpu_scatter_add_kernel( *out, axis, index, value, dev_ctx); - } else if (index_type == paddle::framework::proto::VarType::INT64) { + } else if (index_type == DataType::INT64) { paddle::operators::cpu_scatter_add_kernel( *out, axis, index, value, dev_ctx); } } else if (reduce == "multiply" || reduce == "mul") { - if (index_type == paddle::framework::proto::VarType::INT32) { + if (index_type == DataType::INT32) { paddle::operators::cpu_scatter_mul_kernel( *out, axis, index, value, dev_ctx); - } else if (index_type == paddle::framework::proto::VarType::INT64) { + } else if (index_type == DataType::INT64) { paddle::operators::cpu_scatter_mul_kernel( *out, axis, index, value, dev_ctx); } } else if (reduce == "assign") { - if (index_type == paddle::framework::proto::VarType::INT32) { + if (index_type == DataType::INT32) { paddle::operators::cpu_scatter_assign_kernel( *out, axis, index, value, dev_ctx); - } else if (index_type == paddle::framework::proto::VarType::INT64) { + } else if (index_type == DataType::INT64) { paddle::operators::cpu_scatter_assign_kernel( *out, axis, index, value, dev_ctx); } diff --git a/paddle/phi/kernels/cpu/take_along_axis_kernel.cc b/paddle/phi/kernels/cpu/take_along_axis_kernel.cc index 29f8e5d3923d5..cd1ff2e926288 100644 --- a/paddle/phi/kernels/cpu/take_along_axis_kernel.cc +++ b/paddle/phi/kernels/cpu/take_along_axis_kernel.cc @@ -14,9 +14,9 @@ #include "paddle/phi/kernels/take_along_axis_kernel.h" -#include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/operators/gather_scatter_kernel.h" #include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/common/data_type.h" #include "paddle/phi/common/place.h" #include "paddle/phi/core/kernel_registry.h" @@ -36,12 +36,11 @@ void TakeAlongAxisKernel(const Context& dev_ctx, out->Resize(index.dims()); dev_ctx.template Alloc(out); - const auto& index_type = - paddle::framework::TransToProtoVarType(index.dtype()); - if (index_type == paddle::framework::proto::VarType::INT32) { + const auto& index_type = index.dtype(); + if (index_type == DataType::INT32) { paddle::operators::cpu_gather_kernel( x, axis, index, *out, dev_ctx); - } else if (index_type == paddle::framework::proto::VarType::INT64) { + } else if (index_type == DataType::INT64) { paddle::operators::cpu_gather_kernel( x, axis, index, *out, dev_ctx); } diff --git a/paddle/phi/kernels/funcs/math_function.h b/paddle/phi/kernels/funcs/math_function.h index e922864060c2b..86d2f5c8efb7e 100644 --- a/paddle/phi/kernels/funcs/math_function.h +++ b/paddle/phi/kernels/funcs/math_function.h @@ -17,7 +17,6 @@ limitations under the License. */ #include #include -#include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor_util.h" diff --git a/paddle/phi/kernels/funcs/unique_functor.h b/paddle/phi/kernels/funcs/unique_functor.h index 2bb51cdab65c6..2c713243904eb 100644 --- a/paddle/phi/kernels/funcs/unique_functor.h +++ b/paddle/phi/kernels/funcs/unique_functor.h @@ -13,8 +13,8 @@ // limitations under the License. #pragma once -#include "paddle/fluid/framework/convert_utils.h" #include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/utils/data_type.h" #include "paddle/phi/kernels/funcs/concat_and_split_functor.h" #include "paddle/phi/kernels/funcs/math_function.h" @@ -77,18 +77,14 @@ struct UniqueOpFunctor { const auto& index_type = index_->dtype(); bool index_type_match = index_type == DataType::INT32 || index_type == DataType::INT64; - PADDLE_ENFORCE_EQ( - index_type_match, - true, - phi::errors::InvalidArgument( - "Index holds the wrong type, it holds %s, " - "but desires to be %s or %s", - paddle::framework::DataTypeToString( - paddle::framework::TransToProtoVarType(index_type)), - paddle::framework::DataTypeToString( - paddle::framework::TransToProtoVarType(DataType::INT32)), - paddle::framework::DataTypeToString( - paddle::framework::TransToProtoVarType(DataType::INT64)))); + PADDLE_ENFORCE_EQ(index_type_match, + true, + phi::errors::InvalidArgument( + "Index holds the wrong type, it holds %s, " + "but desires to be %s or %s", + phi::DataType2String(index_type), + phi::DataType2String(DataType::INT32), + phi::DataType2String(DataType::INT64))); if (index_type == DataType::INT32) { for (auto i = 0; i < in_->numel(); ++i) { diff --git a/paddle/phi/kernels/gpu/fill_diagonal_kernel.cu b/paddle/phi/kernels/gpu/fill_diagonal_kernel.cu index 1d342abc31745..d2a1ec4b5b7fb 100644 --- a/paddle/phi/kernels/gpu/fill_diagonal_kernel.cu +++ b/paddle/phi/kernels/gpu/fill_diagonal_kernel.cu @@ -17,7 +17,6 @@ #include #include -#include "paddle/fluid/framework/convert_utils.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/common_shape.h" diff --git a/paddle/phi/kernels/gpu/index_sample_grad_kernel.cu b/paddle/phi/kernels/gpu/index_sample_grad_kernel.cu index 6c94e14492bc0..5193d0a7ab05c 100644 --- a/paddle/phi/kernels/gpu/index_sample_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/index_sample_grad_kernel.cu @@ -17,11 +17,11 @@ #include #include -#include "paddle/fluid/framework/convert_utils.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/utils/data_type.h" #include "paddle/phi/kernels/funcs/math_function.h" namespace phi { @@ -70,18 +70,14 @@ void IndexSampleGradKernel(const Context& ctx, auto index_type = index.dtype(); bool index_type_match = index_type == DataType::INT32 || index_type == DataType::INT64; - PADDLE_ENFORCE_EQ( - index_type_match, - true, - errors::InvalidArgument( - "Input(Index) holds the wrong type, it holds %s, but " - "desires to be %s or %s", - paddle::framework::DataTypeToString( - paddle::framework::TransToProtoVarType(index_type)), - paddle::framework::DataTypeToString( - paddle::framework::TransToProtoVarType(DataType::INT32)), - paddle::framework::DataTypeToString( - paddle::framework::TransToProtoVarType((DataType::INT64))))); + PADDLE_ENFORCE_EQ(index_type_match, + true, + errors::InvalidArgument( + "Input(Index) holds the wrong type, it holds %s, but " + "desires to be %s or %s", + phi::DataType2String(index_type), + phi::DataType2String(DataType::INT32), + phi::DataType2String(DataType::INT64))); auto stream = reinterpret_cast(ctx).stream(); auto input_num = x.numel(); diff --git a/paddle/phi/kernels/gpu/index_sample_kernel.cu b/paddle/phi/kernels/gpu/index_sample_kernel.cu index 5e6bd8701a9dc..dd1b4aa97d2b6 100644 --- a/paddle/phi/kernels/gpu/index_sample_kernel.cu +++ b/paddle/phi/kernels/gpu/index_sample_kernel.cu @@ -17,10 +17,10 @@ #include #include -#include "paddle/fluid/framework/convert_utils.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/utils/data_type.h" #include "paddle/phi/kernels/funcs/math_function.h" namespace phi { @@ -59,18 +59,14 @@ void IndexSampleKernel(const Context& ctx, auto index_type = index.dtype(); bool index_type_match = index_type == DataType::INT32 || index_type == DataType::INT64; - PADDLE_ENFORCE_EQ( - index_type_match, - true, - errors::InvalidArgument( - "Input(Index) holds the wrong type, it holds %s, but " - "desires to be %s or %s", - paddle::framework::DataTypeToString( - paddle::framework::TransToProtoVarType(index_type)), - paddle::framework::DataTypeToString( - paddle::framework::TransToProtoVarType(DataType::INT32)), - paddle::framework::DataTypeToString( - paddle::framework::TransToProtoVarType((DataType::INT64))))); + PADDLE_ENFORCE_EQ(index_type_match, + true, + errors::InvalidArgument( + "Input(Index) holds the wrong type, it holds %s, but " + "desires to be %s or %s", + phi::DataType2String(index_type), + phi::DataType2String(DataType::INT32), + phi::DataType2String(DataType::INT64))); const T* in_data = x.data(); T* out_data = ctx.template Alloc(out); auto stream = reinterpret_cast(ctx).stream(); diff --git a/paddle/phi/kernels/gpu/put_along_axis_grad_kernel.cu b/paddle/phi/kernels/gpu/put_along_axis_grad_kernel.cu index 6ecc3383818d8..16c32886e235a 100644 --- a/paddle/phi/kernels/gpu/put_along_axis_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/put_along_axis_grad_kernel.cu @@ -14,12 +14,12 @@ #include "paddle/phi/kernels/put_along_axis_grad_kernel.h" -#include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/operators/gather_scatter_kernel.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/common/place.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/tensor_utils.h" +#include "paddle/phi/core/utils/data_type.h" namespace phi { @@ -37,11 +37,10 @@ void PutAlongAxisGradKernel(const Context& dev_ctx, errors::PreconditionNotMet( "PutAlongAxisGradOpCUDAKernel only runs on GPU.")); - const auto& index_type = - paddle::framework::TransToProtoVarType(index.dtype()); + const auto& index_type = index.dtype(); if (x_grad) { phi::Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), false, x_grad); - if (index_type == paddle::framework::proto::VarType::INT32) { + if (index_type == DataType::INT32) { paddle::operators::gpu_scatter_input_grad_kernel( out_grad, axis, index, *x_grad, dev_ctx); } else { @@ -52,14 +51,14 @@ void PutAlongAxisGradKernel(const Context& dev_ctx, if (value_grad) { value_grad->Resize(index.dims()); value_grad->mutable_data(dev_ctx.GetPlace()); - if (index_type == paddle::framework::proto::VarType::INT32) { + if (index_type == DataType::INT32) { paddle::operators::gpu_gather_kernel( out_grad, axis, index, *value_grad, dev_ctx); // the gradient of scatter is gather - } else if (index_type == paddle::framework::proto::VarType::INT64) { + } else if (index_type == DataType::INT64) { paddle::operators::gpu_gather_kernel( out_grad, axis, index, *value_grad, dev_ctx); } diff --git a/paddle/phi/kernels/gpu/put_along_axis_kernel.cu b/paddle/phi/kernels/gpu/put_along_axis_kernel.cu index 658041e3f3e24..b43d6fafa72a6 100644 --- a/paddle/phi/kernels/gpu/put_along_axis_kernel.cu +++ b/paddle/phi/kernels/gpu/put_along_axis_kernel.cu @@ -14,12 +14,12 @@ #include "paddle/phi/kernels/put_along_axis_kernel.h" -#include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/operators/gather_scatter_kernel.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/common/place.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/tensor_utils.h" +#include "paddle/phi/core/utils/data_type.h" namespace phi { @@ -36,31 +36,30 @@ void PutAlongAxisKernel(const Context& dev_ctx, errors::PreconditionNotMet( "PutAlongAxisCUDAKernel only runs on GPU device.")); - const auto& index_type = - paddle::framework::TransToProtoVarType(index.dtype()); + const auto& index_type = index.dtype(); phi::Copy(dev_ctx, x, dev_ctx.GetPlace(), false, out); if (reduce == "add") { - if (index_type == paddle::framework::proto::VarType::INT32) { + if (index_type == DataType::INT32) { paddle::operators::gpu_scatter_add_kernel( *out, axis, index, value, dev_ctx); - } else if (index_type == paddle::framework::proto::VarType::INT64) { + } else if (index_type == DataType::INT64) { paddle::operators::gpu_scatter_add_kernel( *out, axis, index, value, dev_ctx); } } else if (reduce == "multiply" || reduce == "mul") { - if (index_type == paddle::framework::proto::VarType::INT32) { + if (index_type == DataType::INT32) { paddle::operators::gpu_scatter_mul_kernel( *out, axis, index, value, dev_ctx); - } else if (index_type == paddle::framework::proto::VarType::INT64) { + } else if (index_type == DataType::INT64) { paddle::operators::gpu_scatter_mul_kernel( *out, axis, index, value, dev_ctx); } } else if (reduce == "assign") { - if (index_type == paddle::framework::proto::VarType::INT32) { + if (index_type == DataType::INT32) { paddle::operators::gpu_scatter_assign_kernel( *out, axis, index, value, dev_ctx); - } else if (index_type == paddle::framework::proto::VarType::INT64) { + } else if (index_type == DataType::INT64) { paddle::operators::gpu_scatter_assign_kernel( *out, axis, index, value, dev_ctx); } diff --git a/paddle/phi/kernels/gpu/sync_batch_norm_utils.h b/paddle/phi/kernels/gpu/sync_batch_norm_utils.h index 544544591f333..7262734516253 100644 --- a/paddle/phi/kernels/gpu/sync_batch_norm_utils.h +++ b/paddle/phi/kernels/gpu/sync_batch_norm_utils.h @@ -30,7 +30,6 @@ namespace cub = hipcub; #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/distributed/collective/ProcessGroupNCCL.h" #endif -#include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h" @@ -431,8 +430,7 @@ void SyncBatchNormGradFunctor( } if (comm) { - int dtype = paddle::platform::ToNCCLDataType( - paddle::framework::TransToProtoVarType(scale.dtype())); + int dtype = paddle::platform::ToNCCLDataType(scale.dtype()); // In-place operation PADDLE_ENFORCE_GPU_SUCCESS( phi::dynload::ncclAllReduce(stats, diff --git a/paddle/phi/kernels/gpu/take_along_axis_grad_kernel.cu b/paddle/phi/kernels/gpu/take_along_axis_grad_kernel.cu index a88aa6596b132..07afc3ba8bb18 100644 --- a/paddle/phi/kernels/gpu/take_along_axis_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/take_along_axis_grad_kernel.cu @@ -14,11 +14,11 @@ #include "paddle/phi/kernels/take_along_axis_grad_kernel.h" -#include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/operators/gather_scatter_kernel.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/common/place.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/utils/data_type.h" #include "paddle/phi/kernels/funcs/math_function.h" namespace phi { @@ -43,17 +43,16 @@ void TakeAlongAxisGradKernel(const Context& dev_ctx, // Set to zero tensor. phi::funcs::SetConstant functor; functor(dev_ctx, x_grad, static_cast(0)); - const auto& index_type = - paddle::framework::TransToProtoVarType(index.dtype()); + const auto& index_type = index.dtype(); - if (index_type == paddle::framework::proto::VarType::INT32) { + if (index_type == DataType::INT32) { paddle::operators::gpu_scatter_add_kernel( *x_grad, axis, index, out_grad, dev_ctx); // the gradient of gather is scatter - } else if (index_type == paddle::framework::proto::VarType::INT64) { + } else if (index_type == DataType::INT64) { paddle::operators::gpu_scatter_add_kernel( *x_grad, axis, index, out_grad, dev_ctx); } diff --git a/paddle/phi/kernels/gpu/take_along_axis_kernel.cu b/paddle/phi/kernels/gpu/take_along_axis_kernel.cu index 6b5203f246592..28a1c9b657d7a 100644 --- a/paddle/phi/kernels/gpu/take_along_axis_kernel.cu +++ b/paddle/phi/kernels/gpu/take_along_axis_kernel.cu @@ -14,11 +14,11 @@ #include "paddle/phi/kernels/take_along_axis_kernel.h" -#include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/operators/gather_scatter_kernel.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/common/place.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/utils/data_type.h" namespace phi { @@ -36,12 +36,11 @@ void TakeAlongAxisKernel(const Context& dev_ctx, out->Resize(index.dims()); dev_ctx.template Alloc(out); - const auto& index_type = - paddle::framework::TransToProtoVarType(index.dtype()); - if (index_type == paddle::framework::proto::VarType::INT32) { + const auto& index_type = index.dtype(); + if (index_type == DataType::INT32) { paddle::operators::gpu_gather_kernel( x, axis, index, *out, dev_ctx); - } else if (index_type == paddle::framework::proto::VarType::INT64) { + } else if (index_type == DataType::INT64) { paddle::operators::gpu_gather_kernel( x, axis, index, *out, dev_ctx); } From 04dcb9d78d72273b672e8807db9fa451c3f1aead Mon Sep 17 00:00:00 2001 From: hong <43953930+phlrain@users.noreply.github.com> Date: Thu, 17 Nov 2022 15:16:25 +0800 Subject: [PATCH 064/210] fix new executor gc dep bug (#48068) --- .../framework/new_executor/garbage_collector/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/framework/new_executor/garbage_collector/CMakeLists.txt b/paddle/fluid/framework/new_executor/garbage_collector/CMakeLists.txt index d7ff6e4d50f20..340d0483fe14d 100644 --- a/paddle/fluid/framework/new_executor/garbage_collector/CMakeLists.txt +++ b/paddle/fluid/framework/new_executor/garbage_collector/CMakeLists.txt @@ -2,4 +2,4 @@ cc_library( interpretercore_garbage_collector SRCS garbage_collector.cc event_garbage_collector.cc fast_garbage_collector.cc no_event_garbage_collector.cc - DEPS garbage_collector) + DEPS garbage_collector executor_gc_helper) From 5329187df1a0cd9d40aab2d4e3810f0c8fa47bf2 Mon Sep 17 00:00:00 2001 From: xiongkun Date: Thu, 17 Nov 2022 16:03:58 +0800 Subject: [PATCH 065/210] fix the thread number to ensure deterministic of embedding kernel (#48073) --- paddle/phi/kernels/gpu/embedding_grad_kernel.cu | 1 + 1 file changed, 1 insertion(+) diff --git a/paddle/phi/kernels/gpu/embedding_grad_kernel.cu b/paddle/phi/kernels/gpu/embedding_grad_kernel.cu index 0cfe2e43d1875..8bb00f075929c 100644 --- a/paddle/phi/kernels/gpu/embedding_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/embedding_grad_kernel.cu @@ -107,6 +107,7 @@ struct EmbeddingGradCUDAFunctor { if (FLAGS_cudnn_deterministic) { VLOG(2) << "Run grad kernel of embedding with single thread."; grids.x = 1; + threads.y = 1; } EmbeddingGrad<<>>( d_table, d_output, ids, N, K, D); From dbc63555f35a7263a37d77c9a736df3787cb75f0 Mon Sep 17 00:00:00 2001 From: wenbin Date: Thu, 17 Nov 2022 17:24:18 +0800 Subject: [PATCH 066/210] support int input for scale (#48044) * int scale * round * revert commit --- .../inference/tensorrt/convert/scale_op.cc | 8 +- paddle/fluid/inference/tensorrt/op_teller.cc | 25 +++- .../ir/inference/test_trt_convert_scale.py | 138 +++++++++++------- 3 files changed, 106 insertions(+), 65 deletions(-) diff --git a/paddle/fluid/inference/tensorrt/convert/scale_op.cc b/paddle/fluid/inference/tensorrt/convert/scale_op.cc index d770c21a9ad71..361ed22395532 100644 --- a/paddle/fluid/inference/tensorrt/convert/scale_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/scale_op.cc @@ -49,9 +49,12 @@ class ScaleOpConverter : public OpConverter { PADDLE_GET_CONST(bool, op_desc.GetAttr("bias_after_scale")); float bias = PADDLE_GET_CONST(float, op_desc.GetAttr("bias")); float scale = PADDLE_GET_CONST(float, op_desc.GetAttr("scale")); + bool is_int = input->getType() == nvinfer1::DataType::kINT32; nvinfer1::ILayer* layer = nullptr; if (engine_->with_dynamic_shape()) { - nvinfer1::ITensor* bias_tensor = Add1DConstantLayer(bias); + nvinfer1::ITensor* bias_tensor = + is_int ? Add1DConstantLayer(static_cast(bias)) + : Add1DConstantLayer(bias); bool is_bias_0 = (bias < 1e-06 && bias > -1e-06); std::vector bias_shapes(input->getDimensions().nbDims, 1); @@ -72,7 +75,8 @@ class ScaleOpConverter : public OpConverter { is_scale_1 = false; } else { has_scale_tensor = false; - scale_tensor = Add1DConstantLayer(scale); + scale_tensor = is_int ? Add1DConstantLayer(static_cast(scale)) + : Add1DConstantLayer(scale); is_scale_1 = ((scale - 1.0) < 1e-06 && (scale - 1.0) > -1e-06); } diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc index dfe1b2ca623bc..f27c006f2080e 100644 --- a/paddle/fluid/inference/tensorrt/op_teller.cc +++ b/paddle/fluid/inference/tensorrt/op_teller.cc @@ -1076,13 +1076,24 @@ struct SimpleOpTypeSetTeller : public Teller { auto* x_var_desc = block->FindVar(x_var_name); const auto x_shape = x_var_desc->GetShape(); auto dtype = x_var_desc->GetDataType(); - // At present, only support float32 or float16 into trt. - if (!(dtype == 5 || dtype == 4)) { - return false; - } - if (!with_dynamic_shape && x_shape.size() == 1) { - VLOG(3) << "Scale op does not support 1-dimensional input in tensorrt"; - return false; + if (!with_dynamic_shape) { + // At present, only support float32 or float16 into trt. + if (!(dtype == framework::proto::VarType::FP32 || + dtype == framework::proto::VarType::FP16)) { + return false; + } + if (x_shape.size() == 1) { + VLOG(3) + << "Scale op does not support 1-dimensional input in tensorrt"; + return false; + } + } else { + // At present, only support float32 or float16 or int32 into trt. + if (!(dtype == framework::proto::VarType::FP32 || + dtype == framework::proto::VarType::FP16 || + dtype == framework::proto::VarType::INT32)) { + return false; + } } } diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_scale.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_scale.py index 27658d9286367..5e11bec684921 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_scale.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_scale.py @@ -26,18 +26,24 @@ def is_program_valid(self, program_config: ProgramConfig) -> bool: return True def sample_program_configs(self): - def generate_input1(attrs: List[Dict[str, Any]], batch): + def generate_input1(attrs: List[Dict[str, Any]], batch, is_int): if self.dims == 4: - return np.ones([batch, 3, 24, 24]).astype(np.float32) + return np.ones([batch, 3, 24, 24]).astype( + np.int32 if is_int else np.float32 + ) elif self.dims == 3: - return np.ones([batch, 3, 24]).astype(np.float32) + return np.ones([batch, 3, 24]).astype( + np.int32 if is_int else np.float32 + ) elif self.dims == 2: - return np.ones([batch, 24]).astype(np.float32) + return np.ones([batch, 24]).astype( + np.int32 if is_int else np.float32 + ) elif self.dims == 1: - return np.ones([24]).astype(np.float32) + return np.ones([24]).astype(np.int32 if is_int else np.float32) - def generate_weight1(attrs: List[Dict[str, Any]]): - return np.ones([1]).astype(np.float32) + def generate_weight1(attrs: List[Dict[str, Any]], is_int): + return np.ones([1]).astype(np.int32 if is_int else np.float32) for num_input in [0, 1]: for dims in [1, 2, 3, 4]: @@ -45,58 +51,67 @@ def generate_weight1(attrs: List[Dict[str, Any]]): for scale in [0.1, -1.0]: for bias in [0.0, 1.2]: for bias_after_scale in [False, True]: - self.num_input = num_input - self.dims = dims - dics = [ - { - "scale": scale, - "bias": bias, - "bias_after_scale": bias_after_scale, - }, - {}, - ] - - dics_intput = [ - { - "X": ["scale_input"], - "ScaleTensor": ["ScaleTensor"], - }, - {"X": ["scale_input"]}, - ] - dics_intputs = [ - { - "ScaleTensor": TensorConfig( - data_gen=partial( - generate_weight1, dics + for is_int in [False, True]: + self.num_input = num_input + self.dims = dims + self.is_int = is_int + dics = [ + { + "scale": scale, + "bias": bias, + "bias_after_scale": bias_after_scale, + }, + {}, + ] + + dics_intput = [ + { + "X": ["scale_input"], + "ScaleTensor": ["ScaleTensor"], + }, + {"X": ["scale_input"]}, + ] + dics_intputs = [ + { + "ScaleTensor": TensorConfig( + data_gen=partial( + generate_weight1, + dics, + is_int, + ) ) - ) - }, - {}, - ] - - ops_config = [ - { - "op_type": "scale", - "op_inputs": dics_intput[num_input], - "op_outputs": {"Out": ["scale_out"]}, - "op_attrs": dics[0], - } - ] - ops = self.generate_op_config(ops_config) - program_config = ProgramConfig( - ops=ops, - weights=dics_intputs[num_input], - inputs={ - "scale_input": TensorConfig( - data_gen=partial( - generate_input1, dics, batch + }, + {}, + ] + + ops_config = [ + { + "op_type": "scale", + "op_inputs": dics_intput[num_input], + "op_outputs": { + "Out": ["scale_out"] + }, + "op_attrs": dics[0], + } + ] + ops = self.generate_op_config(ops_config) + program_config = ProgramConfig( + ops=ops, + weights=dics_intputs[num_input], + inputs={ + "scale_input": TensorConfig( + data_gen=partial( + generate_input1, + dics, + batch, + is_int, + ) ) - ) - }, - outputs=["scale_out"], - ) + }, + outputs=["scale_out"], + ) - yield program_config + yield program_config def sample_predictor_configs( self, program_config @@ -182,6 +197,17 @@ def teller2(program_config, predictor_config): "INPUT DIM EQUAL TO 1 OF STATIC SHAPE NOT SUPPORT", ) + def teller3(program_config, predictor_config): + if self.is_int and len(self.dynamic_shape.min_input_shape) == 0: + return True + return False + + self.add_skip_case( + teller3, + SkipReasons.TRT_NOT_SUPPORT, + "INTEGER INPUT OF STATIC SHAPE NOT SUPPORT", + ) + def test(self): self.add_skip_trt_case() self.run_test() From 3f480af2b073e30df7ca048d98792d581e06fe94 Mon Sep 17 00:00:00 2001 From: Wen Sun <35923278+HermitSun@users.noreply.github.com> Date: Thu, 17 Nov 2022 17:35:58 +0800 Subject: [PATCH 067/210] Refactor collective communication all_to_all, all_to_all_single C++ API (#48059) --- .../distributed/collective/ProcessGroup.h | 31 +-- .../collective/ProcessGroupNCCL.cc | 222 +++++++----------- .../distributed/collective/ProcessGroupNCCL.h | 22 +- .../collective/ProcessGroupStream.cc | 50 ++-- .../collective/ProcessGroupStream.h | 30 +-- paddle/fluid/pybind/distributed_py.cc | 220 +++++++++-------- paddle/fluid/pybind/process_group_utils.h | 17 +- .../communication/stream/all_to_all.py | 17 +- 8 files changed, 290 insertions(+), 319 deletions(-) diff --git a/paddle/fluid/distributed/collective/ProcessGroup.h b/paddle/fluid/distributed/collective/ProcessGroup.h index 029a64a25cca4..152bb1aa6f9d1 100644 --- a/paddle/fluid/distributed/collective/ProcessGroup.h +++ b/paddle/fluid/distributed/collective/ProcessGroup.h @@ -46,7 +46,6 @@ enum class CommType : std::uint8_t { SEND = 9, RECV = 10, BARRIER = 11, - ALLTOALL_SINGLE = 12, UNKNOWN = 100, }; @@ -124,6 +123,17 @@ class ProcessGroup { GetBackendName())); } + virtual std::shared_ptr AllToAll( + phi::DenseTensor* out_tensor, + const phi::DenseTensor& in_tensor, + const std::vector& out_size_each_rank, + const std::vector& in_size_each_rank, + bool sync_op) { + PADDLE_THROW(platform::errors::Unimplemented( + "ProcessGroup%s does not support all_to_all with sync_op flag.", + GetBackendName())); + } + virtual std::shared_ptr Barrier( const BarrierOptions& = BarrierOptions()) { PADDLE_THROW(platform::errors::Unimplemented( @@ -255,25 +265,6 @@ class ProcessGroup { "ProcessGroup%s does not support alltoall", GetBackendName())); } - virtual std::shared_ptr AllToAll_Single( - std::vector&, // NOLINT - std::vector&, // NOLINT - std::vector&, - std::vector&) { - PADDLE_THROW(platform::errors::InvalidArgument( - "ProcessGroup%s does not support AllToAll_Single", GetBackendName())); - } - - virtual std::shared_ptr AllToAllSingle( - std::vector&, // NOLINT - std::vector&, // NOLINT - std::vector&, - std::vector&, - bool) { - PADDLE_THROW(platform::errors::InvalidArgument( - "ProcessGroup%s does not support alltoall_single", GetBackendName())); - } - virtual std::shared_ptr Reduce( std::vector&, // NOLINT std::vector&, // NOLINT diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc index d7d5beea8959b..4a70b81e31093 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc @@ -184,6 +184,80 @@ std::shared_ptr ProcessGroupNCCL::AllReduce( use_calc_stream); } +void CheckSizeOnEachRank(const phi::DDim& tensor_dim, + const std::vector& size_on_each_rank, + int world_size) { + int length_size_on_each_rank = size_on_each_rank.size(); + PADDLE_ENFORCE_EQ( + length_size_on_each_rank, + world_size, + platform::errors::InvalidArgument( + "The length of size_on_each_rank must be equal to world_size.")); + + int64_t sum_size_on_each_rank = + std::accumulate(size_on_each_rank.begin(), size_on_each_rank.end(), 0); + PADDLE_ENFORCE_EQ( + sum_size_on_each_rank, + tensor_dim[0], + platform::errors::InvalidArgument( + "The sum of size_on_each_rank must be equal to tensor's dim[0].")); +} + +std::shared_ptr ProcessGroupNCCL::AllToAll( + phi::DenseTensor* out_tensor, + const phi::DenseTensor& in_tensor, + const std::vector& out_size_each_rank, + const std::vector& in_size_each_rank, + bool sync_op, + bool use_calc_stream) { + const phi::DDim& out_dim = out_tensor->dims(); + const phi::DDim& in_dim = in_tensor.dims(); + CheckSizeOnEachRank(out_dim, out_size_each_rank, size_); + CheckSizeOnEachRank(in_dim, in_size_each_rank, size_); + + return Collective( + out_tensor, + in_tensor, + [&](phi::DenseTensor* output, + const phi::DenseTensor& input, + ncclComm_t comm, + gpuStream_t stream) { + int64_t in_row_size = input.numel() / in_dim[0], + out_row_size = output->numel() / out_dim[0]; + int64_t in_offset = 0, in_numel = 0, out_offset = 0, out_numel = 0; + phi::DenseTensor input_partial, output_partial; + + GroupStart(); + for (auto i = 0; i < size_; i++) { + in_numel = in_size_each_rank[i] * in_row_size; + input_partial = GetPartialTensor(input, in_offset, in_numel); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend( + input_partial.data(), + in_numel, + platform::ToNCCLDataType(input.dtype()), + i, + comm, + stream)); + in_offset += in_numel; + + out_numel = out_size_each_rank[i] * out_row_size; + output_partial = GetPartialTensor(*output, out_offset, out_numel); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv( + output_partial.data(), + out_numel, + platform::ToNCCLDataType(output->dtype()), + i, + comm, + stream)); + out_offset += out_numel; + } + GroupEnd(); + }, + CommType::ALLTOALL, + sync_op, + use_calc_stream); +} + std::shared_ptr ProcessGroupNCCL::Barrier( const BarrierOptions& opts) { PADDLE_ENFORCE_GE(opts.device_id, @@ -551,7 +625,7 @@ void ProcessGroupNCCL::CreateNCCLManagerCache( std::vector dev_ctx_raw; dev_ctx_raw.resize(places.size()); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart()); + GroupStart(); for (size_t i = 0; i < places.size(); ++i) { platform::CUDADeviceGuard guard(places[i]); @@ -564,7 +638,7 @@ void ProcessGroupNCCL::CreateNCCLManagerCache( dev_ctx_raw[i] = dev_ctx[i].get(); } - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd()); + GroupEnd(); // TODO(sunyilun): for compatibility, will be removed later place_to_calc_event_.emplace(places_key, places[0]); @@ -1086,7 +1160,7 @@ std::shared_ptr ProcessGroupNCCL::AllToAll( ncclComm_t comm, const gpuStream_t& stream) { size_t offset = 0; - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart()); + GroupStart(); for (auto i = 0; i < size_; i++) { PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend( GetPointerByOffset(input.data(), offset, input.dtype()), @@ -1104,7 +1178,7 @@ std::shared_ptr ProcessGroupNCCL::AllToAll( stream)); offset += input.numel() / size_; } - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd()); + GroupEnd(); }, CommType::ALLTOALL); } @@ -1130,7 +1204,7 @@ std::shared_ptr ProcessGroupNCCL::AllToAll( ncclComm_t comm, const gpuStream_t& stream) { size_t offset = 0; - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart()); + GroupStart(); for (auto i = 0; i < size_; i++) { PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend( GetPointerByOffset(input.data(), offset, input.dtype()), @@ -1148,141 +1222,13 @@ std::shared_ptr ProcessGroupNCCL::AllToAll( stream)); offset += input.numel() / size_; } - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd()); + GroupEnd(); }, CommType::ALLTOALL, sync_op, use_calc_stream); } -std::shared_ptr ProcessGroupNCCL::AllToAll_Single( - std::vector& in_tensors, - std::vector& out_tensors, - std::vector& in_sizes, - std::vector& out_sizes) { - PADDLE_ENFORCE_EQ( - CheckTensorsInCudaPlace(in_tensors), - true, - platform::errors::InvalidArgument("All inputs should be in CudaPlace.")); - PADDLE_ENFORCE_EQ( - CheckTensorsInCudaPlace(out_tensors), - true, - platform::errors::InvalidArgument("All inputs should be in CudaPlace.")); - return Collective( - in_tensors, - out_tensors, - [&](phi::DenseTensor& input, - phi::DenseTensor& output, - ncclComm_t comm, - const gpuStream_t& stream) { - PADDLE_ENFORCE_EQ(input.dtype() == output.dtype(), - true, - platform::errors::InvalidArgument( - "The dtypes of input and output must be equal.")); - - std::vector in_dims = phi::vectorize(input.dims()); - std::vector out_dims = phi::vectorize(output.dims()); - CheckSplitSizes(&in_sizes, in_dims); - CheckSplitSizes(&out_sizes, out_dims); - - size_t in_offset = 0, out_offset = 0; - size_t in_length = 0, out_length = 0; - size_t in_row_size = input.numel() / in_dims[0]; - size_t out_row_size = output.numel() / out_dims[0]; - - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart()); - for (auto i = 0; i < size_; i++) { - in_length = in_sizes[i] * in_row_size; - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend( - GetPointerByOffset(input.data(), in_offset, input.dtype()), - in_length, - platform::ToNCCLDataType(input.dtype()), - i, - comm, - stream)); - in_offset += in_length; - - out_length = out_sizes[i] * out_row_size; - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv( - GetPointerByOffset(output.data(), out_offset, input.dtype()), - out_length, - platform::ToNCCLDataType(input.dtype()), - i, - comm, - stream)); - out_offset += out_length; - } - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd()); - }, - CommType::ALLTOALL_SINGLE); -} - -std::shared_ptr ProcessGroupNCCL::AllToAllSingle( - std::vector& in_tensors, - std::vector& out_tensors, - std::vector& in_sizes, - std::vector& out_sizes, - bool sync_op, - bool use_calc_stream) { - PADDLE_ENFORCE_EQ( - CheckTensorsInCudaPlace(in_tensors), - true, - platform::errors::InvalidArgument("All inputs should be in CudaPlace.")); - PADDLE_ENFORCE_EQ( - CheckTensorsInCudaPlace(out_tensors), - true, - platform::errors::InvalidArgument("All inputs should be in CudaPlace.")); - return Collective( - in_tensors, - out_tensors, - [&](phi::DenseTensor& input, - phi::DenseTensor& output, - ncclComm_t comm, - const gpuStream_t& stream) { - PADDLE_ENFORCE_EQ(input.dtype() == output.dtype(), - true, - platform::errors::InvalidArgument( - "The dtypes of input and output must be equal.")); - - std::vector in_dims = phi::vectorize(input.dims()); - std::vector out_dims = phi::vectorize(output.dims()); - CheckSplitSizes(&in_sizes, in_dims); - CheckSplitSizes(&out_sizes, out_dims); - - size_t in_offset = 0, out_offset = 0; - size_t in_length = 0, out_length = 0; - size_t in_row_size = input.numel() / in_dims[0]; - size_t out_row_size = output.numel() / out_dims[0]; - - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart()); - for (auto i = 0; i < size_; i++) { - in_length = in_sizes[i] * in_row_size; - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend( - GetPointerByOffset(input.data(), in_offset, input.dtype()), - in_length, - platform::ToNCCLDataType(input.dtype()), - i, - comm, - stream)); - in_offset += in_length; - - out_length = out_sizes[i] * out_row_size; - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv( - GetPointerByOffset(output.data(), out_offset, input.dtype()), - out_length, - platform::ToNCCLDataType(input.dtype()), - i, - comm, - stream)); - out_offset += out_length; - } - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd()); - }, - CommType::ALLTOALL_SINGLE, - sync_op, - use_calc_stream); -} - std::shared_ptr ProcessGroupNCCL::Reduce( std::vector& in_tensors, std::vector& out_tensors, @@ -1396,7 +1342,7 @@ std::shared_ptr ProcessGroupNCCL::Scatter( const gpuStream_t& stream) { size_t offset = 0; if (rank_ == opts.root_rank) { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart()); + GroupStart(); for (auto i = 0; i < size_; i++) { PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend( GetPointerByOffset(input.data(), offset, input.dtype()), @@ -1414,7 +1360,7 @@ std::shared_ptr ProcessGroupNCCL::Scatter( opts.root_rank, comm, stream)); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd()); + GroupEnd(); } else { PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv( output.data(), @@ -1456,7 +1402,7 @@ std::shared_ptr ProcessGroupNCCL::Scatter( "Input and output tensors should have the same shape.")); size_t offset = 0; if (rank_ == opts.root_rank) { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart()); + GroupStart(); for (auto i = 0; i < size_; i++) { PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend( GetPointerByOffset(input.data(), offset, input.dtype()), @@ -1474,7 +1420,7 @@ std::shared_ptr ProcessGroupNCCL::Scatter( opts.root_rank, comm, stream)); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd()); + GroupEnd(); } else { PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv( output.data(), diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h index dab6d9428892b..a6528be80b4a5 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h +++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h @@ -109,6 +109,14 @@ class ProcessGroupNCCL final : public ProcessGroupStream { bool sync_op, bool use_calc_stream) override; + std::shared_ptr AllToAll( + phi::DenseTensor* out_tensor, + const phi::DenseTensor& in_tensor, + const std::vector& out_size_each_rank, + const std::vector& in_size_each_rank, + bool sync_op, + bool use_calc_stream) override; + std::shared_ptr Barrier( const BarrierOptions& = BarrierOptions()) override; @@ -171,20 +179,6 @@ class ProcessGroupNCCL final : public ProcessGroupStream { bool sync_op, bool use_calc_stream) override; - std::shared_ptr AllToAll_Single( - std::vector& in, - std::vector& out, - std::vector& in_sizes, - std::vector& out_sizes) override; - - std::shared_ptr AllToAllSingle( - std::vector& in_tensors, - std::vector& out_tensors, - std::vector& in_sizes, - std::vector& out_sizes, - bool sync_op, - bool use_calc_stream) override; - std::shared_ptr Reduce( std::vector& tensors, std::vector& out_tensors, diff --git a/paddle/fluid/distributed/collective/ProcessGroupStream.cc b/paddle/fluid/distributed/collective/ProcessGroupStream.cc index 2561a4f5b295a..3839f70ac13e2 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupStream.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupStream.cc @@ -73,6 +73,31 @@ std::shared_ptr ProcessGroupStream::AllReduce( "ProcessGroup%s does not support all_reduce.", GetBackendName())); } +std::shared_ptr ProcessGroupStream::AllToAll( + phi::DenseTensor* out_tensor, + const phi::DenseTensor& in_tensor, + const std::vector& out_size_each_rank, + const std::vector& in_size_each_rank, + bool sync_op) { + return AllToAll(out_tensor, + in_tensor, + out_size_each_rank, + in_size_each_rank, + sync_op, + /*use_calc_stream*/ false); +} + +std::shared_ptr ProcessGroupStream::AllToAll( + phi::DenseTensor* out_tensor, + const phi::DenseTensor& in_tensor, + const std::vector& out_size_each_rank, + const std::vector& in_size_each_rank, + bool sync_op, + bool use_calc_stream) { + PADDLE_THROW(platform::errors::Unimplemented( + "ProcessGroup%s does not support all_to_all.", GetBackendName())); +} + std::shared_ptr ProcessGroupStream::Broadcast( phi::DenseTensor* out_tensor, const phi::DenseTensor& in_tensor, @@ -165,31 +190,6 @@ std::shared_ptr ProcessGroupStream::AllToAll( "ProcessGroup%s does not support do alltoall", GetBackendName())); } -std::shared_ptr ProcessGroupStream::AllToAllSingle( - std::vector& in_tensors, - std::vector& out_tensors, - std::vector& in_sizes, - std::vector& out_sizes, - bool sync_op) { - return AllToAllSingle(in_tensors, - out_tensors, - in_sizes, - out_sizes, - sync_op, - /*use_calc_stream*/ false); -} - -std::shared_ptr ProcessGroupStream::AllToAllSingle( - std::vector& in_tensors, - std::vector& out_tensors, - std::vector& in_sizes, - std::vector& out_sizes, - bool sync_op, - bool use_calc_stream) { - PADDLE_THROW(platform::errors::InvalidArgument( - "ProcessGroup%s does not support do alltoall_single", GetBackendName())); -} - std::shared_ptr ProcessGroupStream::Reduce( std::vector& in_tensors, std::vector& out_tensors, diff --git a/paddle/fluid/distributed/collective/ProcessGroupStream.h b/paddle/fluid/distributed/collective/ProcessGroupStream.h index 15b0635c5a6b8..ad37c330681ac 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupStream.h +++ b/paddle/fluid/distributed/collective/ProcessGroupStream.h @@ -89,6 +89,21 @@ class ProcessGroupStream : public ProcessGroup { bool sync_op, bool use_calc_stream); + std::shared_ptr AllToAll( + phi::DenseTensor* out_tensor, + const phi::DenseTensor& in_tensor, + const std::vector& out_size_each_rank, + const std::vector& in_size_each_rank, + bool sync_op) override; + + virtual std::shared_ptr AllToAll( + phi::DenseTensor* out_tensor, + const phi::DenseTensor& in_tensor, + const std::vector& out_size_each_rank, + const std::vector& in_size_each_rank, + bool sync_op, + bool use_calc_stream); + std::shared_ptr Broadcast( phi::DenseTensor* out_tensor, const phi::DenseTensor& in_tensor, @@ -140,21 +155,6 @@ class ProcessGroupStream : public ProcessGroup { bool sync_op, bool use_calc_stream); - std::shared_ptr AllToAllSingle( - std::vector& in_tensors, // NOLINT - std::vector& out_tensors, // NOLINT - std::vector& in_sizes, // NOLINT - std::vector& out_sizes, // NOLINT - bool sync_op) override; - - virtual std::shared_ptr AllToAllSingle( - std::vector& in_tensors, // NOLINT - std::vector& out_tensors, // NOLINT - std::vector& in_sizes, // NOLINT - std::vector& out_sizes, // NOLINT - bool sync_op, - bool use_calc_stream); - std::shared_ptr Reduce( std::vector& in_tensors, // NOLINT std::vector& out_tensors, // NOLINT diff --git a/paddle/fluid/pybind/distributed_py.cc b/paddle/fluid/pybind/distributed_py.cc index a596275015612..dbc4c57c656ba 100644 --- a/paddle/fluid/pybind/distributed_py.cc +++ b/paddle/fluid/pybind/distributed_py.cc @@ -277,7 +277,7 @@ void BindDistributed(py::module *m) { /*offset*/ 0, /*numel*/ -1, sync_op); - distributed::SplitTensor(dev_ctx, *out_dense, &out_tensor_list); + SplitTensor(dev_ctx, *out_dense, &out_tensor_list); task->UpdateWaitChain(dev_ctx); return task; }, @@ -316,84 +316,96 @@ void BindDistributed(py::module *m) { .def( "all_to_all", [](distributed::ProcessGroup &self, - py::handle py_in_tensor_list, py::handle py_out_tensor_list, + py::handle py_in_tensor_list, bool sync_op) { - auto in_tensor_list = - CastPyArg2VectorOfTensor(py_in_tensor_list.ptr(), 0); - Tensor concat_in_tensor = paddle::concat(in_tensor_list, 0); - auto in_dense = std::dynamic_pointer_cast( - concat_in_tensor.impl()); - std::vector in_wrapper = {*in_dense}; - auto out_tensor_list = CastPyArg2VectorOfTensor(py_out_tensor_list.ptr(), 0); Tensor concat_out_tensor = paddle::concat(out_tensor_list, 0); - auto out_dense = std::dynamic_pointer_cast( + auto p_out_tensor = std::dynamic_pointer_cast( concat_out_tensor.impl()); - std::vector out_wrapper = {*out_dense}; + auto *out_dense = p_out_tensor.get(); + + auto in_tensor_list = + CastPyArg2VectorOfTensor(py_in_tensor_list.ptr(), 0); + Tensor concat_in_tensor = paddle::concat(in_tensor_list, 0); + auto p_in_tensor = std::dynamic_pointer_cast( + concat_in_tensor.impl()); + auto in_dense = *p_in_tensor; // in_tensor_list should not be empty const auto &dev_ctx = self.GetDeviceContext(in_tensor_list.back().place()); - auto task = self.AllToAll(in_wrapper, out_wrapper, sync_op); - distributed::SplitTensor(dev_ctx, *out_dense, &out_tensor_list); + int world_size = self.GetSize(); + auto task = + self.AllToAll(out_dense, + in_dense, + GetDefaultSplitSizes(*out_dense, world_size), + GetDefaultSplitSizes(in_dense, world_size), + sync_op); + SplitTensor(dev_ctx, *out_dense, &out_tensor_list); task->UpdateWaitChain(dev_ctx); return task; }, - py::arg("in"), py::arg("out"), + py::arg("in"), py::arg("sync_op"), py::call_guard()) .def( "all_to_all_tensor", [](distributed::ProcessGroup &self, - py::handle py_in_tensor, py::handle py_out_tensor, + py::handle py_in_tensor, bool sync_op) { - auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0); - auto in_dense = std::dynamic_pointer_cast( - in_tensor.impl()); - std::vector in_wrapper = {*in_dense}; - auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0); - auto out_dense = std::dynamic_pointer_cast( + auto p_out_tensor = std::dynamic_pointer_cast( out_tensor.impl()); - std::vector out_wrapper = {*out_dense}; + auto *out_dense = p_out_tensor.get(); - return self.AllToAll(in_wrapper, out_wrapper, sync_op); + auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0); + auto p_in_tensor = std::dynamic_pointer_cast( + in_tensor.impl()); + auto in_dense = *p_in_tensor; + + int world_size = self.GetSize(); + return self.AllToAll( + out_dense, + in_dense, + GetDefaultSplitSizes(*out_dense, world_size), + GetDefaultSplitSizes(in_dense, world_size), + sync_op); }, - py::arg("in"), py::arg("out"), + py::arg("in"), py::arg("sync_op"), py::call_guard()) .def( "all_to_all_single", [](distributed::ProcessGroup &self, - py::handle py_in_tensor, py::handle py_out_tensor, - std::vector &in_sizes, - std::vector &out_sizes, + py::handle py_in_tensor, + const std::vector &out_sizes, + const std::vector &in_sizes, bool sync_op) { - auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0); - auto in_dense = std::dynamic_pointer_cast( - in_tensor.impl()); - std::vector in_wrapper = {*in_dense}; - auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0); - auto out_dense = std::dynamic_pointer_cast( + auto p_out_tensor = std::dynamic_pointer_cast( out_tensor.impl()); - std::vector out_wrapper = {*out_dense}; + auto *out_dense = p_out_tensor.get(); + + auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0); + auto p_in_tensor = std::dynamic_pointer_cast( + in_tensor.impl()); + auto in_dense = *p_in_tensor; - return self.AllToAllSingle( - in_wrapper, out_wrapper, in_sizes, out_sizes, sync_op); + return self.AllToAll( + out_dense, in_dense, out_sizes, in_sizes, sync_op); }, - py::arg("in"), py::arg("out"), - py::arg("in_sizes"), + py::arg("in"), py::arg("out_sizes"), + py::arg("in_sizes"), py::arg("sync_op"), py::call_guard()) @@ -674,18 +686,20 @@ void BindDistributed(py::module *m) { [](distributed::ProcessGroup &self, py::handle py_in_tensor, py::handle py_out_tensor, - std::vector in_sizes, - std::vector out_sizes) { - auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0); + const std::vector in_sizes, + const std::vector out_sizes) { auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0); - auto in_dense = std::dynamic_pointer_cast( - in_tensor.impl()); - auto out_dense = std::dynamic_pointer_cast( + auto p_out_tensor = std::dynamic_pointer_cast( out_tensor.impl()); - std::vector in_tensors = {*in_dense}; - std::vector out_tensors = {*out_dense}; - return self.AllToAll_Single( - in_tensors, out_tensors, in_sizes, out_sizes); + auto *out_dense = p_out_tensor.get(); + + auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0); + auto p_in_tensor = std::dynamic_pointer_cast( + in_tensor.impl()); + auto in_dense = *p_in_tensor; + + return self.AllToAll( + out_dense, in_dense, out_sizes, in_sizes, /*sync_op*/ true); }, py::arg("in"), py::arg("out"), @@ -765,7 +779,7 @@ void BindDistributed(py::module *m) { /*numel*/ -1, /*sync_op*/ true, /*use_calc_stream*/ true); - distributed::SplitTensor(dev_ctx, *out_dense, &out_tensor_list); + SplitTensor(dev_ctx, *out_dense, &out_tensor_list); return task; }, py::arg("out"), @@ -856,88 +870,96 @@ void BindDistributed(py::module *m) { .def( "all_to_all_on_calc_stream", [](distributed::ProcessGroupStream &self, - py::handle py_in_tensor_list, - py::handle py_out_tensor_list) { - auto in_tensor_list = - CastPyArg2VectorOfTensor(py_in_tensor_list.ptr(), 0); - Tensor concat_in_tensor = paddle::concat(in_tensor_list, 0); - auto in_dense = std::dynamic_pointer_cast( - concat_in_tensor.impl()); - std::vector in_wrapper = {*in_dense}; - + py::handle py_out_tensor_list, + py::handle py_in_tensor_list) { auto out_tensor_list = CastPyArg2VectorOfTensor(py_out_tensor_list.ptr(), 0); Tensor concat_out_tensor = paddle::concat(out_tensor_list, 0); - auto out_dense = std::dynamic_pointer_cast( + auto p_out_tensor = std::dynamic_pointer_cast( concat_out_tensor.impl()); - std::vector out_wrapper = {*out_dense}; + auto *out_dense = p_out_tensor.get(); - // in_tensor_list must not be empty + auto in_tensor_list = + CastPyArg2VectorOfTensor(py_in_tensor_list.ptr(), 0); + Tensor concat_in_tensor = paddle::concat(in_tensor_list, 0); + auto p_in_tensor = std::dynamic_pointer_cast( + concat_in_tensor.impl()); + auto in_dense = *p_in_tensor; + + // in_tensor_list should not be empty const auto &dev_ctx = self.GetDeviceContext( in_tensor_list.back().place(), /*use_calc_stream*/ true); - auto task = self.AllToAll(in_wrapper, - out_wrapper, - /*sync_op*/ true, - /*use_calc_stream*/ true); - distributed::SplitTensor(dev_ctx, *out_dense, &out_tensor_list); + int world_size = self.GetSize(); + auto task = + self.AllToAll(out_dense, + in_dense, + GetDefaultSplitSizes(*out_dense, world_size), + GetDefaultSplitSizes(in_dense, world_size), + /*sync_op*/ true, + /*use_calc_stream*/ true); + SplitTensor(dev_ctx, *out_dense, &out_tensor_list); return task; }, - py::arg("in"), py::arg("out"), + py::arg("in"), py::call_guard()) .def( "all_to_all_tensor_on_calc_stream", [](distributed::ProcessGroupStream &self, - py::handle py_in_tensor, - py::handle py_out_tensor) { - auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0); - auto in_dense = std::dynamic_pointer_cast( - in_tensor.impl()); - std::vector in_wrapper = {*in_dense}; - + py::handle py_out_tensor, + py::handle py_in_tensor) { auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0); - auto out_dense = std::dynamic_pointer_cast( + auto p_out_tensor = std::dynamic_pointer_cast( out_tensor.impl()); - std::vector out_wrapper = {*out_dense}; + auto *out_dense = p_out_tensor.get(); - return self.AllToAll(in_wrapper, - out_wrapper, - /*sync_op*/ true, - /*use_calc_stream*/ true); + auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0); + auto p_in_tensor = std::dynamic_pointer_cast( + in_tensor.impl()); + auto in_dense = *p_in_tensor; + + int world_size = self.GetSize(); + return self.AllToAll( + out_dense, + in_dense, + GetDefaultSplitSizes(*out_dense, world_size), + GetDefaultSplitSizes(in_dense, world_size), + /*sync_op*/ true, + /*use_calc_stream*/ true); }, - py::arg("in"), py::arg("out"), + py::arg("in"), py::call_guard()) .def( "all_to_all_single_on_calc_stream", [](distributed::ProcessGroupStream &self, - py::handle py_in_tensor, py::handle py_out_tensor, - std::vector &in_sizes, - std::vector &out_sizes) { - auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0); - auto in_dense = std::dynamic_pointer_cast( - in_tensor.impl()); - std::vector in_wrapper = {*in_dense}; - + py::handle py_in_tensor, + const std::vector &out_sizes, + const std::vector &in_sizes) { auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0); - auto out_dense = std::dynamic_pointer_cast( + auto p_out_tensor = std::dynamic_pointer_cast( out_tensor.impl()); - std::vector out_wrapper = {*out_dense}; + auto *out_dense = p_out_tensor.get(); - return self.AllToAllSingle(in_wrapper, - out_wrapper, - in_sizes, - out_sizes, - /*sync_op*/ true, - /*use_calc_stream*/ true); + auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0); + auto p_in_tensor = std::dynamic_pointer_cast( + in_tensor.impl()); + auto in_dense = *p_in_tensor; + + return self.AllToAll(out_dense, + in_dense, + out_sizes, + in_sizes, + /*sync_op*/ true, + /*use_calc_stream*/ true); }, - py::arg("in"), py::arg("out"), - py::arg("in_sizes"), + py::arg("in"), py::arg("out_sizes"), + py::arg("in_sizes"), py::call_guard()) .def( diff --git a/paddle/fluid/pybind/process_group_utils.h b/paddle/fluid/pybind/process_group_utils.h index 35a5ef0b1bb14..05434957547a4 100644 --- a/paddle/fluid/pybind/process_group_utils.h +++ b/paddle/fluid/pybind/process_group_utils.h @@ -21,7 +21,7 @@ #include "paddle/phi/kernels/funcs/concat_and_split_functor.h" namespace paddle { -namespace distributed { +namespace pybind { template struct ConcatDenseTensor { @@ -113,6 +113,10 @@ void ConcatDenseTensorWithType(const DeviceContext &dev_ctx, ConcatDenseTensor()( dev_ctx, t_list, p_out); break; + case phi::DataType::BFLOAT16: + ConcatDenseTensor()( + dev_ctx, t_list, p_out); + break; case phi::DataType::FLOAT32: ConcatDenseTensor()(dev_ctx, t_list, p_out); break; @@ -150,6 +154,10 @@ void SplitDenseTensorWithType(const DeviceContext &dev_ctx, SplitDenseTensor()( dev_ctx, t_in, p_list); break; + case phi::DataType::BFLOAT16: + SplitDenseTensor()( + dev_ctx, t_in, p_list); + break; case phi::DataType::FLOAT32: SplitDenseTensor()(dev_ctx, t_in, p_list); break; @@ -249,5 +257,10 @@ void SplitTensor(const phi::DeviceContext &dev_ctx, } } -} // namespace distributed +inline std::vector GetDefaultSplitSizes(const phi::DenseTensor &tensor, + int world_size) { + return std::vector(world_size, tensor.dims()[0] / world_size); +} + +} // namespace pybind } // namespace paddle diff --git a/python/paddle/distributed/communication/stream/all_to_all.py b/python/paddle/distributed/communication/stream/all_to_all.py index d05b53564a897..2787c6a3d4d09 100644 --- a/python/paddle/distributed/communication/stream/all_to_all.py +++ b/python/paddle/distributed/communication/stream/all_to_all.py @@ -75,11 +75,11 @@ def _all_to_all_in_dygraph( if use_calc_stream: return group.process_group.all_to_all_on_calc_stream( - in_tensor_list, out_tensor_list + out_tensor_list, in_tensor_list ) task = group.process_group.all_to_all( - in_tensor_list, out_tensor_list, sync_op + out_tensor_list, in_tensor_list, sync_op ) if sync_op: task.wait() @@ -243,18 +243,23 @@ def _alltoall_single_in_dygraph( sync_op, use_calc_stream, ): + world_size = dist.get_world_size() if out_split_sizes is None: - out_split_sizes = [] + out_split_sizes = [ + out_tensor.shape[0] // world_size for _ in range(world_size) + ] if in_split_sizes is None: - in_split_sizes = [] + in_split_sizes = [ + in_tensor.shape[0] // world_size for _ in range(world_size) + ] if use_calc_stream: return group.process_group.all_to_all_single_on_calc_stream( - in_tensor, out_tensor, in_split_sizes, out_split_sizes + out_tensor, in_tensor, out_split_sizes, in_split_sizes ) task = group.process_group.all_to_all_single( - in_tensor, out_tensor, in_split_sizes, out_split_sizes, sync_op + out_tensor, in_tensor, out_split_sizes, in_split_sizes, sync_op ) if sync_op: task.wait() From 209f684c2a2a3b3b4e5270182a50d0e85adc1602 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=82=85=E5=89=91=E5=AF=92?= Date: Thu, 17 Nov 2022 17:42:50 +0800 Subject: [PATCH 068/210] remove stanh in nn.py under fluid (#47889) --- python/paddle/fluid/layers/nn.py | 46 ------------------- .../tests/unittests/test_activation_op.py | 2 +- 2 files changed, 1 insertion(+), 47 deletions(-) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 1e740ca967343..ee85d94e2671d 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -133,7 +133,6 @@ 'crop_tensor', 'relu6', 'pow', - 'stanh', 'hard_sigmoid', 'swish', 'prelu', @@ -9936,51 +9935,6 @@ def pow(x, factor=1.0, name=None): return out -@templatedoc() -def stanh(x, scale_a=0.67, scale_b=1.7159, name=None): - """ - stanh activation. - - .. math:: - - out = b * \\frac{e^{a * x} - e^{-a * x}}{e^{a * x} + e^{-a * x}} - - Parameters: - x (Tensor): The input Tensor with data type float32, float64. - scale_a (float, optional): The scale factor a of the input. Default is 0.67. - scale_b (float, optional): The scale factor b of the output. Default is 1.7159. - name (str, optional): Name for the operation (optional, default is None). - For more information, please refer to :ref:`api_guide_Name`. - - Returns: - A Tensor with the same data type and shape as ``x`` . - - Examples: - .. code-block:: python - - import paddle - - x = paddle.to_tensor([1.0, 2.0, 3.0, 4.0]) - out = paddle.stanh(x, scale_a=0.67, scale_b=1.72) # [1.00616539, 1.49927628, 1.65933108, 1.70390463] - - """ - - if _non_static_mode(): - return _legacy_C_ops.stanh(x, 'scale_a', scale_a, 'scale_b', scale_b) - - check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'stanh') - - helper = LayerHelper('stanh', **locals()) - out = helper.create_variable_for_type_inference(dtype=x.dtype) - helper.append_op( - type='stanh', - inputs={'X': x}, - outputs={'Out': out}, - attrs={'scale_a': scale_a, 'scale_b': scale_b}, - ) - return out - - @templatedoc() def hard_sigmoid(x, slope=0.2, offset=0.5, name=None): """ diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py index ec2adcfcdc5dc..8c4ed1cc1fbeb 100755 --- a/python/paddle/fluid/tests/unittests/test_activation_op.py +++ b/python/paddle/fluid/tests/unittests/test_activation_op.py @@ -2989,7 +2989,7 @@ def test_fluid_api(self): paddle.enable_static() with fluid.program_guard(fluid.Program()): x = fluid.data('X', [10, 12]) - out = fluid.layers.stanh(x, self.scale_a, self.scale_b) + out = paddle.stanh(x, self.scale_a, self.scale_b) exe = fluid.Executor(self.place) res = exe.run(feed={'X': self.x_np}, fetch_list=[out]) out_ref = ref_stanh(self.x_np, self.scale_a, self.scale_b) From 7619188abe811034dc982266196a5cb7795b53d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=82=85=E5=89=91=E5=AF=92?= Date: Thu, 17 Nov 2022 17:48:38 +0800 Subject: [PATCH 069/210] =?UTF-8?q?(fluid=E6=B8=85=E7=90=86)remove=20swish?= =?UTF-8?q?=20in=20nn.py=20under=20fluid=20(#47891)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * remove swish in nn.py under fluid * fix tswish test case --- python/paddle/fluid/layers/nn.py | 87 ------------------- .../test_mkldnn_elt_act_fuse_pass.py | 7 +- .../ir/inference/test_trt_activation_pass.py | 7 +- .../tests/unittests/test_activation_op.py | 2 +- 4 files changed, 8 insertions(+), 95 deletions(-) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index ee85d94e2671d..01a2aa90ce834 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -134,7 +134,6 @@ 'relu6', 'pow', 'hard_sigmoid', - 'swish', 'prelu', 'brelu', 'leaky_relu', @@ -9979,92 +9978,6 @@ def hard_sigmoid(x, slope=0.2, offset=0.5, name=None): return out -@templatedoc() -def swish(x, beta=1.0, name=None): - r""" - :alias_main: paddle.nn.functional.swish - :alias: paddle.nn.functional.swish,paddle.nn.functional.activation.swish - :old_api: paddle.fluid.layers.swish - - Elementwise swish activation function. See `Searching for Activation Functions `_ for more details. - - Equation: - - .. math:: - out = \\frac{x}{1 + e^{- beta * x}} - - Args: - x(Variable): Tensor or LoDTensor, dtype: float32 or float64, the input of swish activation. - - beta(float): Constant beta of swish operator, default 1.0. - - name(str, optional): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`. - - Returns: - - Variable: Output of the swish activation, Tensor or LoDTensor, with the same dtype and shape with the input x. - - Examples: - - .. code-block:: python - - # declarative mode - import numpy as np - from paddle import fluid - - x = fluid.data(name="x", shape=(-1, 3), dtype="float32") - y = fluid.layers.swish(x, beta=2.0) - - place = fluid.CPUPlace() - exe = fluid.Executor(place) - start = fluid.default_startup_program() - main = fluid.default_main_program() - - data = np.random.randn(2, 3).astype("float32") - exe.run(start) - y_np, = exe.run(main, feed={"x": data}, fetch_list=[y]) - - data - # array([[-1.1239197 , 1.3391294 , 0.03921051], - # [ 1.1970421 , 0.02440812, 1.2055548 ]], dtype=float32) - y_np - # array([[-0.2756806 , 1.0610548 , 0.01998957], - # [ 0.9193261 , 0.01235299, 0.9276883 ]], dtype=float32) - - - .. code-block:: python - - # imperative mode - import numpy as np - from paddle import fluid - import paddle.fluid.dygraph as dg - - data = np.random.randn(2, 3).astype("float32") - place = fluid.CPUPlace() - with dg.guard(place) as g: - x = dg.to_variable(data) - y = fluid.layers.swish(x) - y_np = y.numpy() - data - # array([[-0.0816701 , 1.1603649 , -0.88325626], - # [ 0.7522361 , 1.0978601 , 0.12987892]], dtype=float32) - y_np - # array([[-0.03916847, 0.8835007 , -0.25835553], - # [ 0.51126915, 0.82324016, 0.06915068]], dtype=float32) - """ - check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'swish') - - helper = LayerHelper('swish', **locals()) - out = helper.create_variable_for_type_inference(dtype=x.dtype) - helper.append_op( - type='swish', - inputs={'X': x}, - outputs={'Out': out}, - attrs={'slope': beta}, - ) - return out - - @deprecated(since="2.0.0", update_to="paddle.static.nn.prelu") def prelu(x, mode, param_attr=None, data_format="NCHW", name=None): r""" diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_elt_act_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_elt_act_fuse_pass.py index c9647ec60b5cb..b926a8f71a9d5 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_elt_act_fuse_pass.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_elt_act_fuse_pass.py @@ -92,8 +92,7 @@ class ElementwiseActivationMkldnnFusePassTest_Add_Swish( ): def set_params(self): self.operand = fluid.layers.elementwise_add - self.act_alpha = 4 - self.act = fluid.layers.swish + self.act = paddle.nn.functional.swish class ElementwiseActivationMkldnnFusePassTest_Add_HardSwish( @@ -194,7 +193,7 @@ class ElementwiseActivationMkldnnFusePassTest_Sub_Swish( ): def set_params(self): self.operand = fluid.layers.elementwise_sub - self.act = fluid.layers.swish + self.act = paddle.nn.functional.swish class ElementwiseActivationMkldnnFusePassTest_Sub_HardSwish( @@ -287,7 +286,7 @@ class ElementwiseActivationMkldnnFusePassTest_Mul_Swish( ): def set_params(self): self.operand = fluid.layers.elementwise_mul - self.act = fluid.layers.swish + self.act = paddle.nn.functional.swish class ElementwiseActivationMkldnnFusePassTest_Mul_HardSwish( diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_activation_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_activation_pass.py index 0f2a8a97430cd..80e51dbf26124 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_activation_pass.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_activation_pass.py @@ -21,6 +21,7 @@ import paddle.fluid.core as core from paddle.fluid.core import PassVersionChecker from paddle.fluid.core import AnalysisConfig +import paddle class TensorRTSubgraphPassActivationTest(InferencePassTest): @@ -118,7 +119,7 @@ def setUpTensorRTParam(self): ) def append_act(self, x): - return fluid.layers.swish(x) + return paddle.nn.functional.swish(x) class TensorRTSubgraphPassSwishFp16SerializeTest( @@ -131,7 +132,7 @@ def setUpTensorRTParam(self): ) def append_act(self, x): - return fluid.layers.swish(x) + return paddle.nn.functional.swish(x) class TensorRTSubgraphPassDynamicSwishFp16SerializeTest( @@ -152,7 +153,7 @@ def setUpTensorRTParam(self): ) def append_act(self, x): - return fluid.layers.swish(x) + return paddle.nn.functional.swish(x) class TensorRTSubgraphPassMishTest(TensorRTSubgraphPassActivationTest): diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py index 8c4ed1cc1fbeb..4a80879608690 100755 --- a/python/paddle/fluid/tests/unittests/test_activation_op.py +++ b/python/paddle/fluid/tests/unittests/test_activation_op.py @@ -3551,7 +3551,7 @@ def test_fluid_api(self): paddle.enable_static() with fluid.program_guard(fluid.Program()): x = fluid.data('X', self.x_np.shape, self.x_np.dtype) - out = fluid.layers.swish(x) + out = paddle.nn.functional.swish(x) exe = fluid.Executor(self.place) res = exe.run(feed={'X': self.x_np}, fetch_list=[out]) out_ref = ref_swish(self.x_np) From 099c2302370d936f963b4f46d07f7def3a65958d Mon Sep 17 00:00:00 2001 From: Qi Li Date: Thu, 17 Nov 2022 19:21:40 +0800 Subject: [PATCH 070/210] [NPU] add _npu_identity op and api, test=develop (#47850) * [NPU] add _npu_identity op and api, test=develop * fix doc * address comments --- paddle/phi/api/yaml/ops.yaml | 9 ++ paddle/phi/kernels/npu_identity_kernel.cc | 79 +++++++++++++++++ paddle/phi/kernels/npu_identity_kernel.h | 29 +++++++ .../tests/unittests/test_npu_identity_op.py | 57 +++++++++++++ python/paddle/incubate/__init__.py | 1 + python/paddle/incubate/tensor/__init__.py | 1 + python/paddle/incubate/tensor/manipulation.py | 84 +++++++++++++++++++ 7 files changed, 260 insertions(+) create mode 100644 paddle/phi/kernels/npu_identity_kernel.cc create mode 100644 paddle/phi/kernels/npu_identity_kernel.h create mode 100644 python/paddle/fluid/tests/unittests/test_npu_identity_op.py create mode 100644 python/paddle/incubate/tensor/manipulation.py diff --git a/paddle/phi/api/yaml/ops.yaml b/paddle/phi/api/yaml/ops.yaml index 88ab2ee099ca6..32a498f112fd9 100644 --- a/paddle/phi/api/yaml/ops.yaml +++ b/paddle/phi/api/yaml/ops.yaml @@ -580,6 +580,15 @@ func : mv backward : mv_grad +- op : npu_identity + args : (Tensor x, int format = -1) + output : Tensor + infer_meta : + func : UnchangedInferMeta + param : [x] + kernel : + func : npu_identity + - op : poisson args : (Tensor x) output : Tensor diff --git a/paddle/phi/kernels/npu_identity_kernel.cc b/paddle/phi/kernels/npu_identity_kernel.cc new file mode 100644 index 0000000000000..0c1af9bb40a80 --- /dev/null +++ b/paddle/phi/kernels/npu_identity_kernel.cc @@ -0,0 +1,79 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/npu_identity_kernel.h" + +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/empty_kernel.h" + +namespace phi { + +template +void NPUIdentityKernel(const Context& dev_ctx, + const DenseTensor& x, + const int format, + DenseTensor* out) { + VLOG(4) << "npu_identity op is only for NPU, CPU or GPU kernel just empty " + "tensor with shape: " + << out->dims() << ", please avoid using this kernel!"; + *out = phi::EmptyLike(dev_ctx, *out); +} + +} // namespace phi + +/** [ Why need npu_identity op? ] + * + * 1. Ascend CANN use internal storage format for high performance + * computing, for example if run BatchNorm2D op with CANN internal + * storage format ACL_FORMAT_NC1HWC0, time costs in transdata will + * be removed, and at will gain 2x performance improvement. + * + * 2.The internal storage format will use storage_properties_ in + * DenseTensor, and will change the size and layout of denser, and + * finally it should be called when change tensor to numpy and restore + * original size and format by calling CANN Identity OP. + * + * TODO(qili93): remove this op after custom op and custom device + * integrated and then move this op along with its code to plugin. + */ + +PD_REGISTER_KERNEL(npu_identity, + CPU, + ALL_LAYOUT, + phi::NPUIdentityKernel, + float, + double, + int8_t, + uint8_t, + int16_t, + int, + int64_t, + bool, + phi::dtype::float16) {} + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +PD_REGISTER_KERNEL(npu_identity, + GPU, + ALL_LAYOUT, + phi::NPUIdentityKernel, + float, + double, + int8_t, + uint8_t, + int16_t, + int, + int64_t, + bool, + phi::dtype::float16) {} +#endif diff --git a/paddle/phi/kernels/npu_identity_kernel.h b/paddle/phi/kernels/npu_identity_kernel.h new file mode 100644 index 0000000000000..4ef106de1d3aa --- /dev/null +++ b/paddle/phi/kernels/npu_identity_kernel.h @@ -0,0 +1,29 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/common/scalar.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/device_context.h" + +namespace phi { + +template +void NPUIdentityKernel(const Context& dev_ctx, + const DenseTensor& x, + const int format, + DenseTensor* out); + +} // namespace phi diff --git a/python/paddle/fluid/tests/unittests/test_npu_identity_op.py b/python/paddle/fluid/tests/unittests/test_npu_identity_op.py new file mode 100644 index 0000000000000..183a86a8ceb80 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_npu_identity_op.py @@ -0,0 +1,57 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np +import paddle + + +class TestNPUIdentityOp(unittest.TestCase): + def setUp(self): + self.op_type = "npu_identity" + self.shape = [64, 6, 28, 28] + self.x = np.random.random(self.shape).astype(np.float32) + self.format = 3 # ACL_FORMAT_NC1HWC0 = 3 + self.place = paddle.CPUPlace() + + def test_api_static(self): + paddle.enable_static() + + main_program = paddle.static.default_main_program() + startup_program = paddle.static.default_startup_program() + with paddle.static.program_guard(main_program, startup_program): + x_data = paddle.static.data( + shape=self.shape, name="data", dtype='float32' + ) + output = paddle.incubate._npu_identity(x=x_data, format=self.format) + exe = paddle.static.Executor() + exe.run(startup_program) + result = exe.run( + main_program, feed={x_data.name: self.x}, fetch_list=[output] + ) + + np.testing.assert_allclose(result[0].shape, self.shape, rtol=1e-08) + + def test_api_dygraph(self): + paddle.disable_static(self.place) + + x_tensor = paddle.to_tensor(self.x) + out = paddle.incubate._npu_identity(x_tensor, self.format) + + np.testing.assert_allclose(out.shape, self.shape, rtol=1e-08) + paddle.enable_static() + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/incubate/__init__.py b/python/paddle/incubate/__init__.py index 2730db97f0ed2..0c5e90c6975e2 100644 --- a/python/paddle/incubate/__init__.py +++ b/python/paddle/incubate/__init__.py @@ -27,6 +27,7 @@ from .tensor import segment_mean from .tensor import segment_max from .tensor import segment_min +from .tensor import _npu_identity from .passes import fuse_resnet_unit_pass from . import autograd # noqa: F401 diff --git a/python/paddle/incubate/tensor/__init__.py b/python/paddle/incubate/tensor/__init__.py index 01dfab4482d66..99413e41ab368 100644 --- a/python/paddle/incubate/tensor/__init__.py +++ b/python/paddle/incubate/tensor/__init__.py @@ -16,5 +16,6 @@ from .math import segment_mean from .math import segment_max from .math import segment_min +from .manipulation import _npu_identity __all__ = [] diff --git a/python/paddle/incubate/tensor/manipulation.py b/python/paddle/incubate/tensor/manipulation.py new file mode 100644 index 0000000000000..0722c94aa1d6c --- /dev/null +++ b/python/paddle/incubate/tensor/manipulation.py @@ -0,0 +1,84 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from paddle.fluid.framework import _in_legacy_dygraph, in_dygraph_mode +from paddle.fluid.layer_helper import LayerHelper +from paddle.fluid.data_feeder import check_variable_and_dtype +from paddle import _C_ops, _legacy_C_ops + + +__all__ = [] + + +# TODO(qili93): remove this op after custom op and custom device +# integrated and then move this op along with its code to plugin. +def _npu_identity(x, format=-1): + """ + + This OP takes in the Tensor :attr:`x` and change it to ouptut with + aclFormat with int value. This API is only used for Ascend NPU. + + Args: + x(Tensor): An input N-D Tensor with data type bool, float16, + float32, float64, int32, int64, int16, int8, uint8. + format(int): Storage data format of the output in aclFormat, + default value is -1. + + Returns: + Tensor: A Tensor with acl storage format on Ascend NPU. + + Examples: + .. code-block:: python + + # required: npu + import paddle + + x = paddle.ones(shape=[6]) + y = paddle.incubate._npu_identity(x, 3) # ACL_FORMAT_NC1HWC0 = 3 + # y.shape = [1, 1, 1, 1, 16] + """ + if in_dygraph_mode(): + return _C_ops.npu_identity(x, format) + + if _in_legacy_dygraph(): + return _legacy_C_ops.npu_identity(x, format) + + check_variable_and_dtype( + x, + 'x', + [ + 'bool', + 'int8', + 'uint8', + 'int16', + 'int32', + 'int64', + 'float16', + 'float32', + 'float64', + ], + 'npu_identity', + ) + + helper = LayerHelper('npu_identity', **locals()) + out = helper.create_variable_for_type_inference( + dtype=x.dtype, stop_gradient=x.stop_gradient + ) + helper.append_op( + type='npu_identity', + inputs={'x': [x]}, + outputs={'out': [out]}, + attrs={'format': format}, + ) + return out From 74e3f26f24c6f6f5d9ba65ce211e01a25b645100 Mon Sep 17 00:00:00 2001 From: HongyuJia Date: Thu, 17 Nov 2022 20:49:36 +0800 Subject: [PATCH 071/210] [Clean fluid] Clean fluid elementwise_min/pow/mod/floordiv, remove API (#48040) * clean fluid elementwise_pow, remove API * clean elem_pow doc * clean elementwise_mod * clean elementwise min, floordiv, mod --- python/paddle/fluid/layers/distributions.py | 2 +- python/paddle/fluid/layers/nn.py | 181 ------------------ python/paddle/fluid/layers/rnn.py | 6 +- python/paddle/fluid/optimizer.py | 4 +- .../seq2seq_dygraph_model.py | 8 +- .../transformer_dygraph_model.py | 8 +- .../unittests/ipu/test_elemetwise_x_op_ipu.py | 4 +- .../paddle/fluid/tests/unittests/test_cond.py | 2 +- .../fluid/tests/unittests/test_layers.py | 6 +- .../test_optimizer_in_control_flow.py | 2 +- 10 files changed, 16 insertions(+), 207 deletions(-) diff --git a/python/paddle/fluid/layers/distributions.py b/python/paddle/fluid/layers/distributions.py index bd0b39caf0d9e..0010a39e7fc31 100644 --- a/python/paddle/fluid/layers/distributions.py +++ b/python/paddle/fluid/layers/distributions.py @@ -672,7 +672,7 @@ def _inv(self, value): one_diag = tensor.diag( tensor.ones(shape=[batch_shape[0]], dtype=self.loc.dtype) ) - inv_diag = nn.elementwise_pow(value, (one_all - 2 * one_diag)) + inv_diag = paddle.pow(value, (one_all - 2 * one_diag)) return inv_diag diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 01a2aa90ce834..e3e42244dc61e 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -58,7 +58,6 @@ check_type, check_dtype, ) -import paddle from paddle.utils import deprecated from paddle import _C_ops, _legacy_C_ops @@ -151,10 +150,6 @@ 'elementwise_div', 'elementwise_sub', 'elementwise_mul', - 'elementwise_min', - 'elementwise_pow', - 'elementwise_mod', - 'elementwise_floordiv', 'uniform_random_batch_size_like', 'gaussian_random', 'sampling_id', @@ -12369,187 +12364,11 @@ def gen_data(): return _elementwise_op(LayerHelper('elementwise_mul', **locals())) -def elementwise_min(x, y, axis=-1, act=None, name=None): - """ - :alias_main: paddle.elementwise_min - :alias: paddle.elementwise_min,paddle.tensor.elementwise_min,paddle.tensor.math.elementwise_min - :old_api: paddle.fluid.layers.elementwise_min - - Examples: - - .. code-block:: python - - import paddle.fluid as fluid - import numpy as np - import paddle - - def gen_data(): - return { - "x": np.array([2, 3, 4]).astype('float32'), - "y": np.array([1, 5, 2]).astype('float32') - } - paddle.enable_static() - x = fluid.data(name="x", shape=[3], dtype='float32') - y = fluid.data(name="y", shape=[3], dtype='float32') - z = fluid.layers.elementwise_min(x, y) - - place = fluid.CPUPlace() - exe = fluid.Executor(place) - z_value = exe.run(feed=gen_data(), - fetch_list=[z.name]) - - print(z_value) #[1, 3, 2] - - .. code-block:: python - - import paddle.fluid as fluid - import numpy as np - import paddle - - def gen_data(): - return { - "x": np.ones((2, 3, 4, 5)).astype('float32'), - "y": np.zeros((3, 4)).astype('float32') - } - paddle.enable_static() - x = fluid.data(name="x", shape=[2,3,4,5], dtype='float32') - y = fluid.data(name="y", shape=[3,4], dtype='float32') - z = fluid.layers.elementwise_min(x, y, axis=1) - - place = fluid.CPUPlace() - exe = fluid.Executor(place) - - z_value = exe.run(feed=gen_data(), - fetch_list=[z.name]) - - print(z_value)#[[[[0., 0., 0., 0., 0.] .... [0., 0., 0., 0., 0.]]]] - """ - if _non_static_mode(): - return _elementwise_op_in_dygraph( - x, y, axis=axis, act=act, op_name='elementwise_min' - ) - - return _elementwise_op(LayerHelper('elementwise_min', **locals())) - - -def elementwise_pow(x, y, axis=-1, act=None, name=None): - """ - - Examples: - - .. code-block:: python - - import paddle.fluid as fluid - import numpy as np - import paddle - - def gen_data(): - return { - "x": np.array([2, 3, 4]).astype('float32'), - "y": np.array([1, 5, 2]).astype('float32') - } - paddle.enable_static() - x = fluid.data(name="x", shape=[3], dtype='float32') - y = fluid.data(name="y", shape=[3], dtype='float32') - z = fluid.layers.elementwise_pow(x, y) - - place = fluid.CPUPlace() - exe = fluid.Executor(place) - z_value = exe.run(feed=gen_data(), - fetch_list=[z.name]) - - print(z_value) #[2, 243, 16] - """ - if _non_static_mode(): - return _elementwise_op_in_dygraph( - x, y, axis=axis, act=act, op_name='elementwise_pow' - ) - return _elementwise_op(LayerHelper('elementwise_pow', **locals())) - - -@deprecated(since="2.0.0", update_to="paddle.remainder") -def elementwise_mod(x, y, axis=-1, act=None, name=None): - """ - - Examples: - - .. code-block:: python - - import paddle.fluid as fluid - import numpy as np - import paddle - - def gen_data(): - return { - "x": np.array([10, 15, 8]).astype('int32'), - "y": np.array([3, 6, 5]).astype('int32') - } - paddle.enable_static() - x = fluid.data(name="x", shape=[3], dtype='int32') - y = fluid.data(name="y", shape=[3], dtype='int32') - z = fluid.layers.elementwise_mod(x, y) - - place = fluid.CPUPlace() - exe = fluid.Executor(place) - z_value = exe.run(feed=gen_data(), - fetch_list=[z.name]) - - print(z_value) #[1, 3, 3] - """ - if _non_static_mode(): - return _elementwise_op_in_dygraph( - x, y, axis=axis, act=act, op_name='elementwise_mod' - ) - - return _elementwise_op(LayerHelper('elementwise_mod', **locals())) - - -@deprecated(since="2.0.0", update_to="paddle.floor_divide") -def elementwise_floordiv(x, y, axis=-1, act=None, name=None): - """ - - Examples: - - .. code-block:: python - - import paddle.fluid as fluid - import numpy as np - import paddle - - def gen_data(): - return { - "x": np.array([10, 15, 8]).astype('int32'), - "y": np.array([3, 7, 5]).astype('int32') - } - paddle.enable_static() - x = fluid.data(name="x", shape=[3], dtype='int32') - y = fluid.data(name="y", shape=[3], dtype='int32') - z = fluid.layers.elementwise_floordiv(x, y) - - place = fluid.CPUPlace() - exe = fluid.Executor(place) - z_value = exe.run(feed=gen_data(), - fetch_list=[z.name]) - - print(z_value) #[3, 2, 1] - """ - if _non_static_mode(): - return _elementwise_op_in_dygraph( - x, y, axis=axis, act=act, op_name='elementwise_floordiv' - ) - - return _elementwise_op(LayerHelper('elementwise_floordiv', **locals())) - - for func in [ elementwise_add, elementwise_div, elementwise_sub, elementwise_mul, - elementwise_pow, - elementwise_min, - elementwise_mod, - elementwise_floordiv, ]: op_proto = OpProtoHolder.instance().get_op_proto(func.__name__) diff --git a/python/paddle/fluid/layers/rnn.py b/python/paddle/fluid/layers/rnn.py index 5cfc6e37ce1cc..fc0603e227362 100644 --- a/python/paddle/fluid/layers/rnn.py +++ b/python/paddle/fluid/layers/rnn.py @@ -1317,10 +1317,8 @@ def _beam_search_step(self, time, logits, next_cell_states, beam_state): scores = nn.reshape(scores, [-1, self.beam_size * self.vocab_size]) # TODO: add grad for topk then this beam search can be used to train topk_scores, topk_indices = paddle.topk(x=scores, k=self.beam_size) - beam_indices = nn.elementwise_floordiv( - topk_indices, self.vocab_size_tensor - ) - token_indices = nn.elementwise_mod(topk_indices, self.vocab_size_tensor) + beam_indices = paddle.floor_divide(topk_indices, self.vocab_size_tensor) + token_indices = paddle.remainder(topk_indices, self.vocab_size_tensor) next_log_probs = self._gather( nn.reshape(log_probs, [-1, self.beam_size * self.vocab_size]), topk_indices, diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 7eaa38636c252..e3d71a0bafa8c 100755 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -4802,7 +4802,7 @@ def _get_decay_pow(self, block): ) global_step = layers.cast(global_step, "float32") decay_var = block._clone_variable(self._decay_var) - decay_pow_acc = layers.elementwise_pow(decay_var, global_step) + decay_pow_acc = paddle.pow(decay_var, global_step) return decay_pow_acc, global_step def _create_ema_vars(self, param): @@ -7756,7 +7756,7 @@ def minimize(self, loss, startup_program=None): shape=[1], dtype='float32', value=1.0 ) - mod = layers.elementwise_mod(step, k) + mod = paddle.remainder(step, k) with layers.control_flow.Switch() as switch: with switch.case(step == one_var): for param_name in params: diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_dygraph_model.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_dygraph_model.py index 76c5efbc91663..d46778e838fc1 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_dygraph_model.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_dygraph_model.py @@ -459,12 +459,8 @@ def beam_search(self, inputs): input=scores, k=self.beam_size ) - beam_indices = fluid.layers.elementwise_floordiv( - topk_indices, vocab_size_tensor - ) - token_indices = fluid.layers.elementwise_mod( - topk_indices, vocab_size_tensor - ) + beam_indices = paddle.floor_divide(topk_indices, vocab_size_tensor) + token_indices = paddle.remainder(topk_indices, vocab_size_tensor) next_log_probs = self._gather(scores, topk_indices, batch_pos) x = 0 diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_dygraph_model.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_dygraph_model.py index e824d80400723..69839cf72b5c7 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_dygraph_model.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_dygraph_model.py @@ -856,12 +856,8 @@ def gather(input, indices, batch_pos): topk_scores, topk_indices = fluid.layers.topk( input=scores, k=beam_size ) - beam_indices = fluid.layers.elementwise_floordiv( - topk_indices, vocab_size_tensor - ) - token_indices = fluid.layers.elementwise_mod( - topk_indices, vocab_size_tensor - ) + beam_indices = paddle.floor_divide(topk_indices, vocab_size_tensor) + token_indices = paddle.remainder(topk_indices, vocab_size_tensor) # update states caches = map_structure( diff --git a/python/paddle/fluid/tests/unittests/ipu/test_elemetwise_x_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_elemetwise_x_op_ipu.py index 0dc66ba82cde8..3949a3fea1ab7 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_elemetwise_x_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_elemetwise_x_op_ipu.py @@ -150,7 +150,7 @@ def set_test_op(self): class TestPow(TestMul): def set_test_op(self): - self.op = paddle.fluid.layers.elementwise_pow + self.op = paddle.pow class TestMod(TestMul): @@ -161,7 +161,7 @@ def set_atol(self): self.rtol_fp16 = 1e-3 def set_test_op(self): - self.op = paddle.fluid.layers.elementwise_mod + self.op = paddle.remainder if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/test_cond.py b/python/paddle/fluid/tests/unittests/test_cond.py index a09ff49df2efc..176be73afe49f 100644 --- a/python/paddle/fluid/tests/unittests/test_cond.py +++ b/python/paddle/fluid/tests/unittests/test_cond.py @@ -379,7 +379,7 @@ def test_cond_op_in_condition(self): lambda: fluid.layers.cond( a == b, lambda: fluid.layers.elementwise_sub(a, b), - lambda: fluid.layers.elementwise_pow(a, b), + lambda: paddle.pow(a, b), ), ) append_backward(out) diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index 2f08a12b4b4ff..36392d538ecaf 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -614,7 +614,7 @@ def test_elementwise_math(self): t6 = layers.data(name='t6', shape=[3, 3], dtype='float32') ret = layers.elementwise_add(t, t2) - ret = layers.elementwise_pow(ret, t3) + ret = paddle.pow(ret, t3) ret = layers.elementwise_div(ret, t4) ret = layers.elementwise_sub(ret, t5) ret = layers.elementwise_mul(ret, t6) @@ -627,14 +627,14 @@ def test_elementwise_math(self): with self.dynamic_graph(): with _test_eager_guard(): ret = layers.elementwise_add(to_variable(n), to_variable(n2)) - ret = layers.elementwise_pow(ret, to_variable(n3)) + ret = paddle.pow(ret, to_variable(n3)) ret = layers.elementwise_div(ret, to_variable(n4)) ret = layers.elementwise_sub(ret, to_variable(n5)) dy_eager_ret = layers.elementwise_mul(ret, to_variable(n6)) dy_eager_ret_value = dy_eager_ret.numpy() ret = layers.elementwise_add(to_variable(n), to_variable(n2)) - ret = layers.elementwise_pow(ret, to_variable(n3)) + ret = paddle.pow(ret, to_variable(n3)) ret = layers.elementwise_div(ret, to_variable(n4)) ret = layers.elementwise_sub(ret, to_variable(n5)) dy_ret = layers.elementwise_mul(ret, to_variable(n6)) diff --git a/python/paddle/fluid/tests/unittests/test_optimizer_in_control_flow.py b/python/paddle/fluid/tests/unittests/test_optimizer_in_control_flow.py index 9cc47fc3d4234..ee3e443a9dfc2 100644 --- a/python/paddle/fluid/tests/unittests/test_optimizer_in_control_flow.py +++ b/python/paddle/fluid/tests/unittests/test_optimizer_in_control_flow.py @@ -97,7 +97,7 @@ def fn_2(opt, avg_loss=None, pred=None, label=None): id = fluid.data('id', [1], 'int32') two = layers.fill_constant([1], 'int32', 2) - mod_two = layers.elementwise_mod(id, two) == 0 + mod_two = paddle.remainder(id, two) == 0 if loss_in_switch: avg_loss = layers.case( From 8d08c9e03535dcfb4330c4aef2102df4b930b41c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=82=85=E5=89=91=E5=AF=92?= Date: Thu, 17 Nov 2022 21:15:03 +0800 Subject: [PATCH 072/210] [fluid clear] remove unstack in nn.py under fluid (#47927) * remove unstack in nn.py under fluid * remove unstack under fluid --- python/paddle/fluid/layers/nn.py | 63 -------------------------------- 1 file changed, 63 deletions(-) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index e3e42244dc61e..150fef7948303 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -140,7 +140,6 @@ 'flatten', 'stack', 'pad2d', - 'unstack', 'unique', 'unique_with_counts', 'expand', @@ -10510,68 +10509,6 @@ def filter_by_instag(ins, ins_tag, filter_tag, is_lod, out_val_if_empty=0): return [out, loss_weight] -def unstack(x, axis=0, num=None): - """ - :alias_main: paddle.unstack - :alias: paddle.unstack,paddle.tensor.unstack,paddle.tensor.manipulation.unstack - :old_api: paddle.fluid.layers.unstack - - **UnStack Layer** - - This layer unstacks input Tensor :code:`x` into several Tensors along :code:`axis`. - - If :code:`axis` < 0, it would be replaced with :code:`axis+rank(x)`. - If :code:`num` is None, it would be inferred from :code:`x.shape[axis]`, - and if :code:`x.shape[axis]` <= 0 or is unknown, :code:`ValueError` is - raised. - - Args: - x (Tensor): Input Tensor. It is a N-D Tensors of data types float32, float64, int32, int64. - axis (int): The axis along which the input is unstacked. - num (int|None): The number of output variables. - - Returns: - list(Tensor): The unstacked Tensors list. The list elements are N-D Tensors of data types float32, float64, int32, int64. - - Raises: - ValueError: If x.shape[axis] <= 0 or axis is not in range [-D, D). - - Examples: - .. code-block:: python - - import paddle - x = paddle.ones(name='x', shape=[2, 3, 5], dtype='float32') # create a tensor with shape=[2, 3, 5] - y = paddle.unstack(x, axis=1) # unstack with second axis, which results 3 tensors with shape=[2, 5] - - """ - - if _non_static_mode(): - if num is None: - num = x.shape[axis] - if num == 0: - return [] - return _legacy_C_ops.unstack(x, num, 'axis', int(axis), 'num', num) - - helper = LayerHelper('unstack', **locals()) - if num is None: - if axis is None or x.shape[axis] <= 0: - raise ValueError('unknown unstack number') - else: - num = x.shape[axis] - - outs = [] - for _ in range(num): - outs.append(helper.create_variable_for_type_inference(x.dtype)) - - helper.append_op( - type='unstack', - inputs={'X': [x]}, - outputs={'Y': outs}, - attrs={'axis': axis, 'num': num}, - ) - return outs - - @deprecated(since='2.0.0', update_to="paddle.expand") def expand(x, expand_times, name=None): """ From ef51bbfd33a5d11d0cef2969345691c8e9903a93 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=82=85=E5=89=91=E5=AF=92?= Date: Thu, 17 Nov 2022 21:15:36 +0800 Subject: [PATCH 073/210] remove fluid.layers.soft_relu in nn.py under fluid (#47925) --- python/paddle/fluid/layers/nn.py | 52 ------------------- .../tests/unittests/test_activation_op.py | 13 ----- .../fluid/tests/unittests/test_layers.py | 8 --- 3 files changed, 73 deletions(-) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 150fef7948303..6ff8f22a71921 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -136,7 +136,6 @@ 'prelu', 'brelu', 'leaky_relu', - 'soft_relu', 'flatten', 'stack', 'pad2d', @@ -10156,57 +10155,6 @@ def leaky_relu(x, alpha=0.02, name=None): return paddle.nn.functional.leaky_relu(x, alpha, name) -def soft_relu(x, threshold=40.0, name=None): - r""" - - SoftRelu Activation Operator. - - $out = \ln(1 + \exp(\max(\min(x, threshold), -threshold)))$ - - Args: - x(Variable): Input of soft_relu operator. Data type can be float32, float64. - threshold(float, optional): The threshold value of soft_relu, default value being 40.0. - name(str, optional): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name` . - - Returns: - Variable(Tensor|LoDTensor)): Output of soft_relu operator, shape and LoD same as input. - - Examples: - - .. code-block:: python - - import paddle.fluid as fluid - import numpy as np - import numpy as np - import paddle - - paddle.enable_static() - inputs = fluid.layers.data(name="x", shape=[2, 2], dtype="float32") - output = fluid.layers.soft_relu(inputs, threshold=20.0) - - exe = fluid.Executor(fluid.CPUPlace()) - exe.run(fluid.default_startup_program()) - - img = np.array([[0, 1],[2, 3]]).astype(np.float32) - - res = exe.run(fluid.default_main_program(), feed={'x':img}, fetch_list=[output]) - print(res) # [array([[0.6931472, 1.3132616], [2.126928 , 3.0485873]], dtype=float32)] - """ - check_variable_and_dtype( - x, 'x', ['float16', 'float32', 'float64'], 'soft_relu' - ) - - helper = LayerHelper('soft_relu', **locals()) - out = helper.create_variable_for_type_inference(dtype=x.dtype) - helper.append_op( - type='soft_relu', - inputs={'X': x}, - outputs={'Out': out}, - attrs={'threshold': threshold}, - ) - return out - - def flatten(x, axis=1, name=None): r""" **Flatten op** diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py index 4a80879608690..f460ed58c1d9c 100755 --- a/python/paddle/fluid/tests/unittests/test_activation_op.py +++ b/python/paddle/fluid/tests/unittests/test_activation_op.py @@ -2238,19 +2238,6 @@ def test_check_grad(self): self.check_grad(['X'], 'Out', max_relative_error=0.02) -class TestSoftReluOpError(unittest.TestCase): - def test_errors(self): - with program_guard(Program()): - # The input type must be Variable. - self.assertRaises(TypeError, fluid.layers.soft_relu, 1) - # The input dtype must be float16, float32, float64. - x_int32 = fluid.data(name='x_int32', shape=[12, 10], dtype='int32') - self.assertRaises(TypeError, fluid.layers.soft_relu, x_int32) - # support the input dtype is float16 - x_fp16 = fluid.data(name='x_fp16', shape=[12, 10], dtype='float16') - fluid.layers.soft_relu(x_fp16) - - def elu(x, alpha): out_ref = np.where(x > 0, x, alpha * (np.exp(x) - 1)) return out_ref.astype(x.dtype) diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index 36392d538ecaf..de5b2cb67fbba 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -3689,14 +3689,6 @@ def make_prelu(self): ) return out - def make_soft_relu(self): - with program_guard( - fluid.default_main_program(), fluid.default_startup_program() - ): - input = self._get_data(name="input", shape=[16], dtype="float32") - out = layers.soft_relu(input, threshold=30.0, name='soft_relu') - return out - def make_sigmoid(self): with program_guard( fluid.default_main_program(), fluid.default_startup_program() From fafc7be247b145107ac3045f04eec29e6ec301d4 Mon Sep 17 00:00:00 2001 From: zyfncg Date: Thu, 17 Nov 2022 21:31:19 +0800 Subject: [PATCH 074/210] Clip intermediate output of op when save inference model (#48026) * clip extra and intermediate output of op * fix bug * fix bug * polich code * polich log --- paddle/fluid/framework/operator.cc | 2 +- paddle/fluid/operators/batch_norm_op.cc | 8 ++++--- paddle/fluid/operators/reshape_op.cc | 21 +++++++---------- python/paddle/fluid/dygraph/io.py | 31 ++++++++++++++++++++++++- python/paddle/fluid/framework.py | 4 ++-- 5 files changed, 47 insertions(+), 19 deletions(-) diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 1013cf8c49914..b5dad398448f7 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -511,7 +511,7 @@ void OperatorBase::CheckAllInputOutputSet() const { } for (auto& out : info_->Proto().outputs()) { - if (!out.dispensable() && !out.extra()) { + if (!out.dispensable() && !out.extra() && !out.intermediate()) { PADDLE_ENFORCE_NE( outputs_.find(out.name()), outputs_.end(), diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc index 878ab18432cdc..7452c64f6fca8 100644 --- a/paddle/fluid/operators/batch_norm_op.cc +++ b/paddle/fluid/operators/batch_norm_op.cc @@ -158,12 +158,14 @@ void BatchNormOp::InferShape(framework::InferShapeContext *ctx) const { bias_dim[0])); } ctx->SetOutputDim("Y", x_dims); + ctx->ShareLoD("X", "Y"); VLOG(4) << x_dims; ctx->SetOutputDim("MeanOut", {C}); ctx->SetOutputDim("VarianceOut", {C}); - ctx->SetOutputDim("SavedMean", {C}); - ctx->SetOutputDim("SavedVariance", {C}); - ctx->ShareLoD("X", "Y"); + if (!test_mode) { + ctx->SetOutputDim("SavedMean", {C}); + ctx->SetOutputDim("SavedVariance", {C}); + } if (ctx->HasOutput("ReserveSpace")) { ctx->SetOutputDim("ReserveSpace", {-1}); } diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc index c33f8a95cfbec..e143d3e144b91 100644 --- a/paddle/fluid/operators/reshape_op.cc +++ b/paddle/fluid/operators/reshape_op.cc @@ -518,19 +518,16 @@ class Reshape2Op : public ReshapeOp { const framework::AttributeMap &attrs) : ReshapeOp(type, inputs, outputs, attrs) {} void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE_EQ(ctx->HasOutput("XShape"), - true, - platform::errors::InvalidArgument( - "Output(XShape) of ReshapeOp should not be null.")); - const auto &x_dims = ctx->GetInputDim("X"); - std::vector xshape_dims(x_dims.size() + 1); - xshape_dims[0] = 0; - for (int i = 0; i < x_dims.size(); ++i) { - xshape_dims[i + 1] = x_dims[i]; + if (ctx->HasOutput("XShape")) { + const auto &x_dims = ctx->GetInputDim("X"); + std::vector xshape_dims(x_dims.size() + 1); + xshape_dims[0] = 0; + for (int i = 0; i < x_dims.size(); ++i) { + xshape_dims[i + 1] = x_dims[i]; + } + ctx->SetOutputDim("XShape", phi::make_ddim(xshape_dims)); + ctx->ShareLoD("X", /*->*/ "XShape"); } - ctx->SetOutputDim("XShape", phi::make_ddim(xshape_dims)); - ctx->ShareLoD("X", /*->*/ "XShape"); - ReshapeOp::InferShape(ctx); } }; diff --git a/python/paddle/fluid/dygraph/io.py b/python/paddle/fluid/dygraph/io.py index eca171cacd330..f84949aa5e014 100644 --- a/python/paddle/fluid/dygraph/io.py +++ b/python/paddle/fluid/dygraph/io.py @@ -25,7 +25,7 @@ from paddle.fluid.layers import nn from paddle.fluid.layers.utils import _hash_with_id from paddle.fluid.dygraph.base import switch_to_static_graph -from paddle.fluid.framework import _non_static_mode +from paddle.fluid.framework import _non_static_mode, OpProtoHolder from paddle.fluid.executor import ( _is_enable_standalone_executor, _is_dy2st_enable_standalone_executor, @@ -563,6 +563,35 @@ def _get_train_forward_program(self, infer_program_desc): stop_gradient=True, ) op.desc.set_output("ReserveSpace", [reserve_space.name]) + continue + + proto = OpProtoHolder.instance().get_op_proto(op.type) + has_create_intermediate_out = False + for output_proto in proto.outputs: + if output_proto.intermediate: + intermediate_name = output_proto.name + if intermediate_name not in op.output_names: + has_create_intermediate_out = True + intermediate_var = block.create_var( + name=unique_name.generate_with_ignorable_key( + ".".join( + [ + op.type + '_' + intermediate_name, + 'tmp', + ] + ) + ), + type=core.VarDesc.VarType.LOD_TENSOR, + persistable=False, + stop_gradient=True, + ) + op.desc.set_output( + intermediate_name, [intermediate_var.name] + ) + if has_create_intermediate_out: + op.desc.infer_var_type(block.desc) + op.desc.infer_shape(block.desc) + return program @switch_to_static_graph diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 9f30a4e08a31f..4fc525003f71d 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -6175,8 +6175,8 @@ def _remove_training_info(self, clip_extra=True): if not find: remove_output_list.append(name) # The extra output of op will be removed in the future - # for name in remove_output_list: - # op.remove_output(name) + for name in remove_output_list: + op.remove_output(name) op_quant_name = ( core.op_proto_and_checker_maker.kOpWithQuantAttrName() From e4670d8074b5d5f21e71d177e0f0dd9700a51853 Mon Sep 17 00:00:00 2001 From: huangjiyi <43315610+huangjiyi@users.noreply.github.com> Date: Fri, 18 Nov 2022 10:33:52 +0800 Subject: [PATCH 075/210] rm "paddle/fluid/operators/amp/fp16_type_traits.h" in phi (#48051) --- paddle/fluid/operators/group_norm_op.cu | 2 +- paddle/fluid/operators/uniform_random_op.h | 2 +- paddle/phi/kernels/funcs/functors.h | 8 ++++---- paddle/phi/kernels/gpu/norm_grad_kernel.cu | 4 ++-- paddle/phi/kernels/gpu/norm_kernel.cu | 4 ++-- paddle/phi/kernels/gpu/sgd_kernel.cu | 6 +++--- paddle/phi/kernels/primitive/functor_primitives.h | 4 ++-- 7 files changed, 15 insertions(+), 15 deletions(-) diff --git a/paddle/fluid/operators/group_norm_op.cu b/paddle/fluid/operators/group_norm_op.cu index 12a989bc82b1c..08ea4d3278b23 100644 --- a/paddle/fluid/operators/group_norm_op.cu +++ b/paddle/fluid/operators/group_norm_op.cu @@ -324,7 +324,7 @@ class GroupNormKernel : public framework::OpKernel { dim3 grid(group_size, groups, x_dims[0]); dim3 threads(block_size, 1, 1); if (data_layout == DataLayout::kNCHW) { - using AccT = typename details::MPTypeTrait::Type; + using AccT = typename phi::dtype::MPTypeTrait::Type; constexpr int vec_size = sizeof(float4) / sizeof(T); int size = group_size * imsize; const int max_num_threads = 1024; diff --git a/paddle/fluid/operators/uniform_random_op.h b/paddle/fluid/operators/uniform_random_op.h index bf2666deda28b..3ddf6092f04bf 100644 --- a/paddle/fluid/operators/uniform_random_op.h +++ b/paddle/fluid/operators/uniform_random_op.h @@ -165,7 +165,7 @@ void UniformRandom(const framework::ExecutionContext& context, if (seed == 0) { // Use global Generator seed - using MT = typename details::MPTypeTrait::Type; + using MT = typename phi::dtype::MPTypeTrait::Type; phi::funcs::uniform_distribution dist; phi::funcs::uniform_real_transform trans(min, max); phi::funcs::distribution_and_transform(dev_cxt, tensor, dist, trans); diff --git a/paddle/phi/kernels/funcs/functors.h b/paddle/phi/kernels/funcs/functors.h index 2e6fe8b2d738b..3c7ae5ed09af3 100644 --- a/paddle/phi/kernels/funcs/functors.h +++ b/paddle/phi/kernels/funcs/functors.h @@ -14,7 +14,7 @@ limitations under the License. */ #pragma once -#include "paddle/fluid/operators/amp/fp16_type_traits.h" +#include "paddle/phi/common/amp_type_traits.h" #include "paddle/phi/kernels/funcs/math.h" namespace phi { @@ -38,7 +38,7 @@ struct AddGradFunctor { template struct ScaleFunctor { - using MT = typename paddle::operators::details::MPTypeTrait::Type; + using MT = typename phi::dtype::MPTypeTrait::Type; explicit ScaleFunctor(const MT coeff) : coeff_(coeff) {} inline HOSTDEVICE T operator()(T ele) { @@ -125,7 +125,7 @@ struct SigmoidGradFunctor { template struct GeluFunctor { - using MT = typename paddle::operators::details::MPTypeTrait::Type; + using MT = typename phi::dtype::MPTypeTrait::Type; inline HOSTDEVICE T operator()(T x) { // this function is tanh approximation of gelu // actual gelu is: @@ -141,7 +141,7 @@ struct GeluFunctor { template struct GeluGradFunctor { - using MT = typename paddle::operators::details::MPTypeTrait::Type; + using MT = typename phi::dtype::MPTypeTrait::Type; inline HOSTDEVICE T UseX(T x) { MT mx = static_cast(x); MT tanh_out = diff --git a/paddle/phi/kernels/gpu/norm_grad_kernel.cu b/paddle/phi/kernels/gpu/norm_grad_kernel.cu index bbbb6e9c018b6..cb02cc713852c 100644 --- a/paddle/phi/kernels/gpu/norm_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/norm_grad_kernel.cu @@ -22,8 +22,8 @@ #include namespace cub = hipcub; #endif -#include "paddle/fluid/operators/amp/fp16_type_traits.h" #include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/common/amp_type_traits.h" #include "paddle/phi/common/bfloat16.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/common_shape.h" @@ -38,7 +38,7 @@ __global__ void NormalizeGradient(const T* x, const int axis_n, const int post, T* x_grad) { - using MT = typename paddle::operators::details::MPTypeTrait::Type; + using MT = typename phi::dtype::MPTypeTrait::Type; typedef cub::BlockReduce BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage_sum; int num = pre * post; diff --git a/paddle/phi/kernels/gpu/norm_kernel.cu b/paddle/phi/kernels/gpu/norm_kernel.cu index bd9cffe79614b..4843831ebfc68 100644 --- a/paddle/phi/kernels/gpu/norm_kernel.cu +++ b/paddle/phi/kernels/gpu/norm_kernel.cu @@ -22,8 +22,8 @@ #include namespace cub = hipcub; #endif -#include "paddle/fluid/operators/amp/fp16_type_traits.h" #include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/common/amp_type_traits.h" #include "paddle/phi/common/float16.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/common_shape.h" @@ -46,7 +46,7 @@ __global__ void Normalize(const T* x, const T eps, T* y, T* out_norm) { - using MT = typename paddle::operators::details::MPTypeTrait::Type; + using MT = typename phi::dtype::MPTypeTrait::Type; typedef cub::BlockReduce BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; int num = pre * post; diff --git a/paddle/phi/kernels/gpu/sgd_kernel.cu b/paddle/phi/kernels/gpu/sgd_kernel.cu index ea257ebd1cc24..e3f0bf968c82c 100644 --- a/paddle/phi/kernels/gpu/sgd_kernel.cu +++ b/paddle/phi/kernels/gpu/sgd_kernel.cu @@ -15,10 +15,10 @@ #include "paddle/phi/kernels/sgd_kernel.h" #include "paddle/fluid/framework/mixed_vector.h" -#include "paddle/fluid/operators/amp/fp16_type_traits.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_helper.h" #include "paddle/phi/backends/gpu/gpu_primitives.h" +#include "paddle/phi/common/amp_type_traits.h" #include "paddle/phi/core/kernel_registry.h" namespace phi { @@ -72,7 +72,7 @@ void SGDDenseKernel(const Context& dev_ctx, bool multi_precision, DenseTensor* param_out, DenseTensor* master_param_out) { - using MPDType = typename paddle::operators::details::MPTypeTrait::Type; + using MPDType = typename phi::dtype::MPTypeTrait::Type; // do check here // if (multi_precision) { // bool has_master = @@ -109,7 +109,7 @@ void SGDDenseParamSparseGradKernel( bool multi_precision, DenseTensor* param_out, DenseTensor* master_param_out) { - using MPDType = typename paddle::operators::details::MPTypeTrait::Type; + using MPDType = typename phi::dtype::MPTypeTrait::Type; // do some check here // if (multi_precision) { // bool has_master = diff --git a/paddle/phi/kernels/primitive/functor_primitives.h b/paddle/phi/kernels/primitive/functor_primitives.h index 700ba00088517..b0f3d62823c09 100644 --- a/paddle/phi/kernels/primitive/functor_primitives.h +++ b/paddle/phi/kernels/primitive/functor_primitives.h @@ -14,7 +14,7 @@ #pragma once -#include "paddle/fluid/operators/amp/fp16_type_traits.h" +#include "paddle/phi/common/amp_type_traits.h" #include "paddle/phi/common/float16.h" #include "paddle/phi/core/enforce.h" #include "paddle/phi/kernels/funcs/eigen/extensions.h" @@ -79,7 +79,7 @@ struct IdentityFunctor { template struct DivideFunctor { private: - using MPType = typename ::paddle::operators::details::MPTypeTrait::Type; + using MPType = typename ::phi::dtype::MPTypeTrait::Type; public: HOSTDEVICE inline DivideFunctor() { n_inv = static_cast(1.0f); } From 85598e31fb75efff7a56c9506f1ac6aeaf506f3b Mon Sep 17 00:00:00 2001 From: Sylwester Fraczek Date: Fri, 18 Nov 2022 03:38:11 +0100 Subject: [PATCH 076/210] fix onednn prelu header (#48064) --- paddle/phi/kernels/onednn/prelu_kernel.cc | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/paddle/phi/kernels/onednn/prelu_kernel.cc b/paddle/phi/kernels/onednn/prelu_kernel.cc index efe814130bb77..922cee98d1961 100644 --- a/paddle/phi/kernels/onednn/prelu_kernel.cc +++ b/paddle/phi/kernels/onednn/prelu_kernel.cc @@ -1,13 +1,16 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. #include "paddle/phi/kernels/prelu_kernel.h" From 42f35841a890f61781d0cdf26f709583ca7db4b3 Mon Sep 17 00:00:00 2001 From: feng_shuai Date: Fri, 18 Nov 2022 11:17:26 +0800 Subject: [PATCH 077/210] fix: supoort huge length of attention (#48053) --- .../operators/math/bert_encoder_functor.cu | 65 +++++++++++++------ 1 file changed, 44 insertions(+), 21 deletions(-) diff --git a/paddle/fluid/operators/math/bert_encoder_functor.cu b/paddle/fluid/operators/math/bert_encoder_functor.cu index 5b11eee61a0fd..a97ab99dc2eff 100644 --- a/paddle/fluid/operators/math/bert_encoder_functor.cu +++ b/paddle/fluid/operators/math/bert_encoder_functor.cu @@ -783,6 +783,19 @@ __global__ void softmax_kernel_with_mask(T *qk_buf_, } } +#define SOFTMAX_KERNEL_WITH_MASK(REPEAT_THREAD) \ + do { \ + block.x /= REPEAT_THREAD; \ + grid.x /= 4; \ + constexpr int NUM = 4; \ + softmax_kernel_with_mask \ + <<>>(reinterpret_cast(qk_buf_), \ + (const half *)bias_qk, \ + batch_size, \ + head_num, \ + seq_len); \ + } while (0) + template inline void MatMulWithHeadQK(const phi::GPUContext &context, int head_num, @@ -843,22 +856,9 @@ inline void MatMulWithHeadQK(const phi::GPUContext &context, "QK_bias is mask can't be supported on rocm or " "cuda_arch<700")); #else - constexpr int ITEMS_PER_THREAD = 1; - bool is_half2 = true; - dim3 grid(seq_len, batch_size, head_num); dim3 block((seq_len / 2 + 31) / 32 * 32); - block.x /= ITEMS_PER_THREAD; - assert(block.x <= 1024); - assert(grid.x % 4 == 0); - grid.x /= 4; - constexpr int NUM = 4; - softmax_kernel_with_mask - <<>>(reinterpret_cast(qk_buf_), - (const half *)bias_qk, - batch_size, - head_num, - seq_len); + SOFTMAX_KERNEL_WITH_MASK(1); #endif } else { SoftmaxKernelWithEltadd2<__half2><<>>( @@ -888,13 +888,36 @@ inline void MatMulWithHeadQK(const phi::GPUContext &context, seq_len / 2, FINAL_MASK); } else { - SoftmaxKernelWithEltaddForLarge2<__half2><<>>( - reinterpret_cast<__half2 *>(qk_buf_), - reinterpret_cast(bias_qk), - batch_size, - head_num, - seq_len / 2, - FINAL_MASK); + if (bias_is_mask) { +#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 700) + PADDLE_ENFORCE_EQ(bias_is_mask, + false, + platform::errors::InvalidArgument( + "QK_bias is mask can't be supported on rocm or " + "cuda_arch<700")); +#else + dim3 grid(seq_len, batch_size, head_num); + dim3 block((seq_len / 2 + 31) / 32 * 32); + if (block.x > 0 && block.x <= 1024) { + SOFTMAX_KERNEL_WITH_MASK(1); + } else if (block.x <= 2048) { + SOFTMAX_KERNEL_WITH_MASK(2); + } else if (block.x <= 4096) { + SOFTMAX_KERNEL_WITH_MASK(4); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "Cannot support the length of attention > 8192.")); + } +#endif + } else { + SoftmaxKernelWithEltaddForLarge2<__half2><<>>( + reinterpret_cast<__half2 *>(qk_buf_), + reinterpret_cast(bias_qk), + batch_size, + head_num, + seq_len / 2, + FINAL_MASK); + } } } else { SoftmaxKernelWithEltaddForLarge<<>>( From 635958d9d9571bdeb7443c1aa5983655167de129 Mon Sep 17 00:00:00 2001 From: feng_shuai Date: Fri, 18 Nov 2022 11:18:31 +0800 Subject: [PATCH 078/210] optimize: vectorize transpose_padding (#48116) --- .../tensorrt/plugin/qkv_to_context_plugin.cu | 100 ++++++++++++++---- 1 file changed, 78 insertions(+), 22 deletions(-) diff --git a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu index 8cb8b7f4b7e20..5e3f078cf9f4d 100644 --- a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu @@ -78,8 +78,6 @@ __global__ void transpose_qkv_padding( qkv_id * head_num * size_per_head + head_id * size_per_head; if (seq_id < real_seq_len) { dst[threadIdx.x + dst_offset] = src[threadIdx.x + src_offset]; - } else if (seq_id < seq_len) { - dst[threadIdx.x + dst_offset] = 0; } } @@ -91,14 +89,69 @@ __global__ void transpose_qkv_unpadding(const T *src, const int head_num, const int size_per_head, const int real_seq_len) { - int batch_id = blockIdx.x / (head_num * real_seq_len); - int seq_id = blockIdx.x % real_seq_len; - int head_id = blockIdx.x % (head_num * real_seq_len) / real_seq_len; - dst[batch_id * head_num * real_seq_len * size_per_head + - seq_id * head_num * size_per_head + head_id * size_per_head + - threadIdx.x] = src[batch_id * head_num * seq_len * size_per_head + + int batch_id = blockIdx.y; + int seq_id = blockIdx.x; + int head_id = threadIdx.y; + const int src_offset = batch_id * head_num * seq_len * size_per_head + head_id * seq_len * size_per_head + - seq_id * size_per_head + threadIdx.x]; + seq_id * size_per_head; + const int dst_offset = batch_id * real_seq_len * head_num * size_per_head + + seq_id * head_num * size_per_head + + head_id * size_per_head; + + dst[threadIdx.x + dst_offset] = src[threadIdx.x + src_offset]; +} + +#define LAUNCH_TRANSPOSE_KERNEL(TYPE, VECTOR_SIZE, PAD_TYPE) \ + do { \ + int h = head_size / VECTOR_SIZE; \ + const TYPE *input##VECTOR_SIZE = reinterpret_cast(input); \ + TYPE *output##VECTOR_SIZE = reinterpret_cast(output); \ + dim3 block(h, head_num, 1); \ + transpose_qkv_##PAD_TYPE \ + <<>>(input##VECTOR_SIZE, \ + output##VECTOR_SIZE, \ + batch, \ + seq_len, \ + head_num, \ + h, \ + real_seq_len); \ + } while (0) + +inline void TransposePadding(const half *input, + half *output, + const int batch, + const int seq_len, + const int head_num, + const int head_size, + const int real_seq_len, + cudaStream_t stream) { + const dim3 grid(seq_len, batch, 3); + if (head_size % 8 == 0) { + LAUNCH_TRANSPOSE_KERNEL(int4, 8, padding); + } else if (head_size % 2 == 0) { + LAUNCH_TRANSPOSE_KERNEL(half2, 2, padding); + } else { + LAUNCH_TRANSPOSE_KERNEL(half, 1, padding); + } +} + +inline void TransposeUnPadding(const half *input, + half *output, + const int batch, + const int seq_len, + const int head_num, + const int head_size, + const int real_seq_len, + cudaStream_t stream) { + const dim3 grid(real_seq_len, batch); + if (head_size % 8 == 0) { + LAUNCH_TRANSPOSE_KERNEL(int4, 8, unpadding); + } else if (head_size % 2 == 0) { + LAUNCH_TRANSPOSE_KERNEL(half2, 2, unpadding); + } else { + LAUNCH_TRANSPOSE_KERNEL(half, 1, unpadding); + } } int QkvToContextPluginDynamic::initialize() TRT_NOEXCEPT { return 0; } @@ -381,15 +434,14 @@ int QkvToContextPluginDynamic::enqueue( const half *input1_data = static_cast(qk_bias); // BxSx3xNxH => tptr: 3xBxNxSxH. if (need_padding) { - dim3 grid_p(seq_len, batch, 3); - dim3 block_p(head_size_, head_number_, 1); - transpose_qkv_padding<<>>(input0_data, - tptr, - batch, - seq_len, - head_number_, - head_size_, - real_seq_len); + TransposePadding(input0_data, + tptr, + batch, + seq_len, + head_number_, + head_size_, + real_seq_len, + stream); } else { TransposeQKV( batch, seq_len, head_size_, head_number_, input0_data, tptr, stream); @@ -424,10 +476,14 @@ int QkvToContextPluginDynamic::enqueue( int block = head_size_; half *output = static_cast(outputs[0]); if (need_padding) { - int grid_u = batch * head_number_ * real_seq_len; - int block_u = head_size_; - transpose_qkv_unpadding<<>>( - tptr, output, batch, seq_len, head_number_, head_size_, real_seq_len); + TransposeUnPadding(tptr, + output, + batch, + seq_len, + head_number_, + head_size_, + real_seq_len, + stream); } else { transpose<<>>( tptr, output, batch, seq_len, head_number_, head_size_); From 982d5ff77996a9c2ec8ef5f05cdb87849e03d979 Mon Sep 17 00:00:00 2001 From: zhangyikun02 <48021248+zhangyk0314@users.noreply.github.com> Date: Fri, 18 Nov 2022 11:25:40 +0800 Subject: [PATCH 079/210] cast and gradient_accumulator support double for xpu, test=kunlun (#47800) --- .../fluid/imperative/gradient_accumulator.cc | 34 +++++++++++++++---- .../fluid/platform/device/xpu/xpu2_op_list.h | 2 ++ paddle/phi/kernels/xpu/cast_kernel.cc | 18 ++++++---- .../unittests/xpu/test_adadelta_op_xpu.py | 3 +- .../tests/unittests/xpu/test_cast_op_xpu.py | 11 +++++- 5 files changed, 52 insertions(+), 16 deletions(-) diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc index 188617dd31037..b57c874ceebe0 100644 --- a/paddle/fluid/imperative/gradient_accumulator.cc +++ b/paddle/fluid/imperative/gradient_accumulator.cc @@ -31,6 +31,7 @@ #include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/selected_rows_functor.h" #ifdef PADDLE_WITH_XPU +#include "paddle/phi/backends/xpu/enforce_xpu.h" #include "xpu/refactor/math.h" #endif #ifdef PADDLE_WITH_ASCEND_CL @@ -92,13 +93,30 @@ void XPUTensorAddFunctor(const platform::Place& place, platform::DeviceContextPool::Instance().Get(place)); const XPUType* x = reinterpret_cast(src.data()); XPUType* y = reinterpret_cast(dst->mutable_data(place)); - int r = xpu::add( - ctx->x_context(), x, y, y, static_cast(src.numel())); - PADDLE_ENFORCE_EQ( - r, - XPU_SUCCESS, - platform::errors::External( - "XPU add kernel return wrong value[%d %s]", r, XPUAPIErrorMsg[r])); + int r = -1; + int numel = static_cast(src.numel()); + if (std::is_same::value) { + xpu::ctx_guard RAII_GUARD(ctx->x_context()); + float* x_cast_to_fp32 = RAII_GUARD.alloc(numel); + PADDLE_ENFORCE_XDNN_NOT_NULL(x_cast_to_fp32); + float* y_cast_to_fp32 = RAII_GUARD.alloc(numel); + PADDLE_ENFORCE_XDNN_NOT_NULL(y_cast_to_fp32); + r = xpu::cast(ctx->x_context(), x, x_cast_to_fp32, numel); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast"); + r = xpu::cast(ctx->x_context(), y, y_cast_to_fp32, numel); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast"); + r = xpu::add(ctx->x_context(), + x_cast_to_fp32, + y_cast_to_fp32, + y_cast_to_fp32, + numel); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "add"); + r = xpu::cast(ctx->x_context(), y_cast_to_fp32, y, numel); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast"); + } else { + r = xpu::add(ctx->x_context(), x, y, y, numel); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "add"); + } } #endif @@ -286,6 +304,8 @@ void TensorAdd(const VarType& src, VarType* dst) { } else if (data_type == framework::DataTypeTrait::DataType()) { XPUTensorAddFunctor(place, src_tensor, dst_tensor); + } else if (data_type == framework::DataTypeTrait::DataType()) { + XPUTensorAddFunctor(place, src_tensor, dst_tensor); } else { PADDLE_THROW(platform::errors::Unimplemented( "Gradient accumulation of data type (%s) on place (%s) is not " diff --git a/paddle/fluid/platform/device/xpu/xpu2_op_list.h b/paddle/fluid/platform/device/xpu/xpu2_op_list.h index cbcbde8f9ddcd..ae6d53989c316 100644 --- a/paddle/fluid/platform/device/xpu/xpu2_op_list.h +++ b/paddle/fluid/platform/device/xpu/xpu2_op_list.h @@ -103,7 +103,9 @@ XPUOpMap& get_kl2_ops() { {"cast", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), pOpKernelType(vartype::FP16, XPUPlace()), + pOpKernelType(vartype::FP64, XPUPlace()), pOpKernelType(vartype::BOOL, XPUPlace()), + pOpKernelType(vartype::UINT8, XPUPlace()), pOpKernelType(vartype::INT64, XPUPlace()), pOpKernelType(vartype::INT32, XPUPlace())})}, {"check_finite_and_unscale", diff --git a/paddle/phi/kernels/xpu/cast_kernel.cc b/paddle/phi/kernels/xpu/cast_kernel.cc index 502b8324522e6..346cf4cd3bfd6 100644 --- a/paddle/phi/kernels/xpu/cast_kernel.cc +++ b/paddle/phi/kernels/xpu/cast_kernel.cc @@ -14,6 +14,7 @@ #include "paddle/phi/kernels/cast_kernel.h" +#include "paddle/phi/backends/xpu/enforce_xpu.h" #include "paddle/phi/backends/xpu/xpu_context.h" #include "paddle/phi/common/float16.h" #include "paddle/phi/core/enforce.h" @@ -80,16 +81,19 @@ void CastKernel(const Context& dev_ctx, dev_ctx.template Alloc(out), numel); break; + case phi::DataType::FLOAT64: + r = xpu::cast_v2( + dev_ctx.x_context(), + reinterpret_cast(in_data), + dev_ctx.template Alloc(out), + numel); + break; default: PADDLE_THROW(phi::errors::Unavailable( "Not supported cast %d -> %d", x.dtype(), out_dtype)); } - PADDLE_ENFORCE_EQ( - r, - XPU_SUCCESS, - phi::errors::External( - "XPU CAST API return wrong value[%d %s].", r, XPUAPIErrorMsg[r])); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast_v2"); } } // namespace phi @@ -101,6 +105,8 @@ PD_REGISTER_KERNEL(cast, float, phi::dtype::float16, int64_t, - bool) { + bool, + uint8_t, + double) { kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED); } diff --git a/python/paddle/fluid/tests/unittests/xpu/test_adadelta_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_adadelta_op_xpu.py index d65e20522a20e..4ded307cba90e 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_adadelta_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_adadelta_op_xpu.py @@ -18,7 +18,6 @@ sys.path.append("..") -from op_test import OpTest import paddle import paddle.fluid as fluid from op_test_xpu import XPUOpTest @@ -88,7 +87,7 @@ def setUp(self): def test_check_output(self): self.check_output() - class TestAdadeltaOp2(OpTest): + class TestAdadeltaOp2(XPUOpTest): '''Test Adadelta op with default attribute values''' def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/xpu/test_cast_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_cast_op_xpu.py index a2e136dccaab0..a69c439c8cb6e 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_cast_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_cast_op_xpu.py @@ -36,6 +36,7 @@ 'float16': int(core.VarDesc.VarType.FP16), 'bool': int(core.VarDesc.VarType.BOOL), 'uint8': int(core.VarDesc.VarType.UINT8), + 'float64': int(core.VarDesc.VarType.FP64), } @@ -47,7 +48,15 @@ def __init__(self): def dynamic_create_class(self): base_class = self.TestCastOp classes = [] - for out_type in {'float16', 'float32', 'int32', 'int64', 'uint8'}: + for out_type in { + 'float16', + 'float32', + 'int32', + 'int64', + 'uint8', + 'bool', + 'float64', + }: class_name = 'XPUTestCastOp_outtype_' + out_type attr_dict = {'out_typename': out_type} classes.append([class_name, attr_dict]) From a33d563c5e96624615f6c485f8b74d69a316205c Mon Sep 17 00:00:00 2001 From: parap1uie-s Date: Fri, 18 Nov 2022 11:48:55 +0800 Subject: [PATCH 080/210] Allow to specify train_bs and eval_bs separately in hapi.fit() (#48032) * Fix hAPI bug of not compatible with LayerHook https://github.com/PaddlePaddle/Paddle/issues/47000 * Fix hAPI bug of not compatible with LayerHook * Allow to specify train_bs and eval_bs separately in hapi.fit() * Update model.py * Update Model.py * Update test_model.py * update model.py --- python/paddle/hapi/model.py | 16 +++++++++++++--- python/paddle/tests/test_model.py | 2 ++ 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/python/paddle/hapi/model.py b/python/paddle/hapi/model.py index b7813932d86f6..e64aa47e2d1f7 100644 --- a/python/paddle/hapi/model.py +++ b/python/paddle/hapi/model.py @@ -1713,7 +1713,7 @@ def fit( evaluation at the end of epoch. If None, will not do evaluation. An instance of paddle.io.Dataset or paddle.io.Dataloader is recomended. Default: None. - batch_size (int, optional): The batch size of train_data and eval_data. When + batch_size (int|list, optional): The batch size of train_data and eval_data. When train_data and eval_data are both the instance of Dataloader, this parameter will be ignored. Default: 1. epochs (int, optional): The number of epochs to train the model. Default: 1. @@ -1836,10 +1836,20 @@ def fit( """ assert train_data is not None, "train_data must be given!" + if isinstance(batch_size, (tuple, list)) and all( + [isinstance(x, int) for x in batch_size] + ): + assert ( + len(batch_size) == 2 + ), "batch_size length error, expected train_batch_size and eval_batch_size." + train_batch_size, eval_batch_size = batch_size + elif isinstance(batch_size, int): + train_batch_size, eval_batch_size = batch_size, batch_size + if isinstance(train_data, Dataset): train_sampler = DistributedBatchSampler( train_data, - batch_size=batch_size, + batch_size=train_batch_size, shuffle=shuffle, drop_last=drop_last, ) @@ -1855,7 +1865,7 @@ def fit( if eval_data is not None and isinstance(eval_data, Dataset): eval_sampler = DistributedBatchSampler( - eval_data, batch_size=batch_size + eval_data, batch_size=eval_batch_size ) eval_loader = DataLoader( eval_data, diff --git a/python/paddle/tests/test_model.py b/python/paddle/tests/test_model.py index 76a41a56caf94..c20761b7cd2a4 100644 --- a/python/paddle/tests/test_model.py +++ b/python/paddle/tests/test_model.py @@ -312,6 +312,8 @@ def fit(self, dynamic, num_replicas=None, rank=None, num_iters=None): self.val_dataset, batch_size=64, num_iters=num_iters ) + model.fit(self.train_dataset, batch_size=(64, 64), shuffle=False) + train_sampler = DistributedBatchSampler( self.train_dataset, batch_size=64, From 9918bf9ca09bfcd358b63e37d075d1be501e3ec0 Mon Sep 17 00:00:00 2001 From: Wang Xin Date: Fri, 18 Nov 2022 12:03:38 +0800 Subject: [PATCH 081/210] [PHI decoupling] remove "gpu_primitives.h" in fluid (#48063) * remove "gpu_primitives.h" in fluid namespace * fix PR-CI-GpuPS fail * fix PR-CI-GpuPS fail --- .../framework/fleet/heter_ps/feature_value.cu | 41 +- .../fluid/framework/fleet/ps_gpu_wrapper.cu | 4 +- .../plugin/fused_token_prune_op_plugin.cu | 4 +- paddle/fluid/operators/affine_channel_op.cu | 2 +- paddle/fluid/operators/assign_pos_op.cu | 4 +- paddle/fluid/operators/batch_fc_op.cu | 2 +- paddle/fluid/operators/bilateral_slice_op.cu | 2 +- paddle/fluid/operators/center_loss_op.cu | 6 +- .../operators/collective/c_embedding_op.cu | 5 +- .../fluid/operators/collective/c_split_op.cu | 2 +- paddle/fluid/operators/conv_shift_op.cu | 2 +- paddle/fluid/operators/cvm_op.cu | 4 +- paddle/fluid/operators/data_norm_op.cu | 4 +- .../operators/deformable_psroi_pooling_op.cu | 28 +- paddle/fluid/operators/dequantize_log_op.cu | 3 +- .../fluid/operators/detection/box_clip_op.cu | 2 +- .../detection/box_decoder_and_assign_op.cu | 2 +- .../detection/collect_fpn_proposals_op.cu | 4 +- .../detection/polygon_box_transform_op.cu | 4 +- .../detection/roi_perspective_transform_op.cu | 4 +- .../detection/sigmoid_focal_loss_op.cu | 2 +- .../elementwise/elementwise_op_function.h | 2 +- paddle/fluid/operators/fake_quantize_op.cu.h | 2 +- .../operators/fused/fused_softmax_mask.cu.h | 2 +- .../fluid/operators/gather_scatter_kernel.cu | 4 +- .../fluid/operators/graph_khop_sampler_op.cu | 5 +- paddle/fluid/operators/group_norm_op.cu | 8 +- paddle/fluid/operators/interpolate_op.cu | 101 ++- .../fluid/operators/limit_by_capacity_op.cu | 4 +- paddle/fluid/operators/lookup_table_op.cu | 4 +- paddle/fluid/operators/lookup_table_v2_op.cu | 6 +- .../fluid/operators/math/cos_sim_functor.cu | 4 +- paddle/fluid/operators/math/cross_entropy.cu | 3 +- paddle/fluid/operators/math/im2col.cu | 5 +- paddle/fluid/operators/math/maxouting.cu | 2 +- .../fluid/operators/math/sequence_pooling.cu | 2 +- paddle/fluid/operators/math/sequence_scale.cu | 4 +- paddle/fluid/operators/math/unpooling.cu | 2 +- paddle/fluid/operators/math/vol2col.cu | 2 +- paddle/fluid/operators/mean_iou_op.cu | 4 +- paddle/fluid/operators/number_count_op.cu | 4 +- paddle/fluid/operators/one_hot_op.cu | 4 +- paddle/fluid/operators/optimizers/sgd_op.cu | 4 +- paddle/fluid/operators/pad2d_op.cu | 18 +- paddle/fluid/operators/prroi_pool_op.h | 6 +- .../operators/prune_gate_by_capacity_op.cu | 4 +- .../operators/pull_box_extended_sparse_op.cu | 2 +- paddle/fluid/operators/pull_box_sparse_op.kps | 10 +- .../fluid/operators/pull_gpups_sparse_op.cu | 4 +- paddle/fluid/operators/quantize_linear_op.cu | 2 +- paddle/fluid/operators/random_routing_op.cu | 2 +- paddle/fluid/operators/rank_attention_op.cu | 2 +- .../sequence_ops/sequence_enumerate_op.cu | 4 +- .../sequence_ops/sequence_erase_op.cu | 4 +- .../sequence_ops/sequence_expand_as_op.cu | 2 +- .../sequence_ops/sequence_expand_op.cu | 4 +- paddle/fluid/operators/shuffle_channel_op.cu | 2 +- paddle/fluid/operators/temporal_shift_op.cu | 2 +- paddle/fluid/operators/top_k_function_cuda.h | 4 +- paddle/fluid/operators/transpose_op.cu.h | 2 +- .../platform/device/gpu/cuda_helper_test.cu | 8 +- .../platform/device/gpu/gpu_primitives.h | 622 ------------------ paddle/phi/backends/gpu/gpu_primitives.h | 2 +- 63 files changed, 192 insertions(+), 823 deletions(-) delete mode 100644 paddle/fluid/platform/device/gpu/gpu_primitives.h diff --git a/paddle/fluid/framework/fleet/heter_ps/feature_value.cu b/paddle/fluid/framework/fleet/heter_ps/feature_value.cu index f05fe6c95de0a..80a827e6ad0e8 100644 --- a/paddle/fluid/framework/fleet/heter_ps/feature_value.cu +++ b/paddle/fluid/framework/fleet/heter_ps/feature_value.cu @@ -13,12 +13,12 @@ limitations under the License. */ #ifdef PADDLE_WITH_HETERPS #include "paddle/fluid/framework/fleet/heter_ps/feature_value.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" namespace paddle { namespace framework { -const int CUDA_NUM_THREADS = platform::PADDLE_CUDA_NUM_THREADS; +const int CUDA_NUM_THREADS = phi::PADDLE_CUDA_NUM_THREADS; #define GET_BLOCK(N) ((N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS) #define CUDA_BLOCK(N) GET_BLOCK(N), CUDA_NUM_THREADS, 0 @@ -45,7 +45,7 @@ __global__ void PullCopy(float** dest, int x = low; int y = i - (x ? len[x - 1] : 0); float* feature_value_ptr = - (float*)((char*)src + uint64_t(i) * uint64_t(max_val_size)); + (float*)((char*)src + uint64_t(i) * uint64_t(max_val_size)); // NOLINT int mf_dim = gpu_dim[x] - 3; gpu_accessor.Select( dest[x] + y * (mf_dim + 3), feature_value_ptr, keys[x] + y, mf_dim); @@ -79,7 +79,7 @@ __global__ void PullDedupCopy(const size_t N, return; } - float* src_ptr = (float*)((char*)src + uint64_t(restore_idx[i]) * + float* src_ptr = (float*)((char*)src + uint64_t(restore_idx[i]) * // NOLINT uint64_t(max_val_size)); switch (off) { case 0: @@ -125,9 +125,10 @@ __global__ void PushCopyWithPool(float* dest, } int x = low; int y = i - (x ? len[low - 1] : 0); - float* cur = (float*)((char*)dest + i * grad_value_size); + float* cur = (float*)((char*)dest + i * grad_value_size); // NOLINT - cur[gpu_accessor.common_push_value.SlotIndex()] = (float)slot_vector[x]; + cur[gpu_accessor.common_push_value.SlotIndex()] = + (float)slot_vector[x]; // NOLINT int mf_dim = mf_dim_vector[x]; cur[gpu_accessor.common_push_value.MfDimIndex()] = mf_dim; @@ -170,31 +171,29 @@ __global__ void PushMergeCopyAtomic(const size_t N, int y = i - slot_lens[x]; const float* ptr = src[x] + y * hidden; - float* cur = (float*)((char*)dest + d_restore_idx[i] * grad_value_size); + float* cur = + (float*)((char*)dest + d_restore_idx[i] * grad_value_size); // NOLINT int mf_dim = slot_dims[x] - 3; switch (off) { case 0: - cur[accessor.SlotIndex()] = (float)slot_vector[x]; + cur[accessor.SlotIndex()] = (float)slot_vector[x]; // NOLINT cur[accessor.MfDimIndex()] = mf_dim; - paddle::platform::CudaAtomicAdd(&cur[accessor.ShowIndex()], - *(ptr + off)); + phi::CudaAtomicAdd(&cur[accessor.ShowIndex()], *(ptr + off)); break; case 1: - paddle::platform::CudaAtomicAdd(&cur[accessor.ClickIndex()], - *(ptr + off)); + phi::CudaAtomicAdd(&cur[accessor.ClickIndex()], *(ptr + off)); break; case 2: - paddle::platform::CudaAtomicAdd(&cur[accessor.EmbedGIndex()], - *(ptr + off) * -1. * bs); + phi::CudaAtomicAdd(&cur[accessor.EmbedGIndex()], + *(ptr + off) * -1. * bs); break; default: int embedx_idx = off - 3; if (mf_dim < embedx_idx) { return; } - paddle::platform::CudaAtomicAdd( - &cur[accessor.EmbedxGIndex() + embedx_idx], - *(ptr + off) * -1. * bs); + phi::CudaAtomicAdd(&cur[accessor.EmbedxGIndex() + embedx_idx], + *(ptr + off) * -1. * bs); break; } } @@ -228,7 +227,7 @@ __global__ void PushMergeCopy(const size_t N, int i = idx / hidden; int off = idx % hidden; // filter 0 keys - float* cur = (float*)((char*)dest + i * grad_value_size); + float* cur = (float*)((char*)dest + i * grad_value_size); // NOLINT if (total_keys[i] == 0) { switch (off) { @@ -262,7 +261,7 @@ __global__ void PushMergeCopy(const size_t N, switch (off) { case 0: - cur[accessor.SlotIndex()] = (float)slot_vector[x]; + cur[accessor.SlotIndex()] = (float)slot_vector[x]; // NOLINT cur[accessor.MfDimIndex()] = mf_dim; SUM_GRAD_VALUE cur[accessor.ShowIndex()] = val; @@ -331,8 +330,8 @@ void AccessorWrapper::CopyForPushImpl( const uint64_t total_length, const int batch_size, size_t grad_value_size, - std::vector& slot_vector, - std::vector& slot_mf_dim_vector) { + std::vector& slot_vector, // NOLINT + std::vector& slot_mf_dim_vector) { // NOLINT auto stream = dynamic_cast( paddle::platform::DeviceContextPool::Instance().Get(place)) ->stream(); diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu index 7f27b6889fc98..169b87b2b4017 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu @@ -22,12 +22,12 @@ limitations under the License. */ #include "paddle/fluid/framework/fleet/ps_gpu_wrapper.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" namespace paddle { namespace framework { -const int CUDA_NUM_THREADS = platform::PADDLE_CUDA_NUM_THREADS; +const int CUDA_NUM_THREADS = phi::PADDLE_CUDA_NUM_THREADS; #define GET_BLOCK(N) ((N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS) #define CUDA_BLOCK(N) GET_BLOCK(N), CUDA_NUM_THREADS, 0 diff --git a/paddle/fluid/inference/tensorrt/plugin/fused_token_prune_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/fused_token_prune_op_plugin.cu index e49bf16bf6878..fe011422c19e9 100644 --- a/paddle/fluid/inference/tensorrt/plugin/fused_token_prune_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/fused_token_prune_op_plugin.cu @@ -20,8 +20,8 @@ #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/device_context.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/fluid/inference/tensorrt/plugin/fused_token_prune_op_plugin.h" #include "paddle/fluid/operators/fused_token_prune_op.cu.h" @@ -149,7 +149,7 @@ __global__ void ReduceSum2( } if (tid == 0) { - platform::fastAtomicAdd( + phi::fastAtomicAdd( reinterpret_cast(dst), static_cast(batch * max_seq_len + col), static_cast(bsz * max_seq_len), diff --git a/paddle/fluid/operators/affine_channel_op.cu b/paddle/fluid/operators/affine_channel_op.cu index cb7e7a8d12812..16c297459ce04 100644 --- a/paddle/fluid/operators/affine_channel_op.cu +++ b/paddle/fluid/operators/affine_channel_op.cu @@ -23,7 +23,7 @@ namespace cub = hipcub; #include "paddle/fluid/framework/data_layout.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/assign_pos_op.cu b/paddle/fluid/operators/assign_pos_op.cu index f5704b6a08617..0f1107765d384 100644 --- a/paddle/fluid/operators/assign_pos_op.cu +++ b/paddle/fluid/operators/assign_pos_op.cu @@ -23,8 +23,8 @@ We retain the following license from the original files: #include "paddle/fluid/operators/assign_pos_op.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/float16.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" DECLARE_bool(avoid_op_randomness); @@ -47,7 +47,7 @@ __global__ void AssignPos(T* cum_count, CUDA_KERNEL_LOOP(i, limit) { int number_idx = numbers[i]; if (number_idx > -1) { - int p = platform::CudaAtomicAdd(cum_count + number_idx, -1); + int p = phi::CudaAtomicAdd(cum_count + number_idx, -1); out[p - 1] = i; } } diff --git a/paddle/fluid/operators/batch_fc_op.cu b/paddle/fluid/operators/batch_fc_op.cu index b8641565729a3..178e57d7a261a 100644 --- a/paddle/fluid/operators/batch_fc_op.cu +++ b/paddle/fluid/operators/batch_fc_op.cu @@ -17,7 +17,7 @@ limitations under the License. */ #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/operators/batch_fc_op.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/kernels/funcs/blas/blas.h" namespace paddle { diff --git a/paddle/fluid/operators/bilateral_slice_op.cu b/paddle/fluid/operators/bilateral_slice_op.cu index 1e0d0da5dbbad..c995c3ed091dd 100644 --- a/paddle/fluid/operators/bilateral_slice_op.cu +++ b/paddle/fluid/operators/bilateral_slice_op.cu @@ -14,7 +14,7 @@ #include "paddle/fluid/operators/bilateral_slice_op.h" #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/center_loss_op.cu b/paddle/fluid/operators/center_loss_op.cu index fed463d8f7cd7..44495ddf32eb3 100644 --- a/paddle/fluid/operators/center_loss_op.cu +++ b/paddle/fluid/operators/center_loss_op.cu @@ -16,11 +16,11 @@ limitations under the License. */ #include "paddle/fluid/operators/center_loss_op.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" namespace paddle { namespace operators { -using platform::PADDLE_CUDA_NUM_THREADS; +using phi::PADDLE_CUDA_NUM_THREADS; template __global__ void ComputeDifferent(T *centers_diff, @@ -75,7 +75,7 @@ __global__ void UpdateCenters(T *centers, const T *diff = centers_diff + idy * D; T *cent = centers + id * D; for (int i = idx; i < D; i += BlockDimX) { - paddle::platform::CudaAtomicAdd(¢[i], alpha[0] * diff[i] / count); + phi::CudaAtomicAdd(¢[i], alpha[0] * diff[i] / count); } idy += BlockDimY * GridDimX; } diff --git a/paddle/fluid/operators/collective/c_embedding_op.cu b/paddle/fluid/operators/collective/c_embedding_op.cu index 53aef8e835734..e1fa8795d420e 100644 --- a/paddle/fluid/operators/collective/c_embedding_op.cu +++ b/paddle/fluid/operators/collective/c_embedding_op.cu @@ -16,8 +16,8 @@ limitations under the License. */ #include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/float16.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" namespace paddle { namespace operators { @@ -77,8 +77,7 @@ __global__ void CEmbeddingGrad(T *table, auto id = ids[row]; if (id >= start_idx && id < end_idx) { auto real_idx = id - start_idx; - paddle::platform::CudaAtomicAdd(&table[real_idx * columns + col], - output[i]); + phi::CudaAtomicAdd(&table[real_idx * columns + col], output[i]); } } } diff --git a/paddle/fluid/operators/collective/c_split_op.cu b/paddle/fluid/operators/collective/c_split_op.cu index 2089c23fa6ec5..3539a7304010e 100644 --- a/paddle/fluid/operators/collective/c_split_op.cu +++ b/paddle/fluid/operators/collective/c_split_op.cu @@ -16,7 +16,7 @@ limitations under the License. */ #include "paddle/fluid/operators/collective/c_split_op.h" #include "paddle/fluid/operators/math/concat_and_split.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/conv_shift_op.cu b/paddle/fluid/operators/conv_shift_op.cu index 689722d24eccb..047ef75d1fb39 100644 --- a/paddle/fluid/operators/conv_shift_op.cu +++ b/paddle/fluid/operators/conv_shift_op.cu @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/conv_shift_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/kernels/funcs/math_function.h" namespace paddle { diff --git a/paddle/fluid/operators/cvm_op.cu b/paddle/fluid/operators/cvm_op.cu index 3db8c125ec173..e8fdcec36082a 100644 --- a/paddle/fluid/operators/cvm_op.cu +++ b/paddle/fluid/operators/cvm_op.cu @@ -16,12 +16,12 @@ limitations under the License. */ #include "paddle/fluid/operators/cvm_op.h" #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" namespace paddle { namespace operators { -using platform::PADDLE_CUDA_NUM_THREADS; +using phi::PADDLE_CUDA_NUM_THREADS; using Tensor = phi::DenseTensor; using LoDTensor = phi::DenseTensor; diff --git a/paddle/fluid/operators/data_norm_op.cu b/paddle/fluid/operators/data_norm_op.cu index b040b5dfd8d61..790e55965a9d2 100644 --- a/paddle/fluid/operators/data_norm_op.cu +++ b/paddle/fluid/operators/data_norm_op.cu @@ -17,7 +17,7 @@ limitations under the License. */ #include "paddle/fluid/framework/data_layout.h" #include "paddle/fluid/operators/data_norm_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h" @@ -29,7 +29,7 @@ namespace operators { using Tensor = phi::DenseTensor; using LoDTensor = phi::DenseTensor; using DataLayout = phi::DataLayout; -using platform::PADDLE_CUDA_NUM_THREADS; +using phi::PADDLE_CUDA_NUM_THREADS; inline int GET_BLOCKS(const int N) { return (N + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS; diff --git a/paddle/fluid/operators/deformable_psroi_pooling_op.cu b/paddle/fluid/operators/deformable_psroi_pooling_op.cu index 002f89b162082..f1816850317a1 100644 --- a/paddle/fluid/operators/deformable_psroi_pooling_op.cu +++ b/paddle/fluid/operators/deformable_psroi_pooling_op.cu @@ -32,7 +32,7 @@ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/operators/deformable_psroi_pooling_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/math_function.h" @@ -41,7 +41,7 @@ namespace operators { using Tensor = phi::DenseTensor; using LoDTensor = phi::DenseTensor; -using paddle::platform::PADDLE_CUDA_NUM_THREADS; +using phi::PADDLE_CUDA_NUM_THREADS; static inline int GET_BLOCKS(const int N) { return (N + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS; @@ -447,18 +447,14 @@ __global__ void DeformablePSROIPoolBackwardAccKernel( // compute gradient of input if (bottom_data_diff) { - platform::CudaAtomicAdd( - bottom_data_diff + bottom_index + y0 * width + x0, - q00 * diff_val); - platform::CudaAtomicAdd( - bottom_data_diff + bottom_index + y1 * width + x0, - q01 * diff_val); - platform::CudaAtomicAdd( - bottom_data_diff + bottom_index + y0 * width + x1, - q10 * diff_val); - platform::CudaAtomicAdd( - bottom_data_diff + bottom_index + y1 * width + x1, - q11 * diff_val); + phi::CudaAtomicAdd(bottom_data_diff + bottom_index + y0 * width + x0, + q00 * diff_val); + phi::CudaAtomicAdd(bottom_data_diff + bottom_index + y1 * width + x0, + q01 * diff_val); + phi::CudaAtomicAdd(bottom_data_diff + bottom_index + y0 * width + x1, + q10 * diff_val); + phi::CudaAtomicAdd(bottom_data_diff + bottom_index + y1 * width + x1, + q11 * diff_val); } // compute gradient of trans @@ -478,8 +474,8 @@ __global__ void DeformablePSROIPoolBackwardAccKernel( u00 * (1 - dist_x)) * trans_std * diff_val; diff_y *= roi_height; - platform::CudaAtomicAdd(bottom_trans_diff + trans_index_x, diff_x); - platform::CudaAtomicAdd(bottom_trans_diff + trans_index_y, diff_y); + phi::CudaAtomicAdd(bottom_trans_diff + trans_index_x, diff_x); + phi::CudaAtomicAdd(bottom_trans_diff + trans_index_y, diff_y); } } } diff --git a/paddle/fluid/operators/dequantize_log_op.cu b/paddle/fluid/operators/dequantize_log_op.cu index 4a1976f6fdd68..18719c76b2ef0 100644 --- a/paddle/fluid/operators/dequantize_log_op.cu +++ b/paddle/fluid/operators/dequantize_log_op.cu @@ -13,8 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/dequantize_log_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/core/hostdevice.h" +#include "paddle/phi/kernels/funcs/math.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/detection/box_clip_op.cu b/paddle/fluid/operators/detection/box_clip_op.cu index f7239b406b8fd..8fc8ec221f3e8 100644 --- a/paddle/fluid/operators/detection/box_clip_op.cu +++ b/paddle/fluid/operators/detection/box_clip_op.cu @@ -15,7 +15,7 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/detection/box_clip_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/core/hostdevice.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/detection/box_decoder_and_assign_op.cu b/paddle/fluid/operators/detection/box_decoder_and_assign_op.cu index 6acc3845d2408..daae995de0d5e 100644 --- a/paddle/fluid/operators/detection/box_decoder_and_assign_op.cu +++ b/paddle/fluid/operators/detection/box_decoder_and_assign_op.cu @@ -11,7 +11,7 @@ limitations under the License. */ #include "paddle/fluid/operators/detection/box_decoder_and_assign_op.h" #include "paddle/fluid/memory/memcpy.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu index f244bdca35eb7..18e52957d1acb 100644 --- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu +++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu @@ -26,8 +26,8 @@ namespace cub = hipcub; #include "paddle/fluid/operators/detection/collect_fpn_proposals_op.h" #include "paddle/fluid/operators/math/concat_and_split.h" #include "paddle/fluid/operators/strided_memcpy.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/for_range.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/kernels/funcs/gather.cu.h" namespace paddle { @@ -50,7 +50,7 @@ static __global__ void GetLengthLoD(const int nthreads, const int* batch_ids, int* length_lod) { CUDA_KERNEL_LOOP(i, nthreads) { - platform::CudaAtomicAdd(length_lod + batch_ids[i], 1); + phi::CudaAtomicAdd(length_lod + batch_ids[i], 1); } } diff --git a/paddle/fluid/operators/detection/polygon_box_transform_op.cu b/paddle/fluid/operators/detection/polygon_box_transform_op.cu index 49e3d3d96ba5d..bbeb9f7f2858a 100644 --- a/paddle/fluid/operators/detection/polygon_box_transform_op.cu +++ b/paddle/fluid/operators/detection/polygon_box_transform_op.cu @@ -14,13 +14,13 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" namespace paddle { namespace operators { using Tensor = phi::DenseTensor; -using platform::PADDLE_CUDA_NUM_THREADS; +using phi::PADDLE_CUDA_NUM_THREADS; #define CUDA_BLOCK_SIZE 16 template diff --git a/paddle/fluid/operators/detection/roi_perspective_transform_op.cu b/paddle/fluid/operators/detection/roi_perspective_transform_op.cu index cd298e50cad69..0c339b5f219f6 100644 --- a/paddle/fluid/operators/detection/roi_perspective_transform_op.cu +++ b/paddle/fluid/operators/detection/roi_perspective_transform_op.cu @@ -15,12 +15,12 @@ limitations under the License. */ #include #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/float16.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/kernels/funcs/math_function.h" using paddle::platform::float16; -using paddle::platform::PADDLE_CUDA_NUM_THREADS; +using phi::PADDLE_CUDA_NUM_THREADS; namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu b/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu index 76a47581e9f72..56d28c20dc8e7 100644 --- a/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu +++ b/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/detection/sigmoid_focal_loss_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/core/hostdevice.h" #include "paddle/phi/kernels/funcs/math.h" diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.h b/paddle/fluid/operators/elementwise/elementwise_op_function.h index ecdec98339b42..7bcd336732960 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op_function.h +++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h @@ -43,7 +43,7 @@ limitations under the License. */ #include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h" #include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h" #include "paddle/fluid/platform/device/gpu/gpu_device_function.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/kernels/gpu/elementwise_grad.h" #endif diff --git a/paddle/fluid/operators/fake_quantize_op.cu.h b/paddle/fluid/operators/fake_quantize_op.cu.h index a5f9f03493706..b6dd3ca8f64b2 100644 --- a/paddle/fluid/operators/fake_quantize_op.cu.h +++ b/paddle/fluid/operators/fake_quantize_op.cu.h @@ -20,7 +20,7 @@ limitations under the License. */ #include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/operators/fake_quantize_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/fused/fused_softmax_mask.cu.h b/paddle/fluid/operators/fused/fused_softmax_mask.cu.h index 60723c6cb5d17..12e511fe3aef9 100644 --- a/paddle/fluid/operators/fused/fused_softmax_mask.cu.h +++ b/paddle/fluid/operators/fused/fused_softmax_mask.cu.h @@ -13,8 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/enforce.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/kernels/funcs/aligned_vector.h" namespace paddle { diff --git a/paddle/fluid/operators/gather_scatter_kernel.cu b/paddle/fluid/operators/gather_scatter_kernel.cu index 80dbce4b24d28..2f17b946c6149 100644 --- a/paddle/fluid/operators/gather_scatter_kernel.cu +++ b/paddle/fluid/operators/gather_scatter_kernel.cu @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/gather_scatter_kernel.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" namespace paddle { namespace operators { @@ -35,7 +35,7 @@ class ReduceAdd { typename tensor_t, std::enable_if_t::value>* = nullptr> __device__ void operator()(tensor_t* self_data, tensor_t* src_data) const { - platform::CudaAtomicAdd(self_data, *src_data); + phi::CudaAtomicAdd(self_data, *src_data); } template ::value>* = nullptr> diff --git a/paddle/fluid/operators/graph_khop_sampler_op.cu b/paddle/fluid/operators/graph_khop_sampler_op.cu index c83419f309237..2e703282bf932 100644 --- a/paddle/fluid/operators/graph_khop_sampler_op.cu +++ b/paddle/fluid/operators/graph_khop_sampler_op.cu @@ -41,8 +41,8 @@ limitations under the License. */ #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/operators/graph_khop_sampler_imp.h" #include "paddle/fluid/operators/graph_khop_sampler_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/place.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" constexpr int WARP_SIZE = 32; @@ -134,8 +134,7 @@ __global__ void GraphSampleNeighborsCUDAKernel(const uint64_t rand_seed, const int num = curand(&rng) % (idx + 1); #endif if (num < k) { - paddle::platform::CudaAtomicMax(output_idxs + out_row_start + num, - idx); + phi::CudaAtomicMax(output_idxs + out_row_start + num, idx); } } #ifdef PADDLE_WITH_CUDA diff --git a/paddle/fluid/operators/group_norm_op.cu b/paddle/fluid/operators/group_norm_op.cu index 08ea4d3278b23..d0a2935197a8c 100644 --- a/paddle/fluid/operators/group_norm_op.cu +++ b/paddle/fluid/operators/group_norm_op.cu @@ -22,7 +22,7 @@ namespace cub = hipcub; #include "paddle/fluid/operators/group_norm_op.h" #include "paddle/fluid/platform/device/gpu/gpu_device_function.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" namespace paddle { namespace operators { @@ -51,7 +51,7 @@ __device__ __inline__ void CudaAtomicAddWithWarp(T* sum, T value) { typedef cub::WarpReduce WarpReduce; typename WarpReduce::TempStorage temp_storage; value = WarpReduce(temp_storage).Sum(value); - if (cub::LaneId() == 0) platform::CudaAtomicAdd(sum, value); + if (cub::LaneId() == 0) phi::CudaAtomicAdd(sum, value); } template @@ -429,14 +429,14 @@ __global__ void GroupNormBackwardGetMeanAndVar(const T* x, if (flags & kHasScale) { #if CUDA_VERSION >= 11070 - platform::CudaAtomicAdd(&(d_scale[ccid]), d_scale_data); + phi::CudaAtomicAdd(&(d_scale[ccid]), d_scale_data); #else CudaAtomicAddWithWarp(&(d_scale[ccid]), d_scale_data); #endif } if (flags & kHasBias) { #if CUDA_VERSION >= 11070 - platform::CudaAtomicAdd(&(d_bias[ccid]), d_bias_data); + phi::CudaAtomicAdd(&(d_bias[ccid]), d_bias_data); #else CudaAtomicAddWithWarp(&(d_bias[ccid]), d_bias_data); #endif diff --git a/paddle/fluid/operators/interpolate_op.cu b/paddle/fluid/operators/interpolate_op.cu index 0e6755dd0b7ba..a589b49500e0a 100644 --- a/paddle/fluid/operators/interpolate_op.cu +++ b/paddle/fluid/operators/interpolate_op.cu @@ -14,7 +14,7 @@ #include "paddle/fluid/operators/interpolate_op.h" #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" namespace paddle { namespace operators { @@ -126,7 +126,7 @@ __global__ void KeNearestNeighborInterpBw(T* in, in_img_idx * num_channels + channel_id]; } const T out_pos = out[out_id_h * output_w + out_id_w]; - platform::CudaAtomicAdd(in_pos, out_pos); + phi::CudaAtomicAdd(in_pos, out_pos); } } @@ -243,12 +243,11 @@ __global__ void KeLinearInterpBw(T* in, const T* out_pos = &out[out_id_w]; if (data_layout == DataLayout::kNCHW) { - platform::CudaAtomicAdd(&in_pos[0], w2lambda * out_pos[0]); - platform::CudaAtomicAdd(&in_pos[w_id], w1lambda * out_pos[0]); + phi::CudaAtomicAdd(&in_pos[0], w2lambda * out_pos[0]); + phi::CudaAtomicAdd(&in_pos[w_id], w1lambda * out_pos[0]); } else { - platform::CudaAtomicAdd(&in_pos[0], w2lambda * out_pos[0]); - platform::CudaAtomicAdd(&in_pos[w_id * num_channels], - w1lambda * out_pos[0]); + phi::CudaAtomicAdd(&in_pos[0], w2lambda * out_pos[0]); + phi::CudaAtomicAdd(&in_pos[w_id * num_channels], w1lambda * out_pos[0]); } } } @@ -408,19 +407,19 @@ __global__ void KeBilinearInterpBw(T* in, const T* out_pos = &out[out_id_h * output_w + out_id_w]; if (data_layout == DataLayout::kNCHW) { - platform::CudaAtomicAdd(&in_pos[0], h2lambda * w2lambda * out_pos[0]); - platform::CudaAtomicAdd(&in_pos[w_id], h2lambda * w1lambda * out_pos[0]); - platform::CudaAtomicAdd(&in_pos[h_id * in_img_w], - h1lambda * w2lambda * out_pos[0]); - platform::CudaAtomicAdd(&in_pos[h_id * in_img_w + w_id], - h1lambda * w1lambda * out_pos[0]); + phi::CudaAtomicAdd(&in_pos[0], h2lambda * w2lambda * out_pos[0]); + phi::CudaAtomicAdd(&in_pos[w_id], h2lambda * w1lambda * out_pos[0]); + phi::CudaAtomicAdd(&in_pos[h_id * in_img_w], + h1lambda * w2lambda * out_pos[0]); + phi::CudaAtomicAdd(&in_pos[h_id * in_img_w + w_id], + h1lambda * w1lambda * out_pos[0]); } else { - platform::CudaAtomicAdd(&in_pos[0], h2lambda * w2lambda * out_pos[0]); - platform::CudaAtomicAdd(&in_pos[w_id * num_channels], - h2lambda * w1lambda * out_pos[0]); - platform::CudaAtomicAdd(&in_pos[h_id * in_img_w * num_channels], - h1lambda * w2lambda * out_pos[0]); - platform::CudaAtomicAdd( + phi::CudaAtomicAdd(&in_pos[0], h2lambda * w2lambda * out_pos[0]); + phi::CudaAtomicAdd(&in_pos[w_id * num_channels], + h2lambda * w1lambda * out_pos[0]); + phi::CudaAtomicAdd(&in_pos[h_id * in_img_w * num_channels], + h1lambda * w2lambda * out_pos[0]); + phi::CudaAtomicAdd( &in_pos[h_id * in_img_w * num_channels + w_id * num_channels], h1lambda * w1lambda * out_pos[0]); } @@ -638,22 +637,22 @@ __global__ void KeTrilinearInterpBw(T* in, const T* out_pos = &out[out_id_h * output_w + out_id_w]; // trilinear interpolation grad - platform::CudaAtomicAdd(&in_pos1[0], - d2lambda * h2lambda * w2lambda * out_pos[0]); - platform::CudaAtomicAdd(&in_pos1[w_id], - d2lambda * h2lambda * w1lambda * out_pos[0]); - platform::CudaAtomicAdd(&in_pos1[h_id * in_img_w], - d2lambda * h1lambda * w2lambda * out_pos[0]); - platform::CudaAtomicAdd(&in_pos1[h_id * in_img_w + w_id], - d2lambda * h1lambda * w1lambda * out_pos[0]); - platform::CudaAtomicAdd(&in_pos2[0], - d1lambda * h2lambda * w2lambda * out_pos[0]); - platform::CudaAtomicAdd(&in_pos2[w_id], - d1lambda * h2lambda * w1lambda * out_pos[0]); - platform::CudaAtomicAdd(&in_pos2[h_id * in_img_w], - d1lambda * h1lambda * w2lambda * out_pos[0]); - platform::CudaAtomicAdd(&in_pos2[h_id * in_img_w + w_id], - d1lambda * h1lambda * w1lambda * out_pos[0]); + phi::CudaAtomicAdd(&in_pos1[0], + d2lambda * h2lambda * w2lambda * out_pos[0]); + phi::CudaAtomicAdd(&in_pos1[w_id], + d2lambda * h2lambda * w1lambda * out_pos[0]); + phi::CudaAtomicAdd(&in_pos1[h_id * in_img_w], + d2lambda * h1lambda * w2lambda * out_pos[0]); + phi::CudaAtomicAdd(&in_pos1[h_id * in_img_w + w_id], + d2lambda * h1lambda * w1lambda * out_pos[0]); + phi::CudaAtomicAdd(&in_pos2[0], + d1lambda * h2lambda * w2lambda * out_pos[0]); + phi::CudaAtomicAdd(&in_pos2[w_id], + d1lambda * h2lambda * w1lambda * out_pos[0]); + phi::CudaAtomicAdd(&in_pos2[h_id * in_img_w], + d1lambda * h1lambda * w2lambda * out_pos[0]); + phi::CudaAtomicAdd(&in_pos2[h_id * in_img_w + w_id], + d1lambda * h1lambda * w1lambda * out_pos[0]); } else { int in_pos1_idx = out_id_h * input_w + in_img_idt * in_img_h * in_img_w * num_channels + @@ -666,22 +665,22 @@ __global__ void KeTrilinearInterpBw(T* in, const T* out_pos = &out[out_id_h * output_w + out_id_w]; // trilinear interpolation grad - platform::CudaAtomicAdd(&in_pos1[0], - d2lambda * h2lambda * w2lambda * out_pos[0]); - platform::CudaAtomicAdd(&in_pos1[w_id * num_channels], - d2lambda * h2lambda * w1lambda * out_pos[0]); - platform::CudaAtomicAdd(&in_pos1[h_id * in_img_w * num_channels], - d2lambda * h1lambda * w2lambda * out_pos[0]); - platform::CudaAtomicAdd( + phi::CudaAtomicAdd(&in_pos1[0], + d2lambda * h2lambda * w2lambda * out_pos[0]); + phi::CudaAtomicAdd(&in_pos1[w_id * num_channels], + d2lambda * h2lambda * w1lambda * out_pos[0]); + phi::CudaAtomicAdd(&in_pos1[h_id * in_img_w * num_channels], + d2lambda * h1lambda * w2lambda * out_pos[0]); + phi::CudaAtomicAdd( &in_pos1[h_id * in_img_w * num_channels + w_id * num_channels], d2lambda * h1lambda * w1lambda * out_pos[0]); - platform::CudaAtomicAdd(&in_pos2[0], - d1lambda * h2lambda * w2lambda * out_pos[0]); - platform::CudaAtomicAdd(&in_pos2[w_id * num_channels], - d1lambda * h2lambda * w1lambda * out_pos[0]); - platform::CudaAtomicAdd(&in_pos2[h_id * in_img_w * num_channels], - d1lambda * h1lambda * w2lambda * out_pos[0]); - platform::CudaAtomicAdd( + phi::CudaAtomicAdd(&in_pos2[0], + d1lambda * h2lambda * w2lambda * out_pos[0]); + phi::CudaAtomicAdd(&in_pos2[w_id * num_channels], + d1lambda * h2lambda * w1lambda * out_pos[0]); + phi::CudaAtomicAdd(&in_pos2[h_id * in_img_w * num_channels], + d1lambda * h1lambda * w2lambda * out_pos[0]); + phi::CudaAtomicAdd( &in_pos2[h_id * in_img_w * num_channels + w_id * num_channels], d1lambda * h1lambda * w1lambda * out_pos[0]); } @@ -903,8 +902,8 @@ __global__ void KeBicubicInterpBw(T* in, in_pos = &in[out_id_h * input_w + access_y * in_img_w * num_channels + access_x * num_channels + channel_id]; } - platform::CudaAtomicAdd(&in_pos[0], - (out_pos[0] * y_coeffs[j] * x_coeffs[i])); + phi::CudaAtomicAdd(&in_pos[0], + (out_pos[0] * y_coeffs[j] * x_coeffs[i])); } } } diff --git a/paddle/fluid/operators/limit_by_capacity_op.cu b/paddle/fluid/operators/limit_by_capacity_op.cu index 4ca7a03b489be..f6e0bffa1d1ce 100644 --- a/paddle/fluid/operators/limit_by_capacity_op.cu +++ b/paddle/fluid/operators/limit_by_capacity_op.cu @@ -22,8 +22,8 @@ #include "paddle/fluid/operators/limit_by_capacity_op.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/float16.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" namespace paddle { namespace operators { @@ -39,7 +39,7 @@ __global__ void limit_by_capacity_impl( wid = i / n_expert; eid = i % n_expert; auto proposal = expc[wid * n_expert + eid]; - auto cap_left = paddle::platform::CudaAtomicAdd(cap + eid, proposal * (-1)); + auto cap_left = phi::CudaAtomicAdd(cap + eid, proposal * (-1)); if (cap_left >= proposal) { out[wid * n_expert + eid] = proposal; } else if (cap_left >= 0) { diff --git a/paddle/fluid/operators/lookup_table_op.cu b/paddle/fluid/operators/lookup_table_op.cu index 073077f6586fa..0562228f516fa 100644 --- a/paddle/fluid/operators/lookup_table_op.cu +++ b/paddle/fluid/operators/lookup_table_op.cu @@ -15,8 +15,8 @@ limitations under the License. */ #include "paddle/fluid/operators/lookup_table_op.h" #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/float16.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" namespace paddle { namespace operators { @@ -93,7 +93,7 @@ __global__ void LookupTableGrad(T *table, const T *out = output + idy * D; T *tab = table + id * D; for (int i = idx; i < D; i += BlockDimX) { - paddle::platform::CudaAtomicAdd(&tab[i], out[i]); + phi::CudaAtomicAdd(&tab[i], out[i]); } idy += BlockDimY * GridDimX; } diff --git a/paddle/fluid/operators/lookup_table_v2_op.cu b/paddle/fluid/operators/lookup_table_v2_op.cu index 41be6b34e6e5b..a3d8c91d86265 100644 --- a/paddle/fluid/operators/lookup_table_v2_op.cu +++ b/paddle/fluid/operators/lookup_table_v2_op.cu @@ -15,8 +15,8 @@ limitations under the License. */ #include "paddle/fluid/operators/lookup_table_v2_op.h" #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/float16.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" namespace paddle { namespace operators { @@ -65,10 +65,10 @@ __global__ void LookupTableV2Grad(T *table, const T *out = output + idy * D; T *tab = table + id * D; #ifdef PADDLE_WITH_CUDA - paddle::platform::VectorizedAtomicAddPerBlock(D, idx, blockDim.x, out, tab); + phi::VectorizedAtomicAddPerBlock(D, idx, blockDim.x, out, tab); #else for (int i = idx; i < D; i += blockDim.x) { - paddle::platform::CudaAtomicAdd(&tab[i], out[i]); + phi::CudaAtomicAdd(&tab[i], out[i]); } #endif idy += blockDim.y * gridDim.x; diff --git a/paddle/fluid/operators/math/cos_sim_functor.cu b/paddle/fluid/operators/math/cos_sim_functor.cu index cbe76844519a1..bb04df0879bf6 100644 --- a/paddle/fluid/operators/math/cos_sim_functor.cu +++ b/paddle/fluid/operators/math/cos_sim_functor.cu @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/math/cos_sim_functor.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" namespace paddle { namespace operators { @@ -44,7 +44,7 @@ __global__ void CosSimDyKernel(const T* x_norm, for (size_t i = 0; i < cols; ++i) { T dy_data = dz_data * (x_data[i] * reciprocal_xy_norm_prod - z_data * y[i] * reciprocal_y_norm_square); - platform::CudaAtomicAdd(dy + i, dy_data); + phi::CudaAtomicAdd(dy + i, dy_data); } } } diff --git a/paddle/fluid/operators/math/cross_entropy.cu b/paddle/fluid/operators/math/cross_entropy.cu index 478c4e0cd6611..8282f2b8a24f2 100644 --- a/paddle/fluid/operators/math/cross_entropy.cu +++ b/paddle/fluid/operators/math/cross_entropy.cu @@ -15,10 +15,9 @@ limitations under the License. */ #include "paddle/fluid/operators/math/cross_entropy.h" #include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/platform/device/gpu/gpu_device_function.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/kernels/funcs/math.h" - namespace paddle { namespace operators { namespace math { diff --git a/paddle/fluid/operators/math/im2col.cu b/paddle/fluid/operators/math/im2col.cu index 843e50c50a697..5c7038714e93c 100644 --- a/paddle/fluid/operators/math/im2col.cu +++ b/paddle/fluid/operators/math/im2col.cu @@ -17,8 +17,8 @@ limitations under the License. */ #include "paddle/fluid/operators/math/im2col.h" #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" namespace paddle { namespace operators { @@ -466,8 +466,7 @@ __global__ void col2imOCF(const T* col_data, if (height_offset >= 0 && height_offset < im_height && width_offset >= 0 && width_offset < im_width) { - paddle::platform::CudaAtomicAdd(im_data + im_offset, - col_data[col_offset]); + phi::CudaAtomicAdd(im_data + im_offset, col_data[col_offset]); } } } diff --git a/paddle/fluid/operators/math/maxouting.cu b/paddle/fluid/operators/math/maxouting.cu index df115fd16966d..9f1d2286395a4 100644 --- a/paddle/fluid/operators/math/maxouting.cu +++ b/paddle/fluid/operators/math/maxouting.cu @@ -13,8 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/math/maxouting.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/math/sequence_pooling.cu b/paddle/fluid/operators/math/sequence_pooling.cu index eadf0d070b901..530b68bbfbb3c 100644 --- a/paddle/fluid/operators/math/sequence_pooling.cu +++ b/paddle/fluid/operators/math/sequence_pooling.cu @@ -16,8 +16,8 @@ limitations under the License. */ #include #include "paddle/fluid/operators/math/sequence_pooling.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/macros.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/kernels/funcs/math_function.h" namespace paddle { diff --git a/paddle/fluid/operators/math/sequence_scale.cu b/paddle/fluid/operators/math/sequence_scale.cu index a0bb2a1ac33ce..21010ca33148e 100644 --- a/paddle/fluid/operators/math/sequence_scale.cu +++ b/paddle/fluid/operators/math/sequence_scale.cu @@ -13,14 +13,14 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/math/sequence_scale.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" namespace paddle { namespace operators { namespace math { -using platform::PADDLE_CUDA_NUM_THREADS; +using phi::PADDLE_CUDA_NUM_THREADS; template __global__ void SequenceScaleKernel(T* seq, diff --git a/paddle/fluid/operators/math/unpooling.cu b/paddle/fluid/operators/math/unpooling.cu index e3d7abb6e0d71..0ecac6c5fb07a 100644 --- a/paddle/fluid/operators/math/unpooling.cu +++ b/paddle/fluid/operators/math/unpooling.cu @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/math/unpooling.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/math/vol2col.cu b/paddle/fluid/operators/math/vol2col.cu index 765f31eba34f0..999e29470ebbd 100644 --- a/paddle/fluid/operators/math/vol2col.cu +++ b/paddle/fluid/operators/math/vol2col.cu @@ -17,8 +17,8 @@ limitations under the License. */ #include "paddle/fluid/operators/math/vol2col.h" #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/mean_iou_op.cu b/paddle/fluid/operators/mean_iou_op.cu index 3e7f8a5363ac0..e73496a46a0ad 100644 --- a/paddle/fluid/operators/mean_iou_op.cu +++ b/paddle/fluid/operators/mean_iou_op.cu @@ -15,13 +15,13 @@ limitations under the License. */ #include "paddle/fluid/operators/mean_iou_op.h" #include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/kernels/funcs/math_function.h" namespace paddle { namespace operators { -using platform::PADDLE_CUDA_NUM_THREADS; +using phi::PADDLE_CUDA_NUM_THREADS; template __global__ void CountCUDAKernel(const int num_classes, diff --git a/paddle/fluid/operators/number_count_op.cu b/paddle/fluid/operators/number_count_op.cu index dcbf95d059185..25541ebdb3621 100644 --- a/paddle/fluid/operators/number_count_op.cu +++ b/paddle/fluid/operators/number_count_op.cu @@ -22,8 +22,8 @@ #include "paddle/fluid/operators/number_count_op.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/float16.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" namespace paddle { namespace operators { @@ -77,7 +77,7 @@ __global__ void NumberCount(const T* numbers, #endif } if (threadIdx.x % WARP_SIZE == 0) { - platform::CudaAtomicAdd(number_count + i, x); + phi::CudaAtomicAdd(number_count + i, x); } } } diff --git a/paddle/fluid/operators/one_hot_op.cu b/paddle/fluid/operators/one_hot_op.cu index 1a2939366f391..b36ca97b3e40f 100644 --- a/paddle/fluid/operators/one_hot_op.cu +++ b/paddle/fluid/operators/one_hot_op.cu @@ -14,11 +14,11 @@ #include "paddle/fluid/operators/one_hot_op.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" namespace paddle { namespace operators { -using platform::PADDLE_CUDA_NUM_THREADS; +using phi::PADDLE_CUDA_NUM_THREADS; template __global__ void FillOutputKernel(const InT* p_in_data, diff --git a/paddle/fluid/operators/optimizers/sgd_op.cu b/paddle/fluid/operators/optimizers/sgd_op.cu index 05b00bac890a7..385e9a70e5489 100644 --- a/paddle/fluid/operators/optimizers/sgd_op.cu +++ b/paddle/fluid/operators/optimizers/sgd_op.cu @@ -16,7 +16,7 @@ limitations under the License. */ #include "paddle/fluid/operators/amp/fp16_type_traits.h" #include "paddle/fluid/operators/optimizers/sgd_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" namespace paddle { namespace operators { @@ -56,7 +56,7 @@ __global__ void SparseSGDFunctorKernel(const T* selected_rows, for (int64_t index = threadIdx.x; index < row_numel; index += blockDim.x) { // Since index in rows of SelectedRows can be duplicate, we have to use // Atomic Operation to avoid concurrent write error. - paddle::platform::CudaAtomicAdd( + phi::CudaAtomicAdd( tensor_out_ptr + index, -static_cast(1.0) * learning_rate[0] * selected_rows_ptr[index]); } diff --git a/paddle/fluid/operators/pad2d_op.cu b/paddle/fluid/operators/pad2d_op.cu index c76a6b61e780e..7b0dd2149dead 100644 --- a/paddle/fluid/operators/pad2d_op.cu +++ b/paddle/fluid/operators/pad2d_op.cu @@ -16,13 +16,13 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/kernels/funcs/math_function.h" namespace paddle { namespace operators { -using platform::PADDLE_CUDA_NUM_THREADS; +using phi::PADDLE_CUDA_NUM_THREADS; template __global__ void Pad2DConstNCHW(const int nthreads, @@ -257,9 +257,8 @@ __global__ void Pad2DGradReflectNCHW(const int out_size, in_w = max(in_w, -in_w); in_h = min(in_h, 2 * in_height - in_h - 2); in_w = min(in_w, 2 * in_width - in_w - 2); - platform::CudaAtomicAdd( - &d_in_data[(nc * in_height + in_h) * in_width + in_w], - d_out_data[out_index]); + phi::CudaAtomicAdd(&d_in_data[(nc * in_height + in_h) * in_width + in_w], + d_out_data[out_index]); } } @@ -288,7 +287,7 @@ __global__ void Pad2DGradReflectNHWC(const int out_size, in_w = max(in_w, -in_w); in_h = min(in_h, in_height * 2 - in_h - 2); in_w = min(in_w, in_width * 2 - in_w - 2); - platform::CudaAtomicAdd( + phi::CudaAtomicAdd( &d_in_data[((n * in_height + in_h) * in_width + in_w) * channels + c], d_out_data[out_index]); } @@ -313,9 +312,8 @@ __global__ void Pad2DGradEdgeNCHW(const int out_size, nc /= out_height; const int in_h = min(in_height - 1, max(out_h - pad_top, 0)); const int in_w = min(in_width - 1, max(out_w - pad_left, 0)); - platform::CudaAtomicAdd( - &d_in_data[(nc * in_height + in_h) * in_width + in_w], - d_out_data[out_index]); + phi::CudaAtomicAdd(&d_in_data[(nc * in_height + in_h) * in_width + in_w], + d_out_data[out_index]); } } @@ -340,7 +338,7 @@ __global__ void Pad2DGradEdgeNHWC(const int out_size, n /= out_height; const int in_h = min(in_height - 1, max(out_h - pad_top, 0)); const int in_w = min(in_width - 1, max(out_w - pad_left, 0)); - platform::CudaAtomicAdd( + phi::CudaAtomicAdd( &d_in_data[((n * in_height + in_h) * in_width + in_w) * channels + c], d_out_data[out_index]); } diff --git a/paddle/fluid/operators/prroi_pool_op.h b/paddle/fluid/operators/prroi_pool_op.h index d4375b2fc48ce..07a2bde7e94e4 100644 --- a/paddle/fluid/operators/prroi_pool_op.h +++ b/paddle/fluid/operators/prroi_pool_op.h @@ -18,7 +18,7 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/phi/kernels/funcs/math_function.h" #if defined(__NVCC__) || defined(__HIPCC__) -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" #endif namespace paddle { @@ -96,7 +96,7 @@ DEVICE void PrRoIPoolingDistributeDiff(T* diff, const T coeff) { bool overflow = (h < 0) || (w < 0) || (h >= height) || (w >= width); if (!overflow) { - paddle::platform::CudaAtomicAdd(diff + h * width + w, top_diff * coeff); + phi::CudaAtomicAdd(diff + h * width + w, top_diff * coeff); } } #else @@ -166,7 +166,7 @@ HOSTDEVICE void PrRoIPoolingMatDistributeDiff(T* diff, #if defined(__NVCC__) || defined(__HIPCC__) template DEVICE void AccumulateRois(T* offset, T data) { - paddle::platform::CudaAtomicAdd(offset, data); + phi::CudaAtomicAdd(offset, data); } #else template diff --git a/paddle/fluid/operators/prune_gate_by_capacity_op.cu b/paddle/fluid/operators/prune_gate_by_capacity_op.cu index cf0763be27d5b..9f038002cfbe6 100644 --- a/paddle/fluid/operators/prune_gate_by_capacity_op.cu +++ b/paddle/fluid/operators/prune_gate_by_capacity_op.cu @@ -21,7 +21,7 @@ // Licensed under the Apache License, Version 2.0 (the "License"). #include "paddle/fluid/operators/prune_gate_by_capacity_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" namespace ops = paddle::operators; namespace plat = paddle::platform; @@ -47,7 +47,7 @@ __global__ void prune_gate_by_capacity_kernel(const T1* gate_idx_data, const int64_t batch_size) { CUDA_KERNEL_LOOP(i, batch_size) { auto orig_cap = - platform::CudaAtomicAdd(expert_count_data + gate_idx_data[i], -1); + phi::CudaAtomicAdd(expert_count_data + gate_idx_data[i], -1); if (orig_cap <= 0) { new_gate_idx_data[i] = -1; } else { diff --git a/paddle/fluid/operators/pull_box_extended_sparse_op.cu b/paddle/fluid/operators/pull_box_extended_sparse_op.cu index 26a02ea622479..cfa317a3d392f 100644 --- a/paddle/fluid/operators/pull_box_extended_sparse_op.cu +++ b/paddle/fluid/operators/pull_box_extended_sparse_op.cu @@ -14,7 +14,7 @@ #include "paddle/fluid/operators/pull_box_extended_sparse_op.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/pull_box_sparse_op.kps b/paddle/fluid/operators/pull_box_sparse_op.kps index 6b7c7c8495108..4b0580c5e1ab5 100644 --- a/paddle/fluid/operators/pull_box_sparse_op.kps +++ b/paddle/fluid/operators/pull_box_sparse_op.kps @@ -37,7 +37,7 @@ limitations under the License. */ #include "xpu/kernel/math.h" // NOLINT #else #include "paddle/fluid/platform/device/gpu/gpu_info.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" #endif #include "paddle/fluid/operators/pull_box_sparse_op.h" @@ -46,9 +46,13 @@ namespace ops = paddle::operators; namespace plat = paddle::platform; #ifdef PADDLE_WITH_XPU_KP -REGISTER_OP_KERNEL(pull_box_sparse, KP, plat::XPUPlace, +REGISTER_OP_KERNEL(pull_box_sparse, + KP, + plat::XPUPlace, ops::PullBoxSparseKernel); -REGISTER_OP_KERNEL(push_box_sparse, KP, plat::XPUPlace, +REGISTER_OP_KERNEL(push_box_sparse, + KP, + plat::XPUPlace, ops::PushBoxSparseKernel); #else REGISTER_OP_CUDA_KERNEL(pull_box_sparse, ops::PullBoxSparseKernel); diff --git a/paddle/fluid/operators/pull_gpups_sparse_op.cu b/paddle/fluid/operators/pull_gpups_sparse_op.cu index 996eacf428979..d22c632d60dd2 100644 --- a/paddle/fluid/operators/pull_gpups_sparse_op.cu +++ b/paddle/fluid/operators/pull_gpups_sparse_op.cu @@ -14,11 +14,11 @@ #include "paddle/fluid/operators/pull_gpups_sparse_op.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" namespace paddle { namespace operators { -using platform::PADDLE_CUDA_NUM_THREADS; +using phi::PADDLE_CUDA_NUM_THREADS; using LoDTensor = phi::DenseTensor; template diff --git a/paddle/fluid/operators/quantize_linear_op.cu b/paddle/fluid/operators/quantize_linear_op.cu index c5d8b1928fd78..259c1507af038 100644 --- a/paddle/fluid/operators/quantize_linear_op.cu +++ b/paddle/fluid/operators/quantize_linear_op.cu @@ -18,7 +18,7 @@ limitations under the License. */ #include "paddle/fluid/operators/fake_dequantize_op.cu.h" #include "paddle/fluid/operators/fake_quantize_op.cu.h" #include "paddle/fluid/operators/quantize_linear_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/random_routing_op.cu b/paddle/fluid/operators/random_routing_op.cu index 287a523f61f3b..1fdb1bf73a304 100644 --- a/paddle/fluid/operators/random_routing_op.cu +++ b/paddle/fluid/operators/random_routing_op.cu @@ -14,8 +14,8 @@ #include "paddle/fluid/operators/random_routing_op.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/float16.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/rank_attention_op.cu b/paddle/fluid/operators/rank_attention_op.cu index 36117e605031e..8107e520b0492 100644 --- a/paddle/fluid/operators/rank_attention_op.cu +++ b/paddle/fluid/operators/rank_attention_op.cu @@ -18,7 +18,7 @@ limitations under the License. */ #include "paddle/fluid/operators/rank_attention.cu.h" #include "paddle/fluid/operators/rank_attention_op.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/kernels/funcs/blas/blas.h" namespace paddle { diff --git a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cu b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cu index 6a1afcc18e68c..0f53f292ef8ae 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cu +++ b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cu @@ -16,11 +16,11 @@ #include #include "paddle/fluid/operators/sequence_ops/sequence_enumerate_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" namespace paddle { namespace operators { -using platform::PADDLE_CUDA_NUM_THREADS; +using phi::PADDLE_CUDA_NUM_THREADS; using LoDTensor = phi::DenseTensor; template diff --git a/paddle/fluid/operators/sequence_ops/sequence_erase_op.cu b/paddle/fluid/operators/sequence_ops/sequence_erase_op.cu index 74789ecde9d38..d8b0afbc85dc5 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_erase_op.cu +++ b/paddle/fluid/operators/sequence_ops/sequence_erase_op.cu @@ -16,11 +16,11 @@ limitations under the License. */ #include #include "paddle/fluid/operators/sequence_ops/sequence_erase_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" namespace paddle { namespace operators { -using platform::PADDLE_CUDA_NUM_THREADS; +using phi::PADDLE_CUDA_NUM_THREADS; using LoDTensor = phi::DenseTensor; template diff --git a/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cu b/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cu index d5beedd35338a..f565e0d438a0e 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cu +++ b/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cu @@ -15,7 +15,7 @@ limitations under the License. */ #include #include "paddle/fluid/operators/sequence_ops/sequence_expand_as_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu b/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu index 2c9b6408a7390..e4ebd47878cb2 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu +++ b/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu @@ -16,7 +16,7 @@ limitations under the License. */ #include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/operators/sequence_ops/sequence_expand_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" namespace paddle { namespace operators { @@ -72,7 +72,7 @@ __global__ void sequence_expand_grad_kernel(const T* dout_data, for (int tid_y = threadIdx.y; tid_y < x_item_count; tid_y += blockDim.y) { for (int tid_x = threadIdx.x; tid_x < x_item_length; tid_x += blockDim.x) { - platform::CudaAtomicAdd( + phi::CudaAtomicAdd( &dx_data[(x_offset + tid_y) * x_item_length + tid_x], dout_data[(out_offset + tid_z * x_item_count + tid_y) * x_item_length + diff --git a/paddle/fluid/operators/shuffle_channel_op.cu b/paddle/fluid/operators/shuffle_channel_op.cu index 26eee095377c0..4869a4c6c5e22 100644 --- a/paddle/fluid/operators/shuffle_channel_op.cu +++ b/paddle/fluid/operators/shuffle_channel_op.cu @@ -11,7 +11,7 @@ limitations under the License. */ #include "paddle/fluid/operators/shuffle_channel_op.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/temporal_shift_op.cu b/paddle/fluid/operators/temporal_shift_op.cu index cad6f416d41d8..d2583aeb143ec 100644 --- a/paddle/fluid/operators/temporal_shift_op.cu +++ b/paddle/fluid/operators/temporal_shift_op.cu @@ -11,7 +11,7 @@ #include "paddle/fluid/operators/temporal_shift_op.h" #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/top_k_function_cuda.h b/paddle/fluid/operators/top_k_function_cuda.h index faf2b08089157..e95bca3c2791e 100644 --- a/paddle/fluid/operators/top_k_function_cuda.h +++ b/paddle/fluid/operators/top_k_function_cuda.h @@ -28,8 +28,8 @@ limitations under the License. */ #include "paddle/fluid/operators/top_k_op.h" #include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/float16.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" #define FINAL_MASK 0xffffffff #ifdef __HIPCC__ @@ -713,7 +713,7 @@ __device__ void RadixCountUsingMask(const T* input, if (GetLaneId() == 0) { #pragma unroll for (uint32_t i = 0; i < RadixSize; ++i) { - platform::CudaAtomicAdd(&shared_mem[i], counts[i]); + phi::CudaAtomicAdd(&shared_mem[i], counts[i]); } } diff --git a/paddle/fluid/operators/transpose_op.cu.h b/paddle/fluid/operators/transpose_op.cu.h index eb9e8a7bed784..4fc610c393f10 100644 --- a/paddle/fluid/operators/transpose_op.cu.h +++ b/paddle/fluid/operators/transpose_op.cu.h @@ -16,9 +16,9 @@ limitations under the License. */ #include "paddle/fluid/framework/gpu_utils.h" #include "paddle/fluid/operators/transpose_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/fast_divmod.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/core/tensor_utils.h" #include "paddle/phi/kernels/autotune/auto_tune_base.h" diff --git a/paddle/fluid/platform/device/gpu/cuda_helper_test.cu b/paddle/fluid/platform/device/gpu/cuda_helper_test.cu index 68229bba74ccd..a3fff0dbed8e2 100644 --- a/paddle/fluid/platform/device/gpu/cuda_helper_test.cu +++ b/paddle/fluid/platform/device/gpu/cuda_helper_test.cu @@ -24,17 +24,15 @@ #define PADDLE_CUDA_FP16 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/fluid/platform/device/gpu/gpu_helper.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/float16.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" using paddle::platform::float16; -using paddle::platform::PADDLE_CUDA_NUM_THREADS; +using phi::PADDLE_CUDA_NUM_THREADS; template __global__ void AddKernel(const T* data_a, T* data_b, size_t num) { - CUDA_KERNEL_LOOP(i, num) { - paddle::platform::CudaAtomicAdd(&data_b[i], data_a[i]); - } + CUDA_KERNEL_LOOP(i, num) { phi::CudaAtomicAdd(&data_b[i], data_a[i]); } } template diff --git a/paddle/fluid/platform/device/gpu/gpu_primitives.h b/paddle/fluid/platform/device/gpu/gpu_primitives.h deleted file mode 100644 index 4df203b48bb9a..0000000000000 --- a/paddle/fluid/platform/device/gpu/gpu_primitives.h +++ /dev/null @@ -1,622 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#ifdef PADDLE_WITH_CUDA -#include -#endif -#ifdef PADDLE_WITH_HIP -#include -#endif -#include - -#include "paddle/fluid/platform/bfloat16.h" -#include "paddle/fluid/platform/complex.h" -#include "paddle/fluid/platform/float16.h" - -namespace paddle { -namespace platform { - -#define CUDA_ATOMIC_WRAPPER(op, T) \ - __device__ __forceinline__ T CudaAtomic##op(T *address, const T val) - -#define USE_CUDA_ATOMIC(op, T) \ - CUDA_ATOMIC_WRAPPER(op, T) { return atomic##op(address, val); } - -// Default thread count per block(or block size). -// TODO(typhoonzero): need to benchmark against setting this value -// to 1024. -constexpr int PADDLE_CUDA_NUM_THREADS = 512; - -// For atomicAdd. -USE_CUDA_ATOMIC(Add, float); -USE_CUDA_ATOMIC(Add, int); -USE_CUDA_ATOMIC(Add, unsigned int); -// CUDA API uses unsigned long long int, we cannot use uint64_t here. -// It because unsigned long long int is not necessarily uint64_t -USE_CUDA_ATOMIC(Add, unsigned long long int); // NOLINT - -CUDA_ATOMIC_WRAPPER(Add, int64_t) { - // Here, we check long long int must be int64_t. - static_assert(sizeof(int64_t) == sizeof(long long int), // NOLINT - "long long should be int64"); - return CudaAtomicAdd( - reinterpret_cast(address), // NOLINT - static_cast(val)); // NOLINT -} - -#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600) -USE_CUDA_ATOMIC(Add, double); -#else -CUDA_ATOMIC_WRAPPER(Add, double) { - unsigned long long int *address_as_ull = // NOLINT - reinterpret_cast(address); // NOLINT - unsigned long long int old = *address_as_ull, assumed; // NOLINT - - do { - assumed = old; - old = atomicCAS(address_as_ull, - assumed, - __double_as_longlong(val + __longlong_as_double(assumed))); - - // Note: uses integer comparison to avoid hang in case of NaN - } while (assumed != old); - - return __longlong_as_double(old); -} -#endif - -#ifdef PADDLE_CUDA_FP16 -// NOTE(dzhwinter): cuda do not have atomicCAS for half. -// Just use the half address as a unsigned value address and -// do the atomicCAS. According to the value store at high 16 bits -// or low 16 bits, then do a different sum and CAS. -// Given most warp-threads will failed on the atomicCAS, so this -// implemented should be avoided in high concurrency. It's will be -// slower than the way convert value into 32bits and do a full atomicCAS. - -// convert the value into float and do the add arithmetic. -// then store the result into a uint32. -inline static __device__ uint32_t add_to_low_half(uint32_t val, float x) { - float16 low_half; - // the float16 in lower 16bits - low_half.x = static_cast(val & 0xFFFFu); - low_half = static_cast(static_cast(low_half) + x); - return (val & 0xFFFF0000u) | low_half.x; -} - -inline static __device__ uint32_t add_to_high_half(uint32_t val, float x) { - float16 high_half; - // the float16 in higher 16bits - high_half.x = static_cast(val >> 16); - high_half = static_cast(static_cast(high_half) + x); - return (val & 0xFFFFu) | (static_cast(high_half.x) << 16); -} - -#if CUDA_VERSION >= 10000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 -static __device__ __forceinline__ float16 CUDAFP16ToPDFP16(__half x) { - return *reinterpret_cast(&x); -} - -static __device__ __forceinline__ __half PDFP16ToCUDAFP16(float16 x) { - return *reinterpret_cast<__half *>(&x); -} - -CUDA_ATOMIC_WRAPPER(Add, float16) { - return CUDAFP16ToPDFP16( - atomicAdd(reinterpret_cast<__half *>(address), PDFP16ToCUDAFP16(val))); -} -#else -CUDA_ATOMIC_WRAPPER(Add, float16) { - // concrete packed float16 value may exsits in lower or higher 16bits - // of the 32bits address. - uint32_t *address_as_ui = reinterpret_cast( - reinterpret_cast(address) - - (reinterpret_cast(address) & 0x02)); - float val_f = static_cast(val); - uint32_t old = *address_as_ui; - uint32_t sum; - uint32_t newval; - uint32_t assumed; - if (((uintptr_t)address & 0x02) == 0) { - // the float16 value stay at lower 16 bits of the address. - do { - assumed = old; - old = atomicCAS(address_as_ui, assumed, add_to_low_half(assumed, val_f)); - } while (old != assumed); - float16 ret; - ret.x = old & 0xFFFFu; - return ret; - } else { - // the float16 value stay at higher 16 bits of the address. - do { - assumed = old; - old = atomicCAS(address_as_ui, assumed, add_to_high_half(assumed, val_f)); - } while (old != assumed); - float16 ret; - ret.x = old >> 16; - return ret; - } -} -#endif - -template -struct VecAtomicAddHelperBase { - static constexpr auto kIsAvailable = IsAvailable; - using NVT = NVType; - using NVVec2T = NVVec2Type; -}; - -template -struct VecAtomicAddHelper : VecAtomicAddHelperBase {}; - -#if CUDA_VERSION >= 10000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 -template <> -struct VecAtomicAddHelper - : VecAtomicAddHelperBase {}; -#endif - -#if CUDA_VERSION >= 11000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 -template <> -struct VecAtomicAddHelper - : VecAtomicAddHelperBase {}; -#endif - -// The performance of "atomicAdd(half* )" is bad, but for "atomicAdd(half2* )" -// is good. So for fp16 type, we can use "atomicAdd(half2* )" to speed up. -template ::kIsAvailable>::type * = - nullptr> -__device__ __forceinline__ void fastAtomicAdd(T *tensor, - size_t index, - const size_t numel, - T value) { - // whether the address is 32-byte aligned. - using NVT = typename VecAtomicAddHelper::NVT; - using NVVec2T = typename VecAtomicAddHelper::NVVec2T; - NVT *target_addr = reinterpret_cast(tensor + index); - bool aligned_half2 = - (reinterpret_cast(target_addr) % sizeof(NVVec2T) == 0); - - if (aligned_half2 && index < (numel - 1)) { - NVVec2T value2; - value2.x = *reinterpret_cast(&value); - value2.y = 0.0; - atomicAdd(reinterpret_cast(target_addr), value2); - - } else if (!aligned_half2 && index > 0) { - NVVec2T value2; - value2.x = 0.0; - value2.y = *reinterpret_cast(&value); - atomicAdd(reinterpret_cast(target_addr - 1), value2); - - } else { - atomicAdd(reinterpret_cast(tensor) + index, - *reinterpret_cast(&value)); - } -} - -template ::kIsAvailable>::type - * = nullptr> -__device__ __forceinline__ void fastAtomicAdd(T *arr, - size_t index, - const size_t numel, - T value) { - CudaAtomicAdd(arr + index, value); -} -#endif - -// NOTE(zhangbo): cuda do not have atomicCAS for __nv_bfloat16. -inline static __device__ uint32_t bf16_add_to_low_half(uint32_t val, float x) { - bfloat16 low_half; - // the bfloat16 in lower 16bits - low_half.x = static_cast(val & 0xFFFFu); - low_half = static_cast(static_cast(low_half) + x); - return (val & 0xFFFF0000u) | low_half.x; -} - -inline static __device__ uint32_t bf16_add_to_high_half(uint32_t val, float x) { - bfloat16 high_half; - // the bfloat16 in higher 16bits - high_half.x = static_cast(val >> 16); - high_half = static_cast(static_cast(high_half) + x); - return (val & 0xFFFFu) | (static_cast(high_half.x) << 16); -} - -#if CUDA_VERSION >= 11000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 -static __device__ __forceinline__ bfloat16 CUDABF16ToPDBF16(__nv_bfloat16 x) { - return *reinterpret_cast(&x); -} - -static __device__ __forceinline__ __nv_bfloat16 PDBF16ToCUDABF16(bfloat16 x) { - return *reinterpret_cast<__nv_bfloat16 *>(&x); -} - -CUDA_ATOMIC_WRAPPER(Add, bfloat16) { - return CUDABF16ToPDBF16(atomicAdd(reinterpret_cast<__nv_bfloat16 *>(address), - PDBF16ToCUDABF16(val))); -} -#else -CUDA_ATOMIC_WRAPPER(Add, bfloat16) { - // concrete packed bfloat16 value may exsits in lower or higher 16bits - // of the 32bits address. - uint32_t *address_as_ui = reinterpret_cast( - reinterpret_cast(address) - - (reinterpret_cast(address) & 0x02)); - float val_f = static_cast(val); - uint32_t old = *address_as_ui; - uint32_t sum; - uint32_t newval; - uint32_t assumed; - if (((uintptr_t)address & 0x02) == 0) { - // the bfloat16 value stay at lower 16 bits of the address. - do { - assumed = old; - old = atomicCAS( - address_as_ui, assumed, bf16_add_to_low_half(assumed, val_f)); - } while (old != assumed); - bfloat16 ret; - ret.x = old & 0xFFFFu; - return ret; - } else { - // the bfloat16 value stay at higher 16 bits of the address. - do { - assumed = old; - old = atomicCAS( - address_as_ui, assumed, bf16_add_to_high_half(assumed, val_f)); - } while (old != assumed); - bfloat16 ret; - ret.x = old >> 16; - return ret; - } -} -#endif - -CUDA_ATOMIC_WRAPPER(Add, complex) { - float *real = reinterpret_cast(address); - float *imag = real + 1; - return complex(CudaAtomicAdd(real, val.real), - CudaAtomicAdd(imag, val.imag)); -} - -CUDA_ATOMIC_WRAPPER(Add, complex) { - double *real = reinterpret_cast(address); - double *imag = real + 1; - return complex(CudaAtomicAdd(real, val.real), - CudaAtomicAdd(imag, val.imag)); -} - -// For atomicMax -USE_CUDA_ATOMIC(Max, int); -USE_CUDA_ATOMIC(Max, unsigned int); -// CUDA API uses unsigned long long int, we cannot use uint64_t here. -// It because unsigned long long int is not necessarily uint64_t -#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350) -USE_CUDA_ATOMIC(Max, unsigned long long int); // NOLINT -#else -CUDA_ATOMIC_WRAPPER(Max, unsigned long long int) { // NOLINT - if (*address >= val) { - return *address; - } - - unsigned long long int old = *address, assumed; // NOLINT - - do { - assumed = old; - if (assumed >= val) { - break; - } - - old = atomicCAS(address, assumed, val); - } while (assumed != old); -} -#endif - -CUDA_ATOMIC_WRAPPER(Max, int64_t) { - // Here, we check long long int must be int64_t. - static_assert(sizeof(int64_t) == sizeof(long long int), // NOLINT - "long long should be int64"); - long long int res = *address; // NOLINT - while (val > res) { - long long int old = res; // NOLINT - res = (long long int)atomicCAS((unsigned long long int *)address, // NOLINT - (unsigned long long int)old, // NOLINT - (unsigned long long int)val); // NOLINT - if (res == old) { - break; - } - } - return res; -} - -CUDA_ATOMIC_WRAPPER(Max, float) { - if (*address >= val) { - return *address; - } - - int *const address_as_i = reinterpret_cast(address); - int old = *address_as_i, assumed; - - do { - assumed = old; - if (__int_as_float(assumed) >= val) { - break; - } - - old = atomicCAS(address_as_i, assumed, __float_as_int(val)); - } while (assumed != old); - - return __int_as_float(old); -} - -CUDA_ATOMIC_WRAPPER(Max, double) { - if (*address >= val) { - return *address; - } - - unsigned long long int *const address_as_ull = // NOLINT - reinterpret_cast(address); // NOLINT - unsigned long long int old = *address_as_ull, assumed; // NOLINT - - do { - assumed = old; - if (__longlong_as_double(assumed) >= val) { - break; - } - - old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val)); - } while (assumed != old); - - return __longlong_as_double(old); -} - -#ifdef PADDLE_CUDA_FP16 -inline static __device__ uint32_t max_to_low_half(uint32_t val, float x) { - float16 low_half; - // The float16 in lower 16bits - low_half.x = static_cast(val & 0xFFFFu); - low_half = static_cast(max(static_cast(low_half), x)); - return (val & 0xFFFF0000u) | low_half.x; -} - -inline static __device__ uint32_t max_to_high_half(uint32_t val, float x) { - float16 high_half; - // The float16 in higher 16bits - high_half.x = static_cast(val >> 16); - high_half = static_cast(max(static_cast(high_half), x)); - return (val & 0xFFFFu) | (static_cast(high_half.x) << 16); -} - -CUDA_ATOMIC_WRAPPER(Max, float16) { - if (*address >= val) { - return *address; - } - uint32_t *address_as_ui = reinterpret_cast( - reinterpret_cast(address) - - (reinterpret_cast(address) & 0x02)); - float val_f = static_cast(val); - uint32_t old = *address_as_ui; - uint32_t assumed; - if (((uintptr_t)address & 0x02) == 0) { - // The float16 value stay at lower 16 bits of the address. - do { - assumed = old; - old = atomicCAS(address_as_ui, assumed, max_to_low_half(assumed, val_f)); - } while (old != assumed); - float16 ret; - ret.x = old & 0xFFFFu; - return ret; - } else { - // The float16 value stay at higher 16 bits of the address. - do { - assumed = old; - old = atomicCAS(address_as_ui, assumed, max_to_high_half(assumed, val_f)); - } while (old != assumed); - float16 ret; - ret.x = old >> 16; - return ret; - } -} -#endif - -// For atomicMin -USE_CUDA_ATOMIC(Min, int); -USE_CUDA_ATOMIC(Min, unsigned int); -// CUDA API uses unsigned long long int, we cannot use uint64_t here. -// It because unsigned long long int is not necessarily uint64_t -#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350) -USE_CUDA_ATOMIC(Min, unsigned long long int); // NOLINT -#else -CUDA_ATOMIC_WRAPPER(Min, unsigned long long int) { // NOLINT - if (*address <= val) { - return *address; - } - - unsigned long long int old = *address, assumed; // NOLINT - - do { - assumed = old; - if (assumed <= val) { - break; - } - - old = atomicCAS(address, assumed, val); - } while (assumed != old); -} -#endif - -CUDA_ATOMIC_WRAPPER(Min, int64_t) { - // Here, we check long long int must be int64_t. - static_assert(sizeof(int64_t) == sizeof(long long int), // NOLINT - "long long should be int64"); - long long int res = *address; // NOLINT - while (val < res) { - long long int old = res; // NOLINT - res = (long long int)atomicCAS((unsigned long long int *)address, // NOLINT - (unsigned long long int)old, // NOLINT - (unsigned long long int)val); // NOLINT - if (res == old) { - break; - } - } - return res; -} - -CUDA_ATOMIC_WRAPPER(Min, float) { - if (*address <= val) { - return *address; - } - - int *const address_as_i = reinterpret_cast(address); - int old = *address_as_i, assumed; - - do { - assumed = old; - if (__int_as_float(assumed) <= val) { - break; - } - - old = atomicCAS(address_as_i, assumed, __float_as_int(val)); - } while (assumed != old); - - return __int_as_float(old); -} - -CUDA_ATOMIC_WRAPPER(Min, double) { - if (*address <= val) { - return *address; - } - - unsigned long long int *const address_as_ull = // NOLINT - reinterpret_cast(address); // NOLINT - unsigned long long int old = *address_as_ull, assumed; // NOLINT - - do { - assumed = old; - if (__longlong_as_double(assumed) <= val) { - break; - } - - old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val)); - } while (assumed != old); - - return __longlong_as_double(old); -} - -#ifdef PADDLE_CUDA_FP16 -inline static __device__ uint32_t min_to_low_half(uint32_t val, float x) { - float16 low_half; - // The float16 in lower 16bits - low_half.x = static_cast(val & 0xFFFFu); - low_half = static_cast(min(static_cast(low_half), x)); - return (val & 0xFFFF0000u) | low_half.x; -} - -inline static __device__ uint32_t min_to_high_half(uint32_t val, float x) { - float16 high_half; - // The float16 in higher 16bits - high_half.x = static_cast(val >> 16); - high_half = static_cast(min(static_cast(high_half), x)); - return (val & 0xFFFFu) | (static_cast(high_half.x) << 16); -} - -CUDA_ATOMIC_WRAPPER(Min, float16) { - if (*address <= val) { - return *address; - } - uint32_t *address_as_ui = reinterpret_cast( - reinterpret_cast(address) - - (reinterpret_cast(address) & 0x02)); - float val_f = static_cast(val); - uint32_t old = *address_as_ui; - uint32_t assumed; - if (((uintptr_t)address & 0x02) == 0) { - // The float16 value stay at lower 16 bits of the address. - do { - assumed = old; - old = atomicCAS(address_as_ui, assumed, min_to_low_half(assumed, val_f)); - } while (old != assumed); - float16 ret; - ret.x = old & 0xFFFFu; - return ret; - } else { - // The float16 value stay at higher 16 bits of the address. - do { - assumed = old; - old = atomicCAS(address_as_ui, assumed, min_to_high_half(assumed, val_f)); - } while (old != assumed); - float16 ret; - ret.x = old >> 16; - return ret; - } -} -#endif - -#ifdef PADDLE_WITH_CUDA -/* - * One thead block deals with elementwise atomicAdd for vector of len. - * @in: [x1, x2, x3, ...] - * @out:[y1+x1, y2+x2, y3+x3, ...] - * */ - -template ::kIsAvailable>::type - * = nullptr> -__device__ __forceinline__ void VectorizedAtomicAddPerBlock( - const int64_t len, int tid, int threads_per_block, const T *in, T *out) { - for (int i = tid; i < len; i += threads_per_block) { - CudaAtomicAdd(&out[i], in[i]); - } -} - -// Note: assume that len is even. If len is odd, call fastAtomicAdd directly. -template ::kIsAvailable>::type * = - nullptr> -__device__ __forceinline__ void VectorizedAtomicAddPerBlock( - const int64_t len, int tid, int threads_per_block, const T *in, T *out) { - int i = 0; - int loops = len / 2 * 2; - - using NVT = typename VecAtomicAddHelper::NVT; - using NVVec2T = typename VecAtomicAddHelper::NVVec2T; - bool aligned_half2 = - (reinterpret_cast(out) % sizeof(NVT) == 0); - - if (aligned_half2) { - for (i = tid * 2; i < loops; i += threads_per_block * 2) { - NVVec2T value2; - T value_1 = in[i]; - T value_2 = in[i + 1]; - value2.x = *reinterpret_cast(&value_1); - value2.y = *reinterpret_cast(&value_2); - atomicAdd(reinterpret_cast(&out[i]), value2); - } - for (; i < len; i += threads_per_block) { - fastAtomicAdd(out, i, len, in[i]); - } - } else { - for (int i = tid; i < len; i += threads_per_block) { - fastAtomicAdd(out, i, len, in[i]); - } - } -} - -#endif -} // namespace platform -} // namespace paddle diff --git a/paddle/phi/backends/gpu/gpu_primitives.h b/paddle/phi/backends/gpu/gpu_primitives.h index 12f58257cf044..67b34aa289f73 100644 --- a/paddle/phi/backends/gpu/gpu_primitives.h +++ b/paddle/phi/backends/gpu/gpu_primitives.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. From a7d306afe64c29e8df5cb613cf197d123f312d8b Mon Sep 17 00:00:00 2001 From: Yuang Liu Date: Fri, 18 Nov 2022 12:09:35 +0800 Subject: [PATCH 082/210] add bf16 for numel (#48121) --- paddle/phi/kernels/cpu/size_kernel.cc | 1 + paddle/phi/kernels/gpu/size_kernel.cu | 1 + 2 files changed, 2 insertions(+) diff --git a/paddle/phi/kernels/cpu/size_kernel.cc b/paddle/phi/kernels/cpu/size_kernel.cc index 4019976ecec9c..a070c2de5bc44 100644 --- a/paddle/phi/kernels/cpu/size_kernel.cc +++ b/paddle/phi/kernels/cpu/size_kernel.cc @@ -27,6 +27,7 @@ PD_REGISTER_KERNEL(size, int, int64_t, phi::dtype::float16, + phi::dtype::bfloat16, float, double, bool) {} diff --git a/paddle/phi/kernels/gpu/size_kernel.cu b/paddle/phi/kernels/gpu/size_kernel.cu index fb6acd5599a8e..a165e6c82df96 100644 --- a/paddle/phi/kernels/gpu/size_kernel.cu +++ b/paddle/phi/kernels/gpu/size_kernel.cu @@ -26,6 +26,7 @@ PD_REGISTER_KERNEL(size, int, int64_t, phi::dtype::float16, + phi::dtype::bfloat16, float, double, bool) {} From 593bc4e2c360faecf95161d6852eff050eab5c2c Mon Sep 17 00:00:00 2001 From: GGBond8488 <33050871+GGBond8488@users.noreply.github.com> Date: Fri, 18 Nov 2022 12:20:05 +0800 Subject: [PATCH 083/210] remove no used fluid beam_search_decoder (#48096) --- python/paddle/fluid/contrib/__init__.py | 5 +- .../paddle/fluid/contrib/decoder/__init__.py | 18 - .../contrib/decoder/beam_search_decoder.py | 896 ------------------ python/paddle/fluid/tests/CMakeLists.txt | 2 - .../fluid/tests/test_beam_search_decoder.py | 292 ------ python/setup.py.in | 1 - 6 files changed, 2 insertions(+), 1212 deletions(-) delete mode 100644 python/paddle/fluid/contrib/decoder/__init__.py delete mode 100644 python/paddle/fluid/contrib/decoder/beam_search_decoder.py delete mode 100644 python/paddle/fluid/tests/test_beam_search_decoder.py diff --git a/python/paddle/fluid/contrib/__init__.py b/python/paddle/fluid/contrib/__init__.py index a3f0aac77fff7..2860d414d0a5b 100644 --- a/python/paddle/fluid/contrib/__init__.py +++ b/python/paddle/fluid/contrib/__init__.py @@ -13,8 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from . import decoder -from .decoder import * + from . import memory_usage_calc from .memory_usage_calc import * from . import op_frequence @@ -36,7 +35,7 @@ from .sparsity import * __all__ = [] -__all__ += decoder.__all__ + __all__ += memory_usage_calc.__all__ __all__ += op_frequence.__all__ __all__ += quantize.__all__ diff --git a/python/paddle/fluid/contrib/decoder/__init__.py b/python/paddle/fluid/contrib/decoder/__init__.py deleted file mode 100644 index 6343c1543d206..0000000000000 --- a/python/paddle/fluid/contrib/decoder/__init__.py +++ /dev/null @@ -1,18 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from . import beam_search_decoder -from .beam_search_decoder import * - -__all__ = beam_search_decoder.__all__ diff --git a/python/paddle/fluid/contrib/decoder/beam_search_decoder.py b/python/paddle/fluid/contrib/decoder/beam_search_decoder.py deleted file mode 100644 index 717d31c2fe1b9..0000000000000 --- a/python/paddle/fluid/contrib/decoder/beam_search_decoder.py +++ /dev/null @@ -1,896 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -This module provides a general beam search decoder API for RNN based decoders. -The purpose of this API is to allow users to highly customize the behavior -within their RNN decoder(vanilla RNN, LSTM, attention + LSTM, future etc.), -without using the low level API such as while ops. - -This API is still under active development and may change drastically. -""" - -from ...wrapped_decorator import signature_safe_contextmanager -import numpy as np - -from ... import layers -from ...framework import Variable -from ... import core -from ... import framework, unique_name -from ...layer_helper import LayerHelper - -__all__ = ['InitState', 'StateCell', 'TrainingDecoder', 'BeamSearchDecoder'] - - -class _DecoderType: - TRAINING = 1 - BEAM_SEARCH = 2 - - -class InitState: - """ - The initial hidden state object. The state objects holds a variable, and may - use it to initialize the hidden state cell of RNN. Usually used as input to - `StateCell` class. - - Args: - init (Variable): The initial variable of the hidden state. If set None, - the variable will be created as a tensor with constant value based - on `shape` and `value` param. - shape (tuple|list): If `init` is None, new Variable's shape. Default - None. - value (float): If `init` is None, new Variable's value. Default None. - init_boot (Variable): If provided, the initial variable will be created - with the same shape as this variable. - need_reorder (bool): If set true, the init will be sorted by its lod - rank within its batches. This should be used if `batch_size > 1`. - dtype (np.dtype|core.VarDesc.VarType|str): Data type of the initial - variable. - - Returns: - An initialized state object. - - Examples: - See `StateCell`. - """ - - def __init__( - self, - init=None, - shape=None, - value=0.0, - init_boot=None, - need_reorder=False, - dtype='float32', - ): - if init is not None: - self._init = init - elif init_boot is None: - raise ValueError( - 'init_boot must be provided to infer the shape of InitState .\n' - ) - else: - self._init = layers.fill_constant_batch_size_like( - input=init_boot, value=value, shape=shape, dtype=dtype - ) - - self._shape = shape - self._value = value - self._need_reorder = need_reorder - self._dtype = dtype - - @property - def value(self): - return self._init - - @property - def need_reorder(self): - return self._need_reorder - - -class _MemoryState: - def __init__(self, state_name, rnn_obj, init_state): - self._state_name = state_name # each is a rnn.memory - self._rnn_obj = rnn_obj - self._state_mem = self._rnn_obj.memory( - init=init_state.value, need_reorder=init_state.need_reorder - ) - - def get_state(self): - return self._state_mem - - def update_state(self, state): - self._rnn_obj.update_memory(self._state_mem, state) - - -class _ArrayState: - def __init__(self, state_name, block, init_state): - self._state_name = state_name - self._block = block - - self._state_array = self._block.create_var( - name=unique_name.generate('array_state_array'), - type=core.VarDesc.VarType.LOD_TENSOR_ARRAY, - dtype=init_state.value.dtype, - ) - - self._counter = self._block.create_var( - name=unique_name.generate('array_state_counter'), - type=core.VarDesc.VarType.LOD_TENSOR, - dtype='int64', - ) - - # initialize counter - self._block.append_op( - type='fill_constant', - inputs={}, - outputs={'Out': [self._counter]}, - attrs={ - 'shape': [1], - 'dtype': self._counter.dtype, - 'value': float(0.0), - 'force_cpu': True, - }, - ) - - self._counter.stop_gradient = True - - # write initial state - block.append_op( - type='write_to_array', - inputs={'X': init_state.value, 'I': self._counter}, - outputs={'Out': self._state_array}, - ) - - def get_state(self): - state = layers.array_read(array=self._state_array, i=self._counter) - return state - - def update_state(self, state): - layers.increment(x=self._counter, value=1, in_place=True) - layers.array_write(state, array=self._state_array, i=self._counter) - - -class StateCell: - """ - The state cell class stores the hidden state of the RNN cell. A typical RNN - cell has one or more hidden states, and one or more step inputs. This class - allows you to defines the name of hidden states as well as step inputs, and - their associated variables. - - Args: - inputs (dict): A feeding dict of {name(str) : Variable}. It specifies - the names of step inputs for RNN cell, and the associated variables. - The variable could initially be None and set manually during each - RNN step. - states (dict): A feeding dict of {name(str) : InitState object}. It - specifies the names of hidden states and their initialized state. - out_state (str): A string that specifies the name of hidden state that - will be used to compute the score in beam search process. - name (str): The name of the RNN cell. Default None. - - Raises: - `ValueError`: If the initial state is not an instance of InitState, or - the out_state is not in the dict of states. - - Returns: - StateCell: The initialized StateCell object. - - Examples: - .. code-block:: python - hidden_state = InitState(init=encoder_out, need_reorder=True) - state_cell = StateCell( - inputs={'current_word': None}, - states={'h': hidden_state}, - out_state='h') - """ - - def __init__(self, inputs, states, out_state, name=None): - self._helper = LayerHelper('state_cell', name=name) - self._cur_states = {} - self._state_names = [] - for state_name, state in states.items(): - if not isinstance(state, InitState): - raise ValueError('state must be an InitState object.') - self._cur_states[state_name] = state - self._state_names.append(state_name) - self._inputs = inputs # inputs is place holder here - self._cur_decoder_obj = None - self._in_decoder = False - self._states_holder = {} - self._switched_decoder = False - self._state_updater = None - self._out_state = out_state - if self._out_state not in self._cur_states: - raise ValueError('out_state must be one state in states') - - def _enter_decoder(self, decoder_obj): - if self._in_decoder == True or self._cur_decoder_obj is not None: - raise ValueError('StateCell has already entered a decoder.') - self._in_decoder = True - self._cur_decoder_obj = decoder_obj - self._switched_decoder = False - - def _leave_decoder(self, decoder_obj): - if not self._in_decoder: - raise ValueError( - 'StateCell not in decoder, ' 'invalid leaving operation.' - ) - - if self._cur_decoder_obj != decoder_obj: - raise ValueError('Inconsistent decoder object in StateCell.') - - self._in_decoder = False - self._cur_decoder_obj = None - self._switched_decoder = False - - def _switch_decoder(self): # lazy switch - if not self._in_decoder: - raise ValueError('StateCell must be enter a decoder.') - - if self._switched_decoder: - raise ValueError('StateCell already done switching.') - - for state_name in self._state_names: - if state_name not in self._states_holder: - state = self._cur_states[state_name] - - if not isinstance(state, InitState): - raise ValueError( - 'Current type of state is %s, should be ' - 'an InitState object.' % type(state) - ) - - self._states_holder[state_name] = {} - - if self._cur_decoder_obj.type == _DecoderType.TRAINING: - self._states_holder[state_name][ - id(self._cur_decoder_obj) - ] = _MemoryState( - state_name, self._cur_decoder_obj.dynamic_rnn, state - ) - elif self._cur_decoder_obj.type == _DecoderType.BEAM_SEARCH: - self._states_holder[state_name][ - id(self._cur_decoder_obj) - ] = _ArrayState( - state_name, self._cur_decoder_obj._parent_block(), state - ) - else: - raise ValueError( - 'Unknown decoder type, only support ' - '[TRAINING, BEAM_SEARCH]' - ) - - # Read back, since current state should be LoDTensor - self._cur_states[state_name] = self._states_holder[state_name][ - id(self._cur_decoder_obj) - ].get_state() - - self._switched_decoder = True - - def get_state(self, state_name): - """ - The getter of state object. Find the state variable by its name. - - Args: - state_name (str): A string of the state's name. - - Returns: - The associated state object. - """ - if self._in_decoder and not self._switched_decoder: - self._switch_decoder() - - if state_name not in self._cur_states: - raise ValueError( - 'Unknown state %s. Please make sure _switch_decoder() ' - 'invoked.' % state_name - ) - - return self._cur_states[state_name] - - def get_input(self, input_name): - """ - The getter of input variable. Find the input variable by its name. - - Args: - input_name (str): The string of the input's name. - - Returns: - The associated input variable. - """ - if input_name not in self._inputs or self._inputs[input_name] is None: - raise ValueError('Invalid input %s.' % input_name) - return self._inputs[input_name] - - def set_state(self, state_name, state_value): - """ - The setter of the state variable. Change the variable of the given - `state_name`. - - Args: - state_name (str): The name of the state to change. - state_value (Var): The variable of the new state. - """ - self._cur_states[state_name] = state_value - - def state_updater(self, updater): - """ - Set up the updater to update the hidden state every RNN step. The - behavior of updater could be customized by users. The updater should be - a function that takes a `StateCell` object as input and update the - hidden state within it. The hidden state could be accessed through - `get_state` method. - - Args: - updater (func): the updater to update the state cell. - """ - self._state_updater = updater - - def _decorator(state_cell): - if state_cell == self: - raise TypeError( - 'Updater should only accept a StateCell object ' - 'as argument.' - ) - updater(state_cell) - - return _decorator - - def compute_state(self, inputs): - """ - Provide the step input of RNN cell, and compute the new hidden state - with updater and give step input. - - Args: - inputs (dict): A feed dict, {name(str): Variable}. name should be - the names of step inputs for this RNN cell, and Variable should be - the associated variables. - - Examples: - .. code-block:: python - state_cell.compute_state(inputs={'x': current_word}) - """ - if self._in_decoder and not self._switched_decoder: - self._switch_decoder() - - for input_name, input_value in inputs.items(): - if input_name not in self._inputs: - raise ValueError( - 'Unknown input %s. ' - 'Please make sure %s in input ' - 'place holder.' % (input_name, input_name) - ) - self._inputs[input_name] = input_value - self._state_updater(self) - - def update_states(self): - """ - Update and record state information after each RNN step. - """ - if self._in_decoder and not self._switched_decoder: - self._switched_decoder() - - for state_name, decoder_state in self._states_holder.items(): - if id(self._cur_decoder_obj) not in decoder_state: - raise ValueError( - 'Unknown decoder object, please make sure ' - 'switch_decoder been invoked.' - ) - decoder_state[id(self._cur_decoder_obj)].update_state( - self._cur_states[state_name] - ) - - def out_state(self): - """ - Get the output state variable. This must be called after update_states. - - Returns: - The output variable of the RNN cell. - """ - return self._cur_states[self._out_state] - - -class TrainingDecoder: - """ - A decoder that can only be used for training. The decoder could be - initialized with a `StateCell` object. The computation within the RNN cell - could be defined with decoder's block. - - Args: - state_cell (StateCell): A StateCell object that handles the input and - state variables. - name (str): The name of this decoder. Default None. - - Returns: - TrainingDecoder: The initialized TrainingDecoder object. - - Examples: - .. code-block:: python - decoder = TrainingDecoder(state_cell) - with decoder.block(): - current_word = decoder.step_input(trg_embedding) - decoder.state_cell.compute_state(inputs={'x': current_word}) - current_score = layers.fc(input=decoder.state_cell.get_state('h'), - size=32, - act='softmax') - decoder.state_cell.update_states() - decoder.output(current_score) - """ - - BEFORE_DECODER = 0 - IN_DECODER = 1 - AFTER_DECODER = 2 - - def __init__(self, state_cell, name=None): - self._helper = LayerHelper('training_decoder', name=name) - self._status = TrainingDecoder.BEFORE_DECODER - self._dynamic_rnn = layers.DynamicRNN() - self._type = _DecoderType.TRAINING - self._state_cell = state_cell - self._state_cell._enter_decoder(self) - - @signature_safe_contextmanager - def block(self): - """ - Define the behavior of the decoder for each RNN time step. - """ - if self._status != TrainingDecoder.BEFORE_DECODER: - raise ValueError('decoder.block() can only be invoked once') - self._status = TrainingDecoder.IN_DECODER - - with self._dynamic_rnn.block(): - yield - - self._status = TrainingDecoder.AFTER_DECODER - self._state_cell._leave_decoder(self) - - @property - def state_cell(self): - self._assert_in_decoder_block('state_cell') - return self._state_cell - - @property - def dynamic_rnn(self): - return self._dynamic_rnn - - @property - def type(self): - return self._type - - def step_input(self, x): - """ - Set the input variable as a step input to the RNN cell. For example, - in machine translation, each time step we read one word from the target - sentences, then the target sentence is a step input to the RNN cell. - - Args: - x (Variable): the variable to be used as step input. - - Returns: - Variable: The variable as input of current step. - - Examples: - .. code-block:: python - current_word = decoder.step_input(trg_embedding) - """ - self._assert_in_decoder_block('step_input') - return self._dynamic_rnn.step_input(x) - - def static_input(self, x): - """ - Set the input variable as a static input of RNN cell. In contrast to - step input, this variable will be used as a whole within the RNN decode - loop and will not be scattered into time steps. - - Args: - x (Variable): the variable to be used as static input. - - Returns: - Variable: The variable as input of current step. - - Examples: - .. code-block:: python - encoder_vec = decoder.static_input(encoded_vector) - """ - self._assert_in_decoder_block('static_input') - return self._dynamic_rnn.static_input(x) - - def __call__(self, *args, **kwargs): - """ - Get the output of RNN. This API should only be invoked after RNN.block() - - Returns: - Variable: The specified output of the RNN cell. - """ - if self._status != TrainingDecoder.AFTER_DECODER: - raise ValueError( - 'Output of training decoder can only be visited ' - 'outside the block.' - ) - return self._dynamic_rnn(*args, **kwargs) - - def output(self, *outputs): - """ - Set the output variable of the RNN cell. - - Args: - *outputs (Variables): a series of variables that treated as output - of the RNN cell. - - Examples: - .. code-block:: python - out = fluid.layers.fc(input=h, - size=32, - bias_attr=True, - act='softmax') - decoder.output(out) - """ - self._assert_in_decoder_block('output') - self._dynamic_rnn.output(*outputs) - - def _assert_in_decoder_block(self, method): - if self._status != TrainingDecoder.IN_DECODER: - raise ValueError( - '%s should be invoked inside block of ' - 'TrainingDecoder object.' % method - ) - - -class BeamSearchDecoder: - """ - A beam search decoder that can be used for inference. The decoder should be - initialized with a `StateCell` object. The decode process can be defined - within its block. - - Args: - state_cell (StateCell): A StateCell object that handles the input and - state variables. - init_ids (Variable): The init beam search token ids. - init_scores (Variable): The associated score of each id. - target_dict_dim (int): Size of dictionary. - word_dim (int): Word embedding dimension. - input_var_dict (dict): A feeding dict to feed the required input - variables to the state cell. It will be used by state_cell 's - compute method. Default empty. - topk_size (int): The topk size used for beam search. Default 50. - max_len (int): The maximum allowed length of the generated sentence. - Default 100. - beam_size (int): The beam width of beam search decode. Default 1. - end_id (int): The id of end token within beam search. - name (str): The name of this decoder. Default None. - - Returns: - BeamSearchDecoder: A initialized BeamSearchDecoder object. - - Examples: - .. code-block:: python - decoder = BeamSearchDecoder( - state_cell=state_cell, - init_ids=init_ids, - init_scores=init_scores, - target_dict_dim=target_dict_dim, - word_dim=word_dim, - init_var_dict={}, - topk_size=topk_size, - sparse_emb=IS_SPARSE, - max_len=max_length, - beam_size=beam_size, - end_id=1, - name=None - ) - decoder.decode() - translation_ids, translation_scores = decoder() - """ - - BEFORE_BEAM_SEARCH_DECODER = 0 - IN_BEAM_SEARCH_DECODER = 1 - AFTER_BEAM_SEARCH_DECODER = 2 - - def __init__( - self, - state_cell, - init_ids, - init_scores, - target_dict_dim, - word_dim, - input_var_dict={}, - topk_size=50, - sparse_emb=True, - max_len=100, - beam_size=1, - end_id=1, - name=None, - ): - self._helper = LayerHelper('beam_search_decoder', name=name) - self._counter = layers.zeros(shape=[1], dtype='int64') - self._counter.stop_gradient = True - self._type = _DecoderType.BEAM_SEARCH - self._max_len = layers.fill_constant( - shape=[1], dtype='int64', value=max_len - ) - self._cond = layers.less_than( - x=self._counter, - y=layers.fill_constant(shape=[1], dtype='int64', value=max_len), - ) - self._while_op = layers.While(self._cond) - self._state_cell = state_cell - self._state_cell._enter_decoder(self) - self._status = BeamSearchDecoder.BEFORE_BEAM_SEARCH_DECODER - self._zero_idx = layers.fill_constant( - shape=[1], value=0, dtype='int64', force_cpu=True - ) - self._array_dict = {} - self._array_link = [] - self._ids_array = None - self._scores_array = None - self._beam_size = beam_size - self._end_id = end_id - - self._init_ids = init_ids - self._init_scores = init_scores - self._target_dict_dim = target_dict_dim - self._topk_size = topk_size - self._sparse_emb = sparse_emb - self._word_dim = word_dim - self._input_var_dict = input_var_dict - - @signature_safe_contextmanager - def block(self): - """ - Define the behavior of the decoder for each RNN time step. - """ - if self._status != BeamSearchDecoder.BEFORE_BEAM_SEARCH_DECODER: - raise ValueError('block() can only be invoke once.') - - self._status = BeamSearchDecoder.IN_BEAM_SEARCH_DECODER - - with self._while_op.block(): - yield - with layers.Switch() as switch: - with switch.case(self._cond): - layers.increment(x=self._counter, value=1.0, in_place=True) - - for value, array in self._array_link: - layers.array_write( - x=value, i=self._counter, array=array - ) - - layers.less_than( - x=self._counter, y=self._max_len, cond=self._cond - ) - - self._status = BeamSearchDecoder.AFTER_BEAM_SEARCH_DECODER - self._state_cell._leave_decoder(self) - - @property - def type(self): - return self._type - - def early_stop(self): - """ - Stop the generation process in advance. Could be used as "break". - """ - layers.fill_constant( - shape=[1], value=0, dtype='bool', force_cpu=True, out=self._cond - ) - - def decode(self): - """ - Set up the computation within the decoder. Then you could call the - decoder to get the result of beam search decode. If you want to define - a more specific decoder, you could override this function. - - Examples: - .. code-block:: python - decoder.decode() - translation_ids, translation_scores = decoder() - """ - with self.block(): - prev_ids = self.read_array(init=self._init_ids, is_ids=True) - prev_scores = self.read_array( - init=self._init_scores, is_scores=True - ) - prev_ids_embedding = layers.embedding( - input=prev_ids, - size=[self._target_dict_dim, self._word_dim], - dtype='float32', - is_sparse=self._sparse_emb, - ) - - feed_dict = {} - update_dict = {} - - for init_var_name, init_var in self._input_var_dict.items(): - if init_var_name not in self.state_cell._inputs: - raise ValueError( - 'Variable ' - + init_var_name - + ' not found in StateCell!\n' - ) - - read_var = self.read_array(init=init_var) - update_dict[init_var_name] = read_var - feed_var_expanded = layers.sequence_expand( - read_var, prev_scores - ) - feed_dict[init_var_name] = feed_var_expanded - - for state_str in self._state_cell._state_names: - prev_state = self.state_cell.get_state(state_str) - prev_state_expanded = layers.sequence_expand( - prev_state, prev_scores - ) - self.state_cell.set_state(state_str, prev_state_expanded) - - for i, input_name in enumerate(self._state_cell._inputs): - if input_name not in feed_dict: - feed_dict[input_name] = prev_ids_embedding - - self.state_cell.compute_state(inputs=feed_dict) - current_state = self.state_cell.out_state() - current_state_with_lod = layers.lod_reset( - x=current_state, y=prev_scores - ) - scores = layers.fc( - input=current_state_with_lod, - size=self._target_dict_dim, - act='softmax', - ) - topk_scores, topk_indices = layers.topk(scores, k=self._topk_size) - accu_scores = layers.elementwise_add( - x=layers.log(x=topk_scores), - y=layers.reshape(prev_scores, shape=[-1]), - axis=0, - ) - selected_ids, selected_scores = layers.beam_search( - prev_ids, - prev_scores, - topk_indices, - accu_scores, - self._beam_size, - end_id=1, - level=0, - ) - - with layers.Switch() as switch: - with switch.case(layers.is_empty(selected_ids)): - self.early_stop() - with switch.default(): - self.state_cell.update_states() - self.update_array(prev_ids, selected_ids) - self.update_array(prev_scores, selected_scores) - for update_name, var_to_update in update_dict.items(): - self.update_array(var_to_update, feed_dict[update_name]) - - def read_array(self, init, is_ids=False, is_scores=False): - """ - Read an array to get the decoded ids and scores generated by previous - RNN step. At the first step of RNN, the init variable mut be used to - initialize the array. - - Args: - init (Variable): The initial variable for first step usage. init - must be provided. - is_ids (bool): Specify whether the variable is an id. - is_scores (bool): Specify whether the variable is a score. - - Returns: - The associated variable generated during previous RNN steps. - - Examples: - .. code-block:: python - prev_ids = decoder.read_array(init=init_ids, is_ids=True) - prev_scores = decoder.read_array(init=init_scores, is_scores=True) - """ - self._assert_in_decoder_block('read_array') - - if is_ids and is_scores: - raise ValueError( - 'Shouldn\'t mark current array be ids array and' - 'scores array at the same time.' - ) - - if not isinstance(init, Variable): - raise TypeError('The input argument `init` must be a Variable.') - - parent_block = self._parent_block() - array = parent_block.create_var( - name=unique_name.generate('beam_search_decoder_array'), - type=core.VarDesc.VarType.LOD_TENSOR_ARRAY, - dtype=init.dtype, - ) - parent_block.append_op( - type='write_to_array', - inputs={'X': init, 'I': self._zero_idx}, - outputs={'Out': array}, - ) - - if is_ids: - self._ids_array = array - elif is_scores: - self._scores_array = array - - read_value = layers.array_read(array=array, i=self._counter) - self._array_dict[read_value.name] = array - return read_value - - def update_array(self, array, value): - """ - Store the value generated in current step in an array for each RNN step. - This array could be accessed by read_array method. - - Args: - array (Variable): The array to append the new variable to. - value (Variable): The newly generated value to be stored. - """ - self._assert_in_decoder_block('update_array') - - if not isinstance(array, Variable): - raise TypeError( - 'The input argument `array` of must be a Variable.' - ) - if not isinstance(value, Variable): - raise TypeError('The input argument `value` of must be a Variable.') - - array = self._array_dict.get(array.name, None) - if array is None: - raise ValueError('Please invoke read_array before update_array.') - self._array_link.append((value, array)) - - def __call__(self): - """ - Run the decode process and return the final decode result. - - Returns: - A tuple of decoded (id, score) pairs. id is a Variable that holds - the generated tokens, and score is a Variable with the same shape - as id, holds the score for each generated token. - """ - if self._status != BeamSearchDecoder.AFTER_BEAM_SEARCH_DECODER: - raise ValueError( - 'Output of BeamSearchDecoder object can ' - 'only be visited outside the block.' - ) - return layers.beam_search_decode( - ids=self._ids_array, - scores=self._scores_array, - beam_size=self._beam_size, - end_id=self._end_id, - ) - - @property - def state_cell(self): - self._assert_in_decoder_block('state_cell') - return self._state_cell - - def _parent_block(self): - """ - Getter of parent block. - - Returns: - The parent block of decoder. - """ - program = self._helper.main_program - parent_block_idx = program.current_block().parent_idx - if parent_block_idx < 0: - raise ValueError('Invalid block with index %d.' % parent_block_idx) - parent_block = program.block(parent_block_idx) - return parent_block - - def _assert_in_decoder_block(self, method): - if self._status != BeamSearchDecoder.IN_BEAM_SEARCH_DECODER: - raise ValueError( - '%s should be invoked inside block of ' - 'BeamSearchDecoder object.' % method - ) diff --git a/python/paddle/fluid/tests/CMakeLists.txt b/python/paddle/fluid/tests/CMakeLists.txt index 92e29202b28b8..fbf928ddf2b68 100644 --- a/python/paddle/fluid/tests/CMakeLists.txt +++ b/python/paddle/fluid/tests/CMakeLists.txt @@ -13,5 +13,3 @@ add_subdirectory(book) add_subdirectory(custom_op) add_subdirectory(custom_kernel) add_subdirectory(custom_runtime) - -set_tests_properties(test_beam_search_decoder PROPERTIES TIMEOUT 120) diff --git a/python/paddle/fluid/tests/test_beam_search_decoder.py b/python/paddle/fluid/tests/test_beam_search_decoder.py deleted file mode 100644 index 3a5c8604648bc..0000000000000 --- a/python/paddle/fluid/tests/test_beam_search_decoder.py +++ /dev/null @@ -1,292 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -A simple machine translation demo using beam search decoder. -""" - -import contextlib -import numpy as np -import paddle -import paddle.fluid as fluid -import paddle.fluid.framework as framework -import paddle.fluid.layers as layers -from paddle.fluid.executor import Executor -from paddle.fluid.contrib.decoder.beam_search_decoder import ( - BeamSearchDecoder, - InitState, - StateCell, - TrainingDecoder, -) -import unittest - -paddle.enable_static() - -dict_size = 30000 -source_dict_dim = target_dict_dim = dict_size -src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size) -hidden_dim = 32 -word_dim = 32 -decoder_size = hidden_dim -IS_SPARSE = True -batch_size = 2 -max_length = 8 -topk_size = 50 -trg_dic_size = 10000 -beam_size = 2 - - -def encoder(): - # encoder - src_word = layers.data( - name="src_word", shape=[1], dtype='int64', lod_level=1 - ) - src_embedding = layers.embedding( - input=src_word, - size=[dict_size, word_dim], - dtype='float32', - is_sparse=IS_SPARSE, - ) - - fc1 = layers.fc(input=src_embedding, size=hidden_dim * 4, act='tanh') - lstm_hidden0, lstm_0 = layers.dynamic_lstm(input=fc1, size=hidden_dim * 4) - encoder_out = layers.sequence_last_step(input=lstm_hidden0) - return encoder_out - - -def decoder_state_cell(context): - h = InitState(init=context, need_reorder=True) - state_cell = StateCell(inputs={'x': None}, states={'h': h}, out_state='h') - - @state_cell.state_updater - def updater(state_cell): - current_word = state_cell.get_input('x') - prev_h = state_cell.get_state('h') - # make sure lod of h heritted from prev_h - h = layers.fc( - input=[prev_h, current_word], size=decoder_size, act='tanh' - ) - state_cell.set_state('h', h) - - return state_cell - - -def decoder_train(state_cell): - # decoder - trg_language_word = layers.data( - name="target_word", shape=[1], dtype='int64', lod_level=1 - ) - trg_embedding = layers.embedding( - input=trg_language_word, - size=[dict_size, word_dim], - dtype='float32', - is_sparse=IS_SPARSE, - ) - - decoder = TrainingDecoder(state_cell) - - with decoder.block(): - current_word = decoder.step_input(trg_embedding) - decoder.state_cell.compute_state(inputs={'x': current_word}) - current_score = layers.fc( - input=decoder.state_cell.get_state('h'), - size=target_dict_dim, - act='softmax', - ) - decoder.state_cell.update_states() - decoder.output(current_score) - - return decoder() - - -def decoder_decode(state_cell): - init_ids = layers.data( - name="init_ids", shape=[1], dtype="int64", lod_level=2 - ) - init_scores = layers.data( - name="init_scores", shape=[1], dtype="float32", lod_level=2 - ) - - decoder = BeamSearchDecoder( - state_cell=state_cell, - init_ids=init_ids, - init_scores=init_scores, - target_dict_dim=target_dict_dim, - word_dim=word_dim, - input_var_dict={}, - topk_size=topk_size, - sparse_emb=IS_SPARSE, - max_len=max_length, - beam_size=beam_size, - end_id=1, - name=None, - ) - decoder.decode() - translation_ids, translation_scores = decoder() - - return translation_ids, translation_scores - - -def train_main(use_cuda): - if use_cuda and not fluid.core.is_compiled_with_cuda(): - return - place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() - - context = encoder() - state_cell = decoder_state_cell(context) - rnn_out = decoder_train(state_cell) - label = layers.data( - name="target_next_word", shape=[1], dtype='int64', lod_level=1 - ) - cost = layers.cross_entropy(input=rnn_out, label=label) - avg_cost = paddle.mean(x=cost) - - optimizer = fluid.optimizer.Adagrad(learning_rate=1e-3) - optimizer.minimize(avg_cost) - - train_reader = paddle.batch( - paddle.reader.shuffle( - paddle.dataset.wmt14.train(dict_size), buf_size=1000 - ), - batch_size=batch_size, - ) - feed_order = ['src_word', 'target_word', 'target_next_word'] - - exe = Executor(place) - - def train_loop(main_program): - exe.run(framework.default_startup_program()) - - feed_list = [ - main_program.global_block().var(var_name) for var_name in feed_order - ] - feeder = fluid.DataFeeder(feed_list, place) - - for pass_id in range(1): - for batch_id, data in enumerate(train_reader()): - outs = exe.run( - main_program, feed=feeder.feed(data), fetch_list=[avg_cost] - ) - avg_cost_val = np.array(outs[0]) - print( - 'pass_id=' - + str(pass_id) - + ' batch=' - + str(batch_id) - + " avg_cost=" - + str(avg_cost_val) - ) - if batch_id > 3: - break - - train_loop(framework.default_main_program()) - - -def decode_main(use_cuda): - if use_cuda and not fluid.core.is_compiled_with_cuda(): - return - place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() - - context = encoder() - state_cell = decoder_state_cell(context) - translation_ids, translation_scores = decoder_decode(state_cell) - - exe = Executor(place) - exe.run(framework.default_startup_program()) - - init_ids_data = np.array([0 for _ in range(batch_size)], dtype='int64') - init_scores_data = np.array( - [1.0 for _ in range(batch_size)], dtype='float32' - ) - init_ids_data = init_ids_data.reshape((batch_size, 1)) - init_scores_data = init_scores_data.reshape((batch_size, 1)) - init_lod = [1] * batch_size - init_lod = [init_lod, init_lod] - - init_ids = fluid.create_lod_tensor(init_ids_data, init_lod, place) - init_scores = fluid.create_lod_tensor(init_scores_data, init_lod, place) - - train_reader = paddle.batch( - paddle.reader.shuffle( - paddle.dataset.wmt14.train(dict_size), buf_size=1000 - ), - batch_size=batch_size, - ) - - feed_order = ['src_word'] - feed_list = [ - framework.default_main_program().global_block().var(var_name) - for var_name in feed_order - ] - feeder = fluid.DataFeeder(feed_list, place) - - data = next(train_reader()) - feed_dict = feeder.feed([[x[0]] for x in data]) - feed_dict['init_ids'] = init_ids - feed_dict['init_scores'] = init_scores - - result_ids, result_scores = exe.run( - framework.default_main_program(), - feed=feed_dict, - fetch_list=[translation_ids, translation_scores], - return_numpy=False, - ) - print(result_ids.lod()) - - -class TestBeamSearchDecoder(unittest.TestCase): - pass - - -@contextlib.contextmanager -def scope_prog_guard(): - prog = fluid.Program() - startup_prog = fluid.Program() - scope = fluid.core.Scope() - with fluid.scope_guard(scope): - with fluid.program_guard(prog, startup_prog): - yield - - -def inject_test_train(use_cuda): - f_name = 'test_{0}_train'.format('cuda' if use_cuda else 'cpu') - - def f(*args): - with scope_prog_guard(): - train_main(use_cuda) - - setattr(TestBeamSearchDecoder, f_name, f) - - -def inject_test_decode(use_cuda, decorator=None): - f_name = 'test_{0}_decode'.format('cuda' if use_cuda else 'cpu') - - def f(*args): - with scope_prog_guard(): - decode_main(use_cuda) - - if decorator is not None: - f = decorator(f) - - setattr(TestBeamSearchDecoder, f_name, f) - - -for _use_cuda_ in (False, True): - inject_test_train(_use_cuda_) - -for _use_cuda_ in (False, True): - _decorator_ = None - inject_test_decode(use_cuda=_use_cuda_, decorator=_decorator_) - -if __name__ == '__main__': - unittest.main() diff --git a/python/setup.py.in b/python/setup.py.in index 08fcca5b2d8f4..79237c0c238c0 100755 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -340,7 +340,6 @@ packages=['paddle', 'paddle.fluid.layers', 'paddle.fluid.dataloader', 'paddle.fluid.contrib', - 'paddle.fluid.contrib.decoder', 'paddle.fluid.contrib.quantize', 'paddle.fluid.contrib.slim', 'paddle.fluid.contrib.slim.quantization', From 14a6e67b60f65539a38deb6c6a2984955d1b4d13 Mon Sep 17 00:00:00 2001 From: Tian Zheng Date: Fri, 18 Nov 2022 13:58:40 +0800 Subject: [PATCH 084/210] CUDNN v8 Implementation of Convolution Kernels (#47454) * Refactor conv_kernel and conv_grad_kernel to provide interface for CUDNNv8 implementation * Fix macro * Add implementation for conv_kernel and conv_grad_kernel * Modification after rebase onto latest develop * Modify plan cache to comply with the API of phi::autotune * Refactor to reduce duplicate code * Review fix: - move functions in conv_kernel_impl_v8.h and conv_grad_kernel_impl_v8.h to conv_kernel.cu and conv_grad_kernelk.cu - add const specifier for input tensor - add logging when plans fail to execute - move CudnnConvBwdFilterV8 and CudnnConvBwdDataV8 to conv_cudnn_frontend.h * - move plan building outside of cache * Fix ROCM build --- .../platform/device/gpu/cuda/cudnn_desc.h | 7 +- paddle/fluid/platform/flags.cc | 13 + paddle/phi/kernels/CMakeLists.txt | 4 +- paddle/phi/kernels/autotune/CMakeLists.txt | 9 +- paddle/phi/kernels/autotune/cache.cc | 25 + paddle/phi/kernels/autotune/cache.h | 42 +- .../kernels/autotune/cache_cudnn_frontend.h | 132 ++++ .../phi/kernels/gpudnn/conv_cudnn_frontend.h | 464 ++++++++++++ paddle/phi/kernels/gpudnn/conv_cudnn_v7.h | 25 - paddle/phi/kernels/gpudnn/conv_gpudnn_base.h | 25 + paddle/phi/kernels/gpudnn/conv_grad_kernel.cu | 685 +++++++++++------- paddle/phi/kernels/gpudnn/conv_kernel.cu | 498 +++++++++---- 12 files changed, 1473 insertions(+), 456 deletions(-) create mode 100644 paddle/phi/kernels/autotune/cache_cudnn_frontend.h create mode 100644 paddle/phi/kernels/gpudnn/conv_cudnn_frontend.h diff --git a/paddle/fluid/platform/device/gpu/cuda/cudnn_desc.h b/paddle/fluid/platform/device/gpu/cuda/cudnn_desc.h index 4c949c66d1bc2..de8c30efe5a4e 100644 --- a/paddle/fluid/platform/device/gpu/cuda/cudnn_desc.h +++ b/paddle/fluid/platform/device/gpu/cuda/cudnn_desc.h @@ -40,12 +40,13 @@ inline cudnnDataType_t ToCudnnDataType(const T& t) { return ToCudnnDataType(type); } -inline std::vector TransformDimOrder(const std::vector& dims) { - std::vector transformed_dims(dims.begin(), dims.end()); +template +inline std::vector TransformDimOrder(const std::vector& dims) { + std::vector transformed_dims(dims.begin(), dims.end()); if (dims.size() < 4) { return transformed_dims; } - int H, W, D, C; + T H, W, D, C; if (dims.size() == 4) { H = dims[1]; W = dims[2]; diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc index d2d2089cee69d..b2ade3d455b6c 100644 --- a/paddle/fluid/platform/flags.cc +++ b/paddle/fluid/platform/flags.cc @@ -1053,4 +1053,17 @@ PADDLE_DEFINE_EXPORTED_string(jit_engine_type, * Note: Enable CUDNNv8 Frontend API for CUDNN kernels. */ PADDLE_DEFINE_EXPORTED_bool(enable_cudnn_frontend, false, ""); + +/** + * CUDNNv8 related FLAG + * Name: cudnn_cache_saturation_count + * Since Version: 2.5.0 + * Value Range: int64_t, default=1 + * Example: + * Note: Set saturation count for CUDNNv8 cache. A candidate execution + * plan need to be considered as the fastest plan by exhaustive search + * N times before it is actually added in the cache. It is useful when + * the result of exhaustive search is unstable. + */ +PADDLE_DEFINE_EXPORTED_int32(cudnn_cache_saturation_count, 1, ""); #endif // PADDLE_WITH_CUDNN_FRONTEND diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt index 8e45da27a806a..75659d2bcd81a 100644 --- a/paddle/phi/kernels/CMakeLists.txt +++ b/paddle/phi/kernels/CMakeLists.txt @@ -84,7 +84,9 @@ if(WITH_NCCL OR WITH_RCCL) set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} processgroup_nccl) endif() set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} processgroup_comm_utils) - +if(WITH_CUDNN_FRONTEND) + set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} cudnn-frontend) +endif() copy_if_different(${kernel_declare_file} ${kernel_declare_file_final}) file(GLOB kernel_h "*.h" "selected_rows/*.h" "sparse/*.h" "strings/*.h") diff --git a/paddle/phi/kernels/autotune/CMakeLists.txt b/paddle/phi/kernels/autotune/CMakeLists.txt index 33c10e7f48b2a..5b195ef3ecf1e 100644 --- a/paddle/phi/kernels/autotune/CMakeLists.txt +++ b/paddle/phi/kernels/autotune/CMakeLists.txt @@ -1,4 +1,11 @@ -cc_library(cache SRCS cache.cc) +if(WITH_CUDNN_FRONTEND) + cc_library( + cache + SRCS cache.cc + DEPS cudnn-frontend) +else() + cc_library(cache SRCS cache.cc) +endif() cc_library( switch_autotune SRCS switch_autotune.cc diff --git a/paddle/phi/kernels/autotune/cache.cc b/paddle/phi/kernels/autotune/cache.cc index d72790f634128..99f09ac2788ac 100644 --- a/paddle/phi/kernels/autotune/cache.cc +++ b/paddle/phi/kernels/autotune/cache.cc @@ -38,6 +38,17 @@ std::string AlgorithmTypeString(int64_t algo_type) { static_cast(AlgorithmType::kConvBackwardFilter)) { return "conv_backward_filter"; } +#ifdef PADDLE_WITH_CUDNN_FRONTEND + if (algo_type == static_cast(AlgorithmType::kConvForwardV8)) { + return "conv_forward_v8"; + } else if (algo_type == + static_cast(AlgorithmType::kConvBackwardDataV8)) { + return "conv_backward_data_v8"; + } else if (algo_type == + static_cast(AlgorithmType::kConvBackwardFilterV8)) { + return "conv_backward_filter_v8"; + } +#endif return std::to_string(algo_type); } @@ -71,6 +82,20 @@ void AutoTuneCache::UpdateStatus() { cache_misses += v.second.CacheMisses(); } +#ifdef PADDLE_WITH_CUDNN_FRONTEND + for (auto& v : cudnn_v8_auto_tune_map_) { + VLOG(4) << "AlgoType: " << std::setfill(' ') << std::setw(name_width) + << AlgorithmTypeString(v.first) + << " Cache Size: " << v.second.Size() + << " Hits: " << v.second.CacheHits() + << " Misses: " << v.second.CacheMisses() + << " Hit Rate: " << v.second.CacheHitRate(); + size += v.second.Size(); + cache_hits += v.second.CacheHits(); + cache_misses += v.second.CacheMisses(); + } +#endif + total_size_ = size; total_cache_hits_ = cache_hits; total_cache_misses_ = cache_misses; diff --git a/paddle/phi/kernels/autotune/cache.h b/paddle/phi/kernels/autotune/cache.h index 027ce58fd2cd4..5766ae8e8a441 100644 --- a/paddle/phi/kernels/autotune/cache.h +++ b/paddle/phi/kernels/autotune/cache.h @@ -19,7 +19,9 @@ #include "paddle/phi/common/data_type.h" #include "paddle/phi/kernels/autotune/cache_base.h" - +#ifdef PADDLE_WITH_CUDNN_FRONTEND +#include "paddle/phi/kernels/autotune/cache_cudnn_frontend.h" +#endif namespace phi { namespace autotune { @@ -41,8 +43,16 @@ enum class AlgorithmType { kConvForward = 1, kConvBackwardData = 2, kConvBackwardFilter = 3, +#ifdef PADDLE_WITH_CUDNN_FRONTEND + kConvForwardV8 = 4, + kConvBackwardDataV8 = 5, + kConvBackwardFilterV8 = 6, + kTranspose = 7, + kAlgorithmCount = 8 +#else kTranspose = 4, kAlgorithmCount = 5 +#endif }; // AlgorithmsConfigKey -> AlgorithmsID @@ -53,7 +63,10 @@ using AlgorithmsTypeMap = std::unordered_map; using ConvAlgorithmsCacheMap = ConvAlgorithmsCache; using ConvAlgorithmsTypeMap = std::unordered_map; - +#ifdef PADDLE_WITH_CUDNN_FRONTEND +using CudnnV8AlgorithmsTypeMap = + std::unordered_map; +#endif class AutoTuneCache { public: static AutoTuneCache& Instance() { @@ -69,6 +82,12 @@ class AutoTuneCache { return conv_auto_tune_map_[static_cast(algo_type)]; } +#ifdef PADDLE_WITH_CUDNN_FRONTEND + CudnnFrontendPlanCache& GetConvV8(const AlgorithmType& algo_type) { + return cudnn_v8_auto_tune_map_[static_cast(algo_type)]; + } +#endif + AlgorithmsCacheMap& GetTranspose() { return Get(AlgorithmType::kTranspose); } void Clean() { @@ -79,6 +98,12 @@ class AutoTuneCache { for (auto& v : conv_auto_tune_map_) { v.second.Clean(); } + +#ifdef PADDLE_WITH_CUDNN_FRONTEND + for (auto& v : cudnn_v8_auto_tune_map_) { + v.second.Clean(); + } +#endif } void UpdateStatus(); @@ -117,6 +142,16 @@ class AutoTuneCache { ConvAlgorithmsCacheMap cache; conv_auto_tune_map_[key] = cache; } +#ifdef PADDLE_WITH_CUDNN_FRONTEND + } else if (algo_type == AlgorithmType::kConvForwardV8 || + algo_type == AlgorithmType::kConvBackwardDataV8 || + algo_type == AlgorithmType::kConvBackwardFilterV8) { + int64_t key = static_cast(algo_type); + if (cudnn_v8_auto_tune_map_.find(key) == cudnn_v8_auto_tune_map_.end()) { + CudnnFrontendPlanCache cache; + cudnn_v8_auto_tune_map_[key] = cache; + } +#endif } else { int64_t key = static_cast(algo_type); if (auto_tune_map_.find(key) == auto_tune_map_.end()) { @@ -128,6 +163,9 @@ class AutoTuneCache { AlgorithmsTypeMap auto_tune_map_; ConvAlgorithmsTypeMap conv_auto_tune_map_; +#ifdef PADDLE_WITH_CUDNN_FRONTEND + CudnnV8AlgorithmsTypeMap cudnn_v8_auto_tune_map_; +#endif std::shared_ptr autotune_cache_mutex_; int64_t total_cache_hits_{0}; int64_t total_cache_misses_{0}; diff --git a/paddle/phi/kernels/autotune/cache_cudnn_frontend.h b/paddle/phi/kernels/autotune/cache_cudnn_frontend.h new file mode 100644 index 0000000000000..c65e69bdbef4f --- /dev/null +++ b/paddle/phi/kernels/autotune/cache_cudnn_frontend.h @@ -0,0 +1,132 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include + +#include "paddle/phi/backends/dynload/cudnn_frontend.h" + +DECLARE_int32(cudnn_cache_saturation_count); + +namespace phi { +namespace autotune { + +class CudnnFrontendPlanCache { + public: + CudnnFrontendPlanCache() : cache_mutex_(new std::mutex()) { + map_.clear(); + tracker_.clear(); + saturation_count_ = FLAGS_cudnn_cache_saturation_count; + } + + int64_t Size() const { return map_.size(); } + + int64_t CacheHits() const { return cache_hits_; } + + int64_t CacheMisses() const { return cache_misses_; } + + float CacheHitRate() const { + int64_t num_accesses = cache_hits_ + cache_misses_; + float cache_hit_rate = 0.; + if (num_accesses != 0) { + cache_hit_rate = + static_cast(cache_hits_) / static_cast(num_accesses); + } + return cache_hit_rate; + } + + void Clean() { + std::lock_guard lock(*cache_mutex_); + map_.clear(); + tracker_.clear(); + cache_hits_ = 0; + cache_misses_ = 0; + } + + bool FindPlan(const cudnn_frontend::OperationGraph& op_graph, + bool use_addto = false) { + bool ret = false; + std::lock_guard lock(*cache_mutex_); + if (map_.count(MakeKey(op_graph, use_addto)) > 0) { + cache_hits_++; + ret = true; + } else { + cache_misses_++; + } + return ret; + } + + cudnn_frontend::ManagedOpaqueDescriptor GetConfig( + const cudnn_frontend::OperationGraph& op_graph, + cudnnHandle_t handle, + bool use_addto = false) { + std::lock_guard lock(*cache_mutex_); + auto engine_config = map_[MakeKey(op_graph, use_addto)]; + return engine_config; + } + + void InsertPlan(const cudnn_frontend::OperationGraph& op_graph, + const cudnn_frontend::ExecutionPlan& plan, + bool use_addto = false) { + VLOG(4) << "[cudnn_frontend] cache: Insert graph tag: " + << op_graph.getTag(); + std::lock_guard lock(*cache_mutex_); + map_.insert( + std::make_pair(MakeKey(op_graph, use_addto), plan.GetEngineConfig())); + } + + bool IsStable(const cudnn_frontend::OperationGraph& op_graph, + const std::string& tag, + bool use_addto = false) { + if (saturation_count_ == 1) { + return true; + } + std::lock_guard lock(*cache_mutex_); + if (map_.count(MakeKey(op_graph, use_addto))) { + return false; + } + int cnt = tracker_[std::make_pair(MakeKey(op_graph, use_addto), tag)] += 1; + VLOG(4) << "[cudnn_frontend] SaturationTracker: " << op_graph.getTag() + << " " << tag << " " << cnt; + return cnt >= saturation_count_; + } + + private: + static cudnn_frontend::feature_vector_t MakeKey( + const cudnn_frontend::OperationGraph& op_graph, bool use_addto) { + auto key = op_graph.getFeatureVector(); + key.push_back(static_cast(use_addto)); + return key; + } + + std::map + map_; + std::shared_ptr cache_mutex_; + int saturation_count_; + + using SaturationTracker = + std::map, int>; + SaturationTracker tracker_; + + int64_t cache_hits_{0}; + int64_t cache_misses_{0}; +}; // class CudnnFrontendPlanCache + +} // namespace autotune +} // namespace phi diff --git a/paddle/phi/kernels/gpudnn/conv_cudnn_frontend.h b/paddle/phi/kernels/gpudnn/conv_cudnn_frontend.h new file mode 100644 index 0000000000000..e45ff63d41b3a --- /dev/null +++ b/paddle/phi/kernels/gpudnn/conv_cudnn_frontend.h @@ -0,0 +1,464 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +Copyright (c) 2022 NVIDIA Corporation. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include + +#include "paddle/fluid/framework/convert_utils.h" +#include "paddle/fluid/platform/device/gpu/cuda/cudnn_desc.h" +#include "paddle/phi/backends/dynload/cudnn_frontend.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/kernels/autotune/cache.h" +#include "paddle/phi/kernels/autotune/switch_autotune.h" + +namespace phi { + +class CudnnFrontendConvHelper { + public: + static bool IsNonDeterministic(cudnnBackendDescriptor_t engine_config) { + return cudnn_frontend::hasNumericalNote< + CUDNN_NUMERICAL_NOTE_NONDETERMINISTIC>(engine_config); + } + static bool AllowAll(cudnnBackendDescriptor_t engine_config) { + (void)engine_config; + return false; + } + + static uint8_t GetAlignment(const phi::DenseTensor* tensor) { + // alignment are in bytes + uint8_t alignment = 1; + uint64_t address = reinterpret_cast(tensor->data()); + while (address % alignment == 0 && alignment < 16) alignment *= 2; + return alignment; + } + + static std::vector GetInt64Array(const std::vector& in_array) { + std::vector out_array(in_array.size()); + for (int i = 0; i < in_array.size(); i++) { + out_array[i] = static_cast(in_array[i]); + } + return out_array; + } + + static std::vector GenerateStrides( + const std::vector& dim, cudnnTensorFormat_t filter_format) { + // ref: + // https://github.com/NVIDIA/cudnn-frontend/blob/main/samples/helpers.cpp + // For INT8x4 and INT8x32 we still compute standard strides here to input + // into the cuDNN functions. We will manually scale by resizeFactor in the + // cpu ref. + size_t nb_dims = dim.size(); + std::vector stride(nb_dims); + if (filter_format == CUDNN_TENSOR_NCHW) { + stride[nb_dims - 1] = 1; + for (int64_t d = nb_dims - 2; d >= 0; d--) { + stride[d] = stride[d + 1] * dim[d + 1]; + } + } else { + // Here we assume that the format is CUDNN_TENSOR_NHWC + stride[1] = 1; + stride[nb_dims - 1] = stride[1] * dim[1]; + for (int64_t d = nb_dims - 2; d >= 2; d--) { + stride[d] = stride[d + 1] * dim[d + 1]; + } + stride[0] = stride[2] * dim[2]; + } + return stride; + } + + static cudnn_frontend::Tensor GetTensorDescriptor( + const phi::DenseTensor* tensor, + int64_t id, + cudnnTensorFormat_t layout_format) { + auto transformed_dims = phi::vectorize(tensor->dims()); + if (layout_format == CUDNN_TENSOR_NHWC) { + transformed_dims = paddle::platform::TransformDimOrder(transformed_dims); + } + std::vector strides = + GenerateStrides(transformed_dims, layout_format); + return cudnn_frontend::TensorBuilder() + .setDim(transformed_dims.size(), transformed_dims.data()) + .setStrides(strides.size(), strides.data()) + .setId(id) + .setAlignment(GetAlignment(tensor)) + .setDataType(paddle::platform::ToCudnnDataType( + paddle::framework::TransToProtoVarType(tensor->dtype()))) + .build(); + } + + static cudnn_frontend::ConvDesc_v8 GetConvDescriptor( + cudnnDataType_t dataType, + const std::vector& padding, + const std::vector& stride, + const std::vector& dilation) { + uint64_t conv_dim = stride.size(); + cudnnDataType_t compute_type = + (dataType == CUDNN_DATA_DOUBLE) ? CUDNN_DATA_DOUBLE : CUDNN_DATA_FLOAT; + std::vector padding_int64 = GetInt64Array(padding); + std::vector stride_int64 = GetInt64Array(stride); + std::vector dilation_int64 = GetInt64Array(dilation); + return cudnn_frontend::ConvDescBuilder() + .setDataType(compute_type) + .setMathMode(CUDNN_CROSS_CORRELATION) + .setNDims(conv_dim) + .setStrides(conv_dim, stride_int64.data()) + .setPrePadding(conv_dim, padding_int64.data()) + .setPostPadding(conv_dim, padding_int64.data()) + .setDilation(conv_dim, dilation_int64.data()) + .build(); + } + + template + static cudnn_frontend::OperationGraph BuildConvOperationGraph( + const phi::DenseTensor* x_tensor, + const phi::DenseTensor* y_tensor, + const phi::DenseTensor* w_tensor, + cudnnTensorFormat_t layout_format, + const std::vector& strides, + const std::vector& padding_common, + const std::vector& dilations, + cudnnDataType_t dtype, + cudnnHandle_t handle, + float alpha, + float beta) { + auto op = cudnn_frontend::OperationBuilder(op_mode) + .setxDesc(GetTensorDescriptor(x_tensor, 'x', layout_format)) + .setyDesc(GetTensorDescriptor(y_tensor, 'y', layout_format)) + .setwDesc(GetTensorDescriptor(w_tensor, 'w', layout_format)) + .setcDesc(GetConvDescriptor( + dtype, padding_common, strides, dilations)) + .setAlpha(alpha) + .setBeta(beta) + .build(); + std::array ops = {&op}; + return cudnn_frontend::OperationGraphBuilder() + .setHandle(handle) + .setOperationGraph(1, ops.data()) + .build(); + } + + static cudnn_frontend::executionPlans_t FindExecutionPlans( + cudnn_frontend::OperationGraph* op_graph_pointer, + bool exhaustive_search, + bool deterministic, + void* x_data, + void* y_data, + void* w_data, + cudnnHandle_t handle, + phi::DnnWorkspaceHandle* workspace_handle) { + auto heurgen_method = [=](cudnn_frontend::OperationGraph& op_graph_) + -> cudnn_frontend::EngineConfigList { + auto heuristics = cudnn_frontend::EngineHeuristicsBuilder() + .setOperationGraph(op_graph_) + .setHeurMode(CUDNN_HEUR_MODE_INSTANT) + .build(); + VLOG(4) << "Heuristic has " << heuristics.getEngineConfigCount() + << " configurations "; + + auto& engine_configs = + heuristics.getEngineConfig(heuristics.getEngineConfigCount()); + cudnn_frontend::EngineConfigList filtered_configs; + cudnn_frontend::filter(engine_configs, + filtered_configs, + deterministic ? IsNonDeterministic : AllowAll); + return filtered_configs; + }; + + auto fallback_method = [=](cudnn_frontend::OperationGraph& op_graph_) + -> cudnn_frontend::EngineConfigList { + auto fallback = cudnn_frontend::EngineFallbackListBuilder() + .setOperationGraph(op_graph_) + .build(); + auto& fallback_list = fallback.getFallbackList(); + cudnn_frontend::EngineConfigList filtered_configs; + cudnn_frontend::filter(fallback_list, + filtered_configs, + deterministic ? IsNonDeterministic : AllowAll); + return filtered_configs; + }; + + std::array sources = { + heurgen_method, fallback_method}; + cudnn_frontend::EngineConfigGenerator generator(sources.size(), + sources.data()); + + size_t workspace_size_limit = + CalcWorkspaceLimitInBytes(UseFixedWorkspace()); + auto predicate_function = + [=](cudnn_frontend::ExecutionPlan const& plan) -> bool { + return plan.getWorkspaceSize() > workspace_size_limit; + }; + + auto plans = + generator.cudnnGetPlan(handle, *op_graph_pointer, predicate_function); + + bool use_autotune = phi::autotune::AutoTuneStatus::Instance().UseAutoTune(); + + if (!deterministic && (exhaustive_search || use_autotune)) { + size_t workspace_size_max = 0; + std::for_each( + plans.begin(), plans.end(), [&](cudnn_frontend::ExecutionPlan& opt) { + if (opt.getWorkspaceSize() > workspace_size_max) { + workspace_size_max = opt.getWorkspaceSize(); + } + }); + VLOG(6) << "[cudnn_frontend] Max workspace size: " << workspace_size_max; + workspace_handle->RunFunc( + [&](void* workspace_ptr) { + void* data_ptrs[] = {x_data, y_data, w_data}; + int64_t uids[] = {'x', 'y', 'w'}; + auto variant_pack = cudnn_frontend::VariantPackBuilder() + .setWorkspacePointer(workspace_ptr) + .setDataPointers(3, data_ptrs) + .setUids(3, uids) + .build(); + plans = + generator + .cudnnFindPlan( + handle, + *op_graph_pointer, + variant_pack, + predicate_function); + }, + workspace_size_max); + } + + std::for_each( + plans.begin(), plans.end(), [](cudnn_frontend::ExecutionPlan& opt) { + VLOG(6) << "Plan tag: " << opt.getTag() << " finished in " + << opt.getExecutionTime() << " ms," + << " workspace: " << opt.getWorkspaceSize() << " bytes"; + }); + + return plans; + } +}; // class CudnnFrontendConvHelper + +template +void CudnnConvBwdDataV8(const DenseTensor* dy_tensor, + const DenseTensor* w_tensor, + cudnnHandle_t handle, + DnnWorkspaceHandle* workspace_handle, + const std::vector& strides, + const std::vector& padding_common, + const std::vector& dilations, + cudnnDataType_t dtype, + cudnnTensorFormat_t layout_format, + bool use_addto, + bool exhaustive_search, + bool deterministic, + DenseTensor* dx_tensor) { + auto& plan_cache_bwd_data = + phi::autotune::AutoTuneCache::Instance().GetConvV8( + phi::autotune::AlgorithmType::kConvBackwardDataV8); + T* dy_tensor_data = const_cast(dy_tensor->data()); + T* w_tensor_data = const_cast(w_tensor->data()); + T* dx_tensor_data = dx_tensor->data(); + + float alpha = 1.0f; + float beta = use_addto ? 1.0f : 0.0f; + + using helper = CudnnFrontendConvHelper; + auto op_graph = helper::BuildConvOperationGraph< + CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR>( + dx_tensor, + dy_tensor, + w_tensor, + layout_format, + strides, + padding_common, + dilations, + dtype, + handle, + alpha, + beta); + + if (plan_cache_bwd_data.FindPlan(op_graph, use_addto)) { + auto engine_config = + plan_cache_bwd_data.GetConfig(op_graph, handle, use_addto); + auto cached_plan = cudnn_frontend::ExecutionPlanBuilder() + .setHandle(handle) + .setEngineConfig(engine_config, op_graph.getTag()) + .build(); + auto workspace_size = cached_plan.getWorkspaceSize(); + VLOG(4) << "Cached execution plan found." << cached_plan.getTag() + << "; Require workspace: " << workspace_size; + workspace_handle->RunFunc( + [&](void* workspace_ptr) { + void* data_ptrs[] = {dx_tensor_data, dy_tensor_data, w_tensor_data}; + int64_t uids[] = {'x', 'y', 'w'}; + auto variant_pack = cudnn_frontend::VariantPackBuilder() + .setWorkspacePointer(workspace_ptr) + .setDataPointers(3, data_ptrs) + .setUids(3, uids) + .build(); + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnBackendExecute( + handle, cached_plan.get_raw_desc(), variant_pack.get_raw_desc())); + }, + workspace_size); + return; + } + + auto plans = helper::FindExecutionPlans(&op_graph, + exhaustive_search, + deterministic, + dx_tensor_data, + dy_tensor_data, + w_tensor_data, + handle, + workspace_handle); + + for (auto& plan : plans) { + try { + int64_t workspace_size = plan.getWorkspaceSize(); + workspace_handle->RunFunc( + [&](void* workspace_ptr) { + void* data_ptrs[] = {dx_tensor_data, dy_tensor_data, w_tensor_data}; + int64_t uids[] = {'x', 'y', 'w'}; + auto variant_pack = cudnn_frontend::VariantPackBuilder() + .setWorkspacePointer(workspace_ptr) + .setDataPointers(3, data_ptrs) + .setUids(3, uids) + .build(); + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnBackendExecute( + handle, plan.get_raw_desc(), variant_pack.get_raw_desc())); + }, + workspace_size); + if (!exhaustive_search || + plan_cache_bwd_data.IsStable(op_graph, plan.getTag(), use_addto)) { + plan_cache_bwd_data.InsertPlan(op_graph, plan, use_addto); + } + return; + } catch (cudnn_frontend::cudnnException& e) { + } catch (phi::enforce::EnforceNotMet& e) { + } + } + PADDLE_THROW( + phi::errors::InvalidArgument("[CUDNN Frontend API] No valid plan could " + "be found to execute conv backward data.")); +} + +template +void CudnnConvBwdFilterV8(const DenseTensor* x_tensor, + const DenseTensor* dy_tensor, + cudnnHandle_t handle, + DnnWorkspaceHandle* workspace_handle, + const std::vector& strides, + const std::vector& padding_common, + const std::vector& dilations, + cudnnDataType_t dtype, + cudnnTensorFormat_t layout_format, + bool use_addto, + bool exhaustive_search, + bool deterministic, + DenseTensor* dw_tensor) { + auto& plan_cache_bwd_filter = + phi::autotune::AutoTuneCache::Instance().GetConvV8( + phi::autotune::AlgorithmType::kConvBackwardFilterV8); + T* x_tensor_data = const_cast(x_tensor->data()); + T* dy_tensor_data = const_cast(dy_tensor->data()); + T* dw_tensor_data = dw_tensor->data(); + + float alpha = 1.0f; + float beta = 0.0f; + + using helper = CudnnFrontendConvHelper; + auto op_graph = helper::BuildConvOperationGraph< + CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR>( + x_tensor, + dy_tensor, + dw_tensor, + layout_format, + strides, + padding_common, + dilations, + dtype, + handle, + alpha, + beta); + + if (plan_cache_bwd_filter.FindPlan(op_graph)) { + auto engine_config = plan_cache_bwd_filter.GetConfig(op_graph, handle); + auto cached_plan = cudnn_frontend::ExecutionPlanBuilder() + .setHandle(handle) + .setEngineConfig(engine_config, op_graph.getTag()) + .build(); + auto workspace_size = cached_plan.getWorkspaceSize(); + VLOG(4) << "Cached execution plan found." << cached_plan.getTag() + << "; Require workspace: " << workspace_size; + workspace_handle->RunFunc( + [&](void* workspace_ptr) { + void* data_ptrs[] = {x_tensor_data, dy_tensor_data, dw_tensor_data}; + int64_t uids[] = {'x', 'y', 'w'}; + auto variant_pack = cudnn_frontend::VariantPackBuilder() + .setWorkspacePointer(workspace_ptr) + .setDataPointers(3, data_ptrs) + .setUids(3, uids) + .build(); + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnBackendExecute( + handle, cached_plan.get_raw_desc(), variant_pack.get_raw_desc())); + }, + workspace_size); + return; + } + + auto plans = helper::FindExecutionPlans(&op_graph, + exhaustive_search, + deterministic, + x_tensor_data, + dy_tensor_data, + dw_tensor_data, + handle, + workspace_handle); + + for (auto& plan : plans) { + try { + int64_t workspace_size = plan.getWorkspaceSize(); + workspace_handle->RunFunc( + [&](void* workspace_ptr) { + void* data_ptrs[] = {x_tensor_data, dy_tensor_data, dw_tensor_data}; + int64_t uids[] = {'x', 'y', 'w'}; + auto variant_pack = cudnn_frontend::VariantPackBuilder() + .setWorkspacePointer(workspace_ptr) + .setDataPointers(3, data_ptrs) + .setUids(3, uids) + .build(); + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnBackendExecute( + handle, plan.get_raw_desc(), variant_pack.get_raw_desc())); + }, + workspace_size); + if (!exhaustive_search || + plan_cache_bwd_filter.IsStable(op_graph, plan.getTag())) { + plan_cache_bwd_filter.InsertPlan(op_graph, plan); + } + return; + } catch (cudnn_frontend::cudnnException& e) { + VLOG(4) << "Plan " << plan.describe() + << "failed to execute. Trying next plan."; + } catch (phi::enforce::EnforceNotMet& e) { + VLOG(4) << "Plan " << plan.describe() + << "failed to execute. Trying next plan."; + } + } + + PADDLE_THROW(phi::errors::InvalidArgument( + "[CUDNN Frontend API] No valid plan could " + "be found to execute conv backward filter.")); +} + +} // namespace phi diff --git a/paddle/phi/kernels/gpudnn/conv_cudnn_v7.h b/paddle/phi/kernels/gpudnn/conv_cudnn_v7.h index 12afa223f1655..e7d912a482599 100644 --- a/paddle/phi/kernels/gpudnn/conv_cudnn_v7.h +++ b/paddle/phi/kernels/gpudnn/conv_cudnn_v7.h @@ -23,31 +23,6 @@ namespace phi { using ConvArgs = ConvArgsBase; -static inline double ToMegaBytes(size_t bytes) { - return static_cast(bytes) / (1 << 20); -} - -static inline bool UseFixedWorkspace() { - return FLAGS_conv_workspace_size_limit >= 0; -} - -static size_t CalcWorkspaceLimitInBytes(bool use_fixed_workspace) { - if (!use_fixed_workspace) { - int device_id = phi::backends::gpu::GetCurrentDeviceId(); - int64_t allocated = - paddle::memory::DeviceMemoryStatCurrentValue("Allocated", device_id); - int64_t reserved = - paddle::memory::DeviceMemoryStatCurrentValue("Reserved", device_id); - int64_t availble = paddle::platform::GpuAvailableMemToAlloc(); - VLOG(3) << "[memory] allocated=" << ToMegaBytes(allocated) - << " MB, reserved=" << ToMegaBytes(reserved) - << " MB, available_to_alloc=" << ToMegaBytes(availble) << " MB."; - return std::max(availble, reserved - allocated); - } else { - return FLAGS_conv_workspace_size_limit * 1024 * 1024; - } -} - template std::string GetPerfResultString(std::string prefix, const std::vector& perf_results, diff --git a/paddle/phi/kernels/gpudnn/conv_gpudnn_base.h b/paddle/phi/kernels/gpudnn/conv_gpudnn_base.h index 8f3604ed42ec3..4353c6789de37 100644 --- a/paddle/phi/kernels/gpudnn/conv_gpudnn_base.h +++ b/paddle/phi/kernels/gpudnn/conv_gpudnn_base.h @@ -36,6 +36,31 @@ using ScalingParamType = enum class ConvKind { kForward = 1, kBackwardData = 2, kBackwardFilter = 3 }; +static inline double ToMegaBytes(size_t bytes) { + return static_cast(bytes) / (1 << 20); +} + +static inline bool UseFixedWorkspace() { + return FLAGS_conv_workspace_size_limit >= 0; +} + +static size_t CalcWorkspaceLimitInBytes(bool use_fixed_workspace) { + if (!use_fixed_workspace) { + int device_id = phi::backends::gpu::GetCurrentDeviceId(); + int64_t allocated = + paddle::memory::DeviceMemoryStatCurrentValue("Allocated", device_id); + int64_t reserved = + paddle::memory::DeviceMemoryStatCurrentValue("Reserved", device_id); + int64_t availble = paddle::platform::GpuAvailableMemToAlloc(); + VLOG(3) << "[memory] allocated=" << ToMegaBytes(allocated) + << " MB, reserved=" << ToMegaBytes(reserved) + << " MB, available_to_alloc=" << ToMegaBytes(availble) << " MB."; + return std::max(availble, reserved - allocated); + } else { + return FLAGS_conv_workspace_size_limit * 1024 * 1024; + } +} + // The container of SearchAlgorithm::Find() result. template struct SearchResult { diff --git a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu index 276480ed54dbd..0d5f266d3d172 100644 --- a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu +++ b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu @@ -32,8 +32,374 @@ #include "paddle/phi/kernels/funcs/padding.h" #include "paddle/phi/kernels/impl/conv_cudnn_impl.h" +#ifdef PADDLE_WITH_CUDNN_FRONTEND +// clang-format off +#include "paddle/phi/backends/dynload/cudnn_frontend.h" +#include "paddle/phi/kernels/gpudnn/conv_cudnn_frontend.h" +// clang-format on +#endif + namespace phi { +template +void ConvCudnnGradKernelImplV7( + const DenseTensor* transformed_input, + const DenseTensor* transformed_filter_channel, + const DenseTensor* transformed_output_grad_channel, + DenseTensor* input_grad, + DenseTensor* filter_grad, + const Context& ctx, + const std::vector& strides, + const std::vector& padding_common, + const std::vector& dilations, + paddle::platform::DataLayout compute_format, + paddle::platform::DataLayout layout, + bool use_addto, + bool exhaustive_search, + bool deterministic, + int groups, + DenseTensor* transformed_input_grad, + DenseTensor* transformed_filter_grad_channel) { + const T* input_data = transformed_input->data(); + const T* output_grad_data = transformed_output_grad_channel->data(); + const T* filter_data = transformed_filter_channel->data(); + T* filter_grad_data = nullptr; + T* input_grad_data = nullptr; + T* transformed_input_grad_data = nullptr; + + auto handle = ctx.cudnn_handle(); + auto workspace_handle = ctx.cudnn_workspace_handle(); + + auto dtype = paddle::platform::CudnnDataType::type; + auto layout_tensor = paddle::platform::GetCudnnTensorFormat(layout); + + ConvArgs args1{handle, + transformed_input_grad, + transformed_filter_channel, + transformed_output_grad_channel, + strides, + padding_common, + dilations, + dtype, + groups, + layout}; + ConvArgs args2{handle, + transformed_input, + transformed_filter_grad_channel, + transformed_output_grad_channel, + strides, + padding_common, + dilations, + dtype, + groups, + layout}; + + int i_n, i_c, i_d, i_h, i_w; + int o_n, o_c, o_d, o_h, o_w; + if (compute_format == paddle::platform::DataLayout::kNHWC) { + GetNCDHW(transformed_input->dims(), + paddle::platform::DataLayout::kNHWC, + &i_n, + &i_c, + &i_d, + &i_h, + &i_w); + GetNCDHW(transformed_output_grad_channel->dims(), + paddle::platform::DataLayout::kNHWC, + &o_n, + &o_c, + &o_d, + &o_h, + &o_w); + } else { + GetNCDHW(transformed_input->dims(), + paddle::platform::DataLayout::kNCHW, + &i_n, + &i_c, + &i_d, + &i_h, + &i_w); + GetNCDHW(transformed_output_grad_channel->dims(), + paddle::platform::DataLayout::kNCHW, + &o_n, + &o_c, + &o_d, + &o_h, + &o_w); + } + + int group_offset_in = i_c / groups * i_h * i_w * i_d; + int group_offset_out = o_c / groups * o_h * o_w * o_d; + int group_offset_filter = transformed_filter_channel->numel() / groups; + +// ------------------- cudnn backward algorithm --------------------- +#ifdef PADDLE_WITH_HIP + SearchResult bwd_result; + SearchResult filter_result; +#else + SearchResult bwd_result; + SearchResult filter_result; +#endif + size_t workspace_size = 0; + int iwo_groups = groups; + int c_groups = 1; + +#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1) + iwo_groups = 1; + c_groups = groups; + groups = 1; +#endif + + if (input_grad) { + // ------------------- cudnn descriptors --------------------- + input_grad_data = input_grad->data(); + transformed_input_grad_data = transformed_input_grad->data(); + + args1.idesc.set(*transformed_input_grad, layout_tensor); + args1.wdesc.set(*transformed_filter_channel, layout_tensor, iwo_groups); + args1.odesc.set(*transformed_output_grad_channel, layout_tensor); + args1.cdesc.set(dtype, + padding_common, + strides, + dilations, + paddle::platform::AllowTF32Cudnn(), + c_groups); + +#ifdef PADDLE_WITH_HIP + using search1 = SearchAlgorithm; + workspace_size = std::max(workspace_size, search1::GetWorkspaceSize(args1)); + bwd_result.algo = search1::Find( + args1, exhaustive_search, deterministic, workspace_size, ctx); +#else + using search1 = SearchAlgorithm; + bwd_result = search1::Find(ctx, args1, exhaustive_search, deterministic); + workspace_size = std::max(workspace_size, bwd_result.workspace_size); +#endif + } + + if (filter_grad) { + // ------------------- cudnn descriptors --------------------- + filter_grad_data = transformed_filter_grad_channel->data(); + + args2.idesc.set(*transformed_input, layout_tensor); + args2.wdesc.set( + *transformed_filter_grad_channel, layout_tensor, iwo_groups); + args2.odesc.set(*transformed_output_grad_channel, layout_tensor); + args2.cdesc.set(dtype, + padding_common, + strides, + dilations, + paddle::platform::AllowTF32Cudnn(), + c_groups); +#ifdef PADDLE_WITH_HIP + using search2 = SearchAlgorithm; + workspace_size = std::max(workspace_size, search2::GetWorkspaceSize(args2)); + filter_result.algo = search2::Find( + args2, exhaustive_search, deterministic, workspace_size, ctx); +#else + using search2 = SearchAlgorithm; + filter_result = + search2::Find(ctx, args2, exhaustive_search, deterministic); + VLOG(3) << "filter algo: " << filter_result.algo << ", time " + << filter_result.time; + workspace_size = std::max(workspace_size, filter_result.workspace_size); +#endif + } + + // ------------------- cudnn conv backward data --------------------- + ScalingParamType alpha = 1.0f; +#ifdef PADDLE_WITH_HIP + // MIOPEN ONLY support beta to be 0.0f + ScalingParamType beta = 0.0f; +#else + ScalingParamType beta = use_addto ? 1.0f : 0.0f; + +#endif + VLOG(4) << "Conv_grad: use_addto = " << use_addto; + + if (input_grad) { +// When beta is 0, it is unnecessary to reset input_grad. +// When beta is 1, the output cannot be reset since addt strategy used. +#ifdef PADDLE_WITH_HIP + if (use_addto) { + DenseTensor temp_tensor(transformed_input_grad->type()); + temp_tensor.Resize(transformed_input_grad->dims()); + T* temp_tensor_data = ctx.template Alloc(&temp_tensor); + workspace_handle.RunFunc( + [&](void* cudnn_workspace_ptr) { + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::miopenConvolutionBackwardData( + handle, + &alpha, + args1.odesc.desc(), + output_grad_data, + args1.wdesc.desc(), + filter_data, + args1.cdesc.desc(), + bwd_result.algo, + &beta, + args1.idesc.desc(), + temp_tensor_data, + cudnn_workspace_ptr, + workspace_size)); + }, + workspace_size); + PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::miopenOpTensor( + handle, + miopenTensorOpAdd, + &alpha, + args1.idesc.desc(), + transformed_input_grad_data, + &alpha, + args1.idesc.desc(), + temp_tensor_data, + &beta, + args1.idesc.desc(), + transformed_input_grad_data)); + } else { + workspace_handle.RunFunc( + [&](void* cudnn_workspace_ptr) { + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::miopenConvolutionBackwardData( + handle, + &alpha, + args1.odesc.desc(), + output_grad_data, + args1.wdesc.desc(), + filter_data, + args1.cdesc.desc(), + bwd_result.algo, + &beta, + args1.idesc.desc(), + transformed_input_grad_data, + cudnn_workspace_ptr, + workspace_size)); + }, + workspace_size); + } +#else + ConvRunner::Apply(ctx, + args1, + bwd_result, + output_grad_data, + filter_data, + transformed_input_grad_data, + groups, + group_offset_in, + group_offset_filter, + group_offset_out, + workspace_size, + &workspace_handle, + use_addto); +#endif + } + + // ------------------- cudnn conv backward filter --------------------- + if (filter_grad) { +// Because beta is zero, it is unnecessary to reset filter_grad. +#ifdef PADDLE_WITH_HIP + workspace_handle.RunFunc( + [&](void* cudnn_workspace_ptr) { + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::miopenConvolutionBackwardWeights( + handle, + &alpha, + args2.odesc.desc(), + output_grad_data, + args2.idesc.desc(), + input_data, + args2.cdesc.desc(), + filter_result.algo, + &beta, + args2.wdesc.desc(), + filter_grad_data, + cudnn_workspace_ptr, + workspace_size)); + }, + workspace_size); +#else + ConvRunner::Apply(ctx, + args2, + filter_result, + output_grad_data, + input_data, + filter_grad_data, + groups, + group_offset_in, + group_offset_filter, + group_offset_out, + workspace_size, + &workspace_handle, + false); +#endif + } +} + +#ifdef PADDLE_WITH_CUDNN_FRONTEND +template +void ConvCudnnGradKernelImplV8( + const DenseTensor* transformed_input, + const DenseTensor* transformed_filter_channel, + const DenseTensor* transformed_output_grad_channel, + DenseTensor* input_grad, + DenseTensor* filter_grad, + const Context& ctx, + const std::vector& strides, + const std::vector& padding_common, + const std::vector& dilations, + paddle::platform::DataLayout layout, + bool use_addto, + bool exhaustive_search, + bool deterministic, + int groups, + DenseTensor* transformed_input_grad, + DenseTensor* transformed_filter_grad_channel) { + PADDLE_ENFORCE_EQ( + groups, + 1, + paddle::platform::errors::Unimplemented( + "Group concolution using CUDNNv8 API is unsupported for now")); + + cudnnHandle_t handle = const_cast(ctx.cudnn_handle()); + auto workspace_handle = ctx.cudnn_workspace_handle(); + + auto dtype = paddle::platform::CudnnDataType::type; + auto layout_format = paddle::platform::GetCudnnTensorFormat(layout); + + if (input_grad) { + CudnnConvBwdDataV8(transformed_output_grad_channel, + transformed_filter_channel, + handle, + &workspace_handle, + strides, + padding_common, + dilations, + dtype, + layout_format, + use_addto, + exhaustive_search, + deterministic, + transformed_input_grad); + } + + if (filter_grad) { + CudnnConvBwdFilterV8(transformed_input, + transformed_output_grad_channel, + handle, + &workspace_handle, + strides, + padding_common, + dilations, + dtype, + layout_format, + use_addto, + exhaustive_search, + deterministic, + transformed_filter_grad_channel); + } +} +#endif + template void ConvCudnnGradKernel(const Context& ctx, const DenseTensor& input, @@ -252,249 +618,74 @@ void ConvCudnnGradKernel(const Context& ctx, } } } - - const T* input_data = transformed_input.data(); - const T* output_grad_data = transformed_output_grad_channel.data(); - const T* filter_data = transformed_filter_channel.data(); - T* filter_grad_data = nullptr; - T* input_grad_data = nullptr; - T* transformed_input_grad_data = nullptr; - - auto handle = ctx.cudnn_handle(); paddle::platform::DataLayout layout = compute_format == paddle::platform::DataLayout::kNHWC ? paddle::platform::DataLayout::kNHWC : paddle::platform::DataLayout::kNCHW; - - ConvArgs args1{handle, - &transformed_input_grad, - &transformed_filter_channel, - &transformed_output_grad_channel, - strides, - padding_common, - dilations, - dtype, - groups, - layout}; - ConvArgs args2{handle, - &transformed_input, - &transformed_filter_grad_channel, - &transformed_output_grad_channel, - strides, - padding_common, - dilations, - dtype, - groups, - layout}; - // TODO(phlrain): replace paddle::platform::DataLaytout to phi::DataLayout - if (transformed_input.dims().size() == 5) { layout = compute_format == paddle::platform::DataLayout::kNHWC ? paddle::platform::DataLayout::kNDHWC : paddle::platform::DataLayout::kNCDHW; } - auto layout_tensor = paddle::platform::GetCudnnTensorFormat(layout); - auto workspace_handle = ctx.cudnn_workspace_handle(); - - int i_n, i_c, i_d, i_h, i_w; - int o_n, o_c, o_d, o_h, o_w; - if (compute_format == paddle::platform::DataLayout::kNHWC) { - GetNCDHW(transformed_input.dims(), - paddle::platform::DataLayout::kNHWC, - &i_n, - &i_c, - &i_d, - &i_h, - &i_w); - GetNCDHW(transformed_output_grad_channel.dims(), - paddle::platform::DataLayout::kNHWC, - &o_n, - &o_c, - &o_d, - &o_h, - &o_w); - } else { - GetNCDHW(transformed_input.dims(), - paddle::platform::DataLayout::kNCHW, - &i_n, - &i_c, - &i_d, - &i_h, - &i_w); - GetNCDHW(transformed_output_grad_channel.dims(), - paddle::platform::DataLayout::kNCHW, - &o_n, - &o_c, - &o_d, - &o_h, - &o_w); - } - - int group_offset_in = i_c / groups * i_h * i_w * i_d; - int group_offset_out = o_c / groups * o_h * o_w * o_d; - int group_offset_filter = transformed_filter_channel.numel() / groups; - -// ------------------- cudnn backward algorithm --------------------- -#ifdef PADDLE_WITH_HIP - SearchResult bwd_result; - SearchResult filter_result; -#else - SearchResult bwd_result; - SearchResult filter_result; -#endif - size_t workspace_size = 0; - int iwo_groups = groups; - int c_groups = 1; - -#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1) - iwo_groups = 1; - c_groups = groups; - groups = 1; -#endif - - if (input_grad) { - // ------------------- cudnn descriptors --------------------- - input_grad_data = input_grad->data(); - transformed_input_grad_data = transformed_input_grad.data(); - - args1.idesc.set(transformed_input_grad, layout_tensor); - args1.wdesc.set(transformed_filter_channel, layout_tensor, iwo_groups); - args1.odesc.set(transformed_output_grad_channel, layout_tensor); - args1.cdesc.set(dtype, - padding_common, - strides, - dilations, - paddle::platform::AllowTF32Cudnn(), - c_groups); - -#ifdef PADDLE_WITH_HIP - using search1 = SearchAlgorithm; - workspace_size = std::max(workspace_size, search1::GetWorkspaceSize(args1)); - bwd_result.algo = search1::Find( - args1, exhaustive_search, deterministic, workspace_size, ctx); -#else - using search1 = SearchAlgorithm; - bwd_result = search1::Find(ctx, args1, exhaustive_search, deterministic); - workspace_size = std::max(workspace_size, bwd_result.workspace_size); -#endif - } - - if (filter_grad) { - // ------------------- cudnn descriptors --------------------- - filter_grad_data = transformed_filter_grad_channel.data(); - - args2.idesc.set(transformed_input, layout_tensor); - args2.wdesc.set(transformed_filter_grad_channel, layout_tensor, iwo_groups); - args2.odesc.set(transformed_output_grad_channel, layout_tensor); - args2.cdesc.set(dtype, - padding_common, - strides, - dilations, - paddle::platform::AllowTF32Cudnn(), - c_groups); -#ifdef PADDLE_WITH_HIP - using search2 = SearchAlgorithm; - workspace_size = std::max(workspace_size, search2::GetWorkspaceSize(args2)); - filter_result.algo = search2::Find( - args2, exhaustive_search, deterministic, workspace_size, ctx); -#else - using search2 = SearchAlgorithm; - filter_result = - search2::Find(ctx, args2, exhaustive_search, deterministic); - VLOG(3) << "filter algo: " << filter_result.algo << ", time " - << filter_result.time; - workspace_size = std::max(workspace_size, filter_result.workspace_size); -#endif - } - // ------------------- cudnn conv backward data --------------------- - ScalingParamType alpha = 1.0f; -#ifdef PADDLE_WITH_HIP - // MIOPEN ONLY support beta to be 0.0f - ScalingParamType beta = 0.0f; +#ifdef PADDLE_WITH_CUDNN_FRONTEND + if (dynload::IsCudnnFrontendEnabled() && (groups == 1)) + ConvCudnnGradKernelImplV8(&transformed_input, + &transformed_filter_channel, + &transformed_output_grad_channel, + input_grad, + filter_grad, + ctx, + strides, + padding_common, + dilations, + layout, + use_addto, + exhaustive_search, + deterministic, + groups, + &transformed_input_grad, + &transformed_filter_grad_channel); + else + ConvCudnnGradKernelImplV7(&transformed_input, + &transformed_filter_channel, + &transformed_output_grad_channel, + input_grad, + filter_grad, + ctx, + strides, + padding_common, + dilations, + compute_format, + layout, + use_addto, + exhaustive_search, + deterministic, + groups, + &transformed_input_grad, + &transformed_filter_grad_channel); #else - ScalingParamType beta = use_addto ? 1.0f : 0.0f; - + ConvCudnnGradKernelImplV7(&transformed_input, + &transformed_filter_channel, + &transformed_output_grad_channel, + input_grad, + filter_grad, + ctx, + strides, + padding_common, + dilations, + compute_format, + layout, + use_addto, + exhaustive_search, + deterministic, + groups, + &transformed_input_grad, + &transformed_filter_grad_channel); #endif - VLOG(4) << "Conv_grad: use_addto = " << use_addto; if (input_grad) { -// When beta is 0, it is unnecessary to reset input_grad. -// When beta is 1, the output cannot be reset since addt strategy used. -#ifdef PADDLE_WITH_HIP - if (use_addto) { - DenseTensor temp_tensor(transformed_input_grad.type()); - temp_tensor.Resize(transformed_input_grad.dims()); - T* temp_tensor_data = ctx.template Alloc(&temp_tensor); - workspace_handle.RunFunc( - [&](void* cudnn_workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - paddle::platform::dynload::miopenConvolutionBackwardData( - handle, - &alpha, - args1.odesc.desc(), - output_grad_data, - args1.wdesc.desc(), - filter_data, - args1.cdesc.desc(), - bwd_result.algo, - &beta, - args1.idesc.desc(), - temp_tensor_data, - cudnn_workspace_ptr, - workspace_size)); - }, - workspace_size); - PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::miopenOpTensor( - handle, - miopenTensorOpAdd, - &alpha, - args1.idesc.desc(), - transformed_input_grad_data, - &alpha, - args1.idesc.desc(), - temp_tensor_data, - &beta, - args1.idesc.desc(), - transformed_input_grad_data)); - } else { - workspace_handle.RunFunc( - [&](void* cudnn_workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - paddle::platform::dynload::miopenConvolutionBackwardData( - handle, - &alpha, - args1.odesc.desc(), - output_grad_data, - args1.wdesc.desc(), - filter_data, - args1.cdesc.desc(), - bwd_result.algo, - &beta, - args1.idesc.desc(), - transformed_input_grad_data, - cudnn_workspace_ptr, - workspace_size)); - }, - workspace_size); - } -#else - ConvRunner::Apply(ctx, - args1, - bwd_result, - output_grad_data, - filter_data, - transformed_input_grad_data, - groups, - group_offset_in, - group_offset_filter, - group_offset_out, - workspace_size, - &workspace_handle, - use_addto); -#endif - if (!is_sys_pad) { std::vector starts(transformed_input_channel.dims().size(), 0); std::vector axes(transformed_input_channel.dims().size(), 0); @@ -526,45 +717,7 @@ void ConvCudnnGradKernel(const Context& ctx, } } - // ------------------- cudnn conv backward filter --------------------- if (filter_grad) { -// Because beta is zero, it is unnecessary to reset filter_grad. -#ifdef PADDLE_WITH_HIP - workspace_handle.RunFunc( - [&](void* cudnn_workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - paddle::platform::dynload::miopenConvolutionBackwardWeights( - handle, - &alpha, - args2.odesc.desc(), - output_grad_data, - args2.idesc.desc(), - input_data, - args2.cdesc.desc(), - filter_result.algo, - &beta, - args2.wdesc.desc(), - filter_grad_data, - cudnn_workspace_ptr, - workspace_size)); - }, - workspace_size); -#else - ConvRunner::Apply(ctx, - args2, - filter_result, - output_grad_data, - input_data, - filter_grad_data, - groups, - group_offset_in, - group_offset_filter, - group_offset_out, - workspace_size, - &workspace_handle, - false); -#endif - if (compute_format == paddle::platform::DataLayout::kNHWC) { TransToChannelFirst( ctx, &transformed_filter_grad_channel, filter_grad); diff --git a/paddle/phi/kernels/gpudnn/conv_kernel.cu b/paddle/phi/kernels/gpudnn/conv_kernel.cu index ba4cc12990706..3e3b1fb198da9 100644 --- a/paddle/phi/kernels/gpudnn/conv_kernel.cu +++ b/paddle/phi/kernels/gpudnn/conv_kernel.cu @@ -33,8 +33,309 @@ #include "paddle/phi/kernels/funcs/padding.h" #include "paddle/phi/kernels/impl/conv_cudnn_impl.h" +#ifdef PADDLE_WITH_CUDNN_FRONTEND +// clang-format off +#include "paddle/phi/backends/dynload/cudnn_frontend.h" +#include "paddle/phi/kernels/autotune/cache.h" +#include "paddle/phi/kernels/gpudnn/conv_cudnn_frontend.h" +// clang-format on +#endif + namespace phi { +template +void ConvCudnnKernelImplV7(const DenseTensor* transformed_input, + const DenseTensor* transformed_filter_channel, + const Context& ctx, + const std::vector& strides, + const std::vector& padding_common, + const std::vector& dilations, + paddle::platform::DataLayout compute_format, + paddle::platform::DataLayout layout, + bool exhaustive_search, + bool deterministic, + int groups, + DenseTensor* transformed_output) { + const T* input_data = transformed_input->data(); + const T* filter_data = transformed_filter_channel->data(); + T* output_data = transformed_output->data(); + + auto handle = ctx.cudnn_handle(); + auto workspace_handle = ctx.cudnn_workspace_handle(); + + auto layout_format = paddle::platform::GetCudnnTensorFormat(layout); + auto dtype = paddle::platform::CudnnDataType::type; + + // ------------------- cudnn descriptors --------------------- + ConvArgs args{handle, + transformed_input, + transformed_filter_channel, + transformed_output, + strides, + padding_common, + dilations, + dtype, + groups, + compute_format}; + +#ifdef PADDLE_WITH_HIP + // MIOPEN need to set groups in cdesc in miopen_desc.h + args.cdesc.set(dtype, + padding_common, + strides, + dilations, + paddle::platform::AllowTF32Cudnn(), + groups); +#else + args.cdesc.set(dtype, + padding_common, + strides, + dilations, + paddle::platform::AllowTF32Cudnn()); +#endif + +#if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION_MIN(7, 0, 1) + // cudnn 7 can support groups, no need to do it manually + // FIXME(typhoonzero): find a better way to disable groups + // rather than setting it to 1. + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnSetConvolutionGroupCount( + args.cdesc.desc(), groups)); + groups = 1; +#endif +#ifdef PADDLE_WITH_HIP + // MIOPEN do not set groups in wdesc after set groups in cdesc + groups = 1; +#endif + args.idesc.set(*transformed_input, layout_format); + args.wdesc.set(*transformed_filter_channel, layout_format, groups); + args.odesc.set(*transformed_output, layout_format); + int i_n, i_c, i_d, i_h, i_w; + int o_n, o_c, o_d, o_h, o_w; + + if (compute_format == paddle::platform::DataLayout::kNHWC) { + GetNCDHW(transformed_input->dims(), + paddle::platform::DataLayout::kNHWC, + &i_n, + &i_c, + &i_d, + &i_h, + &i_w); + GetNCDHW(transformed_output->dims(), + paddle::platform::DataLayout::kNHWC, + &o_n, + &o_c, + &o_d, + &o_h, + &o_w); + } else { + GetNCDHW(transformed_input->dims(), + paddle::platform::DataLayout::kNCHW, + &i_n, + &i_c, + &i_d, + &i_h, + &i_w); + GetNCDHW(transformed_output->dims(), + paddle::platform::DataLayout::kNCHW, + &o_n, + &o_c, + &o_d, + &o_h, + &o_w); + } + + int group_offset_in = i_c / groups * i_h * i_w * i_d; + int group_offset_out = o_c / groups * o_h * o_w * o_d; + int group_offset_filter = transformed_filter_channel->numel() / groups; + // ------------------- cudnn conv workspace --------------------- + size_t workspace_size = 0; // final workspace to allocate. +// ------------------- cudnn conv algorithm --------------------- +#ifdef PADDLE_WITH_HIP + SearchResult fwd_result; + using search = SearchAlgorithm; + workspace_size = search::GetWorkspaceSize(args); + fwd_result.algo = search::Find( + args, exhaustive_search, deterministic, workspace_size, ctx); +#else + SearchResult fwd_result; + using search = SearchAlgorithm; + fwd_result = search::Find(ctx, args, exhaustive_search, deterministic); + workspace_size = fwd_result.workspace_size; +#endif + +#if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION_MIN(7, 0, 1) + // when groups > 1, SearchAlgorithm find algo is CUDNN_CONVOLUTION_\ + // FWD_ALGO_WINOGRAD_NONFUSED, but this kind of algorithm is unstable + // in forward computation, so change the algorithm to CUDNN_CONVOLUTION_\ + // FWD_ALGO_IMPLICIT_GEMM manually. + if (groups > 1) { + fwd_result.algo = static_cast(0); + } +#endif + + // ------------------- cudnn conv forward --------------------- + ScalingParamType alpha = 1.0f; + ScalingParamType beta = 0.0f; + + // NOTE(zhiqiu): inplace addto is not supportted in double grad yet. + // ScalingParamType beta = ctx.Attr("use_addto") ? 1.0f : 0.0f; + // VLOG(4) << "Conv: use_addto = " << ctx.Attr("use_addto"); + +#ifdef PADDLE_WITH_HIP + workspace_handle.RunFunc( + [&](void* workspace_ptr) { + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::miopenConvolutionForward( + handle, + &alpha, + args.idesc.desc(), + input_data, + args.wdesc.desc(), + filter_data, + args.cdesc.desc(), + fwd_result.algo, + &beta, + args.odesc.desc(), + output_data, + workspace_ptr, + workspace_size)); + }, + workspace_size); +#else + ConvRunner::Apply(ctx, + args, + fwd_result, + input_data, + filter_data, + output_data, + groups, + group_offset_in, + group_offset_filter, + group_offset_out, + workspace_size, + &workspace_handle, + false); +#endif +} + +#ifdef PADDLE_WITH_CUDNN_FRONTEND +template +void ConvCudnnKernelImplV8(const DenseTensor* input_tensor, + const DenseTensor* filter_channel_tensor, + const Context& ctx, + const std::vector& strides, + const std::vector& padding_common, + const std::vector& dilations, + paddle::platform::DataLayout layout, + bool exhaustive_search, + bool deterministic, + int groups, + DenseTensor* output_tensor) { + auto& plan_cache = phi::autotune::AutoTuneCache::Instance().GetConvV8( + phi::autotune::AlgorithmType::kConvForwardV8); + + PADDLE_ENFORCE_EQ( + groups, + 1, + paddle::platform::errors::Unimplemented( + "Group concolution using CUDNNv8 API unsupported for now")); + + T* input_data = const_cast(input_tensor->data()); + T* filter_data = const_cast(filter_channel_tensor->data()); + T* output_data = output_tensor->data(); + cudnnHandle_t handle = const_cast(ctx.cudnn_handle()); + auto workspace_handle = ctx.cudnn_workspace_handle(); + + auto layout_format = paddle::platform::GetCudnnTensorFormat(layout); + auto dtype = paddle::platform::CudnnDataType::type; + + float alpha = 1.0f; + float beta = 0.0f; + + using helper = CudnnFrontendConvHelper; + auto op_graph = helper::BuildConvOperationGraph< + CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR>( + input_tensor, + output_tensor, + filter_channel_tensor, + layout_format, + strides, + padding_common, + dilations, + dtype, + handle, + alpha, + beta); + + if (plan_cache.FindPlan(op_graph)) { + auto engine_config = plan_cache.GetConfig(op_graph, handle); + auto cached_plan = cudnn_frontend::ExecutionPlanBuilder() + .setHandle(handle) + .setEngineConfig(engine_config, op_graph.getTag()) + .build(); + auto workspace_size = cached_plan.getWorkspaceSize(); + VLOG(4) << "Cached execution plan found." << cached_plan.getTag() + << "; Require workspace: " << workspace_size; + workspace_handle.RunFunc( + [&](void* workspace_ptr) { + void* data_ptrs[] = {input_data, output_data, filter_data}; + int64_t uids[] = {'x', 'y', 'w'}; + auto variant_pack = cudnn_frontend::VariantPackBuilder() + .setWorkspacePointer(workspace_ptr) + .setDataPointers(3, data_ptrs) + .setUids(3, uids) + .build(); + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnBackendExecute( + handle, cached_plan.get_raw_desc(), variant_pack.get_raw_desc())); + }, + workspace_size); + return; + } + + auto plans = helper::FindExecutionPlans(&op_graph, + exhaustive_search, + deterministic, + input_data, + output_data, + filter_data, + handle, + &workspace_handle); + + for (auto& plan : plans) { + try { + int64_t workspace_size = plan.getWorkspaceSize(); + workspace_handle.RunFunc( + [&](void* workspace_ptr) { + void* data_ptrs[] = {input_data, output_data, filter_data}; + int64_t uids[] = {'x', 'y', 'w'}; + auto variant_pack = cudnn_frontend::VariantPackBuilder() + .setWorkspacePointer(workspace_ptr) + .setDataPointers(3, data_ptrs) + .setUids(3, uids) + .build(); + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnBackendExecute( + handle, plan.get_raw_desc(), variant_pack.get_raw_desc())); + }, + workspace_size); + if (!exhaustive_search || plan_cache.IsStable(op_graph, plan.getTag())) { + plan_cache.InsertPlan(op_graph, plan); + } + return; + } catch (cudnn_frontend::cudnnException& e) { + VLOG(4) << "Plan " << plan.describe() + << "failed to execute. Trying next plan."; + } catch (phi::enforce::EnforceNotMet& e) { + VLOG(4) << "Plan " << plan.describe() + << "failed to execute. Trying next plan."; + } + } + PADDLE_THROW( + phi::errors::InvalidArgument("[CUDNN Frontend API] No valid plan could " + "be found to execute conv.")); +} +#endif + template void ConvCudnnKernel(const Context& ctx, const DenseTensor& input, @@ -91,7 +392,7 @@ void ConvCudnnKernel(const Context& ctx, DenseTensor transformed_input_channel(input.type()); DenseTensor transformed_output(output->type()); DenseTensor transformed_filter_channel(filter.type()); - T* output_data = nullptr; + if (channel_last && compute_format == paddle::platform::DataLayout::kNCHW) { VLOG(3) << "Transform input tensor from NHWC to NCHW."; ResizeToChannelFirst(ctx, &input, &transformed_input_channel); @@ -110,7 +411,6 @@ void ConvCudnnKernel(const Context& ctx, } else { transformed_filter_channel.ShareDataWith(filter); } - output_data = transformed_output.data(); // update padding and dilation auto in_dims = transformed_input_channel.dims(); @@ -205,24 +505,6 @@ void ConvCudnnKernel(const Context& ctx, } } - const T* input_data = transformed_input.data(); - const T* filter_data = transformed_filter_channel.data(); - - auto handle = ctx.cudnn_handle(); - auto workspace_handle = ctx.cudnn_workspace_handle(); - - // ------------------- cudnn descriptors --------------------- - ConvArgs args{handle, - &transformed_input, - &transformed_filter_channel, - &transformed_output, - strides, - padding_common, - dilations, - dtype, - groups, - compute_format}; - paddle::platform::DataLayout layout = compute_format == paddle::platform::DataLayout::kNHWC ? paddle::platform::DataLayout::kNHWC @@ -232,146 +514,46 @@ void ConvCudnnKernel(const Context& ctx, ? paddle::platform::DataLayout::kNDHWC : paddle::platform::DataLayout::kNCDHW; } - auto layout_format = paddle::platform::GetCudnnTensorFormat(layout); -#ifdef PADDLE_WITH_HIP - // MIOPEN need to set groups in cdesc in miopen_desc.h - args.cdesc.set(dtype, - padding_common, - strides, - dilations, - paddle::platform::AllowTF32Cudnn(), - groups); +#ifdef PADDLE_WITH_CUDNN_FRONTEND + if (dynload::IsCudnnFrontendEnabled() && (groups == 1)) + ConvCudnnKernelImplV8(&transformed_input, + &transformed_filter_channel, + ctx, + strides, + padding_common, + dilations, + layout, + exhaustive_search, + deterministic, + groups, + &transformed_output); + else + ConvCudnnKernelImplV7(&transformed_input, + &transformed_filter_channel, + ctx, + strides, + padding_common, + dilations, + compute_format, + layout, + exhaustive_search, + deterministic, + groups, + &transformed_output); #else - args.cdesc.set(dtype, - padding_common, - strides, - dilations, - paddle::platform::AllowTF32Cudnn()); -#endif - -#if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION_MIN(7, 0, 1) - // cudnn 7 can support groups, no need to do it manually - // FIXME(typhoonzero): find a better way to disable groups - // rather than setting it to 1. - PADDLE_ENFORCE_GPU_SUCCESS( - paddle::platform::dynload::cudnnSetConvolutionGroupCount( - args.cdesc.desc(), groups)); - groups = 1; -#endif -#ifdef PADDLE_WITH_HIP - // MIOPEN do not set groups in wdesc after set groups in cdesc - groups = 1; -#endif - args.idesc.set(transformed_input, layout_format); - args.wdesc.set(transformed_filter_channel, layout_format, groups); - args.odesc.set(transformed_output, layout_format); - int i_n, i_c, i_d, i_h, i_w; - int o_n, o_c, o_d, o_h, o_w; - - if (compute_format == paddle::platform::DataLayout::kNHWC) { - GetNCDHW(transformed_input.dims(), - paddle::platform::DataLayout::kNHWC, - &i_n, - &i_c, - &i_d, - &i_h, - &i_w); - GetNCDHW(transformed_output.dims(), - paddle::platform::DataLayout::kNHWC, - &o_n, - &o_c, - &o_d, - &o_h, - &o_w); - } else { - GetNCDHW(transformed_input.dims(), - paddle::platform::DataLayout::kNCHW, - &i_n, - &i_c, - &i_d, - &i_h, - &i_w); - GetNCDHW(transformed_output.dims(), - paddle::platform::DataLayout::kNCHW, - &o_n, - &o_c, - &o_d, - &o_h, - &o_w); - } - - int group_offset_in = i_c / groups * i_h * i_w * i_d; - int group_offset_out = o_c / groups * o_h * o_w * o_d; - int group_offset_filter = transformed_filter_channel.numel() / groups; - // ------------------- cudnn conv workspace --------------------- - size_t workspace_size = 0; // final workspace to allocate. -// ------------------- cudnn conv algorithm --------------------- -#ifdef PADDLE_WITH_HIP - SearchResult fwd_result; - using search = SearchAlgorithm; - workspace_size = search::GetWorkspaceSize(args); - fwd_result.algo = search::Find( - args, exhaustive_search, deterministic, workspace_size, ctx); -#else - SearchResult fwd_result; - using search = SearchAlgorithm; - fwd_result = search::Find(ctx, args, exhaustive_search, deterministic); - workspace_size = fwd_result.workspace_size; -#endif - -#if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION_MIN(7, 0, 1) - // when groups > 1, SearchAlgorithm find algo is CUDNN_CONVOLUTION_\ - // FWD_ALGO_WINOGRAD_NONFUSED, but this kind of algorithm is unstable - // in forward computation, so change the algorithm to CUDNN_CONVOLUTION_\ - // FWD_ALGO_IMPLICIT_GEMM manually. - if (groups > 1) { - fwd_result.algo = static_cast(0); - } -#endif - - // ------------------- cudnn conv forward --------------------- - ScalingParamType alpha = 1.0f; - ScalingParamType beta = 0.0f; - - // NOTE(zhiqiu): inplace addto is not supportted in double grad yet. - // ScalingParamType beta = ctx.Attr("use_addto") ? 1.0f : 0.0f; - // VLOG(4) << "Conv: use_addto = " << ctx.Attr("use_addto"); - -#ifdef PADDLE_WITH_HIP - workspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - paddle::platform::dynload::miopenConvolutionForward( - handle, - &alpha, - args.idesc.desc(), - input_data, - args.wdesc.desc(), - filter_data, - args.cdesc.desc(), - fwd_result.algo, - &beta, - args.odesc.desc(), - output_data, - workspace_ptr, - workspace_size)); - }, - workspace_size); -#else - ConvRunner::Apply(ctx, - args, - fwd_result, - input_data, - filter_data, - output_data, - groups, - group_offset_in, - group_offset_filter, - group_offset_out, - workspace_size, - &workspace_handle, - false); + ConvCudnnKernelImplV7(&transformed_input, + &transformed_filter_channel, + ctx, + strides, + padding_common, + dilations, + compute_format, + layout, + exhaustive_search, + deterministic, + groups, + &transformed_output); #endif if (channel_last && compute_format == paddle::platform::DataLayout::kNCHW) { From 3b18d96b576862d41c023f97461acc4f15614bb0 Mon Sep 17 00:00:00 2001 From: james Date: Fri, 18 Nov 2022 14:22:12 +0800 Subject: [PATCH 085/210] fix device id issue for xpu eager mode (#48076) * fix device id issue for xpu eager xpu device id is not correctly set in eager mode, thus vars are on dev0 unless XPUDeviceGurad is called, leading to this error message for all node rank != 0: "NotImplementedError: (Unimplemented) Place Place(xpu:0) is not supported." * fix typo * fix pybind error --- paddle/fluid/distributed/collective/ProcessGroupBKCL.cc | 1 + .../eager/auto_code_generator/generator/python_c_gen.py | 9 +++++++++ paddle/fluid/pybind/distributed_py.cc | 2 +- 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/distributed/collective/ProcessGroupBKCL.cc b/paddle/fluid/distributed/collective/ProcessGroupBKCL.cc index a5c80cb04108d..8dfb65d981374 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupBKCL.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupBKCL.cc @@ -105,6 +105,7 @@ void ProcessGroupBKCL::BroadcastUniqueBKCLID(BKCLUniqueId* bkcl_id) { void ProcessGroupBKCL::CreateBKCLEnvCache(const Place& place, const std::string& place_key) { + platform::XPUDeviceGuard guard(place.GetDeviceId()); BKCLUniqueId bkcl_id; if (rank_ == 0) { PADDLE_ENFORCE_XPU_SUCCESS(bkcl_get_unique_id(&bkcl_id)); diff --git a/paddle/fluid/eager/auto_code_generator/generator/python_c_gen.py b/paddle/fluid/eager/auto_code_generator/generator/python_c_gen.py index 8e3944b79c30f..aacde58fa7bc2 100644 --- a/paddle/fluid/eager/auto_code_generator/generator/python_c_gen.py +++ b/paddle/fluid/eager/auto_code_generator/generator/python_c_gen.py @@ -128,6 +128,15 @@ def FindParsingFunctionFromAttributeType(atype): #else PADDLE_THROW(paddle::platform::errors::PreconditionNotMet( "PaddlePaddle should compile with CUSTOM_DEVICE if use CustomPlace.")); +#endif + }} + if (paddle::platform::is_xpu_place(place)) {{ +#if defined(PADDLE_WITH_XPU) + phi::backends::xpu::SetXPUDeviceId(place.device); + VLOG(4) <<"CurrentDeviceId: " << phi::backends::xpu::GetXPUCurrentDeviceId() << " from " << (int)place.device; +#else + PADDLE_THROW(paddle::platform::errors::PreconditionNotMet( + "PaddlePaddle should compile with XPU if use XPUPlace.")); #endif }} """ diff --git a/paddle/fluid/pybind/distributed_py.cc b/paddle/fluid/pybind/distributed_py.cc index dbc4c57c656ba..52160ea99a083 100644 --- a/paddle/fluid/pybind/distributed_py.cc +++ b/paddle/fluid/pybind/distributed_py.cc @@ -1284,7 +1284,7 @@ void BindDistributed(py::module *m) { auto processGroupBKCL = py::class_>( - *m, "ProcessGroupBKCL", ProcessGroup) + *m, "ProcessGroupBKCL", ProcessGroupStream) .def(py::init &, int, int, From 1fb4d90b710090f38ebcda82d6dae4559642737d Mon Sep 17 00:00:00 2001 From: Dandelight <55911877+Dandelight@users.noreply.github.com> Date: Fri, 18 Nov 2022 14:25:31 +0800 Subject: [PATCH 086/210] Add description to `nn.functional.celu` (#48074) --- python/paddle/nn/functional/activation.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/python/paddle/nn/functional/activation.py b/python/paddle/nn/functional/activation.py index f8eb9d35d58ac..af5fa1336f1f0 100644 --- a/python/paddle/nn/functional/activation.py +++ b/python/paddle/nn/functional/activation.py @@ -35,17 +35,19 @@ def celu(x, alpha=1.0, name=None): r""" celu activation. + Apply the following operation to each element of the input Tensor accroding to the `Continuously Differentiable Exponential Linear Units `_. + .. math:: - celu(x) = max(0, x) + min(0, \alpha * (e^{x/\alpha}-1)) + \operatorname{celu}(x) = \max(0, x) + \min(0, \alpha * (\mathrm{e}^{x/\alpha}-1)) Parameters: - x (Tensor): The input Tensor with data type float32, float64. - alpha (float, optional): The 'alpha' value of the CELU formulation. Default is 1.0. + x (Tensor): The input Tensor with data type float16, float32, or float64. + alpha (float, optional): The 'alpha' value of the CELU formula. Default is 1.0. name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None. Returns: - A Tensor with the same data type and shape as ``x`` . + A ``Tensor`` with the same data type and shape as ``x`` . Examples: .. code-block:: python From aafa9820ea46343de4881f65030f5fa452be3c6c Mon Sep 17 00:00:00 2001 From: james Date: Fri, 18 Nov 2022 14:40:28 +0800 Subject: [PATCH 087/210] correct sync behavior for XPU distributed training (#47882) * correct sync behavior for XPU distributed training XPU support event mechanism similar to cuda event, so it is advisable to use an event to sync compute/comm streams for performance. However this mechanism is never fully tested, and inconsistent loss/ending_epochs are reported. Therefore, this PR replaces event sync with stream waiting as a temporary solution. * remove compile warning --- .../fluid/distributed/collective/BKCLTools.h | 18 +++-------------- .../collective/ProcessGroupBKCL.cc | 6 ++++++ paddle/phi/backends/xpu/xpu_context.cc | 2 +- ...d_split.cc => concat_and_split_functor.cc} | 20 ++----------------- 4 files changed, 12 insertions(+), 34 deletions(-) rename paddle/phi/kernels/xpu/{concat_and_split.cc => concat_and_split_functor.cc} (87%) diff --git a/paddle/fluid/distributed/collective/BKCLTools.h b/paddle/fluid/distributed/collective/BKCLTools.h index e08bb61438c88..0572b852f6e90 100644 --- a/paddle/fluid/distributed/collective/BKCLTools.h +++ b/paddle/fluid/distributed/collective/BKCLTools.h @@ -77,23 +77,11 @@ class XPUEventManager { device_index_)); platform::XPUDeviceGuard guard(device_index_); - PADDLE_ENFORCE_XPU_SUCCESS(xpu_event_record(event_, ctx.stream())); + // TODO(zhangxiaoci) temporary solution: xpu::event seems buggy + PADDLE_ENFORCE_XPU_SUCCESS(xpu_wait(ctx.stream())); } - void Block(const XPUContext& ctx) const { - if (is_created_) { - auto device_index = ctx.GetPlace().device; - PADDLE_ENFORCE_EQ(device_index, - device_index_, - platform::errors::PreconditionNotMet( - "XPUContext's device %d does not match" - "Event's device %d", - device_index, - device_index_)); - platform::XPUDeviceGuard guard(device_index_); - PADDLE_ENFORCE_XPU_SUCCESS(xpu_stream_wait_event(ctx.stream(), event_)); - } - } + void Block(const XPUContext& ctx) const {} private: bool is_created_{false}; diff --git a/paddle/fluid/distributed/collective/ProcessGroupBKCL.cc b/paddle/fluid/distributed/collective/ProcessGroupBKCL.cc index 8dfb65d981374..5c122ce2a3216 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupBKCL.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupBKCL.cc @@ -57,8 +57,14 @@ bool ProcessGroupBKCL::BKCLTask::Wait(std::chrono::milliseconds timeout) { if (barrier_) { // If we use the work to do barrier, we should block cpu + + // TODO(zhangxiaoci) There is no such function that can sync entire device + // for xpu (for now), so all we can do is sync whatever stream that we know + // and hope for the best. Note that for correctness the communication stream + // needs to be in sync mode. platform::XPUDeviceGuard guard(place_.GetDeviceId()); xpu_wait(); + calc_ctx->Wait(); } return true; } diff --git a/paddle/phi/backends/xpu/xpu_context.cc b/paddle/phi/backends/xpu/xpu_context.cc index 7257f3f20b06b..4065306abc798 100644 --- a/paddle/phi/backends/xpu/xpu_context.cc +++ b/paddle/phi/backends/xpu/xpu_context.cc @@ -64,7 +64,7 @@ struct XPUContext::Impl { // manually destroy XPUStream here until xpu::api integrates this work // into Context dtor xpu_wait(context_->xpu_stream); - PADDLE_ENFORCE_XPU_SUCCESS(xpu_stream_destroy(context_->xpu_stream)); + xpu_stream_destroy(context_->xpu_stream); context_->xpu_stream = nullptr; xpu::destroy_context(context_); context_ = nullptr; diff --git a/paddle/phi/kernels/xpu/concat_and_split.cc b/paddle/phi/kernels/xpu/concat_and_split_functor.cc similarity index 87% rename from paddle/phi/kernels/xpu/concat_and_split.cc rename to paddle/phi/kernels/xpu/concat_and_split_functor.cc index 225f9555b02e6..769458523a68c 100644 --- a/paddle/phi/kernels/xpu/concat_and_split.cc +++ b/paddle/phi/kernels/xpu/concat_and_split_functor.cc @@ -13,8 +13,6 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/phi/kernels/funcs/concat_and_split_functor.h" - -#include "paddle/fluid/platform/device_context.h" #include "paddle/phi/backends/xpu/enforce_xpu.h" namespace phi { @@ -67,14 +65,7 @@ class ConcatFunctor { reinterpret_cast(output->data()), xdims_list, axis); - PADDLE_ENFORCE_EQ( - r, - XPU_SUCCESS, - paddle::platform::errors::External( - "XPU API return wrong value[%d %s], please check whether " - "Baidu Kunlun Card is properly installed.", - r, - XPUAPIErrorMsg[r])); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "concat"); } }; @@ -126,14 +117,7 @@ class SplitFunctor { xdims_list, split_list, axis); - PADDLE_ENFORCE_EQ( - r, - XPU_SUCCESS, - paddle::platform::errors::External( - "XPU API return wrong value[%d %s], please check whether " - "Baidu Kunlun Card is properly installed.", - r, - XPUAPIErrorMsg[r])); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "split"); } }; From d7f7963f568dd31fa04929fc5eb0bb7d1ada7d51 Mon Sep 17 00:00:00 2001 From: zhaoyingli <86812880+zhaoyinglia@users.noreply.github.com> Date: Fri, 18 Nov 2022 14:57:43 +0800 Subject: [PATCH 088/210] [AutoParallel] selective recompute (#48111) * [AutoParallel] selective recompute * add cmakelist --- .../distributed/auto_parallel/constants.py | 1 + .../distributed/auto_parallel/dist_loader.py | 2 +- .../distributed/auto_parallel/engine.py | 31 +-- .../distributed/auto_parallel/interface.py | 10 +- .../paddle/distributed/auto_parallel/utils.py | 41 ++- .../passes/auto_parallel_recompute.py | 135 +++++++--- .../unittests/auto_parallel/CMakeLists.txt | 2 + .../auto_parallel/recompute_pass_unittest.py | 19 +- .../auto_parallel/test_selective_recompute.py | 175 +++++++++++++ .../unittests/auto_parallel_gpt_model.py | 247 +++++++----------- 10 files changed, 428 insertions(+), 235 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/auto_parallel/test_selective_recompute.py diff --git a/python/paddle/distributed/auto_parallel/constants.py b/python/paddle/distributed/auto_parallel/constants.py index 51afad94c535b..857245b9be425 100644 --- a/python/paddle/distributed/auto_parallel/constants.py +++ b/python/paddle/distributed/auto_parallel/constants.py @@ -55,6 +55,7 @@ def set_field_default_config(category, field, default_value): RECOMPUTE = "recompute" set_field_default_config(RECOMPUTE, "enable", False) set_field_default_config(RECOMPUTE, "checkpoints", None) +set_field_default_config(RECOMPUTE, "no_recompute_segments", []) set_field_default_config(RECOMPUTE, "enable_tuning", False) ######################################### diff --git a/python/paddle/distributed/auto_parallel/dist_loader.py b/python/paddle/distributed/auto_parallel/dist_loader.py index f982f7458999e..f0e0b8aa5a0d7 100644 --- a/python/paddle/distributed/auto_parallel/dist_loader.py +++ b/python/paddle/distributed/auto_parallel/dist_loader.py @@ -134,7 +134,7 @@ def __next__(self): raise StopIteration def _infer_steps(self): - if isinstance(self.steps_per_epoch, int) and self.steps_per_epoch > 1: + if isinstance(self.steps_per_epoch, int) and self.steps_per_epoch > 0: return self.steps_per_epoch try: if isinstance(self.dataset, IterableDataset): diff --git a/python/paddle/distributed/auto_parallel/engine.py b/python/paddle/distributed/auto_parallel/engine.py index 116eaa97f1088..8e27b9aac6c70 100644 --- a/python/paddle/distributed/auto_parallel/engine.py +++ b/python/paddle/distributed/auto_parallel/engine.py @@ -610,7 +610,7 @@ def _build(self, mode): if mode != "train": serial_main_prog = serial_main_prog.clone(for_test=True) - self._set_recompute_ckpts() + auto_utils.set_recompute_ckpts(self._model, self._strategy) self._dist_contexts[mode] = DistributedContext( serial_main_prog, serial_startup_prog, @@ -1518,35 +1518,6 @@ def _is_local_var(self, var): var_name = _to_name_str(var) return var_name in self.main_program.global_block().vars - def _set_recompute_ckpts(self): - # NOTE hack to enable recompute in engine api for GPT-3 - # TODO support more PaddleNLP/CV models here - - recompute = self._strategy.recompute - - # extract ckpts by specific model - if isinstance(self._model, paddle.nn.Layer): - if hasattr( - self._model, "gpt" - ) and self._model.__class__.__name__ in [ - 'GPTForPretraining', - 'GPTForPretrainingAuto', - ]: - exact_ckpts = self._model.gpt.checkpoints - else: - exact_ckpts = recompute.checkpoints - else: - exact_ckpts = recompute.checkpoints - - # modify strategy - if recompute.enable: - recompute.checkpoints = exact_ckpts[:] - logs = { - 'Model Class': self._model.__class__.__name__, - 'Applied Recompute ckpts': exact_ckpts, - } - self._logger.info(logs) - def _reset_metrics(self): for metric in self._metrics: metric.reset() diff --git a/python/paddle/distributed/auto_parallel/interface.py b/python/paddle/distributed/auto_parallel/interface.py index cc8afb4f27173..b85d85011a1fa 100644 --- a/python/paddle/distributed/auto_parallel/interface.py +++ b/python/paddle/distributed/auto_parallel/interface.py @@ -195,7 +195,13 @@ def shard_op(op, process_mesh=None, in_shard_specs=None, out_shard_specs=None): return op +_g_recompute_idx = -1 + + def recompute(op): + global _g_recompute_idx + _g_recompute_idx += 1 + class RecomputeOperator: def __init__(self, op): self._op = op @@ -209,7 +215,9 @@ def __call__(self, *args, **kwargs): for idx in range(op_size, new_op_size): op = cur_block.ops[idx] - op._set_attr("is_recompute@auto_parallel", True) + op._set_attr( + 'op_namescope', "/auto_parallel/rc_" + str(_g_recompute_idx) + ) return output diff --git a/python/paddle/distributed/auto_parallel/utils.py b/python/paddle/distributed/auto_parallel/utils.py index 35b3483a31481..be4c68d97d840 100644 --- a/python/paddle/distributed/auto_parallel/utils.py +++ b/python/paddle/distributed/auto_parallel/utils.py @@ -33,6 +33,9 @@ OperatorDistributedAttribute, ) +OP_ROLE_KEY = core.op_proto_and_checker_maker.kOpRoleAttrName() +OpRole = core.op_proto_and_checker_maker.OpRole + __no_shape_var_type__ = [ core.VarDesc.VarType.READER, core.VarDesc.VarType.STEP_SCOPES, @@ -1181,7 +1184,6 @@ def _get_split_indices( def set_grad_var_shape(program, dist_context): from .operators.common import infer_shape - from paddle.distributed.fleet.meta_optimizers.common import OpRole block = program.global_block() vars = block.vars @@ -1315,10 +1317,6 @@ def set_grad_var_shape(program, dist_context): grad_var.desc.set_shape(ref_shape) -OP_ROLE_KEY = core.op_proto_and_checker_maker.kOpRoleAttrName() -OpRole = core.op_proto_and_checker_maker.OpRole - - def is_forward_op(op): op_role = int(op.attr('op_role')) return OP_ROLE_KEY in op.attr_names and ( @@ -1896,6 +1894,39 @@ def initialize_pg_in_full_mode(all_process_groups, cur_rank): server_socket.close() +def set_recompute_ckpts(model, strategy): + from .interface import _g_recompute_idx + + if _g_recompute_idx > -1: + return + + recompute = strategy.recompute + if not recompute.enable: + return + + # NOTE: hack to enable recompute in engine api for GPT-3 + # TODO support more PaddleNLP/CV models here + # extract ckpts by specific model + if isinstance(model, paddle.nn.Layer): + if hasattr(model, "gpt") and model.__class__.__name__ in [ + 'GPTForPretraining', + 'GPTForPretrainingAuto', + ]: + exact_ckpts = model.gpt.checkpoints + else: + exact_ckpts = recompute.checkpoints + else: + exact_ckpts = recompute.checkpoints + + # modify strategy + recompute.checkpoints = exact_ckpts[:] + logs = { + 'Model Class': model.__class__.__name__, + 'Applied Recompute ckpts': exact_ckpts, + } + logging.info(logs) + + def get_input_split_info(cur_rank, var, dist_context): # deduce how the input data is split among the cluster tensor_dist_attr = dist_context.get_tensor_dist_attr_for_program(var) diff --git a/python/paddle/distributed/passes/auto_parallel_recompute.py b/python/paddle/distributed/passes/auto_parallel_recompute.py index b725ac004eb01..5bdbe9d2dd5d9 100644 --- a/python/paddle/distributed/passes/auto_parallel_recompute.py +++ b/python/paddle/distributed/passes/auto_parallel_recompute.py @@ -17,7 +17,6 @@ from .pass_base import PassBase, register_pass from paddle.fluid import core, unique_name from paddle.fluid import framework as framework -from paddle.fluid.framework import Variable from paddle.fluid.backward import _append_grad_suffix_, _get_no_grad_set_name from paddle.fluid.backward import ProgramStats, _rename_arg_, _find_op_path_ from paddle.distributed.auto_parallel.dist_attribute import ( @@ -33,12 +32,21 @@ ) +def _to_be_recomputed(op): + return op.has_attr('op_namescope') and "/auto_parallel/rc_" in op.attr( + 'op_namescope' + ) + + class RecomputeState(ProgramStats): def __init__(self, block, ops): super().__init__(block=block, ops=ops) self._block = block self._ops = ops + # {varname: {as_input_ops: op_idx, as_output_ops: op_idx}} self.var_op_deps = {} + # {segment_name: op_idx} + self.seg_op_deps = {} def build_stats(self): for i, op in enumerate(self._ops): @@ -58,36 +66,72 @@ def build_stats(self): self.var_op_deps[name]["var_as_input_ops"] = [] self.var_op_deps[name]["var_as_output_ops"] = [i] - def get_recompute_segments(self, checkpoints): - """get recompute segments from checkpoints""" + if not _to_be_recomputed(op): + continue + + seg_name = op.attr('op_namescope') + if seg_name not in self.seg_op_deps: + self.seg_op_deps[seg_name] = [i] + else: + assert ( + self.seg_op_deps[seg_name][-1] + 1 == i + ), "The recompute segment's ops should be continuous" + self.seg_op_deps[seg_name].extend([i]) + + def get_recompute_segments( + self, checkpoints_list=None, no_recompute_segments=[] + ): + """get recompute segments and checkpoints""" segments = [] - start_idx = -1 - pre_segment_end_idx = -1 - while start_idx + 1 < len(checkpoints): - if start_idx == -1: - ckpt_name = checkpoints[start_idx + 1] - if ckpt_name not in self.var_op_deps: - start_idx += 1 + checkpoints = checkpoints_list or [] + + if len(checkpoints) == 0: + # the segments is marked by `auto.recompute()` api + for segment_idx in self.seg_op_deps.values(): + if len(segment_idx) == 1: continue - op_idx_list = self.var_op_deps[ckpt_name]["var_as_output_ops"] - if op_idx_list: - segments.append([0, max(op_idx_list) + 1]) - else: - flag, min_idx, max_idx = self.is_subgraph( - [checkpoints[start_idx]], [checkpoints[start_idx + 1]] - ) - if flag: - min_idx = self._update_segment_start( - min_idx, pre_segment_end_idx - ) - segments.append([min_idx, max_idx + 1]) + segments.append([segment_idx[0], segment_idx[-1] + 1]) + checkpoints.extend(self._ops[segment_idx[-1]].output_arg_names) + else: + # the segments is marked by `strategy.checkpoints` api + start_idx = -1 + pre_segment_end_idx = -1 + while start_idx + 1 < len(checkpoints): + if start_idx == -1: + ckpt_name = checkpoints[start_idx + 1] + if ckpt_name not in self.var_op_deps: + start_idx += 1 + continue + op_idx_list = self.var_op_deps[ckpt_name][ + "var_as_output_ops" + ] + if op_idx_list: + segments.append([0, max(op_idx_list) + 1]) else: - logging.info( - "Could not recompute op range [{}] - [{}] ".format( - min_idx, max_idx + 1 - ) + flag, min_idx, max_idx = self.is_subgraph( + [checkpoints[start_idx]], [checkpoints[start_idx + 1]] ) - start_idx += 1 + if flag: + min_idx = self._update_segment_start( + min_idx, pre_segment_end_idx + ) + segments.append([min_idx, max_idx + 1]) + else: + logging.info( + "Could not recompute op range [{}] - [{}] ".format( + min_idx, max_idx + 1 + ) + ) + start_idx += 1 + + if no_recompute_segments: + for i in reversed(sorted(no_recompute_segments)): + assert i < len( + segments + ), "the no_recompute_segments idx [{}] should be lower the number of segment [{}]".format( + i, len(segments) + ) + segments.pop(i) for i, (idx1, idx2) in enumerate(segments): logging.info("recompute segment[{}]".format(i)) @@ -106,7 +150,10 @@ def get_recompute_segments(self, checkpoints): ) ) - return segments + return segments, checkpoints + + def is_recompute(self): + return any([_to_be_recomputed(op) for op in self._ops]) def modify_forward_desc_for_recompute(self, dist_context): """ @@ -162,6 +209,7 @@ def modify_forward_desc_for_recompute(self, dist_context): outputs={"Out": seed_var}, attrs={"seed": seed, "force_cpu": True}, ) + seed_op._set_attr('op_namescope', cur_op.attr('op_namescope')) # set new seed op's dist_attr naive_set_dist_op_attr_for_program_by_mesh_and_mapping( seed_op, ref_process_mesh, ref_dims_mapping, dist_context @@ -196,7 +244,6 @@ def _get_stop_gradients(program, no_grad_set): no_grad_set_name = set() for var in program.list_vars(): - assert isinstance(var, Variable) if "@GRAD" in var.name: break if var.stop_gradient: @@ -244,14 +291,13 @@ def __init__(self): self.set_attr("loss", None) self.set_attr("dist_context", None) self.set_attr("no_grad_set", None) + self.set_attr("no_recompute_segments", []) def _check_self(self): if self.get_attr("dist_context") is None: return False if self.get_attr("loss") is None: return False - if self.get_attr("checkpoints") is None: - return False return True def _check_conflict(self, other_pass): @@ -259,25 +305,32 @@ def _check_conflict(self, other_pass): def _apply_single_impl(self, main_program, startup_program, context): checkpoints = self.get_attr("checkpoints") + no_recompute_segments = self.get_attr("no_recompute_segments") loss = self.get_attr("loss") no_grad_set = self.get_attr("no_grad_set") self._dist_context = self.get_attr("dist_context") + # 0. get op_path which is related to loss main_block = main_program.global_block() no_grad_set_name = _get_stop_gradients(main_program, no_grad_set) - # get op_path which is related to loss op_path = _find_op_path_(main_block, [loss], [], no_grad_set_name) - # step 1: build recompute state + # 1. build recompute state rc_state = RecomputeState(main_block, op_path) + if not rc_state.is_recompute() and not checkpoints: + return + + # 2. get the segments to be recomputed rc_state.modify_forward_desc_for_recompute(self._dist_context) rc_state.build_stats() - checkpoints = rc_state.sort_checkpoints(checkpoints) - segments = rc_state.get_recompute_segments(checkpoints) - if segments == []: + checkpoints = rc_state.sort_checkpoints(checkpoints or []) + segments, checkpoints = rc_state.get_recompute_segments( + checkpoints, no_recompute_segments + ) + if segments == [] or checkpoints == []: return - # step 2: get vars_should_be_hold + # 3. get vars that should be hold in memory vars_should_be_hold = [] for segment in segments: vars_should_be_hold.extend( @@ -295,9 +348,9 @@ def _apply_single_impl(self, main_program, startup_program, context): vars_should_be_hold = list(set(vars_should_be_hold)) vars_in_memory = vars_should_be_hold + checkpoints - # step 3: get recomputed fwd ops desc - var_name_dict = {} - ckpt_ops_dict = {} + # 4. get the fwd ops desc to be recomputed. + var_name_dict = {} # varname --> varname.subprog_XXX + ckpt_ops_dict = {} # ckpt_op_id --> segment_descs buffer_block = main_block.program._create_block() for i, segment in enumerate(segments[::-1]): fwd_ops = op_path[segment[0] : segment[1]] @@ -362,7 +415,7 @@ def _apply_single_impl(self, main_program, startup_program, context): ckpt_op = op_path[segment[1] - 1] ckpt_ops_dict[ckpt_op.desc.original_id()] = [True, segment_descs] - # step 4: insert recomputed fwd ops + # 5. insert recomputed fwd ops into backward parse ops = main_block.ops loss_op = get_loss_op(main_block) loss_op_idx = _find_op_index(main_block, loss_op) diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt index b2935a0b175b3..201241cb31e63 100644 --- a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt @@ -72,6 +72,8 @@ if(WITH_DISTRIBUTE AND WITH_GPU) py_test_modules(test_parallel_tuner_predict MODULES test_parallel_tuner_predict ENVS ${dist_ENVS}) set_tests_properties(test_parallel_tuner_predict PROPERTIES TIMEOUT 120) + py_test_modules(test_selective_recompute MODULES test_selective_recompute) + set_tests_properties(test_selective_recompute PROPERTIES TIMEOUT 50) py_test_modules(test_while_op_completion MODULES test_while_op_completion ENVS ${dist_ENVS}) diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/recompute_pass_unittest.py b/python/paddle/fluid/tests/unittests/auto_parallel/recompute_pass_unittest.py index 1aa83f1a8c978..d9c179dda09b4 100644 --- a/python/paddle/fluid/tests/unittests/auto_parallel/recompute_pass_unittest.py +++ b/python/paddle/fluid/tests/unittests/auto_parallel/recompute_pass_unittest.py @@ -22,13 +22,14 @@ from get_gpt_model import FakeDataset, generate_model -def apply_pass(use_recompute=False): +def apply_pass(use_recompute=False, no_recompute_segments=[]): strategy = auto.Strategy() strategy.auto_mode = "semi" strategy.reinit = True if use_recompute: recompute = strategy.recompute recompute.enable = True + recompute.no_recompute_segments = no_recompute_segments return strategy @@ -53,10 +54,10 @@ def init(self, engine): place = paddle.fluid.CUDAPlace(ParallelEnv().dev_id) engine._executor = paddle.static.Executor(place) - def get_engine(self, use_recompute=False): + def get_engine(self, use_recompute=False, no_recompute_segments=[]): reset_prog() - strategy = apply_pass(use_recompute) + strategy = apply_pass(use_recompute, no_recompute_segments) clip = paddle.nn.ClipGradByGlobalNorm(self.clip_norm) opt = paddle.optimizer.AdamW(learning_rate=0.00001, grad_clip=clip) model, loss = generate_model("mp") @@ -88,6 +89,18 @@ def test_recompute_pass(self): rc_losses = np.array(history.history["loss"]) self.check_results(mp_losses, rc_losses) + # mp2 selective recompute training + rc1_engine = self.get_engine(True, [0]) + history = rc1_engine.fit(self.dataset, 3, batch_size=self.batch_size) + rc1_losses = np.array(history.history["loss"]) + self.check_results(mp_losses, rc1_losses) + + def test_recompute_pass_error(self): + + with self.assertRaises(AssertionError): + rc_engine = self.get_engine(True, [2]) + history = rc_engine.fit(self.dataset, 3, batch_size=self.batch_size) + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_selective_recompute.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_selective_recompute.py new file mode 100644 index 0000000000000..97e175a39801a --- /dev/null +++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_selective_recompute.py @@ -0,0 +1,175 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys +import unittest +import random +import numpy as np +import paddle + +from paddle.distributed.fleet import auto +from paddle.fluid.dygraph.parallel import ParallelEnv +from get_gpt_model import FakeDataset + +sys.path.append("..") +import auto_parallel_gpt_model as modeling +from auto_parallel_gpt_model import ( + GPTModel, + GPTForPretraining, + GPTPretrainingCriterion, +) + + +def generate_model(use_new_recompute, recompute_granularity): + modeling.init_global() + modeling._global_parallel_strategy = "serial" + modeling._global_process_mesh = auto.ProcessMesh(mesh=[0], dim_names=["x"]) + + gpt = GPTModel( + vocab_size=1000, + hidden_size=64, + num_hidden_layers=2, + num_attention_heads=8, + intermediate_size=256, + hidden_act="gelu", + hidden_dropout_prob=0.0, + attention_probs_dropout_prob=0.0, + max_position_embeddings=1024, + type_vocab_size=1, + initializer_range=0.02, + pad_token_id=0, + eos_token_id=7, + bos_token_id=0, + eol_token_id=3, + use_new_recompute=use_new_recompute, + recompute_granularity=recompute_granularity, + ) + model = GPTForPretraining( + gpt, vocab_size=1000, hidden_size=64, initializer_range=0.02 + ) + criterion = GPTPretrainingCriterion() + return model, criterion + + +def apply_pass(use_recompute=False, no_recompute_segments=[]): + strategy = auto.Strategy() + strategy.auto_mode = "semi" + strategy.reinit = True + if use_recompute: + recompute = strategy.recompute + recompute.enable = True + recompute.no_recompute_segments = no_recompute_segments + return strategy + + +def reset_prog(): + paddle.fluid.framework.switch_main_program(paddle.static.Program()) + paddle.fluid.framework.switch_startup_program(paddle.static.Program()) + + +class TestRecomputePassWithRecomputeAPI(unittest.TestCase): + def setUp(self): + self.rtol = 1e-6 + self.atol = 1e-8 + self.batch_size = 1 + self.batch_num = 2 + self.clip_norm = 0.2 + self.dataset = FakeDataset(self.batch_size * self.batch_num) + + def init(self, engine): + paddle.seed(2022) + np.random.seed(2022) + random.seed(2022) + place = paddle.fluid.CUDAPlace(ParallelEnv().dev_id) + engine._executor = paddle.static.Executor(place) + + def get_engine( + self, + use_recompute=False, + use_new_recompute=False, + recompute_granularity="full", + no_recompute_segments=[], + ): + reset_prog() + + strategy = apply_pass(use_recompute, no_recompute_segments) + clip = paddle.nn.ClipGradByGlobalNorm(self.clip_norm) + opt = paddle.optimizer.AdamW(learning_rate=0.00001, grad_clip=clip) + model, loss = generate_model(use_new_recompute, recompute_granularity) + + engine = auto.Engine(model, loss, opt, strategy=strategy) + self.init(engine) + return engine + + def check_results(self, ref_losses, check_losses): + np.testing.assert_allclose( + ref_losses, + check_losses, + rtol=self.rtol, + atol=self.atol, + err_msg='pass {} has wrong results!, \nu={}\nv={}\ndiff={}'.format( + __class__, ref_losses, check_losses, ref_losses - check_losses + ), + ) + + def recompute_vars(self, program): + return list(filter(lambda a: "subprog" in a.name, program.list_vars())) + + def test_recompute_pass(self): + # mp2 training + mp_engine = self.get_engine() + history = mp_engine.fit(self.dataset, 3, batch_size=self.batch_size) + mp_losses = np.array(history.history["loss"]) + + # mp2 recompute with old api + rc4_engine = self.get_engine(True, False) + history = rc4_engine.fit(self.dataset, 3, batch_size=self.batch_size) + rc4_losses = np.array(history.history["loss"]) + self.check_results(mp_losses, rc4_losses) + + # mp2 recompute core_attn + rc1_engine = self.get_engine(True, True, "core_attn", [0]) + history = rc1_engine.fit(self.dataset, 3, batch_size=self.batch_size) + rc1_losses = np.array(history.history["loss"]) + self.check_results(mp_losses, rc1_losses) + + # mp2 recompute full_attn + rc2_engine = self.get_engine(True, True, "full_attn") + history = rc2_engine.fit(self.dataset, 3, batch_size=self.batch_size) + rc2_losses = np.array(history.history["loss"]) + self.check_results(mp_losses, rc2_losses) + + # mp2 recompute full + rc3_engine = self.get_engine(True, True, "full") + history = rc3_engine.fit(self.dataset, 3, batch_size=self.batch_size) + rc3_losses = np.array(history.history["loss"]) + self.check_results(mp_losses, rc3_losses) + + rc0_vars = self.recompute_vars(mp_engine.main_program) + rc1_vars = self.recompute_vars(rc1_engine.main_program) + rc2_vars = self.recompute_vars(rc2_engine.main_program) + rc3_vars = self.recompute_vars(rc3_engine.main_program) + + assert rc0_vars == [] + assert len(rc1_vars) < len(rc2_vars) and len(rc2_vars) < len(rc3_vars) + + def test_recompute_pass_error(self): + + with self.assertRaises(AssertionError): + rc_engine = self.get_engine(True, True, "full", [2]) + history = rc_engine.fit(self.dataset, 3, batch_size=self.batch_size) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/auto_parallel_gpt_model.py b/python/paddle/fluid/tests/unittests/auto_parallel_gpt_model.py index 6e96ee6dcf83c..829e7f7a5ddc5 100644 --- a/python/paddle/fluid/tests/unittests/auto_parallel_gpt_model.py +++ b/python/paddle/fluid/tests/unittests/auto_parallel_gpt_model.py @@ -57,6 +57,8 @@ def __init__( bias_attr=None, fuse=False, mesh_idx=None, + use_new_recompute=False, + recompute_granularity="full", ): super().__init__() self.embed_dim = embed_dim @@ -67,6 +69,9 @@ def __init__( self.need_weights = need_weights self.fuse = fuse self.mesh_idx = mesh_idx + self.use_new_recompute = use_new_recompute + self.recompute_granularity = recompute_granularity + self.head_dim = embed_dim // num_heads assert ( self.head_dim * num_heads == self.embed_dim @@ -225,6 +230,27 @@ def gen_cache(self, key, value=None, type=Cache): # incremental_state with initial value, mainly for usage like UniLM return self.Cache(key, value) + def core_attn(self, q, k, v, attn_mask): + product = layers.matmul( + x=q, y=k, transpose_y=True, alpha=self.head_dim**-0.5 + ) + if attn_mask is not None: + product = product + attn_mask + weights = F.softmax(product) + if self.dropout: + weights = F.dropout( + weights, + self.dropout, + training=self.training, + mode="upscale_in_train", + ) + out = tensor.matmul(weights, v) + # combine heads + out = tensor.transpose(out, perm=[0, 2, 1, 3]) + out = tensor.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]]) + + return out, weights + def forward( self, query, key, value, attn_mask=None, use_cache=False, cache=None ): @@ -244,23 +270,12 @@ def forward( q, k, v, cache = self._prepare_qkv( query, key, value, use_cache, cache ) - product = layers.matmul( - x=q, y=k, transpose_y=True, alpha=self.head_dim**-0.5 - ) - if attn_mask is not None: - product = product + attn_mask - weights = F.softmax(product) - if self.dropout: - weights = F.dropout( - weights, - self.dropout, - training=self.training, - mode="upscale_in_train", - ) - out = tensor.matmul(weights, v) - # combine heads - out = tensor.transpose(out, perm=[0, 2, 1, 3]) - out = tensor.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]]) + + if self.use_new_recompute and self.recompute_granularity == "core_attn": + out, weights = auto.recompute(self.core_attn)(q, k, v, attn_mask) + else: + out, weights = self.core_attn(q, k, v, attn_mask) + # project to output out = self.out_proj(out) if _global_parallel_strategy == "mp": @@ -295,12 +310,22 @@ class TransformerDecoder(nn.Layer): TransformerDecoder is a stack of N decoder layers. """ - def __init__(self, decoder_layers, num_layers, norm=None, hidden_size=None): + def __init__( + self, + decoder_layers, + num_layers, + norm=None, + hidden_size=None, + use_new_recompute=False, + recompute_granularity="full", + ): super().__init__() self.num_layers = num_layers self.layers = decoder_layers self.norm = norm + self.use_new_recompute = use_new_recompute + self.recompute_granularity = recompute_granularity if norm == "LayerNorm": self.norm = nn.LayerNorm(hidden_size) elif norm is not None: @@ -348,149 +373,36 @@ def forward( DPMPPP_MESH_LIST[0], ["x"] + [None for i in range(len(output.shape) - 1)], ) + for i, mod in enumerate(self.layers): + if self.use_new_recompute and self.recompute_granularity == "full": + mod = auto.recompute(mod) + if cache is None: if use_cache: - if _global_parallel_strategy == "pp": - output, new_cache = auto.shard_op( - mod, PP_MESH_LIST[mod.mesh_idx] - )(output, memory, tgt_mask, use_cache, cache) - auto.shard_tensor( - output, - PP_MESH_LIST[mod.mesh_idx], - [None for i in range(len(output.shape))], - ) - elif _global_parallel_strategy == "dp_pp": - output, new_cache = auto.shard_op( - mod, DPPP_MESH_LIST[mod.mesh_idx] - )(output, memory, tgt_mask, use_cache, cache) - auto.shard_tensor( - output, - DPPP_MESH_LIST[mod.mesh_idx], - ["x"] - + [None for i in range(len(output.shape) - 1)], - ) - elif _global_parallel_strategy == "mp_pp": - output, new_cache = auto.shard_op( - mod, MPPP_MESH_LIST[mod.mesh_idx] - )(output, memory, tgt_mask, use_cache, cache) - auto.shard_tensor( - output, - MPPP_MESH_LIST[mod.mesh_idx], - [None for i in range(len(output.shape))], - ) - elif _global_parallel_strategy == "dp_mp_pp": - output, new_cache = auto.shard_op( - mod, DPMPPP_MESH_LIST[mod.mesh_idx] - )(output, memory, tgt_mask, use_cache, cache) - auto.shard_tensor( - output, - DPMPPP_MESH_LIST[mod.mesh_idx], - [None for i in range(len(output.shape))], - ) - else: - output, new_cache = mod( - output, - memory, - tgt_mask=tgt_mask, - use_cache=use_cache, - cache=cache, - ) - new_caches.append(new_cache) - else: - if _global_parallel_strategy == "pp": - output = auto.shard_op(mod, PP_MESH_LIST[mod.mesh_idx])( - output, memory, tgt_mask, use_cache, cache - ) - auto.shard_tensor( - output, - PP_MESH_LIST[mod.mesh_idx], - [None for i in range(len(output.shape))], - ) - elif _global_parallel_strategy == "dp_pp": - output = auto.shard_op( - mod, DPPP_MESH_LIST[mod.mesh_idx] - )(output, memory, tgt_mask, use_cache, cache) - auto.shard_tensor( - output, - DPPP_MESH_LIST[mod.mesh_idx], - ["x"] - + [None for i in range(len(output.shape) - 1)], - ) - elif _global_parallel_strategy == "mp_pp": - output = auto.shard_op( - mod, MPPP_MESH_LIST[mod.mesh_idx] - )(output, memory, tgt_mask, use_cache, cache) - auto.shard_tensor( - output, - MPPP_MESH_LIST[mod.mesh_idx], - [None for i in range(len(output.shape))], - ) - elif _global_parallel_strategy == "dp_mp_pp": - output = auto.shard_op( - mod, DPMPPP_MESH_LIST[mod.mesh_idx] - )(output, memory, tgt_mask, use_cache, cache) - auto.shard_tensor( - output, - DPMPPP_MESH_LIST[mod.mesh_idx], - ["x"] - + [None for i in range(len(output.shape) - 1)], - ) - else: - output = mod( - output, - memory, - tgt_mask=tgt_mask, - use_cache=use_cache, - cache=cache, - ) - else: - if _global_parallel_strategy == "pp": - output, new_cache = auto.shard_op( - mod, PP_MESH_LIST[mod.mesh_idx] - )(output, memory, tgt_mask, use_cache, cache) - auto.shard_tensor( - output, - PP_MESH_LIST[mod.mesh_idx], - [None for i in range(len(output.shape))], - ) - elif _global_parallel_strategy == "dp_pp": - output, new_cache = auto.shard_op( - mod, DPPP_MESH_LIST[mod.mesh_idx] - )(output, memory, tgt_mask, use_cache, cache) - auto.shard_tensor( - output, - DPPP_MESH_LIST[mod.mesh_idx], - ["x"] + [None for i in range(len(output.shape) - 1)], - ) - elif _global_parallel_strategy == "mp_pp": - output, new_cache = auto.shard_op( - mod, MPPP_MESH_LIST[mod.mesh_idx] - )(output, memory, tgt_mask, use_cache, cache) - auto.shard_tensor( - output, - MPPP_MESH_LIST[mod.mesh_idx], - [None for i in range(len(output.shape))], - ) - elif _global_parallel_strategy == "dp_mp_pp": - output, new_cache = auto.shard_op( - mod, DPMPPP_MESH_LIST[mod.mesh_idx] - )(output, memory, tgt_mask, use_cache, cache) - auto.shard_tensor( - output, - DPMPPP_MESH_LIST[mod.mesh_idx], - ["x"] + [None for i in range(len(output.shape) - 1)], - ) - else: output, new_cache = mod( output, memory, tgt_mask=tgt_mask, use_cache=use_cache, - cache=cache[i], + cache=cache, ) + new_caches.append(new_cache) + else: + output = mod(output, memory, tgt_mask, use_cache, cache) + else: + output, new_cache = mod( + output, + memory, + tgt_mask=tgt_mask, + use_cache=use_cache, + cache=cache[i], + ) new_caches.append(new_cache) - self.checkpoints.append(output.name) + + if not self.use_new_recompute: + self.checkpoints.append(output.name) + if self.norm is not None: output = self.norm(output) return output if use_cache is False else (output, new_caches) @@ -528,6 +440,8 @@ def __init__( weight_attr=None, bias_attr=None, mesh_idx=None, + use_new_recompute=False, + recompute_granularity="full", ): self._config = locals() self._config.pop("self") @@ -537,8 +451,12 @@ def __init__( attn_dropout = dropout if attn_dropout is None else attn_dropout act_dropout = dropout if act_dropout is None else act_dropout self.normalize_before = normalize_before + self.use_new_recompute = use_new_recompute + self.recompute_granularity = recompute_granularity + weight_attrs = _convert_param_attr_to_list(weight_attr, 3) bias_attrs = _convert_param_attr_to_list(bias_attr, 3) + self.self_attn = MultiHeadAttention( d_model, nhead, @@ -546,6 +464,8 @@ def __init__( weight_attr=weight_attrs[0], bias_attr=bias_attrs[0], mesh_idx=self.mesh_idx, + use_new_recompute=self.use_new_recompute, + recompute_granularity=self.recompute_granularity, ) self.linear1 = nn.Linear( d_model, dim_feedforward, weight_attrs[2], bias_attr=bias_attrs[2] @@ -563,12 +483,19 @@ def forward(self, tgt, memory, tgt_mask=None, use_cache=False, cache=None): residual = tgt if self.normalize_before: tgt = self.norm1(tgt) + + if self.use_new_recompute and self.recompute_granularity == "full_attn": + self_attn = auto.recompute(self.self_attn) + else: + self_attn = self.self_attn + if use_cache is False: - tgt = self.self_attn(tgt, tgt, tgt, tgt_mask, use_cache, cache) + tgt = self_attn(tgt, tgt, tgt, tgt_mask, use_cache, cache) else: - tgt, incremental_cache = self.self_attn( + tgt, incremental_cache = self_attn( tgt, tgt, tgt, tgt_mask, use_cache, cache ) + tgt = residual + self.dropout1(tgt) if not self.normalize_before: tgt = self.norm1(tgt) @@ -716,12 +643,17 @@ def __init__( bos_token_id=0, eol_token_id=3, pp_degree=None, + use_new_recompute=False, + recompute_granularity="full", ): super().__init__() self.pad_token_id = pad_token_id self.initializer_range = initializer_range self.hidden_size = hidden_size self.vocab_size = vocab_size + self.use_new_recompute = use_new_recompute + self.recompute_granularity = recompute_granularity + self.layer_per_stage = None self.pipline_mode = pp_degree is not None and pp_degree > 1 if self.pipline_mode: @@ -734,6 +666,7 @@ def __init__( type_vocab_size, self.initializer_range, ) + decoder_layers = nn.LayerList() for i in range(num_hidden_layers): mesh_index = None @@ -756,14 +689,19 @@ def __init__( ), bias_attr=None, mesh_idx=mesh_index, + use_new_recompute=self.use_new_recompute, + recompute_granularity=self.recompute_granularity, ) ) + Decoder = TransformerDecoder self.decoder = Decoder( decoder_layers, num_hidden_layers, norm="LayerNorm", hidden_size=hidden_size, + use_new_recompute=self.use_new_recompute, + recompute_granularity=self.recompute_granularity, ) self.checkpoints = [] @@ -817,7 +755,8 @@ def forward( use_cache=use_cache, cache=cache, ) - self.checkpoints.extend(self.decoder.checkpoints) + if not self.use_new_recompute: + self.checkpoints.extend(self.decoder.checkpoints) return encoder_outputs From edda13cd88b269c932e1d8fafa5a6fabbbda72a2 Mon Sep 17 00:00:00 2001 From: Wen Sun <35923278+HermitSun@users.noreply.github.com> Date: Fri, 18 Nov 2022 15:34:10 +0800 Subject: [PATCH 089/210] Refactor collective communication reduce, scatter, reduce_scatter C++ API (#48115) --- .../fluid/distributed/collective/NCCLTools.h | 3 +- .../distributed/collective/ProcessGroup.h | 60 ++-- .../collective/ProcessGroupGloo.cc | 62 ++-- .../distributed/collective/ProcessGroupGloo.h | 22 +- .../collective/ProcessGroupNCCL.cc | 269 ++++++++---------- .../distributed/collective/ProcessGroupNCCL.h | 40 ++- .../collective/ProcessGroupStream.cc | 132 ++++----- .../collective/ProcessGroupStream.h | 76 +++-- paddle/fluid/pybind/distributed_py.cc | 198 ++++++------- .../communication/stream/reduce_scatter.py | 8 +- .../communication/stream/scatter.py | 8 +- 11 files changed, 432 insertions(+), 446 deletions(-) diff --git a/paddle/fluid/distributed/collective/NCCLTools.h b/paddle/fluid/distributed/collective/NCCLTools.h index 464ae0b6581de..37b1e0f114c3d 100644 --- a/paddle/fluid/distributed/collective/NCCLTools.h +++ b/paddle/fluid/distributed/collective/NCCLTools.h @@ -47,7 +47,7 @@ namespace paddle { namespace distributed { -#define NCCLCHECK(cmd) \ +#define NCCL_CHECK(cmd) \ do { \ ncclResult_t r = cmd; \ if (r != ncclSuccess) { \ @@ -60,6 +60,7 @@ namespace distributed { } while (0) ncclRedOp_t ToNCCLRedType(ReduceOp reduction); + std::string SerializeNCCLUniqueId(const ncclUniqueId& ncclID); } // namespace distributed diff --git a/paddle/fluid/distributed/collective/ProcessGroup.h b/paddle/fluid/distributed/collective/ProcessGroup.h index 152bb1aa6f9d1..795a1a91b5235 100644 --- a/paddle/fluid/distributed/collective/ProcessGroup.h +++ b/paddle/fluid/distributed/collective/ProcessGroup.h @@ -150,6 +150,36 @@ class ProcessGroup { GetBackendName())); } + virtual std::shared_ptr Reduce( + phi::DenseTensor* out_tensor, + const phi::DenseTensor& in_tensor, + const ReduceOptions& opts, + bool sync_op) { + PADDLE_THROW(platform::errors::Unimplemented( + "ProcessGroup%s does not support reduce with sync_op flag.", + GetBackendName())); + } + + virtual std::shared_ptr ReduceScatter( + phi::DenseTensor* out_tensor, + const phi::DenseTensor& in_tensor, + const ReduceScatterOptions& opts, + bool sync_op) { + PADDLE_THROW(platform::errors::Unimplemented( + "ProcessGroup%s does not support reduce_scatter with sync_op flag.", + GetBackendName())); + } + + virtual std::shared_ptr Scatter( + phi::DenseTensor* out_tensor, + const phi::DenseTensor& in_tensor, + const ScatterOptions& opts, + bool sync_op) { + PADDLE_THROW(platform::errors::Unimplemented( + "ProcessGroup%s does not support scatter with sync_op flag.", + GetBackendName())); + } + virtual std::shared_ptr Recv(phi::DenseTensor* tensor, int src_rank, int64_t offset, @@ -273,16 +303,6 @@ class ProcessGroup { "ProcessGroup%s does not support reduce", GetBackendName())); } - virtual std::shared_ptr Reduce( - std::vector& /* input tensors */, // NOLINT - std::vector& /* output tensors */, // NOLINT - const ReduceOptions&, - bool) { - PADDLE_THROW(platform::errors::InvalidArgument( - "ProcessGroup%s does not support reduce with sync_op flag", - GetBackendName())); - } - virtual std::shared_ptr Scatter( std::vector&, // NOLINT std::vector&, // NOLINT @@ -291,26 +311,6 @@ class ProcessGroup { "ProcessGroup%s does not support scatter", GetBackendName())); } - virtual std::shared_ptr Scatter( - std::vector&, // NOLINT - std::vector&, // NOLINT - const ScatterOptions&, - bool) { - PADDLE_THROW(platform::errors::InvalidArgument( - "ProcessGroup%s does not support scatter with sync_op flag", - GetBackendName())); - } - - virtual std::shared_ptr ReduceScatter( - std::vector&, // NOLINT - std::vector&, // NOLINT - const ReduceScatterOptions&, - bool) { - PADDLE_THROW(platform::errors::InvalidArgument( - "ProcessGroup%s does not support reduce_scatter with sync_op flag", - GetBackendName())); - } - protected: const int rank_; const int size_; diff --git a/paddle/fluid/distributed/collective/ProcessGroupGloo.cc b/paddle/fluid/distributed/collective/ProcessGroupGloo.cc index 2574eb11be200..f0a65b02fb69f 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupGloo.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupGloo.cc @@ -234,8 +234,8 @@ std::shared_ptr ProcessGroupGloo::Broadcast( const phi::DenseTensor& in_tensor, const BroadcastOptions& opts, bool sync_op) { - std::vector in_wrapper = {in_tensor}; - std::vector out_wrapper = {*out_tensor}; + std::vector in_wrapper{in_tensor}; + std::vector out_wrapper{*out_tensor}; return Broadcast(in_wrapper, out_wrapper, opts, true); } @@ -396,8 +396,8 @@ std::shared_ptr ProcessGroupGloo::AllGather( int64_t offset, // for compatibility, no use now int64_t numel, // for compatibility, no use now bool sync_op) { - std::vector in_wrapper = {in_tensor}; - std::vector out_wrapper = {*out_tensor}; + std::vector in_wrapper{in_tensor}; + std::vector out_wrapper{*out_tensor}; return AllGather(in_wrapper, out_wrapper, true); } @@ -475,26 +475,34 @@ class ReduceGlooTask : public ProcessGroupGloo::GlooTask { }; std::shared_ptr ProcessGroupGloo::Reduce( - std::vector& inputs, - std::vector& outputs, - const ReduceOptions& opts) { - return Reduce(inputs, outputs, opts, true); -} - -std::shared_ptr ProcessGroupGloo::Reduce( - std::vector& inputs, - std::vector& outputs, + phi::DenseTensor* out_tensor, + const phi::DenseTensor& in_tensor, const ReduceOptions& opts, - bool sync_op) { + bool sync_op // for compatibility, no use now +) { std::shared_ptr task; auto tag = next_tag(); auto context = get_context(); - task = std::make_shared( - rank_, context, inputs, outputs, opts.reduce_op, opts.root_rank, tag); + std::vector in_wrapper{in_tensor}; + std::vector out_wrapper{*out_tensor}; + task = std::make_shared(rank_, + context, + in_wrapper, + out_wrapper, + opts.reduce_op, + opts.root_rank, + tag); task->Run(); return task; } +std::shared_ptr ProcessGroupGloo::Reduce( + std::vector& inputs, + std::vector& outputs, + const ReduceOptions& opts) { + return Reduce(&outputs[0], inputs[0], opts, true); +} + class ScatterGlooTask : public ProcessGroupGloo::GlooTask { public: ScatterGlooTask(int rank, @@ -538,26 +546,28 @@ class ScatterGlooTask : public ProcessGroupGloo::GlooTask { }; std::shared_ptr ProcessGroupGloo::Scatter( - std::vector& in_tensors, - std::vector& out_tensors, - const ScatterOptions& opts) { - return Scatter(in_tensors, out_tensors, opts, true); -} - -std::shared_ptr ProcessGroupGloo::Scatter( - std::vector& in_tensors, - std::vector& out_tensors, + phi::DenseTensor* out_tensor, + const phi::DenseTensor& in_tensor, const ScatterOptions& opts, bool sync_op) { std::shared_ptr task; auto tag = next_tag(); auto context = get_context(); + std::vector in_wrapper{in_tensor}; + std::vector out_wrapper{*out_tensor}; task = std::make_shared( - rank_, context, in_tensors, out_tensors, opts.root_rank, size_, tag); + rank_, context, in_wrapper, out_wrapper, opts.root_rank, size_, tag); task->Run(); return task; } +std::shared_ptr ProcessGroupGloo::Scatter( + std::vector& in_tensors, + std::vector& out_tensors, + const ScatterOptions& opts) { + return Scatter(&out_tensors[0], in_tensors[0], opts, true); +} + std::shared_ptr<::gloo::transport::Device> ProcessGroupGloo::createDeviceForInterface(const std::string& ifname) { ::gloo::transport::tcp::attr attr; diff --git a/paddle/fluid/distributed/collective/ProcessGroupGloo.h b/paddle/fluid/distributed/collective/ProcessGroupGloo.h index 474fb0c027c62..fd691e024c4a5 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupGloo.h +++ b/paddle/fluid/distributed/collective/ProcessGroupGloo.h @@ -120,6 +120,16 @@ class ProcessGroupGloo : public ProcessGroup { const BroadcastOptions& opts, bool sync_op) override; + std::shared_ptr Reduce(phi::DenseTensor* out_tensor, + const phi::DenseTensor& in_tensor, + const ReduceOptions& opts, + bool sync_op) override; + + std::shared_ptr Scatter(phi::DenseTensor* out_tensor, + const phi::DenseTensor& in_tensor, + const ScatterOptions& opts, + bool sync_op) override; + // TODO(sunyilun): methods below will be removed later std::shared_ptr Broadcast( std::vector& inputs, @@ -155,23 +165,11 @@ class ProcessGroupGloo : public ProcessGroup { std::vector& out_tensors, bool sync_op) override; - std::shared_ptr Reduce( - std::vector& in_tensors, - std::vector& out_tensors, - const ReduceOptions& opts, - bool sync_op) override; - std::shared_ptr Reduce( std::vector& in_tensors, std::vector& out_tensors, const ReduceOptions& opts) override; - std::shared_ptr Scatter( - std::vector& in_tensors, - std::vector& out_tensors, - const ScatterOptions&, - bool sync_op) override; - std::shared_ptr Scatter( std::vector& in_tensors, std::vector& out_tensors, diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc index 4a70b81e31093..74ebf80205964 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc @@ -87,11 +87,11 @@ ProcessGroupNCCL::ProcessGroupNCCL(const std::shared_ptr& store, : ProcessGroupStream(rank, size, gid), store_(store) {} void ProcessGroupNCCL::GroupStart() { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart()); + NCCL_CHECK(platform::dynload::ncclGroupStart()); } void ProcessGroupNCCL::GroupEnd() { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd()); + NCCL_CHECK(platform::dynload::ncclGroupEnd()); } const phi::DeviceContext& ProcessGroupNCCL::GetDeviceContext( @@ -144,13 +144,13 @@ std::shared_ptr ProcessGroupNCCL::AllGather( const phi::DenseTensor& input, ncclComm_t comm, gpuStream_t stream) { - return platform::dynload::ncclAllGather( + NCCL_CHECK(platform::dynload::ncclAllGather( input.data(), output->data(), input.numel(), platform::ToNCCLDataType(input.dtype()), comm, - stream); + stream)); }, CommType::ALLGATHER, sync_op, @@ -170,14 +170,14 @@ std::shared_ptr ProcessGroupNCCL::AllReduce( const phi::DenseTensor& input, ncclComm_t comm, gpuStream_t stream) { - return platform::dynload::ncclAllReduce( + NCCL_CHECK(platform::dynload::ncclAllReduce( input.data(), output->data(), input.numel(), platform::ToNCCLDataType(input.type()), ToNCCLRedType(opts.reduce_op), comm, - stream); + stream)); }, CommType::ALLREDUCE, sync_op, @@ -231,7 +231,7 @@ std::shared_ptr ProcessGroupNCCL::AllToAll( for (auto i = 0; i < size_; i++) { in_numel = in_size_each_rank[i] * in_row_size; input_partial = GetPartialTensor(input, in_offset, in_numel); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend( + NCCL_CHECK(platform::dynload::ncclSend( input_partial.data(), in_numel, platform::ToNCCLDataType(input.dtype()), @@ -242,7 +242,7 @@ std::shared_ptr ProcessGroupNCCL::AllToAll( out_numel = out_size_each_rank[i] * out_row_size; output_partial = GetPartialTensor(*output, out_offset, out_numel); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv( + NCCL_CHECK(platform::dynload::ncclRecv( output_partial.data(), out_numel, platform::ToNCCLDataType(output->dtype()), @@ -294,20 +294,127 @@ std::shared_ptr ProcessGroupNCCL::Broadcast( ncclComm_t comm, gpuStream_t stream) { int root = opts.source_rank + opts.source_root; - return platform::dynload::ncclBroadcast( + NCCL_CHECK(platform::dynload::ncclBroadcast( input.data(), output->data(), input.numel(), platform::ToNCCLDataType(input.type()), root, comm, - stream); + stream)); }, CommType::BROADCAST, sync_op, use_calc_stream); } +std::shared_ptr ProcessGroupNCCL::Reduce( + phi::DenseTensor* out_tensor, + const phi::DenseTensor& in_tensor, + const ReduceOptions& opts, + bool sync_op, + bool use_calc_stream) { + return Collective( + out_tensor, + in_tensor, + [&](phi::DenseTensor* output, + const phi::DenseTensor& input, + ncclComm_t comm, + gpuStream_t stream) { + NCCL_CHECK(platform::dynload::ncclReduce( + input.data(), + output->data(), + input.numel(), + platform::ToNCCLDataType(input.dtype()), + ToNCCLRedType(opts.reduce_op), + opts.root_rank, + comm, + stream)); + }, + CommType::REDUCE, + sync_op, + use_calc_stream); +} + +std::shared_ptr ProcessGroupNCCL::ReduceScatter( + phi::DenseTensor* out_tensor, + const phi::DenseTensor& in_tensor, + const ReduceScatterOptions& opts, + bool sync_op, + bool use_calc_stream) { + return Collective( + out_tensor, + in_tensor, + [&](phi::DenseTensor* output, + const phi::DenseTensor& input, + ncclComm_t comm, + gpuStream_t stream) { + NCCL_CHECK(platform::dynload::ncclReduceScatter( + input.data(), + output->data(), + output->numel(), + platform::ToNCCLDataType(input.dtype()), + ToNCCLRedType(opts.reduce_op), + comm, + stream)); + }, + CommType::REDUCE_SCATTER, + sync_op, + use_calc_stream); +} + +std::shared_ptr ProcessGroupNCCL::Scatter( + phi::DenseTensor* out_tensor, + const phi::DenseTensor& in_tensor, + const ScatterOptions& opts, + bool sync_op, + bool use_calc_stream) { + return Collective( + out_tensor, + in_tensor, + [&](phi::DenseTensor* output, + const phi::DenseTensor& input, + ncclComm_t comm, + gpuStream_t stream) { + int64_t numel = input.numel() / size_; + if (rank_ == opts.root_rank) { + int64_t offset = 0; + phi::DenseTensor partial_tensor; + GroupStart(); + for (auto i = 0; i < size_; i++) { + partial_tensor = GetPartialTensor(input, offset, numel); + NCCL_CHECK(platform::dynload::ncclSend( + partial_tensor.data(), + numel, + platform::ToNCCLDataType(input.dtype()), + i, + comm, + stream)); + offset += numel; + } + NCCL_CHECK(platform::dynload::ncclRecv( + output->data(), + numel, + platform::ToNCCLDataType(output->dtype()), + opts.root_rank, + comm, + stream)); + GroupEnd(); + } else { + NCCL_CHECK(platform::dynload::ncclRecv( + output->data(), + numel, + platform::ToNCCLDataType(output->dtype()), + opts.root_rank, + comm, + stream)); + } + }, + CommType::SCATTER, + sync_op, + use_calc_stream); +} + std::shared_ptr ProcessGroupNCCL::Recv( phi::DenseTensor* tensor, int src_rank, @@ -328,13 +435,13 @@ std::shared_ptr ProcessGroupNCCL::Recv( int src, ncclComm_t comm, gpuStream_t stream) { - return platform::dynload::ncclRecv( + NCCL_CHECK(platform::dynload::ncclRecv( output->data(), output->numel(), platform::ToNCCLDataType(output->dtype()), src, comm, - stream); + stream)); }, CommType::RECV, sync_op, @@ -361,13 +468,13 @@ std::shared_ptr ProcessGroupNCCL::Send( int dst, ncclComm_t comm, gpuStream_t stream) { - return platform::dynload::ncclSend( + NCCL_CHECK(platform::dynload::ncclSend( input->data(), input->numel(), platform::ToNCCLDataType(input->dtype()), dst, comm, - stream); + stream)); }, CommType::SEND, sync_op, @@ -406,7 +513,7 @@ void ProcessGroupNCCL::CreateNCCLEnvCache(const Place& place, ncclUniqueId nccl_id; if (rank_ == 0) { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGetUniqueId(&nccl_id)); + NCCL_CHECK(platform::dynload::ncclGetUniqueId(&nccl_id)); } BroadcastUniqueNCCLID(&nccl_id); @@ -418,7 +525,7 @@ void ProcessGroupNCCL::CreateNCCLEnvCache(const Place& place, platform::DeviceContextPool::Instance().Get(place)); auto comm_ctx = std::make_unique(place); ncclComm_t nccl_comm; - NCCLCHECK(platform::dynload::ncclCommInitRank( + NCCL_CHECK(platform::dynload::ncclCommInitRank( &nccl_comm, GetSize(), nccl_id, GetRank())); comm_ctx->set_nccl_comm(nccl_comm); @@ -611,7 +718,7 @@ void ProcessGroupNCCL::CreateNCCLManagerCache( ncclUniqueId nccl_id; if (rank_ == 0) { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGetUniqueId(&nccl_id)); + NCCL_CHECK(platform::dynload::ncclGetUniqueId(&nccl_id)); } BroadcastUniqueNCCLID(&nccl_id); @@ -632,7 +739,7 @@ void ProcessGroupNCCL::CreateNCCLManagerCache( dev_ctx[i].reset(new phi::GPUContext(places[i])); ncclComm_t nccl_comm; - NCCLCHECK(platform::dynload::ncclCommInitRank( + NCCL_CHECK(platform::dynload::ncclCommInitRank( &nccl_comm, GetSize(), nccl_id, GetRank())); dev_ctx[i]->set_nccl_comm(nccl_comm); dev_ctx_raw[i] = dev_ctx[i].get(); @@ -1257,70 +1364,6 @@ std::shared_ptr ProcessGroupNCCL::Reduce( CommType::REDUCE); } -std::shared_ptr ProcessGroupNCCL::Reduce( - std::vector& in_tensors, - std::vector& out_tensors, - const ReduceOptions& opts, - bool sync_op, - bool use_calc_stream) { - PADDLE_ENFORCE_EQ( - CheckTensorsInCudaPlace(in_tensors), - true, - platform::errors::InvalidArgument("All inputs should be in CudaPlace.")); - return Collective( - in_tensors, - out_tensors, - [&](const phi::DenseTensor& input, - phi::DenseTensor& output, - ncclComm_t comm, - const gpuStream_t& stream) { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclReduce( - input.data(), - output.data(), - input.numel(), - platform::ToNCCLDataType(input.dtype()), - ToNCCLRedType(opts.reduce_op), - opts.root_rank, - comm, - stream)); - }, - CommType::REDUCE, - sync_op, - use_calc_stream); -} - -std::shared_ptr ProcessGroupNCCL::ReduceScatter( - std::vector& in_tensors, - std::vector& out_tensors, - const ReduceScatterOptions& opts, - bool sync_op, - bool use_calc_stream) { - return Collective( - in_tensors, - out_tensors, - [&](phi::DenseTensor& input, - phi::DenseTensor& output, - ncclComm_t comm, - const gpuStream_t& stream) { - if (FLAGS_use_stream_safe_cuda_allocator) { - platform::CUDADeviceGuard cuda_guard; - cuda_guard.SetDevice(output.place()); - memory::RecordStream(output.Holder(), stream); - } - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclReduceScatter( - input.data(), - output.data(), - output.numel(), - platform::ToNCCLDataType(input.dtype()), - ToNCCLRedType(opts.reduce_op), - comm, - stream)); - }, - CommType::REDUCE_SCATTER, - sync_op, - use_calc_stream); -} - std::shared_ptr ProcessGroupNCCL::Scatter( std::vector& in_tensors, std::vector& out_tensors, @@ -1374,67 +1417,5 @@ std::shared_ptr ProcessGroupNCCL::Scatter( CommType::SCATTER); } -std::shared_ptr ProcessGroupNCCL::Scatter( - std::vector& in_tensors, - std::vector& out_tensors, - const ScatterOptions& opts, - bool sync_op, - bool use_calc_stream) { - PADDLE_ENFORCE_EQ( - CheckTensorsInCudaPlace(in_tensors), - true, - platform::errors::InvalidArgument("All inputs should be in CudaPlace.")); - PADDLE_ENFORCE_EQ( - CheckTensorsInCudaPlace(out_tensors), - true, - platform::errors::InvalidArgument("All inputs should be in CudaPlace.")); - return Collective( - in_tensors, - out_tensors, - [&](phi::DenseTensor& input, - phi::DenseTensor& output, - ncclComm_t comm, - const gpuStream_t& stream) { - PADDLE_ENFORCE_EQ( - output.numel(), - input.numel() / size_, - platform::errors::InvalidArgument( - "Input and output tensors should have the same shape.")); - size_t offset = 0; - if (rank_ == opts.root_rank) { - GroupStart(); - for (auto i = 0; i < size_; i++) { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend( - GetPointerByOffset(input.data(), offset, input.dtype()), - input.numel() / size_, - platform::ToNCCLDataType(input.dtype()), - i, - comm, - stream)); - offset += input.numel() / size_; - } - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv( - output.data(), - input.numel() / size_, - platform::ToNCCLDataType(input.dtype()), - opts.root_rank, - comm, - stream)); - GroupEnd(); - } else { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv( - output.data(), - input.numel() / size_, - platform::ToNCCLDataType(input.dtype()), - opts.root_rank, - comm, - stream)); - } - }, - CommType::SCATTER, - sync_op, - use_calc_stream); -} - } // namespace distributed } // namespace paddle diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h index a6528be80b4a5..c10c4370b4b23 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h +++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h @@ -127,6 +127,25 @@ class ProcessGroupNCCL final : public ProcessGroupStream { bool sync_op, bool use_calc_stream) override; + std::shared_ptr Reduce(phi::DenseTensor* out_tensor, + const phi::DenseTensor& in_tensor, + const ReduceOptions& opts, + bool sync_op, + bool use_calc_stream) override; + + std::shared_ptr ReduceScatter( + phi::DenseTensor* out_tensor, + const phi::DenseTensor& in_tensor, + const ReduceScatterOptions& opts, + bool sync_op, + bool use_calc_stream) override; + + std::shared_ptr Scatter(phi::DenseTensor* out_tensor, + const phi::DenseTensor& in_tensor, + const ScatterOptions& opts, + bool sync_op, + bool use_calc_stream) override; + std::shared_ptr Recv(phi::DenseTensor* tensor, int src_rank, int64_t offset, @@ -184,32 +203,11 @@ class ProcessGroupNCCL final : public ProcessGroupStream { std::vector& out_tensors, const ReduceOptions& opts) override; - std::shared_ptr Reduce( - std::vector& in_tensors, - std::vector& out_tensors, - const ReduceOptions& opts, - bool sync_op, - bool use_calc_stream) override; - - std::shared_ptr ReduceScatter( - std::vector& in_tensors, - std::vector& out_tensors, - const ReduceScatterOptions& opts, - bool sync_op, - bool use_calc_stream) override; - std::shared_ptr Scatter( std::vector& in_tensors, std::vector& out_tensors, const ScatterOptions& opts) override; - std::shared_ptr Scatter( - std::vector& in_tensors, - std::vector& out_tensors, - const ScatterOptions& opts, - bool sync_op, - bool use_calc_stream) override; - private: std::shared_ptr CreateTask(const Place& place, int rank, diff --git a/paddle/fluid/distributed/collective/ProcessGroupStream.cc b/paddle/fluid/distributed/collective/ProcessGroupStream.cc index 3839f70ac13e2..9f7b3c1964e23 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupStream.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupStream.cc @@ -120,6 +120,72 @@ std::shared_ptr ProcessGroupStream::Broadcast( "ProcessGroup%s does not support broadcast.", GetBackendName())); } +std::shared_ptr ProcessGroupStream::Reduce( + phi::DenseTensor* out_tensor, + const phi::DenseTensor& in_tensor, + const ReduceOptions& opts, + bool sync_op) { + return Reduce(out_tensor, + in_tensor, + opts, + sync_op, + /*use_calc_stream*/ false); +} + +std::shared_ptr ProcessGroupStream::Reduce( + phi::DenseTensor* out_tensor, + const phi::DenseTensor& in_tensor, + const ReduceOptions& opts, + bool sync_op, + bool use_calc_stream) { + PADDLE_THROW(platform::errors::Unimplemented( + "ProcessGroup%s does not support reduce.", GetBackendName())); +} + +std::shared_ptr ProcessGroupStream::ReduceScatter( + phi::DenseTensor* out_tensor, + const phi::DenseTensor& in_tensor, + const ReduceScatterOptions& opts, + bool sync_op) { + return ReduceScatter(out_tensor, + in_tensor, + opts, + sync_op, + /*use_calc_stream*/ false); +} + +std::shared_ptr ProcessGroupStream::ReduceScatter( + phi::DenseTensor* out_tensor, + const phi::DenseTensor& in_tensor, + const ReduceScatterOptions& opts, + bool sync_op, + bool use_calc_stream) { + PADDLE_THROW(platform::errors::Unimplemented( + "ProcessGroup%s does not support reduce_scatter.", GetBackendName())); +} + +std::shared_ptr ProcessGroupStream::Scatter( + phi::DenseTensor* out_tensor, + const phi::DenseTensor& in_tensor, + const ScatterOptions& opts, + bool sync_op) { + return Scatter(out_tensor, + in_tensor, + opts, + sync_op, + /*use_calc_stream*/ false); +} + +std::shared_ptr ProcessGroupStream::Scatter( + phi::DenseTensor* out_tensor, + const phi::DenseTensor& in_tensor, + const ScatterOptions& opts, + bool sync_op, + bool use_calc_stream) { + PADDLE_THROW(platform::errors::Unimplemented( + "ProcessGroup%s does not support scatter.", GetBackendName())); +} + std::shared_ptr ProcessGroupStream::Recv( phi::DenseTensor* tensor, int src_rank, @@ -190,72 +256,6 @@ std::shared_ptr ProcessGroupStream::AllToAll( "ProcessGroup%s does not support do alltoall", GetBackendName())); } -std::shared_ptr ProcessGroupStream::Reduce( - std::vector& in_tensors, - std::vector& out_tensors, - const ReduceOptions& opts, - bool sync_op) { - return Reduce(in_tensors, - out_tensors, - opts, - sync_op, - /*use_calc_stream*/ false); -} - -std::shared_ptr ProcessGroupStream::Reduce( - std::vector& in_tensors, - std::vector& out_tensors, - const ReduceOptions& opts, - bool sync_op, - bool use_calc_stream) { - PADDLE_THROW(platform::errors::InvalidArgument( - "ProcessGroup%s does not support do reduce", GetBackendName())); -} - -std::shared_ptr ProcessGroupStream::ReduceScatter( - std::vector& in_tensors, - std::vector& out_tensors, - const ReduceScatterOptions& opts, - bool sync_op) { - return ReduceScatter(in_tensors, - out_tensors, - opts, - sync_op, - /*use_calc_stream*/ false); -} - -std::shared_ptr ProcessGroupStream::ReduceScatter( - std::vector& in_tensors, - std::vector& out_tensors, - const ReduceScatterOptions& opts, - bool sync_op, - bool use_calc_stream) { - PADDLE_THROW(platform::errors::InvalidArgument( - "ProcessGroup%s does not support do reduce_scatter", GetBackendName())); -} - -std::shared_ptr ProcessGroupStream::Scatter( - std::vector& in_tensors, - std::vector& out_tensors, - const ScatterOptions& opts, - bool sync_op) { - return Scatter(in_tensors, - out_tensors, - opts, - sync_op, - /*use_calc_stream*/ false); -} - -std::shared_ptr ProcessGroupStream::Scatter( - std::vector& in_tensors, - std::vector& out_tensors, - const ScatterOptions& opts, - bool sync_op, - bool use_calc_stream) { - PADDLE_THROW(platform::errors::InvalidArgument( - "ProcessGroup%s does not support do scatter", GetBackendName())); -} - std::shared_ptr ProcessGroupStream::Recv( std::vector& tensors, int src_rank, bool sync_op) { return Recv(tensors, diff --git a/paddle/fluid/distributed/collective/ProcessGroupStream.h b/paddle/fluid/distributed/collective/ProcessGroupStream.h index ad37c330681ac..d1fd95953f1f0 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupStream.h +++ b/paddle/fluid/distributed/collective/ProcessGroupStream.h @@ -117,6 +117,43 @@ class ProcessGroupStream : public ProcessGroup { bool sync_op, bool use_calc_stream); + std::shared_ptr Reduce(phi::DenseTensor* out_tensor, + const phi::DenseTensor& in_tensor, + const ReduceOptions& opts, + bool sync_op) override; + + virtual std::shared_ptr Reduce( + phi::DenseTensor* out_tensor, + const phi::DenseTensor& in_tensor, + const ReduceOptions& opts, + bool sync_op, + bool use_calc_stream); + + std::shared_ptr ReduceScatter( + phi::DenseTensor* out_tensor, + const phi::DenseTensor& in_tensor, + const ReduceScatterOptions& opts, + bool sync_op) override; + + virtual std::shared_ptr ReduceScatter( + phi::DenseTensor* out_tensor, + const phi::DenseTensor& in_tensor, + const ReduceScatterOptions& opts, + bool sync_op, + bool use_calc_stream); + + std::shared_ptr Scatter(phi::DenseTensor* out_tensor, + const phi::DenseTensor& in_tensor, + const ScatterOptions& opts, + bool sync_op) override; + + virtual std::shared_ptr Scatter( + phi::DenseTensor* out_tensor, + const phi::DenseTensor& in_tensor, + const ScatterOptions& opts, + bool sync_op, + bool use_calc_stream); + std::shared_ptr Recv(phi::DenseTensor* tensor, int src_rank, int64_t offset, @@ -155,45 +192,6 @@ class ProcessGroupStream : public ProcessGroup { bool sync_op, bool use_calc_stream); - std::shared_ptr Reduce( - std::vector& in_tensors, // NOLINT - std::vector& out_tensors, // NOLINT - const ReduceOptions& opts, - bool sync_op) override; - - virtual std::shared_ptr Reduce( - std::vector& in_tensors, // NOLINT - std::vector& out_tensors, // NOLINT - const ReduceOptions& opts, - bool sync_op, - bool use_calc_stream); - - std::shared_ptr ReduceScatter( - std::vector& in_tensors, // NOLINT - std::vector& out_tensors, // NOLINT - const ReduceScatterOptions& opts, - bool sync_op) override; - - virtual std::shared_ptr ReduceScatter( - std::vector& in_tensors, // NOLINT - std::vector& out_tensors, // NOLINT - const ReduceScatterOptions& opts, - bool sync_op, - bool use_calc_stream); - - std::shared_ptr Scatter( - std::vector& in_tensors, // NOLINT - std::vector& out_tensors, // NOLINT - const ScatterOptions& opts, - bool sync_op) override; - - virtual std::shared_ptr Scatter( - std::vector& in_tensors, // NOLINT - std::vector& out_tensors, // NOLINT - const ScatterOptions& opts, - bool sync_op, - bool use_calc_stream); - std::shared_ptr Recv( std::vector& tensors, // NOLINT int src_rank, diff --git a/paddle/fluid/pybind/distributed_py.cc b/paddle/fluid/pybind/distributed_py.cc index 52160ea99a083..0634f825a0110 100644 --- a/paddle/fluid/pybind/distributed_py.cc +++ b/paddle/fluid/pybind/distributed_py.cc @@ -412,16 +412,17 @@ void BindDistributed(py::module *m) { .def( "reduce", [](distributed::ProcessGroup &self, - py::handle py_in_tensor, + py::handle py_tensor, int dst, distributed::ReduceOp op, bool sync_op) { - auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0); + auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0); + auto p_dense = + std::dynamic_pointer_cast(tensor.impl()); + auto *out_dense = p_dense.get(); + auto in_dense = *p_dense; distributed::ReduceOptions opts{op, dst}; - auto dense = std::dynamic_pointer_cast( - in_tensor.impl()); - std::vector tensors = {*dense}; - return self.Reduce(tensors, tensors, opts, sync_op); + return self.Reduce(out_dense, in_dense, opts, sync_op); }, py::arg("tensor"), py::arg("dst"), @@ -432,28 +433,27 @@ void BindDistributed(py::module *m) { .def( "reduce_scatter", [](distributed::ProcessGroup &self, - py::handle py_in_tensor_list, py::handle py_out_tensor, + py::handle py_in_tensor_list, distributed::ReduceOp op, bool sync_op) { + auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0); + auto p_out_tensor = std::dynamic_pointer_cast( + out_tensor.impl()); + auto out_dense = p_out_tensor.get(); + auto in_tensor_list = CastPyArg2VectorOfTensor(py_in_tensor_list.ptr(), 0); Tensor concat_in_tensor = paddle::concat(in_tensor_list, 0); - auto in_dense = std::dynamic_pointer_cast( + auto p_in_tensor = std::dynamic_pointer_cast( concat_in_tensor.impl()); - std::vector in_wrapper = {*in_dense}; - - auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0); - auto out_dense = std::dynamic_pointer_cast( - out_tensor.impl()); - std::vector out_wrapper = {*out_dense}; + auto in_dense = *p_in_tensor; distributed::ReduceScatterOptions opts{op}; - return self.ReduceScatter( - in_wrapper, out_wrapper, opts, sync_op); + return self.ReduceScatter(out_dense, in_dense, opts, sync_op); }, - py::arg("in"), py::arg("out"), + py::arg("in"), py::arg("op"), py::arg("sync_op"), py::call_guard()) @@ -461,26 +461,25 @@ void BindDistributed(py::module *m) { .def( "reduce_scatter_tensor", [](distributed::ProcessGroup &self, - py::handle py_in_tensor, py::handle py_out_tensor, + py::handle py_in_tensor, distributed::ReduceOp op, bool sync_op) { - auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0); - auto in_dense = std::dynamic_pointer_cast( - in_tensor.impl()); - std::vector in_wrapper = {*in_dense}; - auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0); - auto out_dense = std::dynamic_pointer_cast( + auto p_out_tensor = std::dynamic_pointer_cast( out_tensor.impl()); - std::vector out_wrapper = {*out_dense}; + auto out_dense = p_out_tensor.get(); + + auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0); + auto p_in_tensor = std::dynamic_pointer_cast( + in_tensor.impl()); + auto in_dense = *p_in_tensor; distributed::ReduceScatterOptions opts{op}; - return self.ReduceScatter( - in_wrapper, out_wrapper, opts, sync_op); + return self.ReduceScatter(out_dense, in_dense, opts, sync_op); }, - py::arg("in"), py::arg("out"), + py::arg("in"), py::arg("op"), py::arg("sync_op"), py::call_guard()) @@ -488,27 +487,27 @@ void BindDistributed(py::module *m) { .def( "scatter", [](distributed::ProcessGroup &self, - py::handle py_in_tensor_list, py::handle py_out_tensor, + py::handle py_in_tensor_list, int src, bool sync_op) { + auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0); + auto p_out_tensor = std::dynamic_pointer_cast( + out_tensor.impl()); + auto *out_dense = p_out_tensor.get(); + auto in_tensor_list = CastPyArg2VectorOfTensor(py_in_tensor_list.ptr(), 0); Tensor concat_in_tensor = paddle::concat(in_tensor_list, 0); - auto in_dense = std::dynamic_pointer_cast( + auto p_in_tensor = std::dynamic_pointer_cast( concat_in_tensor.impl()); - std::vector in_wrapper = {*in_dense}; - - auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0); - auto out_dense = std::dynamic_pointer_cast( - out_tensor.impl()); - std::vector out_wrapper = {*out_dense}; + auto in_dense = *p_in_tensor; distributed::ScatterOptions opts{src}; - return self.Scatter(in_wrapper, out_wrapper, opts, sync_op); + return self.Scatter(out_dense, in_dense, opts, sync_op); }, - py::arg("in"), py::arg("out"), + py::arg("in"), py::arg("src"), py::arg("sync_op"), py::call_guard()) @@ -516,25 +515,25 @@ void BindDistributed(py::module *m) { .def( "scatter_tensor", [](distributed::ProcessGroup &self, - py::handle py_in_tensor, py::handle py_out_tensor, + py::handle py_in_tensor, int src, bool sync_op) { - auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0); - auto in_dense = std::dynamic_pointer_cast( - in_tensor.impl()); - std::vector in_wrapper = {*in_dense}; - auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0); - auto out_dense = std::dynamic_pointer_cast( + auto p_out_tensor = std::dynamic_pointer_cast( out_tensor.impl()); - std::vector out_wrapper = {*out_dense}; + auto *out_dense = p_out_tensor.get(); + + auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0); + auto p_in_tensor = std::dynamic_pointer_cast( + in_tensor.impl()); + auto in_dense = *p_in_tensor; distributed::ScatterOptions opts{src}; - return self.Scatter(in_wrapper, out_wrapper, opts, sync_op); + return self.Scatter(out_dense, in_dense, opts, sync_op); }, - py::arg("in"), py::arg("out"), + py::arg("in"), py::arg("src"), py::arg("sync_op"), py::call_guard()) @@ -986,16 +985,17 @@ void BindDistributed(py::module *m) { .def( "reduce_on_calc_stream", [](distributed::ProcessGroupStream &self, - py::handle py_in_tensor, + py::handle py_tensor, int dst, distributed::ReduceOp op) { - auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0); + auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0); + auto p_dense = + std::dynamic_pointer_cast(tensor.impl()); + auto *out_dense = p_dense.get(); + auto in_dense = *p_dense; distributed::ReduceOptions opts{op, dst}; - auto dense = std::dynamic_pointer_cast( - in_tensor.impl()); - std::vector tensors = {*dense}; - return self.Reduce(tensors, - tensors, + return self.Reduce(out_dense, + in_dense, opts, /*sync_op*/ true, /*use_calc_stream*/ true); @@ -1008,116 +1008,116 @@ void BindDistributed(py::module *m) { .def( "reduce_scatter_on_calc_stream", [](distributed::ProcessGroupStream &self, - py::handle py_in_tensor_list, py::handle py_out_tensor, + py::handle py_in_tensor_list, distributed::ReduceOp op) { + auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0); + auto p_out_tensor = std::dynamic_pointer_cast( + out_tensor.impl()); + auto out_dense = p_out_tensor.get(); + auto in_tensor_list = CastPyArg2VectorOfTensor(py_in_tensor_list.ptr(), 0); Tensor concat_in_tensor = paddle::concat(in_tensor_list, 0); - auto in_dense = std::dynamic_pointer_cast( + auto p_in_tensor = std::dynamic_pointer_cast( concat_in_tensor.impl()); - std::vector in_wrapper = {*in_dense}; - - auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0); - auto out_dense = std::dynamic_pointer_cast( - out_tensor.impl()); - std::vector out_wrapper = {*out_dense}; + auto in_dense = *p_in_tensor; distributed::ReduceScatterOptions opts{op}; - return self.ReduceScatter(in_wrapper, - out_wrapper, + return self.ReduceScatter(out_dense, + in_dense, opts, /*sync_op*/ true, /*use_calc_stream*/ true); }, - py::arg("in"), py::arg("out"), + py::arg("in"), py::arg("op"), py::call_guard()) .def( "reduce_scatter_tensor_on_calc_stream", [](distributed::ProcessGroupStream &self, - py::handle py_in_tensor, py::handle py_out_tensor, + py::handle py_in_tensor, distributed::ReduceOp op) { - auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0); - auto in_dense = std::dynamic_pointer_cast( - in_tensor.impl()); - std::vector in_wrapper = {*in_dense}; - auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0); - auto out_dense = std::dynamic_pointer_cast( + auto p_out_tensor = std::dynamic_pointer_cast( out_tensor.impl()); - std::vector out_wrapper = {*out_dense}; + auto out_dense = p_out_tensor.get(); + + auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0); + auto p_in_tensor = std::dynamic_pointer_cast( + in_tensor.impl()); + auto in_dense = *p_in_tensor; distributed::ReduceScatterOptions opts{op}; - return self.ReduceScatter(in_wrapper, - out_wrapper, + return self.ReduceScatter(out_dense, + in_dense, opts, /*sync_op*/ true, /*use_calc_stream*/ true); }, - py::arg("in"), py::arg("out"), + py::arg("in"), py::arg("op"), py::call_guard()) .def( "scatter_on_calc_stream", [](distributed::ProcessGroupStream &self, - py::handle py_in_tensor_list, py::handle py_out_tensor, + py::handle py_in_tensor_list, int src) { + auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0); + auto p_out_tensor = std::dynamic_pointer_cast( + out_tensor.impl()); + auto *out_dense = p_out_tensor.get(); + auto in_tensor_list = CastPyArg2VectorOfTensor(py_in_tensor_list.ptr(), 0); Tensor concat_in_tensor = paddle::concat(in_tensor_list, 0); - auto in_dense = std::dynamic_pointer_cast( + auto p_in_tensor = std::dynamic_pointer_cast( concat_in_tensor.impl()); - std::vector in_wrapper = {*in_dense}; - - auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0); - auto out_dense = std::dynamic_pointer_cast( - out_tensor.impl()); - std::vector out_wrapper = {*out_dense}; + auto in_dense = *p_in_tensor; distributed::ScatterOptions opts{src}; - return self.Scatter(in_wrapper, - out_wrapper, + return self.Scatter(out_dense, + in_dense, opts, /*sync_op*/ true, /*use_calc_stream*/ true); }, - py::arg("in"), py::arg("out"), + py::arg("in"), py::arg("src"), py::call_guard()) .def( "scatter_tensor_on_calc_stream", [](distributed::ProcessGroupStream &self, - py::handle py_in_tensor, py::handle py_out_tensor, + py::handle py_in_tensor, int src) { - auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0); - auto in_dense = std::dynamic_pointer_cast( - in_tensor.impl()); - std::vector in_wrapper = {*in_dense}; - auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0); - auto out_dense = std::dynamic_pointer_cast( + auto p_out_tensor = std::dynamic_pointer_cast( out_tensor.impl()); - std::vector out_wrapper = {*out_dense}; + auto *out_dense = p_out_tensor.get(); + + auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0); + auto p_in_tensor = std::dynamic_pointer_cast( + in_tensor.impl()); + auto in_dense = *p_in_tensor; distributed::ScatterOptions opts{src}; - return self.Scatter(in_wrapper, - out_wrapper, + return self.Scatter(out_dense, + in_dense, opts, /*sync_op*/ true, /*use_calc_stream*/ true); }, - py::arg("in"), py::arg("out"), + py::arg("in"), py::arg("src"), py::call_guard()) diff --git a/python/paddle/distributed/communication/stream/reduce_scatter.py b/python/paddle/distributed/communication/stream/reduce_scatter.py index aa0d1e9b95550..4d26e8d2b66c5 100644 --- a/python/paddle/distributed/communication/stream/reduce_scatter.py +++ b/python/paddle/distributed/communication/stream/reduce_scatter.py @@ -57,11 +57,11 @@ def _reduce_scatter_tensor_in_dygraph( if use_calc_stream: return group.process_group.reduce_scatter_tensor_on_calc_stream( - in_tensor, out_tensor, op_type + out_tensor, in_tensor, op_type ) task = group.process_group.reduce_scatter_tensor( - in_tensor, out_tensor, op_type, sync_op + out_tensor, in_tensor, op_type, sync_op ) if sync_op: task.wait() @@ -78,11 +78,11 @@ def _reduce_scatter_in_dygraph( if use_calc_stream: return group.process_group.reduce_scatter_on_calc_stream( - tensor_list, tensor, op_type + tensor, tensor_list, op_type ) task = group.process_group.reduce_scatter( - tensor_list, tensor, op_type, sync_op + tensor, tensor_list, op_type, sync_op ) if sync_op: task.wait() diff --git a/python/paddle/distributed/communication/stream/scatter.py b/python/paddle/distributed/communication/stream/scatter.py index 75a8ab3909a88..5767c2150d813 100644 --- a/python/paddle/distributed/communication/stream/scatter.py +++ b/python/paddle/distributed/communication/stream/scatter.py @@ -53,11 +53,11 @@ def _scatter_tensor_in_dygraph( if use_calc_stream: return group.process_group.scatter_tensor_on_calc_stream( - in_tensor, out_tensor, src_rank_in_group + out_tensor, in_tensor, src_rank_in_group ) task = group.process_group.scatter_tensor( - in_tensor, out_tensor, src_rank_in_group, sync_op + out_tensor, in_tensor, src_rank_in_group, sync_op ) if sync_op: task.wait() @@ -80,11 +80,11 @@ def _scatter_in_dygraph( if use_calc_stream: return group.process_group.scatter_on_calc_stream( - tensor_list, tensor, src_rank_in_group + tensor, tensor_list, src_rank_in_group ) task = group.process_group.scatter( - tensor_list, tensor, src_rank_in_group, sync_op + tensor, tensor_list, src_rank_in_group, sync_op ) if sync_op: task.wait() From 27ee6e714046e8cf6dd913854da167233f7f7c41 Mon Sep 17 00:00:00 2001 From: huangjiyi <43315610+huangjiyi@users.noreply.github.com> Date: Fri, 18 Nov 2022 16:25:18 +0800 Subject: [PATCH 090/210] [PHI decoupling] move "gpu_device_function.h" from fluid to phi (#48097) * move "paddle/phi/backends/gpu/gpu_device_function.h" to phi * update copyright years * rm "fluid/platform/device/gpu/gpu_device_function.h" in phi * fix rocm-complie bugs --- .../backends/gpu/cuda/cuda_device_function.h | 191 ++++++++++++++++++ paddle/phi/backends/gpu/gpu_device_function.h | 24 +++ .../backends/gpu/rocm/rocm_device_function.h | 165 +++++++++++++++ .../phi/kernels/funcs/elementwise_grad_base.h | 36 ++-- paddle/phi/kernels/funcs/reduce_function.h | 2 +- .../phi/kernels/gpu/activation_grad_kernel.cu | 2 +- paddle/phi/kernels/gpu/activation_kernel.cu | 2 +- .../kernels/gpu/affine_grid_grad_kernel.cu | 2 +- paddle/phi/kernels/gpu/affine_grid_kernel.cu | 2 +- .../kernels/gpu/cross_entropy_grad_kernel.cu | 2 +- .../phi/kernels/gpu/cross_entropy_kernel.cu | 2 +- paddle/phi/kernels/gpu/depthwise_conv.h | 4 +- .../kernels/gpu/grid_sample_grad_kernel.cu | 2 +- paddle/phi/kernels/gpu/group_norm_utils.h | 2 +- paddle/phi/kernels/gpu/interpolate_kernel.cu | 2 +- .../kernels/gpudnn/affine_grid_grad_kernel.cu | 2 +- .../phi/kernels/gpudnn/affine_grid_kernel.cu | 2 +- paddle/phi/kernels/gpudnn/softmax_gpudnn.h | 6 +- .../kernels/primitive/compute_primitives.h | 6 +- 19 files changed, 420 insertions(+), 36 deletions(-) create mode 100644 paddle/phi/backends/gpu/cuda/cuda_device_function.h create mode 100644 paddle/phi/backends/gpu/gpu_device_function.h create mode 100644 paddle/phi/backends/gpu/rocm/rocm_device_function.h diff --git a/paddle/phi/backends/gpu/cuda/cuda_device_function.h b/paddle/phi/backends/gpu/cuda/cuda_device_function.h new file mode 100644 index 0000000000000..10aee53c45cf9 --- /dev/null +++ b/paddle/phi/backends/gpu/cuda/cuda_device_function.h @@ -0,0 +1,191 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +// NOTE(): support float16 to half in header file. +#define PADDLE_CUDA_FP16 +#include "paddle/phi/common/bfloat16.h" +#include "paddle/phi/common/complex.h" +#include "paddle/phi/common/float16.h" +#include "paddle/phi/core/enforce.h" + +namespace phi { +namespace backends { +namespace gpu { + +#define FULL_WARP_MASK 0xFFFFFFFF +#define CREATE_SHFL_MASK(mask, predicate) \ + mask = __ballot_sync(FULL_WARP_MASK, (predicate)) + +#define CUDA_LAUNCH_KERNEL_BASE(dim, ...) \ + case (dim): { \ + constexpr auto kPowerOfTwoDim = (dim); \ + __VA_ARGS__; \ + } break + +#define CUDA_LAUNCH_KERNEL_HELPER(...) \ + CUDA_LAUNCH_KERNEL_BASE(1024, ##__VA_ARGS__); \ + CUDA_LAUNCH_KERNEL_BASE(512, ##__VA_ARGS__); \ + CUDA_LAUNCH_KERNEL_BASE(256, ##__VA_ARGS__); \ + CUDA_LAUNCH_KERNEL_BASE(128, ##__VA_ARGS__); \ + CUDA_LAUNCH_KERNEL_BASE(64, ##__VA_ARGS__); \ + CUDA_LAUNCH_KERNEL_BASE(32, ##__VA_ARGS__); + +template +__forceinline__ __device__ T +CudaShuffleDownSync(unsigned mask, T val, int delta, int width = warpSize) { + return __shfl_down_sync(mask, val, static_cast(delta), width); +} + +template +__forceinline__ __device__ T CudaShuffleXorSync(unsigned mask, + T val, + int width = warpSize) { + return __shfl_xor_sync(mask, val, width); +} + +template <> +__forceinline__ __device__ phi::dtype::float16 CudaShuffleDownSync( + unsigned mask, phi::dtype::float16 val, int delta, int width) { + return phi::dtype::float16(__shfl_down_sync( + mask, val.to_half(), static_cast(delta), width)); +} + +template <> +__forceinline__ __device__ phi::dtype::bfloat16 CudaShuffleDownSync( + unsigned mask, phi::dtype::bfloat16 val, int delta, int width) { +#if defined(PADDLE_CUDA_BF16) + return phi::dtype::bfloat16(__shfl_down_sync(mask, + static_cast(val), + static_cast(delta), + width)); +#else + PADDLE_ENFORCE( + false, "__shfl_down_sync with bfloat16 is not supported on cuda <= 11."); +#endif +} + +template <> +__forceinline__ __device__ phi::dtype::complex CudaShuffleDownSync( + unsigned mask, phi::dtype::complex val, int delta, int width) { + float real = static_cast(__shfl_down_sync( + mask, static_cast(val.real), static_cast(delta), width)); + float imag = static_cast(__shfl_down_sync( + mask, static_cast(val.imag), static_cast(delta), width)); + return phi::dtype::complex(real, imag); +} + +template <> +__forceinline__ __device__ phi::dtype::complex CudaShuffleDownSync( + unsigned mask, phi::dtype::complex val, int delta, int width) { + double real = + static_cast(__shfl_down_sync(mask, + static_cast(val.real), + static_cast(delta), + width)); + double imag = + static_cast(__shfl_down_sync(mask, + static_cast(val.imag), + static_cast(delta), + width)); + return phi::dtype::complex(real, imag); +} + +template <> +__forceinline__ __device__ phi::dtype::float16 CudaShuffleXorSync( + unsigned mask, phi::dtype::float16 val, int width) { + return phi::dtype::float16(__shfl_xor_sync(mask, val.to_half(), width)); +} + +template <> +__forceinline__ __device__ phi::dtype::bfloat16 CudaShuffleXorSync( + unsigned mask, phi::dtype::bfloat16 val, int width) { +#if defined(PADDLE_CUDA_BF16) + return phi::dtype::bfloat16( + __shfl_xor_sync(mask, static_cast(val), width)); +#else + PADDLE_ENFORCE( + false, "__shfl_xor_sync with bfloat16 is not supported on cuda <= 11."); +#endif +} + +template <> +__forceinline__ __device__ phi::dtype::complex CudaShuffleXorSync( + unsigned mask, phi::dtype::complex val, int width) { + float real = static_cast( + __shfl_xor_sync(mask, static_cast(val.real), width)); + float imag = static_cast( + __shfl_xor_sync(mask, static_cast(val.imag), width)); + return phi::dtype::complex(real, imag); +} + +template <> +__forceinline__ __device__ phi::dtype::complex CudaShuffleXorSync( + unsigned mask, phi::dtype::complex val, int width) { + double real = static_cast( + __shfl_xor_sync(mask, static_cast(val.real), width)); + double imag = static_cast( + __shfl_xor_sync(mask, static_cast(val.imag), width)); + return phi::dtype::complex(real, imag); +} + +template +__forceinline__ __device__ T +CudaShuffleSync(unsigned mask, T val, int src_line, int width = 32) { + return __shfl_sync(mask, val, src_line, width); +} + +template +HOSTDEVICE T Infinity() { + return INFINITY; +} + +template +__device__ T reduceSum(T val, int tid, int len) { + // NOTE(zcd): The warp size should be taken from the + // parameters of the GPU but not specified as 32 simply. + // To make the reduceSum more efficiently, + // I use Warp-Level Parallelism and assume the Warp size + // is 32 which may be different for different GPU, + // but most card's warp size is 32. + const int warpSize = 32; + __shared__ T shm[warpSize]; + unsigned mask = 0u; + CREATE_SHFL_MASK(mask, tid < len); + + for (int offset = warpSize / 2; offset > 0; offset /= 2) + val += phi::backends::gpu::CudaShuffleDownSync(mask, val, offset); + + if (tid < warpSize) shm[tid] = 0; + __syncthreads(); + + if (tid % warpSize == 0) { + shm[tid / warpSize] = val; + } + __syncthreads(); + + CREATE_SHFL_MASK(mask, tid < warpSize); + + if (tid < warpSize) { + val = shm[tid]; + for (int offset = warpSize / 2; offset > 0; offset /= 2) + val += phi::backends::gpu::CudaShuffleDownSync(mask, val, offset); + } + return val; +} + +} // namespace gpu +} // namespace backends +} // namespace phi diff --git a/paddle/phi/backends/gpu/gpu_device_function.h b/paddle/phi/backends/gpu/gpu_device_function.h new file mode 100644 index 0000000000000..0f79e2a645ab3 --- /dev/null +++ b/paddle/phi/backends/gpu/gpu_device_function.h @@ -0,0 +1,24 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + +#ifdef PADDLE_WITH_HIP +#include "paddle/phi/backends/gpu/rocm/rocm_device_function.h" +#else +#include "paddle/phi/backends/gpu/cuda/cuda_device_function.h" +#endif + +#endif diff --git a/paddle/phi/backends/gpu/rocm/rocm_device_function.h b/paddle/phi/backends/gpu/rocm/rocm_device_function.h new file mode 100644 index 0000000000000..6f5d684075f0f --- /dev/null +++ b/paddle/phi/backends/gpu/rocm/rocm_device_function.h @@ -0,0 +1,165 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +// NOTE(): support float16 to half in header file. +#define PADDLE_CUDA_FP16 +#include "paddle/phi/common/bfloat16.h" +#include "paddle/phi/common/complex.h" +#include "paddle/phi/common/float16.h" + +namespace phi { +namespace backends { +namespace gpu { + +#define CREATE_SHFL_MASK(mask, predicate) mask = __ballot((predicate)) + +#define CUDA_LAUNCH_KERNEL_BASE(dim, ...) \ + case (dim): { \ + constexpr auto kPowerOfTwoDim = (dim); \ + __VA_ARGS__; \ + } break + +#define CUDA_LAUNCH_KERNEL_HELPER(...) \ + CUDA_LAUNCH_KERNEL_BASE(1024, ##__VA_ARGS__); \ + CUDA_LAUNCH_KERNEL_BASE(512, ##__VA_ARGS__); \ + CUDA_LAUNCH_KERNEL_BASE(256, ##__VA_ARGS__); \ + CUDA_LAUNCH_KERNEL_BASE(128, ##__VA_ARGS__); \ + CUDA_LAUNCH_KERNEL_BASE(64, ##__VA_ARGS__); \ + CUDA_LAUNCH_KERNEL_BASE(32, ##__VA_ARGS__); + +template +__forceinline__ __device__ T +CudaShuffleDownSync(unsigned mask, T val, int delta, int width = warpSize) { + return __shfl_down(val, delta, width); +} + +template +__forceinline__ __device__ T CudaShuffleXorSync(unsigned mask, + T val, + int width = warpSize) { + return __shfl_xor(val, width); +} + +template <> +__forceinline__ __device__ phi::dtype::float16 CudaShuffleDownSync( + unsigned mask, phi::dtype::float16 val, int delta, int width) { + return phi::dtype::float16(__shfl_down( + static_cast(val), static_cast(delta), width)); +} + +template <> +__forceinline__ __device__ phi::dtype::bfloat16 CudaShuffleDownSync( + unsigned mask, phi::dtype::bfloat16 val, int delta, int width) { + return phi::dtype::bfloat16(__shfl_down( + static_cast(val), static_cast(delta), width)); +} + +template <> +__forceinline__ __device__ phi::dtype::complex CudaShuffleDownSync( + unsigned mask, phi::dtype::complex val, int delta, int width) { + float real = __shfl_down(val.real, delta, width); + float imag = __shfl_down(val.imag, delta, width); + return phi::dtype::complex(real, imag); +} + +template <> +__forceinline__ __device__ phi::dtype::complex CudaShuffleDownSync( + unsigned mask, phi::dtype::complex val, int delta, int width) { + double real = __shfl_down(val.real, delta, width); + double imag = __shfl_down(val.imag, delta, width); + return phi::dtype::complex(real, imag); +} + +template <> +__forceinline__ __device__ phi::dtype::float16 CudaShuffleXorSync( + unsigned mask, phi::dtype::float16 val, int width) { + return phi::dtype::float16(__shfl_xor(static_cast(val), width)); +} + +template <> +__forceinline__ __device__ phi::dtype::bfloat16 CudaShuffleXorSync( + unsigned mask, phi::dtype::bfloat16 val, int width) { + return phi::dtype::bfloat16(__shfl_xor(static_cast(val), width)); +} + +template <> +__forceinline__ __device__ phi::dtype::complex CudaShuffleXorSync( + unsigned mask, phi::dtype::complex val, int width) { + float real = __shfl_xor(val.real, width); + float imag = __shfl_xor(val.imag, width); + return phi::dtype::complex(real, imag); +} + +template <> +__forceinline__ __device__ phi::dtype::complex CudaShuffleXorSync( + unsigned mask, phi::dtype::complex val, int width) { + double real = __shfl_xor(val.real, width); + double imag = __shfl_xor(val.imag, width); + return phi::dtype::complex(real, imag); +} + +template +__forceinline__ __device__ T +CudaShuffleSync(unsigned mask, T val, int src_line, int width = 32) { + return __shfl(val, src_line, width); +} + +template +HOSTDEVICE T Infinity() { + return INFINITY; +} + +template +__device__ T reduceSum(T val, int tid, int len) { + // NOTE(zcd): The warp size should be taken from the + // parameters of the GPU but not specified as 32 simply. + // To make the reduceSum more efficiently, + // I use Warp-Level Parallelism and assume the Warp size + // is 32 which may be different for different GPU, + // but most card's warp size is 32. +#ifdef PADDLE_WITH_HIP + const int warpSize = 64; +#else + const int warpSize = 32; +#endif + __shared__ T shm[warpSize]; + unsigned mask = 0u; + CREATE_SHFL_MASK(mask, tid < len); + + for (int offset = warpSize / 2; offset > 0; offset /= 2) + val += phi::backends::gpu::CudaShuffleDownSync(mask, val, offset); + + if (tid < warpSize) shm[tid] = 0; + __syncthreads(); + + if (tid % warpSize == 0) { + shm[tid / warpSize] = val; + } + __syncthreads(); + + CREATE_SHFL_MASK(mask, tid < warpSize); + + if (tid < warpSize) { + val = shm[tid]; + for (int offset = warpSize / 2; offset > 0; offset /= 2) + val += phi::backends::gpu::CudaShuffleDownSync(mask, val, offset); + } + return val; +} + +} // namespace gpu +} // namespace backends +} // namespace phi diff --git a/paddle/phi/kernels/funcs/elementwise_grad_base.h b/paddle/phi/kernels/funcs/elementwise_grad_base.h index c55ce6a89ae1c..65f21e5b7f196 100644 --- a/paddle/phi/kernels/funcs/elementwise_grad_base.h +++ b/paddle/phi/kernels/funcs/elementwise_grad_base.h @@ -24,7 +24,7 @@ limitations under the License. */ #if defined(__NVCC__) || defined(__HIPCC__) // See Note [ Why still include the fluid headers? ] #include "paddle/fluid/memory/memcpy.h" -#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" +#include "paddle/phi/backends/gpu/gpu_device_function.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/kernels/primitive/kernel_primitives.h" @@ -504,7 +504,7 @@ static __global__ void FastCommonGradBroadcastOneCUDAKernel(const T *x, } if (dd) { int h = n > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : n; - val = paddle::platform::reduceSum(val, tid, h); + val = phi::backends::gpu::reduceSum(val, tid, h); if (tid == 0) { dd[bid] = val; } @@ -527,7 +527,7 @@ static __global__ void FastCommonGradBroadcastOneCUDAKernel(const T *x, } if (dd) { int h = n > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : n; - val = paddle::platform::reduceSum(val, tid, h); + val = phi::backends::gpu::reduceSum(val, tid, h); if (tid == 0) { dd[bid] = val; } @@ -569,7 +569,7 @@ static __global__ void FastCommonGradBroadcastAllCUDAKernel( } if (dy) { int h = n > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : n; - val = paddle::platform::reduceSum(val, tid, h); + val = phi::backends::gpu::reduceSum(val, tid, h); if (tid == 0) { dy[bid] = val; } @@ -590,7 +590,7 @@ static __global__ void FastCommonGradBroadcastAllCUDAKernel( } if (dx) { int h = n > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : n; - val = paddle::platform::reduceSum(val, tid, h); + val = phi::backends::gpu::reduceSum(val, tid, h); if (tid == 0) { dx[bid] = val; } @@ -636,7 +636,8 @@ static __global__ void FastCommonGradBroadcastCUDAKernelHeight(const T *x, if (dy) { T my_val = sdata[THREAD_ID_X][THREAD_ID_Y]; for (int i = warpSize >> 1; i > 0; i >>= 1) { - my_val += paddle::platform::CudaShuffleXorSync(0xFFFFFFFF, my_val, i); + my_val += + phi::backends::gpu::CudaShuffleXorSync(0xFFFFFFFF, my_val, i); } __syncthreads(); if ((THREAD_ID_X == 0)) { @@ -665,7 +666,8 @@ static __global__ void FastCommonGradBroadcastCUDAKernelHeight(const T *x, if (dy) { T my_val = sdata[THREAD_ID_X][THREAD_ID_Y]; for (int i = warpSize >> 1; i > 0; i >>= 1) { - my_val += paddle::platform::CudaShuffleXorSync(0xFFFFFFFF, my_val, i); + my_val += + phi::backends::gpu::CudaShuffleXorSync(0xFFFFFFFF, my_val, i); } __syncthreads(); if ((THREAD_ID_X == 0)) { @@ -709,7 +711,7 @@ static __global__ void CommonGradBroadcast1CUDAKernelHeight(const T *x, if (dy) { h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h; - val = paddle::platform::reduceSum(val, tid, h); + val = phi::backends::gpu::reduceSum(val, tid, h); if (THREAD_ID_X == 0) { dy[j] = val; } @@ -726,7 +728,7 @@ static __global__ void CommonGradBroadcast1CUDAKernelHeight(const T *x, if (dy) { h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h; - val = paddle::platform::reduceSum(val, tid, h); + val = phi::backends::gpu::reduceSum(val, tid, h); if (THREAD_ID_X == 0) { dy[j] = val; } @@ -764,7 +766,7 @@ static __global__ void ElemwiseGradBroadcast1CUDAKernel(const T *x, if (dy) { h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h; - val = paddle::platform::reduceSum(val, tid, h); + val = phi::backends::gpu::reduceSum(val, tid, h); if (THREAD_ID_X == 0) { dy[j] = val; } @@ -783,7 +785,7 @@ static __global__ void ElemwiseGradBroadcast1CUDAKernel(const T *x, if (dx) { h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h; - val = paddle::platform::reduceSum(val, tid, h); + val = phi::backends::gpu::reduceSum(val, tid, h); if (THREAD_ID_X == 0) { dx[j] = val; } @@ -835,7 +837,8 @@ static __global__ void FastElemwiseGradBroadcast1CUDAKernel( if (dy) { T my_val = sdata[THREAD_ID_X][THREAD_ID_Y]; for (int i = warpSize >> 1; i > 0; i >>= 1) - my_val += paddle::platform::CudaShuffleXorSync(0xFFFFFFFF, my_val, i); + my_val += + phi::backends::gpu::CudaShuffleXorSync(0xFFFFFFFF, my_val, i); __syncthreads(); if ((THREAD_ID_X == 0)) { sdata[0][THREAD_ID_Y] = my_val; @@ -866,7 +869,8 @@ static __global__ void FastElemwiseGradBroadcast1CUDAKernel( if (dx) { T my_val = sdata[THREAD_ID_X][THREAD_ID_Y]; for (int i = warpSize >> 1; i > 0; i >>= 1) - my_val += paddle::platform::CudaShuffleXorSync(0xFFFFFFFF, my_val, i); + my_val += + phi::backends::gpu::CudaShuffleXorSync(0xFFFFFFFF, my_val, i); __syncthreads(); if ((THREAD_ID_X == 0)) { sdata[0][THREAD_ID_Y] = my_val; @@ -921,7 +925,7 @@ static __global__ void ElemwiseGradBroadcast2CUDAKernel(const T *x, if (dy) { int h = pre * post; h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h; - val = paddle::platform::reduceSum(val, tid, h); + val = phi::backends::gpu::reduceSum(val, tid, h); if (THREAD_ID_X == 0) { dy[j] = val; } @@ -948,7 +952,7 @@ static __global__ void ElemwiseGradBroadcast2CUDAKernel(const T *x, if (dx) { int h = pre * post; h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h; - val = paddle::platform::reduceSum(val, tid, h); + val = phi::backends::gpu::reduceSum(val, tid, h); if (THREAD_ID_X == 0) { dx[j] = val; } @@ -1054,7 +1058,7 @@ __global__ void CommonGradBroadcastCUDAKernel(const int *x_strides_array, out_index = C_index; val += dx_op(x[x_index], y[y_index], out[out_index], dout[out_index]); } - val = paddle::platform::reduceSum(val, tid, thread_num); + val = phi::backends::gpu::reduceSum(val, tid, thread_num); if (THREAD_ID_X == 0) { dx[i] = val; } diff --git a/paddle/phi/kernels/funcs/reduce_function.h b/paddle/phi/kernels/funcs/reduce_function.h index 9719fbd88160e..1b1a55b25c5ec 100644 --- a/paddle/phi/kernels/funcs/reduce_function.h +++ b/paddle/phi/kernels/funcs/reduce_function.h @@ -33,8 +33,8 @@ namespace cub = hipcub; #endif #ifndef PADDLE_WITH_XPU_KP -#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/backends/gpu/gpu_device_function.h" #include "paddle/phi/backends/gpu/gpu_info.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h" #endif diff --git a/paddle/phi/kernels/gpu/activation_grad_kernel.cu b/paddle/phi/kernels/gpu/activation_grad_kernel.cu index 5e75909649a65..2c2ca16e2623f 100644 --- a/paddle/phi/kernels/gpu/activation_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/activation_grad_kernel.cu @@ -14,8 +14,8 @@ limitations under the License. */ #include "paddle/phi/kernels/activation_grad_kernel.h" -#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/backends/gpu/gpu_device_function.h" #include "paddle/phi/common/bfloat16.h" #include "paddle/phi/common/float16.h" #include "paddle/phi/core/kernel_registry.h" diff --git a/paddle/phi/kernels/gpu/activation_kernel.cu b/paddle/phi/kernels/gpu/activation_kernel.cu index df8ae72346a6d..5168a1de07335 100644 --- a/paddle/phi/kernels/gpu/activation_kernel.cu +++ b/paddle/phi/kernels/gpu/activation_kernel.cu @@ -14,8 +14,8 @@ limitations under the License. */ #include "paddle/phi/kernels/activation_kernel.h" -#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/backends/gpu/gpu_device_function.h" #include "paddle/phi/common/bfloat16.h" #include "paddle/phi/common/float16.h" #include "paddle/phi/core/kernel_registry.h" diff --git a/paddle/phi/kernels/gpu/affine_grid_grad_kernel.cu b/paddle/phi/kernels/gpu/affine_grid_grad_kernel.cu index a7a82236a40a2..886aaa76e41ec 100644 --- a/paddle/phi/kernels/gpu/affine_grid_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/affine_grid_grad_kernel.cu @@ -16,10 +16,10 @@ #include "paddle/phi/kernels/affine_grid_grad_kernel.h" -#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/backends/gpu/gpu_device_function.h" #include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/common/int_array.h" #include "paddle/phi/core/kernel_registry.h" diff --git a/paddle/phi/kernels/gpu/affine_grid_kernel.cu b/paddle/phi/kernels/gpu/affine_grid_kernel.cu index 499ed260eef47..8274e687512ac 100644 --- a/paddle/phi/kernels/gpu/affine_grid_kernel.cu +++ b/paddle/phi/kernels/gpu/affine_grid_kernel.cu @@ -16,10 +16,10 @@ #include "paddle/phi/kernels/affine_grid_kernel.h" -#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/backends/gpu/gpu_device_function.h" #include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/common/int_array.h" #include "paddle/phi/core/kernel_registry.h" diff --git a/paddle/phi/kernels/gpu/cross_entropy_grad_kernel.cu b/paddle/phi/kernels/gpu/cross_entropy_grad_kernel.cu index 5d40304c5e0c6..df3e4bd0cf118 100644 --- a/paddle/phi/kernels/gpu/cross_entropy_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/cross_entropy_grad_kernel.cu @@ -24,8 +24,8 @@ namespace cub = hipcub; #include "paddle/fluid/operators/math/cross_entropy.h" #include "paddle/fluid/operators/math/softmax.h" -#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" +#include "paddle/phi/backends/gpu/gpu_device_function.h" #include "paddle/phi/common/amp_type_traits.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/tensor_utils.h" diff --git a/paddle/phi/kernels/gpu/cross_entropy_kernel.cu b/paddle/phi/kernels/gpu/cross_entropy_kernel.cu index 76201a1077edb..bee9fc801b795 100644 --- a/paddle/phi/kernels/gpu/cross_entropy_kernel.cu +++ b/paddle/phi/kernels/gpu/cross_entropy_kernel.cu @@ -24,8 +24,8 @@ namespace cub = hipcub; #include "paddle/fluid/operators/math/cross_entropy.h" #include "paddle/fluid/operators/math/softmax.h" -#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" +#include "paddle/phi/backends/gpu/gpu_device_function.h" #include "paddle/phi/common/amp_type_traits.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/tensor_utils.h" diff --git a/paddle/phi/kernels/gpu/depthwise_conv.h b/paddle/phi/kernels/gpu/depthwise_conv.h index 5da0ae96e6be4..9ed8813504150 100644 --- a/paddle/phi/kernels/gpu/depthwise_conv.h +++ b/paddle/phi/kernels/gpu/depthwise_conv.h @@ -27,7 +27,7 @@ limitations under the License. */ namespace cub = hipcub; #endif -#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" +#include "paddle/phi/backends/gpu/gpu_device_function.h" #include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/kernels/funcs/math_function.h" @@ -92,7 +92,7 @@ class DepthwiseConvFilterGradFunctor { template __forceinline__ __device__ T WarpReduceSum(T val, unsigned lane_mask) { for (int mask = HALF_WARP; mask > 0; mask >>= 1) - val += platform::CudaShuffleDownSync(lane_mask, val, mask); + val += phi::backends::gpu::CudaShuffleDownSync(lane_mask, val, mask); return val; } diff --git a/paddle/phi/kernels/gpu/grid_sample_grad_kernel.cu b/paddle/phi/kernels/gpu/grid_sample_grad_kernel.cu index 8f4beaa26775f..6e8b12c4b1b90 100644 --- a/paddle/phi/kernels/gpu/grid_sample_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/grid_sample_grad_kernel.cu @@ -14,7 +14,7 @@ #include "paddle/phi/kernels/grid_sample_grad_kernel.h" -#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" +#include "paddle/phi/backends/gpu/gpu_device_function.h" #include "paddle/phi/backends/gpu/gpu_info.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/backends/gpu/gpu_primitives.h" diff --git a/paddle/phi/kernels/gpu/group_norm_utils.h b/paddle/phi/kernels/gpu/group_norm_utils.h index 00986817c61a0..3cb13692d52ca 100644 --- a/paddle/phi/kernels/gpu/group_norm_utils.h +++ b/paddle/phi/kernels/gpu/group_norm_utils.h @@ -22,7 +22,7 @@ namespace cub = hipcub; #endif -#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" +#include "paddle/phi/backends/gpu/gpu_device_function.h" #include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/kernels/primitive/kernel_primitives.h" diff --git a/paddle/phi/kernels/gpu/interpolate_kernel.cu b/paddle/phi/kernels/gpu/interpolate_kernel.cu index 625718e8f4bc9..8135e73142fec 100644 --- a/paddle/phi/kernels/gpu/interpolate_kernel.cu +++ b/paddle/phi/kernels/gpu/interpolate_kernel.cu @@ -14,8 +14,8 @@ #include "paddle/phi/kernels/interpolate_kernel.h" -#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/backends/gpu/gpu_device_function.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/common/amp_type_traits.h" diff --git a/paddle/phi/kernels/gpudnn/affine_grid_grad_kernel.cu b/paddle/phi/kernels/gpudnn/affine_grid_grad_kernel.cu index 4bc8c205025e3..d1cc738e2b01b 100644 --- a/paddle/phi/kernels/gpudnn/affine_grid_grad_kernel.cu +++ b/paddle/phi/kernels/gpudnn/affine_grid_grad_kernel.cu @@ -15,11 +15,11 @@ #ifndef PADDLE_WITH_HIP #include "paddle/phi/kernels/affine_grid_grad_kernel.h" -#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/backends/gpu/gpu_device_function.h" #include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/common/int_array.h" #include "paddle/phi/core/kernel_registry.h" diff --git a/paddle/phi/kernels/gpudnn/affine_grid_kernel.cu b/paddle/phi/kernels/gpudnn/affine_grid_kernel.cu index 98f200480d44c..6c5d305abbff2 100644 --- a/paddle/phi/kernels/gpudnn/affine_grid_kernel.cu +++ b/paddle/phi/kernels/gpudnn/affine_grid_kernel.cu @@ -15,11 +15,11 @@ #ifndef PADDLE_WITH_HIP #include "paddle/phi/kernels/affine_grid_kernel.h" -#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/backends/gpu/gpu_device_function.h" #include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/common/int_array.h" #include "paddle/phi/core/kernel_registry.h" diff --git a/paddle/phi/kernels/gpudnn/softmax_gpudnn.h b/paddle/phi/kernels/gpudnn/softmax_gpudnn.h index 99cd4c9b6d8db..a81357e99b58d 100644 --- a/paddle/phi/kernels/gpudnn/softmax_gpudnn.h +++ b/paddle/phi/kernels/gpudnn/softmax_gpudnn.h @@ -24,8 +24,8 @@ limitations under the License. */ #include "paddle/phi/kernels/primitive/kernel_primitives.h" // See Note [ Why still include the fluid headers? ] -#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" +#include "paddle/phi/backends/gpu/gpu_device_function.h" #define MATRIX_SOFTMAX_ALIGN_BYTES 16 #define MATRIX_SOFTMAX_THREAHOLD 100000 @@ -133,7 +133,7 @@ __device__ __forceinline__ void WarpReduceSum(T* sum) { #pragma unroll for (int i = 0; i < BatchSize; ++i) { T sum_val = - paddle::platform::CudaShuffleXorSync(0xFFFFFFFF, sum[i], offset); + phi::backends::gpu::CudaShuffleXorSync(0xFFFFFFFF, sum[i], offset); sum[i] = sum[i] + sum_val; } } @@ -146,7 +146,7 @@ __device__ __forceinline__ void WarpReduceMax(T* sum) { #pragma unroll for (int i = 0; i < BatchSize; ++i) { T max_val = - paddle::platform::CudaShuffleXorSync(0xFFFFFFFF, sum[i], offset); + phi::backends::gpu::CudaShuffleXorSync(0xFFFFFFFF, sum[i], offset); sum[i] = max(sum[i], max_val); } } diff --git a/paddle/phi/kernels/primitive/compute_primitives.h b/paddle/phi/kernels/primitive/compute_primitives.h index b3da41976624b..1dfcde4e5dd0e 100644 --- a/paddle/phi/kernels/primitive/compute_primitives.h +++ b/paddle/phi/kernels/primitive/compute_primitives.h @@ -21,7 +21,7 @@ #include #endif -#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" +#include "paddle/phi/backends/gpu/gpu_device_function.h" #include "paddle/phi/common/float16.h" namespace phi { @@ -65,7 +65,7 @@ __device__ __forceinline__ T WarpReduce(T val, ReduceOp reducer) { unsigned mask = 0u; CREATE_SHFL_MASK(mask, true); for (int stride = details::kWarpSize / 2; stride > 0; stride >>= 1) { - T temp = paddle::platform::CudaShuffleDownSync(mask, val, stride); + T temp = phi::backends::gpu::CudaShuffleDownSync(mask, val, stride); val = reducer(val, temp); } return val; @@ -110,7 +110,7 @@ __device__ __forceinline__ T BlockXReduce(T val, ReduceOp reducer) { unsigned mask = 0u; CREATE_SHFL_MASK(mask, true); for (int stride = 1; stride < block_dim_x; stride <<= 1) { - T temp = paddle::platform::CudaShuffleDownSync(mask, val, stride); + T temp = phi::backends::gpu::CudaShuffleDownSync(mask, val, stride); val = reducer(val, temp); } __syncthreads(); From b0e28540b7912869632726d6e8cc799c491c5748 Mon Sep 17 00:00:00 2001 From: ZZK <359521840@qq.com> Date: Fri, 18 Nov 2022 16:29:27 +0800 Subject: [PATCH 091/210] Optimize FusedBiasAddGelu Kernel (#47679) * Add quick gelu and fused bias add kernel * fix annotation * remove useless code * add fast gelu option and set it in multi transformer op * add flag to restrict if use fast gelu approximate * fix flags conflict * fix use tanh function instead * add cudart version limit * use phi fast tanh func * fix comment --- .../operators/fused/fused_dropout_act_bias.h | 103 ++++++++++++++---- .../operators/fused/fused_dropout_helper.h | 71 ++++++++---- paddle/phi/kernels/gpu/gelu_funcs.h | 14 +-- 3 files changed, 138 insertions(+), 50 deletions(-) diff --git a/paddle/fluid/operators/fused/fused_dropout_act_bias.h b/paddle/fluid/operators/fused/fused_dropout_act_bias.h index e3e19d9ea6ebc..553fb8d7be604 100644 --- a/paddle/fluid/operators/fused/fused_dropout_act_bias.h +++ b/paddle/fluid/operators/fused/fused_dropout_act_bias.h @@ -18,13 +18,11 @@ limitations under the License. */ #endif #include "paddle/fluid/operators/fused/fused_residual_dropout_bias.h" +#include "paddle/phi/kernels/gpu/gelu_funcs.h" namespace paddle { namespace operators { -/** - *@brief the gelu functor - */ template struct GeluFunctor { inline __host__ __device__ T operator()(const T x) const { @@ -36,6 +34,13 @@ struct GeluFunctor { } }; +template +struct FastGeluFunctor { + inline __device__ T operator()(const T x) const { + return phi::GeluFwd(x); + } +}; + /** *@brief the gelu grad functor */ @@ -131,6 +136,49 @@ __global__ void FusedDropoutActBias( } } +template +__global__ void FusedActBias(Functor act, + const uint64_t elem_cnt, + const uint64_t cols, + const InType *__restrict__ src, + const T *__restrict__ bias, + OutType *dst) { + const int32_t global_thread_idx = blockDim.x * blockIdx.x + threadIdx.x; + using LoadT = phi::AlignedVector; + using LoadInType = phi::AlignedVector; + using LoadFloat = phi::AlignedVector; + using StoreOutType = phi::AlignedVector; + + LoadInType src_vec; + LoadT bias_vec; + StoreOutType out_vec; + for (int32_t idx = global_thread_idx * VecSize, + step = blockDim.x * gridDim.x * VecSize; + idx < elem_cnt; + idx += step) { + const int32_t col_idx = idx % cols; + phi::Load(&src[idx], &src_vec); + if (bias) { + phi::Load(&bias[col_idx], &bias_vec); + } +#pragma unroll + for (int32_t unroll_idx = 0; unroll_idx < VecSize; unroll_idx++) { + if (bias) { + out_vec[unroll_idx] = static_cast( + act(static_cast(src_vec[unroll_idx]) + bias_vec[unroll_idx])); + } else { + out_vec[unroll_idx] = + static_cast(act(static_cast(src_vec[unroll_idx]))); + } + } + phi::Store(out_vec, &dst[idx]); + } +} + /** * @brief dst = dropout(activation(src + bias)); */ @@ -170,24 +218,37 @@ void LaunchDropoutActBias(Functor act_functor, const int real_vec_size = cols % VecSize == 0 ? VecSize : 1; const auto config = Get1DBlocksAnd2DGrids(ctx, rows, cols, real_vec_size); if (cols % VecSize == 0) { - FusedDropoutActBias - <<>>( - act_functor, - seed, - rows, - cols, - increment, - dropout_prob, - is_upscale_in_train, - is_test, - src, - bias, - dst, - mask_data, - quant_last_in_scale, - dequant_out_scale_data, - quant_out_scale_offset, - quant_next_in_scale); + if (is_test && (dequant_out_scale_data == nullptr)) { + const int32_t elem_cnt = rows * cols; + const int32_t pack_num = elem_cnt / VecSize; + const int32_t tmp_cols = cols / VecSize; + int block_size = + std::max(static_cast(32), std::min(tmp_cols, 128)); + const int grid_size = std::max(static_cast(1), + (pack_num + block_size - 1) / block_size); + FusedActBias + <<>>( + act_functor, elem_cnt, cols, src, bias, dst); + } else { + FusedDropoutActBias + <<>>( + act_functor, + seed, + rows, + cols, + increment, + dropout_prob, + is_upscale_in_train, + is_test, + src, + bias, + dst, + mask_data, + quant_last_in_scale, + dequant_out_scale_data, + quant_out_scale_offset, + quant_next_in_scale); + } } else { FusedDropoutActBias <<>>( diff --git a/paddle/fluid/operators/fused/fused_dropout_helper.h b/paddle/fluid/operators/fused/fused_dropout_helper.h index 3230854284062..46c5f7c0e5f94 100644 --- a/paddle/fluid/operators/fused/fused_dropout_helper.h +++ b/paddle/fluid/operators/fused/fused_dropout_helper.h @@ -21,6 +21,8 @@ limitations under the License. */ #include "paddle/fluid/operators/fused/fused_residual_dropout_bias.h" #include "paddle/phi/kernels/funcs/functors.h" +DECLARE_bool(use_fast_math); + namespace paddle { namespace operators { @@ -216,28 +218,53 @@ class FusedDropoutHelper { const float quant_min_bound = -127.0) { auto increment = GetIncrement(ctx); if (act_method == "gelu") { - GeluFunctor gelu; - LaunchDropoutActBias, InType, OutType>( - gelu, - dropout_param_.seed, - rows_, - cols_, - dropout_param_.increment, - dropout_param_.dropout_prob, - dropout_param_.is_upscale_in_train, - dropout_param_.is_test, - src, - bias, - out, - mask, - ctx, - quant_last_in_scale, - dequant_out_scale_data, - quant_out_scale_offset, - quant_next_in_scale, - quant_round_type, - quant_max_bound, - quant_min_bound); + if (FLAGS_use_fast_math) { + FastGeluFunctor fast_gelu; + LaunchDropoutActBias, InType, OutType>( + fast_gelu, + dropout_param_.seed, + rows_, + cols_, + dropout_param_.increment, + dropout_param_.dropout_prob, + dropout_param_.is_upscale_in_train, + dropout_param_.is_test, + src, + bias, + out, + mask, + ctx, + quant_last_in_scale, + dequant_out_scale_data, + quant_out_scale_offset, + quant_next_in_scale, + quant_round_type, + quant_max_bound, + quant_min_bound); + } else { + GeluFunctor gelu; + LaunchDropoutActBias, InType, OutType>( + gelu, + dropout_param_.seed, + rows_, + cols_, + dropout_param_.increment, + dropout_param_.dropout_prob, + dropout_param_.is_upscale_in_train, + dropout_param_.is_test, + src, + bias, + out, + mask, + ctx, + quant_last_in_scale, + dequant_out_scale_data, + quant_out_scale_offset, + quant_next_in_scale, + quant_round_type, + quant_max_bound, + quant_min_bound); + } } else if (act_method == "relu") { phi::funcs::ReluFunctor relu; LaunchDropoutActBias -static __device__ __forceinline__ float FP32GeluFwd(float x) { - auto tanh_out = - FP32FastTanh(0.79788456f * x * (1.0f + 0.044715f * x * x)); - return x * 0.5f * (1.0f + tanh_out); +template +static __device__ __forceinline__ T GeluFwd(T x) { + const float cast_x = static_cast(x); + auto tanh_out = FP32FastTanh(0.79788456f * cast_x * + (1.0f + 0.044715f * cast_x * cast_x)); + return static_cast(cast_x * 0.5f * (1.0f + tanh_out)); } template @@ -67,8 +68,7 @@ static __global__ void FP16FastGeluFwdCUDAKernel(const __half* x, ArrT in_arr = *reinterpret_cast(x + offset); #pragma unroll for (int i = 0; i < VecSize; ++i) { - float tmp = __half2float(in_arr[i]); - in_arr[i] = __float2half(FP32GeluFwd(tmp)); + in_arr[i] = GeluFwd(in_arr[i]); } *reinterpret_cast(y + offset) = in_arr; } From 058aa3817660ddfbc25dec72184772842171b050 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=82=85=E5=89=91=E5=AF=92?= Date: Fri, 18 Nov 2022 16:43:08 +0800 Subject: [PATCH 092/210] =?UTF-8?q?(fluid=E6=B8=85=E7=90=86=EF=BC=89remove?= =?UTF-8?q?=20stack=20in=20nn.py=20under=20fluid=20(#47942)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- python/paddle/fluid/dataloader/collate.py | 2 +- python/paddle/fluid/layers/nn.py | 142 ------------------ python/paddle/fluid/layers/rnn.py | 8 +- python/paddle/fluid/layers/tensor.py | 3 +- .../dygraph_to_static/bert_dygraph_model.py | 2 +- .../seq2seq_dygraph_model.py | 14 +- .../unittests/dygraph_to_static/test_list.py | 2 +- .../transformer_dygraph_model.py | 6 +- .../tests/unittests/ipu/test_stack_op_ipu.py | 2 +- .../tests/unittests/npu/test_stack_op_npu.py | 2 +- .../test_dynamic_rnn_stop_gradient.py | 3 +- .../fluid/tests/unittests/test_stack_op.py | 2 +- 12 files changed, 25 insertions(+), 163 deletions(-) diff --git a/python/paddle/fluid/dataloader/collate.py b/python/paddle/fluid/dataloader/collate.py index 50b86ca41e53b..661a0de13cd51 100644 --- a/python/paddle/fluid/dataloader/collate.py +++ b/python/paddle/fluid/dataloader/collate.py @@ -58,7 +58,7 @@ def default_collate_fn(batch): batch = np.stack(batch, axis=0) return batch elif isinstance(sample, (paddle.Tensor, core.eager.Tensor)): - return layers.stack(batch, axis=0) + return paddle.stack(batch, axis=0) elif isinstance(sample, numbers.Number): batch = np.array(batch) return batch diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 6ff8f22a71921..494fd6d47a62c 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -137,7 +137,6 @@ 'brelu', 'leaky_relu', 'flatten', - 'stack', 'pad2d', 'unique', 'unique_with_counts', @@ -10247,147 +10246,6 @@ def flatten(x, axis=1, name=None): return out -def stack(x, axis=0, name=None): - """ - - This OP stacks all the inputs :code:`x` along axis. - - .. code-block:: text - - Case 1: - - Input: - x[0].shape = [1, 2] - x[0].data = [ [1.0 , 2.0 ] ] - x[1].shape = [1, 2] - x[1].data = [ [3.0 , 4.0 ] ] - x[2].shape = [1, 2] - x[2].data = [ [5.0 , 6.0 ] ] - - Attrs: - axis = 0 - - Output: - Out.dims = [3, 1, 2] - Out.data =[ [ [1.0, 2.0] ], - [ [3.0, 4.0] ], - [ [5.0, 6.0] ] ] - - - Case 2: - - - Input: - x[0].shape = [1, 2] - x[0].data = [ [1.0 , 2.0 ] ] - x[1].shape = [1, 2] - x[1].data = [ [3.0 , 4.0 ] ] - x[2].shape = [1, 2] - x[2].data = [ [5.0 , 6.0 ] ] - - - Attrs: - axis = 1 or axis = -2 - - Output: - Out.shape = [1, 3, 2] - Out.data =[ [ [1.0, 2.0] - [3.0, 4.0] - [5.0, 6.0] ] ] - - - Args: - x (list(Variable)|tuple(Variable)): Input :code:`x` can be a :code:`list` or :code:`tuple` of Tensors, the shapes of all these Tensors - must be the same. Supposing input is N dims - Tensors :math:`[d_0, d_1, ..., d_{n-1}]`, the output is N+1 dims - Tensor :math:`[d_0, d_1, d_{axis-1}, len(x), d_{axis}, ..., d_{n-1}]`. - Supported data types: float32, float64, int32, int64. - axis (int, optional): The axis along which all inputs are stacked. ``axis`` range is ``[-(R+1), R+1)``, - where ``R`` is the number of dimensions of the first input tensor ``x[0]``. - If ``axis < 0``, ``axis = axis+R+1``. The default value of axis is 0. - name (str, optional): Please refer to :ref:`api_guide_Name`, Default None. - - - Returns: - Variable: The stacked Tensor, has same data type with input Tensors. Output dim is :math:`rank(x[0])+1`. - - Examples: - .. code-block:: python - - import paddle.fluid as fluid - import paddle.fluid.layers as layers - # set batch size=None - x1 = fluid.data(name='x1', shape=[None, 1, 2], dtype='int32') - x2 = fluid.data(name='x2', shape=[None, 1, 2], dtype='int32') - # stack Tensor list - data = layers.stack([x1,x2]) # stack according to axis 0, data.shape=[2, None, 1, 2] - - data = layers.stack([x1,x2], axis=1) # stack according to axis 1, data.shape=[None, 2, 1, 2] - - - """ - axis = 0 if axis is None else axis - - if in_dygraph_mode(): - return _C_ops.stack(x, axis) - - if _in_legacy_dygraph(): - return _legacy_C_ops.stack(x, 'axis', axis) - - if not isinstance(x, list) and not isinstance(x, tuple): - # NOTE:(zhiqiu) Only support Variable as input if the Variable is a LOD_TENSOR_ARRAY create by create_array, array_write, array_read, etc. - # In that case, Variable is array of tensors indeed. - if ( - isinstance(x, Variable) - and x.desc.type() == core.VarDesc.VarType.LOD_TENSOR_ARRAY - ): - x = [x] - else: - raise TypeError( - "The type of '%s' in %s must be %s, but received %s" - % ( - 'x', - 'stack', - 'list[Tensor], tuple[Tensor] or TensorArray', - type(x), - ) - ) - - helper = LayerHelper('stack', **locals()) - - out = helper.create_variable_for_type_inference(x[0].dtype) - if x[0].desc.type() == core.VarDesc.VarType.LOD_TENSOR_ARRAY: - assert len(x) == 1, ( - "If the elements of 'x' in stack are Variable(LoDTensorArray), " - "number of the elements must be 1, but received %s." % len(x) - ) - out_index = helper.create_variable_for_type_inference(dtype="int32") - - for i in x: - check_variable_and_dtype( - i, - 'x', - ['float16', 'float32', 'float64', 'int32', 'int64'], - 'stack', - ) - - helper.append_op( - type='tensor_array_to_tensor', - inputs={'X': x[0]}, - outputs={'Out': [out], 'OutIndex': [out_index]}, - attrs={'axis': axis, 'use_stack': True}, - ) - else: - helper.append_op( - type='stack', - inputs={'X': x}, - outputs={'Y': out}, - attrs={'axis': axis}, - ) - - return out - - @templatedoc(op_type="filter_by_instag") def filter_by_instag(ins, ins_tag, filter_tag, is_lod, out_val_if_empty=0): """ diff --git a/python/paddle/fluid/layers/rnn.py b/python/paddle/fluid/layers/rnn.py index fc0603e227362..0104502a7ddbd 100644 --- a/python/paddle/fluid/layers/rnn.py +++ b/python/paddle/fluid/layers/rnn.py @@ -623,7 +623,7 @@ def _rnn_dynamic_graph( ) final_outputs = map_structure( - lambda x: nn.stack(x.array, axis=time_step_index), outputs + lambda x: paddle.stack(x.array, axis=time_step_index), outputs ) if is_reverse: @@ -1167,7 +1167,7 @@ def _gather(self, x, indices, batch_size): ), [1, self.beam_size], ) - topk_coordinates = nn.stack([batch_pos, indices], axis=2) + topk_coordinates = paddle.stack([batch_pos, indices], axis=2) topk_coordinates.stop_gradient = True return nn.gather_nd(x, topk_coordinates) @@ -1546,7 +1546,9 @@ def _maybe_copy(state, new_state, step_mask): if max_step_num is not None and step_idx > max_step_num: break - final_outputs = map_structure(lambda x: nn.stack(x.array, axis=0), outputs) + final_outputs = map_structure( + lambda x: paddle.stack(x.array, axis=0), outputs + ) final_states = states try: diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py index 79766ba09f791..0f6652fdd5d7c 100644 --- a/python/paddle/fluid/layers/tensor.py +++ b/python/paddle/fluid/layers/tensor.py @@ -571,8 +571,9 @@ def tensor_array_to_tensor(input, axis=1, name=None, use_stack=False): assert isinstance( input, list ), "The 'input' in tensor_array_to_tensor must be list" - from .nn import stack, concat + from .nn import concat from ..dygraph import to_variable + from paddle import stack op = stack if use_stack else concat res = op(input, axis=axis) diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/bert_dygraph_model.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/bert_dygraph_model.py index 76627c5a5ef0a..e5b85be96b8ba 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/bert_dygraph_model.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/bert_dygraph_model.py @@ -279,7 +279,7 @@ def forward(self, src_ids, position_ids, sentence_ids, input_mask): self_attn_mask = fluid.layers.scale( x=self_attn_mask, scale=10000.0, bias=-1.0, bias_after_scale=False ) - n_head_self_attn_mask = fluid.layers.stack( + n_head_self_attn_mask = paddle.stack( x=[self_attn_mask] * self._n_head, axis=1 ) n_head_self_attn_mask.stop_gradient = True diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_dygraph_model.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_dygraph_model.py index d46778e838fc1..20aa0870086f4 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_dygraph_model.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_dygraph_model.py @@ -196,7 +196,7 @@ def _real_state(self, state, new_state, step_mask): return new_state def _gather(self, x, indices, batch_pos): - topk_coordinates = fluid.layers.stack([batch_pos, indices], axis=2) + topk_coordinates = paddle.stack([batch_pos, indices], axis=2) return fluid.layers.gather_nd(x, topk_coordinates) @declarative @@ -288,7 +288,7 @@ def forward(self, inputs): step_input = new_hidden dec_output.append(step_input) - dec_output = fluid.layers.stack(dec_output) + dec_output = paddle.stack(dec_output) dec_output = self.fc(self._transpose_batch_time(dec_output)) loss = fluid.layers.softmax_with_cross_entropy( logits=dec_output, label=label, soft_label=False @@ -498,8 +498,8 @@ def beam_search(self, inputs): predicted_ids.append(token_indices) parent_ids.append(beam_indices) - predicted_ids = fluid.layers.stack(predicted_ids) - parent_ids = fluid.layers.stack(parent_ids) + predicted_ids = paddle.stack(predicted_ids) + parent_ids = paddle.stack(parent_ids) predicted_ids = fluid.layers.gather_tree(predicted_ids, parent_ids) predicted_ids = self._transpose_batch_time(predicted_ids) return predicted_ids @@ -680,7 +680,7 @@ def _real_state(self, state, new_state, step_mask): return new_state def _gather(self, x, indices, batch_pos): - topk_coordinates = fluid.layers.stack([batch_pos, indices], axis=2) + topk_coordinates = paddle.stack([batch_pos, indices], axis=2) return fluid.layers.gather_nd(x, topk_coordinates) def attention(self, query, enc_output, mask=None): @@ -774,7 +774,7 @@ def forward(self, inputs): enc_outputs.append(enc_step_input) enc_hidden, enc_cell = new_enc_hidden, new_enc_cell - enc_outputs = fluid.layers.stack(enc_outputs) + enc_outputs = paddle.stack(enc_outputs) enc_outputs = self._transpose_batch_time(enc_outputs) # train @@ -815,7 +815,7 @@ def forward(self, inputs): dec_output.append(out) dec_hidden, dec_cell = new_dec_hidden, new_dec_cell - dec_output = fluid.layers.stack(dec_output) + dec_output = paddle.stack(dec_output) dec_output = self.fc(self._transpose_batch_time(dec_output)) loss = fluid.layers.softmax_with_cross_entropy( logits=dec_output, label=label, soft_label=False diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_list.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_list.py index 4f6984c5e05fc..f2914614e603e 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_list.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_list.py @@ -121,7 +121,7 @@ def test_list_append_in_while_loop_with_stack(x, iter_num): while i < iter_num.numpy()[0]: a.append(x) i += 1 - out = fluid.layers.stack(a, axis=1) + out = paddle.stack(a, axis=1) return out diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_dygraph_model.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_dygraph_model.py index 69839cf72b5c7..a2c6b4c225dcd 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_dygraph_model.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_dygraph_model.py @@ -762,7 +762,7 @@ def mask_probs(probs, finished, noend_mask_tensor): return probs def gather(input, indices, batch_pos): - topk_coordinates = fluid.layers.stack([batch_pos, indices], axis=2) + topk_coordinates = paddle.stack([batch_pos, indices], axis=2) return layers.gather_nd(input, topk_coordinates) # run encoder @@ -876,8 +876,8 @@ def gather(input, indices, batch_pos): if layers.reduce_all(finished).numpy(): break - predict_ids = layers.stack(predict_ids, axis=0) - parent_ids = layers.stack(parent_ids, axis=0) + predict_ids = paddle.stack(predict_ids, axis=0) + parent_ids = paddle.stack(parent_ids, axis=0) finished_seq = layers.transpose( layers.gather_tree(predict_ids, parent_ids), [1, 2, 0] ) diff --git a/python/paddle/fluid/tests/unittests/ipu/test_stack_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_stack_op_ipu.py index 1f1fbf6d789ae..e2e5405ff0266 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_stack_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_stack_op_ipu.py @@ -62,7 +62,7 @@ def build_model(self): z = paddle.static.data( name=self.feed_list[2], shape=self.feed_shape[2], dtype='float32' ) - out = paddle.fluid.layers.stack([x, y, z], **self.attrs) + out = paddle.stack([x, y, z], **self.attrs) self.fetch_list = [out.name] def run_model(self, exec_mode): diff --git a/python/paddle/fluid/tests/unittests/npu/test_stack_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_stack_op_npu.py index 8a9e8879b8226..e2509e12b2705 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_stack_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_stack_op_npu.py @@ -144,7 +144,7 @@ def set_program(self): for i in range(self.iter_num): fluid.layers.array_write(input, zero + i, tensor_array) - self.out_var = fluid.layers.stack(tensor_array, axis=self.axis) + self.out_var = paddle.stack(tensor_array, axis=self.axis) def test_case(self): self.assertTrue(self.out_var.shape[self.axis] == -1) diff --git a/python/paddle/fluid/tests/unittests/test_dynamic_rnn_stop_gradient.py b/python/paddle/fluid/tests/unittests/test_dynamic_rnn_stop_gradient.py index 3922f6a8c229d..dd19157c32169 100644 --- a/python/paddle/fluid/tests/unittests/test_dynamic_rnn_stop_gradient.py +++ b/python/paddle/fluid/tests/unittests/test_dynamic_rnn_stop_gradient.py @@ -16,6 +16,7 @@ import paddle.fluid as fluid import paddle.fluid.layers as layers import unittest +import paddle def build_and_run_program(place, batch_size, beam_size, stop_gradient=False): @@ -45,7 +46,7 @@ def build_and_run_program(place, batch_size, beam_size, stop_gradient=False): layers.unsqueeze(layers.range(0, bs, 1, dtype=bs.dtype), [1]), [1, beam_size], ) - topk_coordinates = layers.stack([batch_pos, indices], axis=2) + topk_coordinates = paddle.stack([batch_pos, indices], axis=2) topk_coordinates.stop_gradient = stop_gradient score = layers.gather_nd(x, topk_coordinates) layers.increment(x=step_idx, value=1.0, in_place=True) diff --git a/python/paddle/fluid/tests/unittests/test_stack_op.py b/python/paddle/fluid/tests/unittests/test_stack_op.py index 0f71ebeac5b21..15947f40f03de 100644 --- a/python/paddle/fluid/tests/unittests/test_stack_op.py +++ b/python/paddle/fluid/tests/unittests/test_stack_op.py @@ -172,7 +172,7 @@ def set_program(self): for i in range(self.iter_num): fluid.layers.array_write(input, zero + i, tensor_array) - self.out_var = fluid.layers.stack(tensor_array, axis=self.axis) + self.out_var = paddle.stack(tensor_array, axis=self.axis) def test_case(self): self.assertTrue(self.out_var.shape[self.axis] == -1) From 7f92e27efd72998102cff1ef524dc24ec7288ed2 Mon Sep 17 00:00:00 2001 From: zyfncg Date: Fri, 18 Nov 2022 16:55:24 +0800 Subject: [PATCH 093/210] Fix bug of zero_allocator in HostAlloc (#48108) * fix bug of zero_allocator in host * fix test compile bug * add unittest * update test --- .../fluid/inference/api/analysis_predictor.cc | 4 ++ .../fluid/inference/tensorrt/test_engine.cc | 4 ++ paddle/fluid/memory/malloc_test.cu | 4 ++ paddle/fluid/platform/collective_helper.cc | 8 ++++ .../fluid/platform/device/gpu/nccl_helper.h | 4 ++ paddle/fluid/platform/device_context.cc | 3 ++ paddle/fluid/platform/device_context_test.cu | 48 +++++++++++++++++++ paddle/fluid/pybind/pybind.cc | 12 +++++ paddle/phi/core/device_context.cc | 29 ++++++++++- paddle/phi/core/device_context.h | 9 ++++ 10 files changed, 124 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 2f2e0ff9f7259..c1ca6d8e9608c 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -427,6 +427,10 @@ void AnalysisPredictor::InitDeviceContexts() { memory::allocation::AllocatorFacade::Instance() .GetZeroAllocator(place_) .get()); + gpu_context->SetHostZeroAllocator( + memory::allocation::AllocatorFacade::Instance() + .GetZeroAllocator(platform::CPUPlace()) + .get()); gpu_context->SetGenerator( framework::DefaultCUDAGenerator(place_.GetDeviceId()).get()); gpu_context->SetHostGenerator(framework::DefaultCPUGenerator().get()); diff --git a/paddle/fluid/inference/tensorrt/test_engine.cc b/paddle/fluid/inference/tensorrt/test_engine.cc index 027c593d73c6f..9a06b2e65ef10 100644 --- a/paddle/fluid/inference/tensorrt/test_engine.cc +++ b/paddle/fluid/inference/tensorrt/test_engine.cc @@ -38,6 +38,10 @@ class TensorRTEngineTest : public ::testing::Test { paddle::memory::allocation::AllocatorFacade::Instance() .GetZeroAllocator(platform::CUDAPlace(0)) .get()); + ctx_->SetHostZeroAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetZeroAllocator(paddle::platform::CPUPlace()) + .get()); ctx_->SetPinnedAllocator( paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(paddle::platform::CUDAPinnedPlace()) diff --git a/paddle/fluid/memory/malloc_test.cu b/paddle/fluid/memory/malloc_test.cu index 0bf5e99b773b2..9a8ab9324f1c2 100644 --- a/paddle/fluid/memory/malloc_test.cu +++ b/paddle/fluid/memory/malloc_test.cu @@ -183,6 +183,10 @@ TEST(Malloc, GPUContextMultiThreadMultiStream) { paddle::memory::allocation::AllocatorFacade::Instance() .GetZeroAllocator(place) .get()); + ctx->SetHostZeroAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetZeroAllocator(paddle::platform::CPUPlace()) + .get()); ctx->SetPinnedAllocator( paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(paddle::platform::CUDAPinnedPlace()) diff --git a/paddle/fluid/platform/collective_helper.cc b/paddle/fluid/platform/collective_helper.cc index 67ccac77e4e03..41cb9ed1b700d 100644 --- a/paddle/fluid/platform/collective_helper.cc +++ b/paddle/fluid/platform/collective_helper.cc @@ -221,6 +221,10 @@ NCCLComm* NCCLCommContext::AssignNCCLComm( paddle::memory::allocation::AllocatorFacade::Instance() .GetZeroAllocator(CUDAPlace(dev_id)) .get()); + dev_ctx->SetHostZeroAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetZeroAllocator(paddle::platform::CPUPlace()) + .get()); dev_ctx->SetPinnedAllocator( paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(paddle::platform::CUDAPinnedPlace()) @@ -364,6 +368,10 @@ BKCLComm* BKCLCommContext::AssignBKCLComm( paddle::memory::allocation::AllocatorFacade::Instance() .GetZeroAllocator(XPUPlace(dev_id)) .get()); + dev_ctx->SetHostZeroAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetZeroAllocator(paddle::platform::CPUPlace()) + .get()); BKCLCommImpl* c = new BKCLCommImpl; c->set_ring_id(ring_id); diff --git a/paddle/fluid/platform/device/gpu/nccl_helper.h b/paddle/fluid/platform/device/gpu/nccl_helper.h index 5d89da86efa6c..f17ad3749fac5 100644 --- a/paddle/fluid/platform/device/gpu/nccl_helper.h +++ b/paddle/fluid/platform/device/gpu/nccl_helper.h @@ -136,6 +136,10 @@ struct NCCLContext { paddle::memory::allocation::AllocatorFacade::Instance() .GetZeroAllocator(CUDAPlace(dev_id)) .get()); + ctx_->SetHostZeroAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetZeroAllocator(paddle::platform::CPUPlace()) + .get()); ctx_->SetPinnedAllocator( paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(paddle::platform::CUDAPinnedPlace()) diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index f0b1efc769430..cafb7e1da0f82 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -183,6 +183,9 @@ std::unique_ptr CreateDeviceContext( dev_ctx->SetZeroAllocator(memory::allocation::AllocatorFacade::Instance() .GetZeroAllocator(p) .get()); + dev_ctx->SetHostZeroAllocator(memory::allocation::AllocatorFacade::Instance() + .GetZeroAllocator(platform::CPUPlace()) + .get()); return PtrType(dev_ctx); } diff --git a/paddle/fluid/platform/device_context_test.cu b/paddle/fluid/platform/device_context_test.cu index abffa1e8846df..c4b998f660f35 100644 --- a/paddle/fluid/platform/device_context_test.cu +++ b/paddle/fluid/platform/device_context_test.cu @@ -17,6 +17,7 @@ limitations under the License. */ #include "gtest/gtest.h" #include "paddle/fluid/memory/allocation/allocator_facade.h" #include "paddle/fluid/platform/device_context.h" +#include "paddle/phi/core/dense_tensor.h" TEST(Device, Init) { using paddle::platform::CUDAPlace; @@ -38,6 +39,10 @@ TEST(Device, Init) { paddle::memory::allocation::AllocatorFacade::Instance() .GetZeroAllocator(CUDAPlace(i)) .get()); + device_context->SetHostZeroAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetZeroAllocator(paddle::platform::CPUPlace()) + .get()); device_context->SetPinnedAllocator( paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(paddle::platform::CUDAPinnedPlace()) @@ -69,6 +74,10 @@ TEST(Device, GPUContext) { paddle::memory::allocation::AllocatorFacade::Instance() .GetZeroAllocator(CUDAPlace(i)) .get()); + device_context->SetHostZeroAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetZeroAllocator(paddle::platform::CPUPlace()) + .get()); device_context->SetPinnedAllocator( paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(paddle::platform::CUDAPinnedPlace()) @@ -92,6 +101,45 @@ TEST(Device, GPUContext) { } } +TEST(Device, HostZeroAllocator) { + using paddle::platform::CUDAPlace; + + auto device_context = std::make_unique(CUDAPlace(0)); + device_context->SetAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(CUDAPlace(0), device_context->stream()) + .get()); + device_context->SetHostAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CPUPlace()) + .get()); + device_context->SetZeroAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetZeroAllocator(CUDAPlace(0)) + .get()); + device_context->SetHostZeroAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetZeroAllocator(paddle::platform::CPUPlace()) + .get()); + device_context->SetPinnedAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CUDAPinnedPlace()) + .get()); + device_context->PartialInitWithAllocator(); + + phi::DenseTensor tensor; + tensor.Resize({0}); + device_context->HostAlloc(&tensor); + ASSERT_EQ(tensor.place().GetType(), phi::AllocationType::CPU); + ASSERT_EQ(tensor.numel(), 0); + ASSERT_EQ(tensor.dtype(), phi::DataType::FLOAT32); + + phi::GPUContext gpu_context(CUDAPlace(0)); + gpu_context.SetHostZeroAllocator(&device_context->GetHostZeroAllocator()); + gpu_context.HostAlloc(&tensor); + ASSERT_EQ(tensor.place().GetType(), phi::AllocationType::CPU); +} + TEST(Device, DeviceContextPool) { using paddle::platform::CPUPlace; using paddle::platform::CUDAPlace; diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index b4d175efd2b56..32bfeb8b1c343 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -1325,6 +1325,10 @@ All parameter, weight, gradient are variables in Paddle. paddle::memory::allocation::AllocatorFacade::Instance() .GetZeroAllocator(place) .get()); + context->SetHostZeroAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetZeroAllocator(paddle::platform::CPUPlace()) + .get()); return context; }) .def_static( @@ -1349,6 +1353,10 @@ All parameter, weight, gradient are variables in Paddle. paddle::memory::allocation::AllocatorFacade::Instance() .GetZeroAllocator(place) .get()); + context->SetHostZeroAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetZeroAllocator(paddle::platform::CPUPlace()) + .get()); return context; #endif }) @@ -1410,6 +1418,10 @@ All parameter, weight, gradient are variables in Paddle. paddle::memory::allocation::AllocatorFacade::Instance() .GetZeroAllocator(place) .get()); + context->SetHostZeroAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetZeroAllocator(paddle::platform::CPUPlace()) + .get()); context->SetPinnedAllocator( paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(paddle::platform::CUDAPinnedPlace()) diff --git a/paddle/phi/core/device_context.cc b/paddle/phi/core/device_context.cc index dd3a30ed2992e..d46f9250eeb4c 100644 --- a/paddle/phi/core/device_context.cc +++ b/paddle/phi/core/device_context.cc @@ -54,6 +54,14 @@ struct DeviceContext::Impl { zero_allocator_ = allocator; } + void SetHostZeroAllocator(const Allocator* allocator) { + PADDLE_ENFORCE_NOT_NULL( + allocator, + phi::errors::InvalidArgument( + "Required allocator shall not be nullptr, but received nullptr.")); + host_zero_allocator_ = allocator; + } + void SetPinnedAllocator(const Allocator* allocator) { PADDLE_ENFORCE_NOT_NULL( allocator, @@ -106,6 +114,14 @@ struct DeviceContext::Impl { return *zero_allocator_; } + const Allocator& GetHostZeroAllocator() const { + PADDLE_ENFORCE_NOT_NULL( + host_zero_allocator_, + phi::errors::InvalidArgument("Required zero_allocator_ shall not be " + "nullptr, but received nullptr.")); + return *host_zero_allocator_; + } + const Allocator& GetPinnedAllocator() const { PADDLE_ENFORCE_NOT_NULL( pinned_allocator_, @@ -172,7 +188,8 @@ struct DeviceContext::Impl { if (tensor->initialized() && tensor->place() != CPUPlace()) { ClearHolder(tensor); } - auto* allocator = tensor->numel() == 0 ? zero_allocator_ : host_allocator_; + auto* allocator = + tensor->numel() == 0 ? host_zero_allocator_ : host_allocator_; return tensor->AllocateFrom( const_cast(allocator), dtype, requested_size); } @@ -234,6 +251,7 @@ struct DeviceContext::Impl { const Allocator* device_allocator_{nullptr}; const Allocator* host_allocator_{nullptr}; const Allocator* zero_allocator_{nullptr}; + const Allocator* host_zero_allocator_{nullptr}; const Allocator* pinned_allocator_{nullptr}; #ifdef PADDLE_WITH_CUDA const Allocator* cuda_graph_allocator_{nullptr}; @@ -248,6 +266,7 @@ DeviceContext::DeviceContext(const DeviceContext& other) { impl_->SetHostAllocator(&other.GetHostAllocator()); impl_->SetAllocator(&other.GetAllocator()); impl_->SetZeroAllocator(&other.GetZeroAllocator()); + impl_->SetHostZeroAllocator(&other.GetHostZeroAllocator()); impl_->SetPinnedAllocator(&other.GetPinnedAllocator()); impl_->SetHostGenerator(other.GetHostGenerator()); impl_->SetGenerator(other.GetGenerator()); @@ -300,10 +319,18 @@ void DeviceContext::SetZeroAllocator(const Allocator* allocator) { impl_->SetZeroAllocator(allocator); } +void DeviceContext::SetHostZeroAllocator(const Allocator* allocator) { + impl_->SetHostZeroAllocator(allocator); +} + const Allocator& DeviceContext::GetZeroAllocator() const { return impl_->GetZeroAllocator(); } +const Allocator& DeviceContext::GetHostZeroAllocator() const { + return impl_->GetHostZeroAllocator(); +} + void DeviceContext::SetPinnedAllocator(const Allocator* allocator) { impl_->SetPinnedAllocator(allocator); } diff --git a/paddle/phi/core/device_context.h b/paddle/phi/core/device_context.h index 5dad261f43b34..9114490d1a70e 100644 --- a/paddle/phi/core/device_context.h +++ b/paddle/phi/core/device_context.h @@ -82,6 +82,13 @@ class PADDLE_API DeviceContext { */ void SetZeroAllocator(const Allocator*); + /** + * @brief Set the zero-size host Allocator object. + * + * @param allocator + */ + void SetHostZeroAllocator(const Allocator*); + /** * @brief Set the zero-size Allocator object. * @@ -105,6 +112,8 @@ class PADDLE_API DeviceContext { const Allocator& GetZeroAllocator() const; + const Allocator& GetHostZeroAllocator() const; + const Allocator& GetPinnedAllocator() const; #ifdef PADDLE_WITH_CUDA From ec778272d3eef215c8bfa15c673cf177f0cdb645 Mon Sep 17 00:00:00 2001 From: 201716010711 <87008376+201716010711@users.noreply.github.com> Date: Fri, 18 Nov 2022 17:05:51 +0800 Subject: [PATCH 094/210] delete logical_xor api (#48070) --- python/paddle/fluid/layers/nn.py | 44 -------------------------------- 1 file changed, 44 deletions(-) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 494fd6d47a62c..089597fdaf1aa 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -159,7 +159,6 @@ 'size', 'logical_and', 'logical_or', - 'logical_xor', 'logical_not', 'clip', 'clip_by_norm', @@ -12340,49 +12339,6 @@ def logical_or(x, y, out=None, name=None): ) -def logical_xor(x, y, out=None, name=None): - r""" - - ``logical_xor`` operator computes element-wise logical XOR on ``x`` and ``y``, and returns ``out``. ``out`` is N-dim boolean ``Tensor``. - Each element of ``out`` is calculated by - - .. math:: - - out = (x || y) \&\& !(x \&\& y) - - .. note:: - ``paddle.logical_xor`` supports broadcasting. If you want know more about broadcasting, please refer to :ref:`user_guide_broadcasting`. - - Args: - x (Tensor): the input tensor, it's data type should be one of bool, int8, int16, in32, in64, float32, float64. - y (Tensor): the input tensor, it's data type should be one of bool, int8, int16, in32, in64, float32, float64. - out(Tensor): The ``Tensor`` that specifies the output of the operator, which can be any ``Tensor`` that has been created in the program. The default value is None, and a new ``Tensor`` will be created to save the output. - name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. - - Returns: - N-D Tensor. A location into which the result is stored. It's dimension equals with ``x``. - - Examples: - .. code-block:: python - - import paddle - import numpy as np - - x_data = np.array([True, False], dtype=np.bool_).reshape([2, 1]) - y_data = np.array([True, False, True, False], dtype=np.bool_).reshape([2, 2]) - x = paddle.to_tensor(x_data) - y = paddle.to_tensor(y_data) - res = paddle.logical_xor(x, y) - print(res) # [[False, True], [ True, False]] - """ - if in_dygraph_mode(): - return _C_ops.logical_xor(x, y) - - return _logical_op( - op_name="logical_xor", x=x, y=y, name=name, out=out, binary_op=True - ) - - @templatedoc() def logical_not(x, out=None, name=None): """ From 9aacb31bb7468f23b2c86462ebd4ae63eac328b7 Mon Sep 17 00:00:00 2001 From: Zuza Gawrysiak Date: Fri, 18 Nov 2022 10:21:35 +0100 Subject: [PATCH 095/210] [PHI] Migrate conv_transpose kernel (#48119) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Migrate conv_transpose to phi * Move handler to kernel * kernel m * Fix formatting * handler * remove fluid * revert tcp_store * tcp_store * remove unused * Fix declaration * add dnn input * Fix typo Co-authored-by: Sławomir Siwek --- .../ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc | 7 + .../mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc | 2 +- .../mkldnn/conv_transpose_mkldnn_op.cc | 430 ----------------- paddle/fluid/operators/ops_extra_info.h | 1 + .../kernels/onednn/conv_transpose_kernel.cc | 440 ++++++++++++++++++ 5 files changed, 449 insertions(+), 431 deletions(-) delete mode 100644 paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc create mode 100644 paddle/phi/kernels/onednn/conv_transpose_kernel.cc diff --git a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc index 24bbcbca01b4f..bf88b82fc30a1 100644 --- a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc @@ -20,6 +20,7 @@ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/string/pretty_log.h" namespace paddle { namespace framework { @@ -315,6 +316,12 @@ void ConvBiasFusePass::ApplyImpl(ir::Graph* graph) const { }; gpd(graph, handler); AddStatis(found_conv_bias_count); + if ((!Has("disable_logs") || !Get("disable_logs")) && + found_conv_bias_count > 0) { + string::PrettyLogDetail("--- fused %d %s with elementwise_add as bias", + found_conv_bias_count, + type()); + } } } // namespace ir } // namespace framework diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc index bdb2bef362be4..1762f638e7de8 100644 --- a/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc @@ -34,7 +34,7 @@ PD_DECLARE_KERNEL(gelu, CPU, ALL_LAYOUT); USE_OP_ITSELF(batch_norm); PD_DECLARE_KERNEL(batch_norm, OneDNN, ONEDNN); USE_OP_ITSELF(conv2d_transpose); -USE_OP_DEVICE_KERNEL(conv2d_transpose, MKLDNN); +PD_DECLARE_KERNEL(conv2d_transpose, OneDNN, ONEDNN); USE_OP_ITSELF(elementwise_add); USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN); USE_OP_ITSELF(gelu); diff --git a/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc deleted file mode 100644 index 63fe71bce7c35..0000000000000 --- a/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc +++ /dev/null @@ -1,430 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -#include "paddle/fluid/framework/data_layout_transform.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/memory/malloc.h" -#include "paddle/fluid/operators/conv_op.h" -#include "paddle/fluid/platform/mkldnn_reuse.h" - -namespace paddle { -namespace operators { - -using Tensor = phi::DenseTensor; -using phi::DataLayout; -using phi::funcs::OneDNNMemDesc; - -inline dnnl::memory::dims GetWeightsTz(const phi::DenseTensor* filter, - const int groups) { - auto weights_tz = phi::vectorize(filter->dims()); - int g = std::max(groups, 1); - int g_dim = (g > 1) ? 1 : 0; - phi::funcs::GetGroupConvWeightsTz(weights_tz, g); - // gIOHW -> gOIHW || IOHW -> OIHW - std::swap(weights_tz[g_dim + 0], weights_tz[g_dim + 1]); - return weights_tz; -} - -template -class ConvTransposeMKLDNNHandlerT - : public phi::funcs::OneDNNHandlerNoCachingT { - public: - ConvTransposeMKLDNNHandlerT(const framework::ExecutionContext& ctx, - const dnnl::engine mkldnn_engine, - const phi::DenseTensor* input, - const phi::DenseTensor* filter, - const phi::DenseTensor* bias, - phi::DenseTensor* output) - : phi::funcs::OneDNNHandlerNoCachingT( - mkldnn_engine, ctx.GetPlace()), - is_test_(ctx.Attr("is_test")) { - PADDLE_ENFORCE_EQ(is_test_, - true, - platform::errors::InvalidArgument( - "ConvTransposeMKLDNN works only for inference. " - "The attribute \'is_test\' value should be set to " - "True, but got is_test=False.")); - - PADDLE_ENFORCE_EQ( - input->layout(), - DataLayout::ONEDNN, - platform::errors::InvalidArgument( - "Got wrong layout = %d for Input tensor.", input->layout())); - - PADDLE_ENFORCE_EQ( - filter->layout(), - DataLayout::ONEDNN, - platform::errors::InvalidArgument( - "The filter tensor's layout should be %d, but got %d.", - DataLayout::ONEDNN, - filter->layout())); - - PADDLE_ENFORCE_EQ( - input->dims().size(), - 4, - platform::errors::InvalidArgument("Input must be with 4 dimensions, " - "i.e. NCHW. but got dimension =%d", - input->dims().size())); - PADDLE_ENFORCE_EQ( - filter->dims().size(), - 4, - platform::errors::InvalidArgument("Filter must be with 4 dimensions, " - "i.e. OIHW, but got dimension =%d", - filter->dims().size())); - - if (bias) { - PADDLE_ENFORCE_EQ( - bias->layout(), - DataLayout::ONEDNN, - platform::errors::InvalidArgument( - "The bias tensor's laytout should be %d, but got %d.", - DataLayout::ONEDNN, - bias->layout())); - - PADDLE_ENFORCE_EQ( - bias->dims().size(), - 1, - platform::errors::InvalidArgument("Bias must only have 1 dimension, " - "i.e. X, but got dimension = %d .", - bias->dims().size())); - } - - std::vector strides_temp = ctx.Attr>("strides"); - dnnl::memory::dims strides(begin(strides_temp), end(strides_temp)); - - std::vector paddings_temp = ctx.Attr>("paddings"); - dnnl::memory::dims paddings(begin(paddings_temp), end(paddings_temp)); - - std::vector dilations_temp = ctx.Attr>("dilations"); - dnnl::memory::dims dilations(begin(dilations_temp), end(dilations_temp)); - - int groups = ctx.Attr("groups"); - std::string padding_algorithm = ctx.Attr("padding_algorithm"); - - PADDLE_ENFORCE_EQ( - strides.size(), - 2, - platform::errors::Unimplemented( - "Now we only support 2d oneDNN convolution transpose op")); - - const auto& input_dims = input->dims(); - const auto data_dims = phi::slice_ddim(input_dims, 2, input_dims.size()); - const auto& filter_dims = filter->dims(); - const auto filter_data_dims = - phi::slice_ddim(filter_dims, 2, filter_dims.size()); - - const auto ksize = phi::vectorize(filter_data_dims); - - UpdatePaddingAndDilation( - &paddings, &dilations, padding_algorithm, data_dims, strides, ksize); - - std::transform( - dilations.begin(), dilations.end(), dilations.begin(), [](int64_t i) { - return i - 1; - }); - - const auto src_tz = phi::vectorize(input->dims()); - const auto weights_tz = GetWeightsTz(filter, groups); - const auto dst_tz = phi::vectorize(output->dims()); - const auto mkldnn_paddings = phi::funcs::ToOneDNNPadding(paddings); - - /* create memory descriptor for convolution without specified format - * ('any') which lets a primitive (convolution in this case) choose - * the memory format preferred for best performance - */ - const auto chosen_memory_format = OneDNNMemoryFormat::any; - - auto data_type = dnnl::memory::data_type::f32; - if (ctx.Attr("mkldnn_data_type") == "bfloat16" || - std::is_same::value) - data_type = dnnl::memory::data_type::bf16; - - const auto src_md = OneDNNMemDesc(src_tz, data_type, chosen_memory_format); - const auto weights_md = - OneDNNMemDesc(weights_tz, data_type, chosen_memory_format); - const auto dst_md = OneDNNMemDesc( - dst_tz, phi::funcs::OneDNNGetDataType(), chosen_memory_format); - - const dnnl::primitive_attr conv_trans_attr = CreateConvAttrs(ctx); - auto fwd_prop_kind = is_test_ ? dnnl::prop_kind::forward_inference - : dnnl::prop_kind::forward_training; - if (bias) { - std::vector bias_tz = phi::vectorize(bias->dims()); - const auto bias_md = - OneDNNMemDesc(bias_tz, data_type, OneDNNMemoryFormat::x); - this->AcquireForwardPrimitiveDescriptor( - conv_trans_attr, - fwd_prop_kind, - dnnl::algorithm::deconvolution_direct, - src_md, - weights_md, - bias_md, - dst_md, - strides, - dilations, - mkldnn_paddings[0], - mkldnn_paddings[1]); - } else { - this->AcquireForwardPrimitiveDescriptor( - conv_trans_attr, - fwd_prop_kind, - dnnl::algorithm::deconvolution_direct, - src_md, - weights_md, - dst_md, - strides, - dilations, - mkldnn_paddings[0], - mkldnn_paddings[1]); - } - } - - dnnl::primitive_attr CreateConvAttrs(const framework::ExecutionContext& ctx) { - dnnl::primitive_attr conv_attr; - dnnl::post_ops post_operations; - - const std::string fuse_activation = - ctx.Attr("fuse_activation"); - const float fuse_alpha = ctx.Attr("fuse_alpha"); - const float fuse_beta = ctx.Attr("fuse_beta"); - - // Fusion with ReLU layer is executed through the PostOps feature. Create a - // PostOps object and configure it to execute an eltwise relu operation. - if (fuse_activation == "relu" || fuse_activation == "leaky_relu") { - constexpr float scale = 1.0f; - post_operations.append_eltwise( - scale, dnnl::algorithm::eltwise_relu, fuse_alpha, fuse_beta); - } else if (fuse_activation == "relu6") { - constexpr float scale = 1.0f; - post_operations.append_eltwise( - scale, dnnl::algorithm::eltwise_bounded_relu, fuse_alpha, fuse_beta); - } else if (fuse_activation == "swish") { - constexpr float scale = 1.0f; - post_operations.append_eltwise( - scale, dnnl::algorithm::eltwise_swish, fuse_alpha, fuse_beta); - } - conv_attr.set_post_ops(post_operations); - return conv_attr; - } - - std::shared_ptr AcquireSrcMemoryWithReorder( - const phi::DenseTensor* input) { - const T* input_data = input->data(); - return phi::funcs::OneDNNHandlerNoCachingT:: - AcquireMemoryWithReorder(input->mem_desc(), - this->fwd_pd_->src_desc(), - phi::funcs::to_void_cast(input_data)); - } - - std::shared_ptr AcquireWeightsMemoryWithReorder( - const platform::MKLDNNDeviceContext& dev_ctx, - const std::string& key, - const phi::DenseTensor* filter, - const int& groups) { - const K* filter_data = filter->data(); - auto weights_tz = GetWeightsTz(filter, groups); - int g = std::max(groups, 1); - - auto user_src_md = OneDNNMemDesc( - weights_tz, - phi::funcs::OneDNNGetDataType(), - (g == 1) ? OneDNNMemoryFormat::iohw : OneDNNMemoryFormat::giohw); - - return this->template AcquireMemoryWithReorder( - dev_ctx, - user_src_md, - this->fwd_pd_->weights_desc(), - phi::funcs::to_void_cast(filter_data), - key, - "@weights_mem_p", - is_test_); - } - - template - std::shared_ptr AcquireMemoryWithReorder( - const platform::MKLDNNDeviceContext& dev_ctx, - const dnnl::memory::desc& user_md, - const dnnl::memory::desc& target_md, - void* ptr, - const std::string& key, - const std::string& suffix, - bool is_persistent = false, - const std::vector& scale_data = {1.0f}, - int mask = 0) { - const auto target_key = key + suffix + "_target"; - const auto key_reorder_p = key + suffix + "reorder_p"; - const auto user_key = key + suffix + "_user"; - - auto target_memory_p = - std::static_pointer_cast(dev_ctx.GetBlob(target_key)); - - if (target_memory_p == nullptr) { - auto user_memory_p = - std::make_shared(user_md, this->engine_, ptr); - if (user_md != target_md) { - target_memory_p = - std::make_shared(target_md, this->engine_); - dnnl::reorder::primitive_desc reorder_pdesc; - if (phi::funcs::is_int8()) { - dnnl::primitive_attr attr; - attr.set_output_scales(mask, scale_data); - reorder_pdesc = dnnl::reorder::primitive_desc( - *user_memory_p, *target_memory_p, attr); - } else { - reorder_pdesc = - dnnl::reorder::primitive_desc(*user_memory_p, *target_memory_p); - } - auto reorder_p = std::make_shared(reorder_pdesc); - dev_ctx.SetBlob(key_reorder_p, reorder_p); - - auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); - platform::RecordEvent record_reorder( - "int_reorder", - platform::TracerEventType::UserDefined, - 1, - platform::EventRole::kUniqueOp); - reorder_p->execute( - astream, - {{DNNL_ARG_FROM, *user_memory_p}, {DNNL_ARG_TO, *target_memory_p}}); - astream.wait(); - } else { - target_memory_p = user_memory_p; - } - dev_ctx.SetBlob(user_key, user_memory_p); - dev_ctx.SetBlob(target_key, target_memory_p); - } else if (!is_persistent) { - auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); - - auto user_memory_p = - std::static_pointer_cast(dev_ctx.GetBlob(user_key)); - user_memory_p->set_data_handle(ptr); - - // TODO(jczaja): Here we detect if reorder is cached it means it is needed - // need to change this to get rid of keys - auto reorder_p = std::static_pointer_cast( - dev_ctx.GetBlob(key_reorder_p)); - if (reorder_p != nullptr) { - platform::RecordEvent record_reorder( - "int_reorder", - platform::TracerEventType::UserDefined, - 1, - platform::EventRole::kUniqueOp); - reorder_p->execute( - astream, - {{DNNL_ARG_FROM, *user_memory_p}, {DNNL_ARG_TO, *target_memory_p}}); - astream.wait(); - } - } - return target_memory_p; - } - - std::shared_ptr AcquireBiasMemoryWithReorder( - const platform::MKLDNNDeviceContext& dev_ctx, - const std::string& key, - const phi::DenseTensor* bias) { - const K* bias_data = bias->data(); - auto user_bias_md = OneDNNMemDesc(phi::vectorize(bias->dims()), - phi::funcs::OneDNNGetDataType(), - OneDNNMemoryFormat::x); - return this->AcquireMemoryWithReorder( - dev_ctx, - user_bias_md, - this->fwd_pd_->bias_desc(), - phi::funcs::to_void_cast(bias_data), - key, - "@bias_mem_p", - is_test_); - } - - private: - const bool is_test_; -}; - -template -class ConvTransposeMKLDNNOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.GetPlace()), - true, - platform::errors::PreconditionNotMet( - "Operator DNNL ConvTranspose must use CPUPlace")); - const bool is_bfloat16 = - ctx.Attr("mkldnn_data_type") == "bfloat16"; - const bool force_fp32_output = ctx.Attr("force_fp32_output"); - if (is_bfloat16) { - if (force_fp32_output) - Execute(ctx); - else - Execute(ctx); - } else { - Execute(ctx); - } - } - - template - void Execute(const framework::ExecutionContext& ctx) const { - auto& dev_ctx = - ctx.template device_context(); - const auto& mkldnn_engine = dev_ctx.GetEngine(); - - const auto* input = ctx.Input("Input"); - const auto* filter = ctx.Input("Filter"); - const auto* bias = - ctx.HasInput("Bias") ? ctx.Input("Bias") : nullptr; - auto* output = ctx.Output("Output"); - ConvTransposeMKLDNNHandlerT handler( - ctx, mkldnn_engine, input, filter, bias, output); - auto src_memory_p = handler.AcquireSrcMemoryWithReorder(input); - // Caching Key for weights is needed - std::string key = platform::CreateKey(dev_ctx, - ctx.InputName("Input"), - ctx.InputName("Filter"), - (bias ? ctx.InputName("Bias") : "")); - key = platform::ExtendKeyWithThreadInfoIfNeeded(dev_ctx, key); - auto weights_memory_p = handler.AcquireWeightsMemoryWithReorder( - dev_ctx, key, filter, ctx.Attr("groups")); - - std::shared_ptr dst_memory_p = - handler.template AcquireDstMemory(output); - auto conv_p = handler.AcquireForwardPrimitive(); - - std::unordered_map args = { - {DNNL_ARG_SRC, *src_memory_p}, - {DNNL_ARG_WEIGHTS, *weights_memory_p}, - {DNNL_ARG_DST, *dst_memory_p}}; - - if (bias) { - auto bias_memory_p = - handler.AcquireBiasMemoryWithReorder(dev_ctx, key, bias); - args.insert({DNNL_ARG_BIAS, *bias_memory_p}); - } - auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); - conv_p->execute(astream, args); - astream.wait(); - output->set_mem_desc(dst_memory_p->get_desc()); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OP_KERNEL( - conv2d_transpose, - MKLDNN, - ::paddle::platform::CPUPlace, - ops::ConvTransposeMKLDNNOpKernel, - ops::ConvTransposeMKLDNNOpKernel); diff --git a/paddle/fluid/operators/ops_extra_info.h b/paddle/fluid/operators/ops_extra_info.h index d0847df43e230..6aa6bd21fba1f 100644 --- a/paddle/fluid/operators/ops_extra_info.h +++ b/paddle/fluid/operators/ops_extra_info.h @@ -222,6 +222,7 @@ class ExtraInfoUtils { // TODO(chenweihang): move these extra inputs into op_compat.yaml std::unordered_map> g_extra_input_names_map_ = {{"conv2d", {"Bias", "ResidualData"}}, + {"conv2d_transpose", {"Bias"}}, {"conv2d_grad", {"Bias"}}}; std::vector empty_extra_input_names_; }; diff --git a/paddle/phi/kernels/onednn/conv_transpose_kernel.cc b/paddle/phi/kernels/onednn/conv_transpose_kernel.cc new file mode 100644 index 0000000000000..fd47d5ce540f4 --- /dev/null +++ b/paddle/phi/kernels/onednn/conv_transpose_kernel.cc @@ -0,0 +1,440 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/conv_transpose_kernel.h" + +#include "paddle/fluid/platform/profiler/event_tracing.h" +#include "paddle/phi/backends/onednn/onednn_helper.h" +#include "paddle/phi/backends/onednn/onednn_reuse.h" +#include "paddle/phi/core/expect.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/cpu/conv_util.h" +#include "paddle/phi/kernels/funcs/data_layout_transform.h" + +namespace phi { + +inline dnnl::memory::dims GetWeightsTz(const phi::DenseTensor* filter, + const int groups) { + auto weights_tz = phi::vectorize(filter->dims()); + int g = std::max(groups, 1); + int g_dim = (g > 1) ? 1 : 0; + funcs::GetGroupConvWeightsTz(weights_tz, g); + // gIOHW -> gOIHW || IOHW -> OIHW + std::swap(weights_tz[g_dim + 0], weights_tz[g_dim + 1]); + return weights_tz; +} + +template +class ConvTransposeOneDNNHandlerT + : public funcs::OneDNNHandlerNoCachingT { + private: + const bool is_test_; + + public: + ConvTransposeOneDNNHandlerT(const OneDNNContext& dev_ctx, + const DenseTensor* x, + const DenseTensor* filter, + const DenseTensor* bias, + const std::vector& strides_in, + const std::vector& paddings_in, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations_in, + DenseTensor* out) + : funcs::OneDNNHandlerNoCachingT( + dev_ctx.GetEngine(), dev_ctx.GetPlace()), + is_test_(dev_ctx.HasDnnAttr("is_test") + ? PADDLE_GET_CONST(bool, dev_ctx.GetDnnAttr("is_test")) + : false) { + PADDLE_ENFORCE_EQ(is_test_, + true, + phi::errors::InvalidArgument( + "ConvTransposeOneDNN works only for inference. " + "The attribute \'is_test\' value should be set to " + "True, but got is_test=False.")); + + PADDLE_ENFORCE_EQ( + x->layout(), + DataLayout::ONEDNN, + phi::errors::InvalidArgument("Got wrong layout = %d for Input tensor.", + x->layout())); + + PADDLE_ENFORCE_EQ( + filter->layout(), + DataLayout::ONEDNN, + phi::errors::InvalidArgument( + "The filter tensor's layout should be %d, but got %d.", + DataLayout::ONEDNN, + filter->layout())); + + PADDLE_ENFORCE_EQ( + x->dims().size(), + 4, + phi::errors::InvalidArgument("Input must be with 4 dimensions, " + "i.e. NCHW. but got dimension =%d", + x->dims().size())); + PADDLE_ENFORCE_EQ( + filter->dims().size(), + 4, + phi::errors::InvalidArgument("Filter must be with 4 dimensions, " + "i.e. OIHW, but got dimension =%d", + filter->dims().size())); + + if (bias) { + PADDLE_ENFORCE_EQ( + bias->layout(), + DataLayout::ONEDNN, + phi::errors::InvalidArgument( + "The bias tensor's laytout should be %d, but got %d.", + DataLayout::ONEDNN, + bias->layout())); + + PADDLE_ENFORCE_EQ( + bias->dims().size(), + 1, + phi::errors::InvalidArgument("Bias must only have 1 dimension, " + "i.e. X, but got dimension = %d .", + bias->dims().size())); + } + + dnnl::memory::dims strides(begin(strides_in), end(strides_in)); + dnnl::memory::dims paddings(begin(paddings_in), end(paddings_in)); + dnnl::memory::dims dilations(begin(dilations_in), end(dilations_in)); + + PADDLE_ENFORCE_EQ( + strides.size(), + 2, + phi::errors::Unimplemented( + "Now we only support 2d oneDNN convolution transpose op")); + + const auto x_dims = x->dims(); + const auto x_data_dims = phi::slice_ddim(x_dims, 2, x_dims.size()); + const auto filter_dims = filter->dims(); + const auto filter_data_dims = + phi::slice_ddim(filter_dims, 2, filter_dims.size()); + const auto ksize = phi::vectorize(filter_data_dims); + UpdatePaddingAndDilation( + &paddings, &dilations, padding_algorithm, x_data_dims, strides, ksize); + + std::transform( + dilations.begin(), dilations.end(), dilations.begin(), [](int64_t i) { + return i - 1; + }); + + const auto src_tz = phi::vectorize(x->dims()); + const auto weights_tz = GetWeightsTz(filter, groups); + const auto dst_tz = phi::vectorize(out->dims()); + const auto onednn_paddings = funcs::ToOneDNNPadding(paddings); + + /* create memory descriptor for convolution without specified format + * ('any') which lets a primitive (convolution in this case) choose + * the memory format preferred for best performance + */ + auto chosen_memory_format = funcs::OneDNNMemoryFormat::any; + auto data_type = dnnl::memory::data_type::f32; + const bool is_BFLOAT16 = + dev_ctx.HasDnnAttr("mkldnn_data_type") + ? PADDLE_GET_CONST(std::string, + dev_ctx.GetDnnAttr("mkldnn_data_type")) == + "bfloat16" + : false; + if (is_BFLOAT16 || std::is_same::value) { + data_type = dnnl::memory::data_type::bf16; + } + + const auto src_md = + funcs::OneDNNMemDesc(src_tz, data_type, chosen_memory_format); + const auto weights_md = + funcs::OneDNNMemDesc(weights_tz, data_type, chosen_memory_format); + const auto dst_md = funcs::OneDNNMemDesc( + dst_tz, funcs::OneDNNGetDataType(), chosen_memory_format); + + auto fwd_prop_kind = is_test_ ? dnnl::prop_kind::forward_inference + : dnnl::prop_kind::forward_training; + + if (bias) { + std::vector bias_tz = phi::vectorize(bias->dims()); + const auto bias_md = funcs::OneDNNMemDesc( + bias_tz, data_type, funcs::OneDNNMemoryFormat::x); + this->AcquireForwardPrimitiveDescriptor( + fwd_prop_kind, + dnnl::algorithm::deconvolution_direct, + src_md, + weights_md, + bias_md, + dst_md, + strides, + dilations, + onednn_paddings[0], + onednn_paddings[1]); + } else { + this->AcquireForwardPrimitiveDescriptor( + fwd_prop_kind, + dnnl::algorithm::deconvolution_direct, + src_md, + weights_md, + dst_md, + strides, + dilations, + onednn_paddings[0], + onednn_paddings[1]); + } + } + + std::shared_ptr AcquireSrcMemoryWithReorder( + const phi::DenseTensor* x) { + const T* input_data = x->data(); + return funcs::OneDNNHandlerNoCachingT:: + AcquireMemoryWithReorder(x->mem_desc(), + this->fwd_pd_->src_desc(), + funcs::to_void_cast(input_data)); + } + + std::shared_ptr AcquireWeightsMemoryWithReorder( + const OneDNNContext& dev_ctx, + const std::string& key, + const phi::DenseTensor* filter, + const int& groups) { + const K* filter_data = filter->data(); + auto weights_tz = GetWeightsTz(filter, groups); + int g = std::max(groups, 1); + + auto user_src_md = + funcs::OneDNNMemDesc(weights_tz, + funcs::OneDNNGetDataType(), + (g == 1) ? funcs::OneDNNMemoryFormat::iohw + : funcs::OneDNNMemoryFormat::giohw); + + return this->template AcquireMemoryWithReorder( + dev_ctx, + user_src_md, + this->fwd_pd_->weights_desc(), + funcs::to_void_cast(filter_data), + key, + "@weights_mem_p", + is_test_); + } + + template + std::shared_ptr AcquireMemoryWithReorder( + const OneDNNContext& dev_ctx, + const dnnl::memory::desc& user_md, + const dnnl::memory::desc& target_md, + void* ptr, + const std::string& key, + const std::string& suffix, + bool is_persistent = false, + const std::vector& scale_data = {1.0f}, + int mask = 0) { + const auto target_key = key + suffix + "_target"; + const auto key_reorder_p = key + suffix + "reorder_p"; + const auto user_key = key + suffix + "_user"; + + auto target_memory_p = + std::static_pointer_cast(dev_ctx.GetBlob(target_key)); + + if (target_memory_p == nullptr) { + auto user_memory_p = + std::make_shared(user_md, this->engine_, ptr); + if (user_md != target_md) { + target_memory_p = + std::make_shared(target_md, this->engine_); + dnnl::reorder::primitive_desc reorder_pdesc; + if (funcs::is_int8()) { + dnnl::primitive_attr attr; + attr.set_output_scales(mask, scale_data); + reorder_pdesc = dnnl::reorder::primitive_desc( + *user_memory_p, *target_memory_p, attr); + } else { + reorder_pdesc = + dnnl::reorder::primitive_desc(*user_memory_p, *target_memory_p); + } + auto reorder_p = std::make_shared(reorder_pdesc); + dev_ctx.SetBlob(key_reorder_p, reorder_p); + + auto& astream = OneDNNContext::tls().get_stream(); + paddle::platform::RecordEvent record_reorder( + "int_reorder", + paddle::platform::TracerEventType::UserDefined, + 1, + paddle::platform::EventRole::kUniqueOp); + reorder_p->execute( + astream, + {{DNNL_ARG_FROM, *user_memory_p}, {DNNL_ARG_TO, *target_memory_p}}); + astream.wait(); + } else { + target_memory_p = user_memory_p; + } + dev_ctx.SetBlob(user_key, user_memory_p); + dev_ctx.SetBlob(target_key, target_memory_p); + } else if (!is_persistent) { + auto& astream = OneDNNContext::tls().get_stream(); + + auto user_memory_p = + std::static_pointer_cast(dev_ctx.GetBlob(user_key)); + user_memory_p->set_data_handle(ptr); + + // TODO(jczaja): Here we detect if reorder is cached it means it is needed + // need to change this to get rid of keys + auto reorder_p = std::static_pointer_cast( + dev_ctx.GetBlob(key_reorder_p)); + if (reorder_p != nullptr) { + paddle::platform::RecordEvent record_reorder( + "int_reorder", + paddle::platform::TracerEventType::UserDefined, + 1, + paddle::platform::EventRole::kUniqueOp); + reorder_p->execute( + astream, + {{DNNL_ARG_FROM, *user_memory_p}, {DNNL_ARG_TO, *target_memory_p}}); + astream.wait(); + } + } + return target_memory_p; + } + + std::shared_ptr AcquireBiasMemoryWithReorder( + const OneDNNContext& dev_ctx, + const std::string& key, + const phi::DenseTensor* bias) { + const K* bias_data = bias->data(); + auto user_bias_md = funcs::OneDNNMemDesc(phi::vectorize(bias->dims()), + funcs::OneDNNGetDataType(), + funcs::OneDNNMemoryFormat::x); + return this->AcquireMemoryWithReorder(dev_ctx, + user_bias_md, + this->fwd_pd_->bias_desc(), + funcs::to_void_cast(bias_data), + key, + "@bias_mem_p", + is_test_); + } +}; + +template +void Execute(const OneDNNContext& dev_ctx, + const DenseTensor* x, + const DenseTensor* filter, + const std::vector& strides, + const std::vector& paddings, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations, + DenseTensor* out) { + const auto* bias = + dev_ctx.HasDnnInput("Bias") ? dev_ctx.GetDnnInput("Bias") : nullptr; + + ConvTransposeOneDNNHandlerT handler(dev_ctx, + x, + filter, + bias, + strides, + paddings, + padding_algorithm, + groups, + dilations, + out); + + auto src_memory_p = handler.AcquireSrcMemoryWithReorder(x); + // Caching Key for weights is needed + std::string key = + funcs::CreateKey(dev_ctx, + dev_ctx.GetInputsName("Input")[0], + dev_ctx.GetInputsName("Filter")[0], + (bias ? dev_ctx.GetInputsName("Bias")[0] : "")); + key = funcs::ExtendKeyWithThreadInfoIfNeeded(dev_ctx, key); + auto weights_memory_p = + handler.AcquireWeightsMemoryWithReorder(dev_ctx, key, filter, groups); + + std::shared_ptr dst_memory_p = + handler.template AcquireDstMemory(out); + auto conv_p = handler.AcquireForwardPrimitive(); + + std::unordered_map args = { + {DNNL_ARG_SRC, *src_memory_p}, + {DNNL_ARG_WEIGHTS, *weights_memory_p}, + {DNNL_ARG_DST, *dst_memory_p}}; + + if (bias) { + auto bias_memory_p = + handler.AcquireBiasMemoryWithReorder(dev_ctx, key, bias); + args.insert({DNNL_ARG_BIAS, *bias_memory_p}); + } + auto& astream = OneDNNContext::tls().get_stream(); + conv_p->execute(astream, args); + astream.wait(); + out->set_mem_desc(dst_memory_p->get_desc()); +} + +template +void Conv2dTransposeKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& filter, + const std::vector& strides, + const std::vector& paddings, + const std::vector& output_padding, + const IntArray& output_size, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + DenseTensor* out) { + PADDLE_ENFORCE_EQ(dev_ctx.GetPlace().GetType(), + AllocationType::CPU, + phi::errors::PreconditionNotMet( + "Operator oneDNN Conv must use CPUPlace")); + + const bool is_BFLOAT16 = + dev_ctx.HasDnnAttr("mkldnn_data_type") + ? PADDLE_GET_CONST(std::string, + dev_ctx.GetDnnAttr("mkldnn_data_type")) == + "bfloat16" + : false; + const bool force_fp32_output = + dev_ctx.HasDnnAttr("force_fp32_output") + ? PADDLE_GET_CONST(bool, dev_ctx.GetDnnAttr("force_fp32_output")) + : false; + const bool use_bfloat16 = (!force_fp32_output && is_BFLOAT16); + + if (use_bfloat16) { + Execute(dev_ctx, + &x, + &filter, + strides, + paddings, + padding_algorithm, + groups, + dilations, + out); + } else { + Execute(dev_ctx, + &x, + &filter, + strides, + paddings, + padding_algorithm, + groups, + dilations, + out); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(conv2d_transpose, + OneDNN, + ONEDNN, + phi::Conv2dTransposeKernel, + float, + phi::dtype::bfloat16) {} From 7073ed5ba3a4b62d541f77fc5db6419c9c776f2f Mon Sep 17 00:00:00 2001 From: Vvsmile <450864116@qq.com> Date: Fri, 18 Nov 2022 17:30:42 +0800 Subject: [PATCH 096/210] Remove API: pad_constant_like (#47949) remove pad_constant_like which is not used in paddle 2.0 --- python/paddle/fluid/layers/nn.py | 97 ------------------- .../tests/unittests/test_pad_constant_like.py | 39 +------- 2 files changed, 1 insertion(+), 135 deletions(-) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 089597fdaf1aa..50602825c1d28 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -107,7 +107,6 @@ 'lod_append', 'lrn', 'pad', - 'pad_constant_like', 'label_smooth', 'roi_pool', 'roi_align', @@ -7022,102 +7021,6 @@ def pad(x, paddings, pad_value=0.0, name=None): return out -def pad_constant_like(x, y, pad_value=0.0, name=None): - r""" - Pad :attr:`y` with :attr:`pad_value`, the number of values padded to - the edges of each axis is specified by the difference of the shape - of :attr:`x` and :attr:`y` . ((0, shape_x_0 - shape_y_0), ... (0, shape_x_n - shape_y_n)) - specify padding widths for each axis. The input should be a k-D tensor(k > 0 and k < 7). - - See below for an example. - - .. code-block:: text - - Given: - X = [[[[ 0, 1, 2], - [ 3, 4, 5]], - [[ 6, 7, 8], - [ 9, 10, 11]], - [[12, 13, 14], - [15, 16, 17]]], - [[[18, 19, 20], - [21, 22, 23]], - [[24, 25, 26], - [27, 28, 29]], - [[30, 31, 32], - [33, 34, 35]]]] - - X.shape = (2, 3, 2, 3) - - Y = [[[[35, 36, 37]], - [[38, 39, 40]], - [[41, 42, 43]]]] - - Y.shape = (1, 3, 1, 3) - - And - pad_value = 0. - - Return: - Out = [[[[35, 36, 37], - [ 0, 0, 0]], - [[38, 39, 40], - [ 0, 0, 0]], - [[41, 42, 43], - [ 0, 0, 0]]], - [[[ 0, 0, 0], - [ 0, 0, 0]], - [[ 0, 0, 0], - [ 0, 0, 0]], - [[ 0, 0, 0], - [ 0, 0, 0]]]] - - Out.shape = [2, 3, 2, 3] - - - Args: - x (Variable): Tensor, its shape specifies the shape of output. - y (Variable): Tensor, its rank is the same with :attr:`x`, and for each dimension :math:`i` , - :math:`y\_shape[i] <= x\_shape[i]` . The data type can be float32 or float64. - pad_value (float): The constant value used to pad. - name(str, optional): The default value is None. - Normally there is no need for user to set this property. - For more information, please refer to :ref:`api_guide_Name` - - Returns: - The padded tensor, with the same shape as :attr:`x` and the same data type as :attr:`y` - - Return Type: - Variable - - Examples: - .. code-block:: python - - # x is a rank 4 tensor variable, x.shape = (2, 3, 2, 3) - # y is a rank 4 tensor variable, y.shape = (1, 3, 1, 3) - import paddle.fluid as fluid - x = fluid.data(name='x', shape=[2,3,2,3], dtype='float32') - y = fluid.data(name='y', shape=[1,3,1,3], dtype='float32') - out = fluid.layers.pad_constant_like(x=x, y=y, pad_value=0.) - # out is a rank 4 tensor variable, and out.shape = [2, 3 ,2 , 3] - """ - check_type(x, 'x', (Variable), 'pad_constant_like') - check_variable_and_dtype( - y, 'y', ['float32', 'float64', 'int32', 'int64'], "pad_constant_like" - ) - - helper = LayerHelper('pad_constant_like', **locals()) - dtype = helper.input_dtype(input_param_name='y') - out = helper.create_variable_for_type_inference(dtype) - helper.append_op( - type='pad_constant_like', - inputs={'X': x, 'Y': y}, - outputs={'Out': out}, - attrs={'pad_value': float(pad_value)}, - ) - return out - - def label_smooth( label, prior_dist=None, epsilon=0.1, dtype="float32", name=None ): diff --git a/python/paddle/fluid/tests/unittests/test_pad_constant_like.py b/python/paddle/fluid/tests/unittests/test_pad_constant_like.py index fda844eb09b5d..ee957096b5b19 100644 --- a/python/paddle/fluid/tests/unittests/test_pad_constant_like.py +++ b/python/paddle/fluid/tests/unittests/test_pad_constant_like.py @@ -14,9 +14,7 @@ import unittest import numpy as np -from op_test import OpTest, check_out_dtype -import paddle.fluid as fluid -from paddle.fluid import Program, program_guard +from op_test import OpTest class TestPadConstantLikeOp(OpTest): @@ -67,40 +65,5 @@ def initTestCase(self): self.pad_value = 0.5 -class TestPadConstantLikeOpError(unittest.TestCase): - def test_errors(self): - with program_guard(Program(), Program()): - x_data = np.random.random((2, 2, 2, 2)).astype("float32") - y_data = np.random.random((2, 2, 2, 2)).astype("float32") - - def test_Variable_x(): - var_y = fluid.data( - name="data_y", shape=[2, 2, 2, 2], dtype="float32" - ) - fluid.layers.pad_constant_like(x=x_data, y=var_y) - - self.assertRaises(TypeError, test_Variable_x) - - def test_Variable_y(): - var_x = fluid.data( - name="data_x", shape=[2, 2, 2, 2], dtype="float32" - ) - fluid.layers.pad_constant_like(x=var_x, y=y_data) - - self.assertRaises(TypeError, test_Variable_y) - - -class TestOutDtype(unittest.TestCase): - def test_dtype(self): - api_fn = fluid.layers.pad_constant_like - check_out_dtype( - api_fn, - in_specs=[([2, 3, 2, 3], 'float64'), ([1, 3, 1, 3],)], - expect_dtypes=['float32', 'float64', 'int32', 'int64'], - target_index=1, - pad_value=0.0, - ) - - if __name__ == '__main__': unittest.main() From 4ab18ada50c17f8fe8b8defb02784684cc4537d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awomir=20Siwek?= Date: Fri, 18 Nov 2022 11:15:28 +0100 Subject: [PATCH 097/210] [PHI] Migrate matmul_grad kernel (#48023) * cleanup unused code * unify is_int8 is_bfloat16 * Simplify matmul_v2 FWD kernel * remove RunKernel methods * remove import namespace * remove headers * clean fluid/phi cross imports * remove fluid axpy_handler * delete fluid methods * activations * OneDNNMemDesc * MKLDNNFormatForSize * MatchShapeToLayout * MKLDNNMemoryFormat * MKLDNNFormat * ReorderMKLDNNHandler * to_void_cast * review suggestions * interpolate * remove fluid depedency * init * ExecuteMatMulV2 * rm fluid kernel * matmul_grad * remove mutable_data --- .../operators/mkldnn/matmul_v2_mkldnn_op.cc | 222 +---------- paddle/phi/backends/onednn/onednn_reuse.h | 352 +++++++++++++++++- .../phi/kernels/onednn/matmul_grad_kernel.cc | 163 ++++++++ 3 files changed, 514 insertions(+), 223 deletions(-) create mode 100644 paddle/phi/kernels/onednn/matmul_grad_kernel.cc diff --git a/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc index 68813fbb5482e..810c0eaff1861 100644 --- a/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc @@ -75,20 +75,6 @@ static Tensor FoldFirstAndLastDims(const MKLDNNDeviceContext &dev_ctx, return output; } -// Get row matrix shape from a vector shape. If the rank of x_dim > 1, the -// original x_dim is returned. -static paddle::framework::DDim RowMatrixDimsFromVector( - const paddle::framework::DDim &x_dim) { - return x_dim.size() > 1 ? x_dim : phi::make_ddim({1, x_dim[0]}); -} - -// Get column matrix shape from a vector shape. If the ran of y_dim > 1, the -// original y_dim is returned. -static paddle::framework::DDim ColumnMatrixDimsFromVector( - const paddle::framework::DDim &y_dim) { - return y_dim.size() > 1 ? y_dim : phi::make_ddim({y_dim[0], 1}); -} - phi::DDim GetDimForInput(const ExecutionContext &ctx, std::string input_name) { auto shape = ctx.Attr>("fused_reshape_" + input_name); auto axis = ctx.Attr>("fused_transpose_" + input_name); @@ -245,8 +231,8 @@ static void ReshapeTensorToMatrixSequence( */ static void ReshapeXYOutToMatrixSequence( Tensor *x, Tensor *y, Tensor *out, bool trans_x, bool trans_y) { - auto x_dim = RowMatrixDimsFromVector(x->dims()); - auto y_dim = ColumnMatrixDimsFromVector(y->dims()); + auto x_dim = phi::funcs::RowMatrixDimsFromVector(x->dims()); + auto y_dim = phi::funcs::ColumnMatrixDimsFromVector(y->dims()); auto mat_dim_x = phi::funcs::CreateMatrixDescriptor(x_dim, 0, trans_x); auto mat_dim_y = phi::funcs::CreateMatrixDescriptor(y_dim, 0, trans_y); if (mat_dim_x.batch_size_ == 0 && mat_dim_y.batch_size_ == 0) { @@ -304,8 +290,9 @@ std::vector GetInputStrides(const ExecutionContext &ctx, new_dims = input_dims.reshape(shape).transpose(axis); } - auto &MatrixDimsFromVector = - input_name == "X" ? RowMatrixDimsFromVector : ColumnMatrixDimsFromVector; + auto &MatrixDimsFromVector = input_name == "X" + ? phi::funcs::RowMatrixDimsFromVector + : phi::funcs::ColumnMatrixDimsFromVector; phi::funcs::MatDescriptor mat_dim = phi::funcs::CreateMatrixDescriptor( MatrixDimsFromVector(new_dims), 0, @@ -707,199 +694,6 @@ class MatMulGradMKLDNNKernel : public paddle::framework::OpKernel { } }; -template -class MatMulV2GradMKLDNNKernel : public paddle::framework::OpKernel { - public: - void Compute(const ExecutionContext &ctx) const override { - const auto &dev_ctx = ctx.template device_context(); - const auto &onednn_engine = dev_ctx.GetEngine(); - - auto *x = ctx.Input("X"); - auto *y = ctx.Input("Y"); - - auto x_dims = vectorize(x->dims()); - auto y_dims = vectorize(y->dims()); - - bool is_broadcast = true; - if (x_dims.size() <= 2 || y_dims.size() <= 2) { - is_broadcast = false; - } else if (x_dims.size() != y_dims.size()) { - is_broadcast = true; - } else { - is_broadcast = !std::equal(x_dims.cbegin(), - x_dims.cbegin() + x_dims.size() - 2, - y_dims.cbegin()); - } - - // if no broadcasting is needed, we can simply use matmul's grad and avoid - // using reduce_sum - if (!is_broadcast) { - matmul_v1_grad_mkldnn_kernel.Compute(ctx); - return; - } - - auto *dout = ctx.Input(GradVarName("Out")); - auto *dx = ctx.Output(GradVarName("X")); - auto *dy = ctx.Output(GradVarName("Y")); - - bool trans_x = ctx.HasAttr("trans_x") ? ctx.Attr("trans_x") - : ctx.Attr("transpose_X"); - bool trans_y = ctx.HasAttr("trans_y") ? ctx.Attr("trans_y") - : ctx.Attr("transpose_Y"); - auto dout_dims = vectorize(dout->dims()); - - size_t ndims = std::max(x->dims().size(), y->dims().size()); - ndims = std::max(ndims, 3); - - if (x_dims.size() != ndims) { - x_dims = ExtendDimsWithOnes(x_dims, ndims); - } else if (y_dims.size() != ndims) { - y_dims = ExtendDimsWithOnes(y_dims, ndims); - } - - // in broadcasting scenario new memory is required because - // reduce sum must be calculated upon broadcasted dims - Tensor dx_tmp, dy_tmp; - - std::vector dx_bd_dims(x_dims); - std::vector dy_bd_dims(y_dims); - - CalculateGradMatrixDims( - ctx, &dx_tmp, &dy_tmp, x_dims, y_dims, &dx_bd_dims, &dy_bd_dims); - - if (trans_x && trans_y) { - ExecuteMatMulV2( - ctx, onednn_engine, y, y_dims, true, dout, dout_dims, true, &dx_tmp); - ExecuteMatMulV2( - ctx, onednn_engine, dout, dout_dims, true, x, x_dims, true, &dy_tmp); - } else if (trans_x) { - ExecuteMatMulV2( - ctx, onednn_engine, y, y_dims, false, dout, dout_dims, true, &dx_tmp); - ExecuteMatMulV2(ctx, - onednn_engine, - x, - x_dims, - false, - dout, - dout_dims, - false, - &dy_tmp); - } else if (trans_y) { - ExecuteMatMulV2(ctx, - onednn_engine, - dout, - dout_dims, - false, - y, - y_dims, - false, - &dx_tmp); - ExecuteMatMulV2( - ctx, onednn_engine, dout, dout_dims, true, x, x_dims, false, &dy_tmp); - } else { - ExecuteMatMulV2( - ctx, onednn_engine, dout, dout_dims, false, y, y_dims, true, &dx_tmp); - ExecuteMatMulV2( - ctx, onednn_engine, x, x_dims, true, dout, dout_dims, false, &dy_tmp); - } - - if (x_dims != dx_bd_dims) { - ReduceSumForMatmulGradOutput(ctx, - dev_ctx, - onednn_engine, - &dx_tmp, - dx, - x_dims, - vectorize(x->dims())); - } else { - *dx = std::move(dx_tmp); - } - if (y_dims != dy_bd_dims) { - ReduceSumForMatmulGradOutput(ctx, - dev_ctx, - onednn_engine, - &dy_tmp, - dy, - y_dims, - vectorize(y->dims())); - } else { - *dy = std::move(dy_tmp); - } - - dx->Resize(x->dims()); - dy->Resize(y->dims()); - } - - private: - void CalculateGradMatrixDims(const ExecutionContext &ctx, - Tensor *dx_tmp, - Tensor *dy_tmp, - const std::vector &dx_dims, - const std::vector &dy_dims, - std::vector *dx_bd_dims, - std::vector *dy_bd_dims) const { - for (size_t i = 0; i < dx_dims.size() - 2; ++i) { - if (dx_dims[i] != dy_dims[i]) { - if (dx_dims[i] == 1) { - (*dx_bd_dims)[i] = dy_dims[i]; - } else { - (*dy_bd_dims)[i] = dx_dims[i]; - } - } - } - - dx_tmp->Resize(phi::make_ddim((*dx_bd_dims))); - dx_tmp->mutable_data(ctx.GetPlace()); - dy_tmp->Resize(phi::make_ddim((*dy_bd_dims))); - dy_tmp->mutable_data(ctx.GetPlace()); - } - - void ReduceSumForMatmulGradOutput( - const ExecutionContext &ctx, - const MKLDNNDeviceContext &dev_ctx, - const dnnl::engine onednn_engine, - const Tensor *dx_tmp, - Tensor *dx, - const std::vector &dx_dims, - const std::vector &squeezed_dims) const { - phi::funcs::ReductionOneDNNHandler handler( - dnnl::algorithm::reduction_sum, - 0.0f, - 0.0f, - onednn_engine, - ctx.GetPlace(), - dx_tmp, - dx, - dx_dims); - - auto src_memory_p = handler.AcquireSrcMemory(dx_tmp); - auto dst_memory_p = handler.AcquireDstMemory(dx); - - std::unordered_map reduction_args = { - {DNNL_ARG_SRC, *src_memory_p}, {DNNL_ARG_DST, *dst_memory_p}}; - - auto &astream = MKLDNNDeviceContext::tls().get_stream(); - auto reduction_p = handler.AcquireForwardPrimitive(); - - reduction_p->execute(astream, reduction_args); - astream.wait(); - - dx->set_mem_desc(dst_memory_p->get_desc().reshape(squeezed_dims)); - } - - std::vector ExtendDimsWithOnes(const std::vector &dims, - int new_size) const { - std::vector new_dims(new_size, 1); - for (size_t i = 0; i < dims.size(); ++i) { - new_dims[new_size - dims.size() + i] = dims[i]; - } - - return new_dims; - } - - private: - MatMulGradMKLDNNKernel matmul_v1_grad_mkldnn_kernel; -}; } // anonymous namespace REGISTER_OP_KERNEL(matmul, @@ -923,9 +717,3 @@ REGISTER_OP_KERNEL(matmul_v2, MatMulV2MKLDNNKernel, MatMulV2MKLDNNKernel, MatMulV2MKLDNNKernel); - -REGISTER_OP_KERNEL(matmul_v2_grad, - MKLDNN, - ::paddle::platform::CPUPlace, - MatMulV2GradMKLDNNKernel, - MatMulV2GradMKLDNNKernel); diff --git a/paddle/phi/backends/onednn/onednn_reuse.h b/paddle/phi/backends/onednn/onednn_reuse.h index 7395138bfd63b..bd3d3f30f7a44 100644 --- a/paddle/phi/backends/onednn/onednn_reuse.h +++ b/paddle/phi/backends/onednn/onednn_reuse.h @@ -29,6 +29,7 @@ limitations under the License. */ #include "paddle/phi/common/scalar.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/kernels/funcs/axis_utils.h" +#include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/data_layout_transform.h" #include "paddle/phi/kernels/funcs/pooling.h" @@ -1331,14 +1332,13 @@ class BatchNormOneDNNHandler diff_scaleshift_data); } - std::shared_ptr AcquireMeanMemory( - const phi::DenseTensor* mean) { + std::shared_ptr AcquireMeanMemory(const DenseTensor* mean) { const T* mean_data = mean->data(); return this->AcquireMemoryFromPrimitive(this->fwd_pd_->mean_desc(), to_void_cast(mean_data)); } - std::shared_ptr AcquireMeanMemory(phi::DenseTensor* mean) { + std::shared_ptr AcquireMeanMemory(DenseTensor* mean) { T* mean_data = mean->mutable_data(this->place_, this->fwd_pd_->mean_desc().get_size()); return this->AcquireMemoryFromPrimitive(this->fwd_pd_->mean_desc(), @@ -1346,14 +1346,13 @@ class BatchNormOneDNNHandler } std::shared_ptr AcquireVarianceMemory( - const phi::DenseTensor* variance) { + const DenseTensor* variance) { const T* variance_data = variance->data(); return this->AcquireMemoryFromPrimitive(this->fwd_pd_->variance_desc(), to_void_cast(variance_data)); } - std::shared_ptr AcquireVarianceMemory( - phi::DenseTensor* variance) { + std::shared_ptr AcquireVarianceMemory(DenseTensor* variance) { T* variance_data = variance->mutable_data( this->place_, this->fwd_pd_->variance_desc().get_size()); return this->AcquireMemoryFromPrimitive(this->fwd_pd_->variance_desc(), @@ -1630,5 +1629,346 @@ class PoolingOneDNNHandler } }; +static DDim RowMatrixDimsFromVector(const DDim& x_dim) { + return x_dim.size() > 1 ? x_dim : make_ddim({1, x_dim[0]}); +} + +static DDim ColumnMatrixDimsFromVector(const DDim& y_dim) { + return y_dim.size() > 1 ? y_dim : make_ddim({y_dim[0], 1}); +} + +static std::vector TransposeAxis(const std::vector& x, + const std::vector& axis) { + size_t in_rank = x.size(); + size_t axis_size = axis.size(); + + auto axis_set = std::set(axis.begin(), axis.end()); + PADDLE_ENFORCE_EQ(axis_set.size(), + axis_size, + paddle::platform::errors::InvalidArgument( + "In an axis array, elements must be unique.")); + + PADDLE_ENFORCE_EQ(in_rank, + axis_size, + paddle::platform::errors::InvalidArgument( + "The input dimension's size " + "should be equal to the axis's size. " + "But received dimension is %d, " + "axis's size is %d", + in_rank, + axis_size)); + + PADDLE_ENFORCE_LT(*std::max_element(axis.begin(), axis.end()), + axis_size, + paddle::platform::errors::InvalidArgument( + "Axis values must be ranging from 0 to (dims - 1).")); + + std::vector new_x(x.size()); + for (size_t i = 0; i < x.size(); i++) { + new_x[i] = x[axis[i]]; + } + return new_x; +} + +static std::vector GetInputStrides(const OneDNNContext& dev_ctx, + const DDim& input_dims, + const std::string input_name, + const bool transpose_input) { + auto new_dims = input_dims; + auto shape = + dev_ctx.HasDnnAttr("fused_reshape_" + input_name) + ? PADDLE_GET_CONST(std::vector, + dev_ctx.GetDnnAttr("fused_reshape_" + input_name)) + : std::vector(); + auto axis = dev_ctx.HasDnnAttr("fused_transpose_" + input_name) + ? PADDLE_GET_CONST( + std::vector, + dev_ctx.GetDnnAttr("fused_transpose_" + input_name)) + : std::vector(); + + if (!shape.empty() && !axis.empty()) { + new_dims = input_dims.reshape(shape).transpose(axis); + } + + auto& MatrixDimsFromVector = + input_name == "X" ? RowMatrixDimsFromVector : ColumnMatrixDimsFromVector; + phi::funcs::MatDescriptor mat_dim = phi::funcs::CreateMatrixDescriptor( + MatrixDimsFromVector(new_dims), 0, transpose_input); + + std::vector strides; + if (!shape.empty()) { + auto shape2 = input_dims.reshape(shape); + strides.push_back(1); + for (auto i = shape2.size() - 1; i > 0; --i) { + strides.insert(strides.begin(), + strides.front() * static_cast(shape2[i])); + } + strides = TransposeAxis(strides, axis); + if (shape.size() == 2) + strides.insert(strides.begin(), + static_cast(shape[0] * shape[1])); + mat_dim.stride_ = strides[0]; + if (mat_dim.trans_) std::swap(*strides.rbegin(), *(++strides.rbegin())); + } + return strides; +} + +static bool IsOutputFused(const OneDNNContext& dev_ctx) { + const auto shape = + dev_ctx.HasDnnAttr("fused_reshape_Out") + ? PADDLE_GET_CONST(std::vector, + dev_ctx.GetDnnAttr("fused_reshape_Out")) + : std::vector(); + const auto axis = + dev_ctx.HasDnnAttr("fused_transpose_Out") + ? PADDLE_GET_CONST(std::vector, + dev_ctx.GetDnnAttr("fused_transpose_Out")) + : std::vector(); + return !shape.empty() && !axis.empty(); +} + +template +class MatmulOneDNNHandler + : public phi::funcs::OneDNNHandlerNoCachingT { + public: + MatmulOneDNNHandler(const OneDNNContext& dev_ctx, + const std::vector& x_org_dims, + const std::vector& y_org_dims, + bool trans_x, + bool trans_y, + const std::vector& x_strides_override, + const std::vector& y_strides_override, + bool is_output_fused) + : phi::funcs::OneDNNHandlerNoCachingT( + dev_ctx.GetEngine(), dev_ctx.GetPlace()) { + // M X K * K X N + std::vector x_dims(x_org_dims); + std::vector y_dims(y_org_dims); + + const int MB_idx = x_dims.size() - 3; + const int H_idx = x_dims.size() - 2; + const int W_idx = x_dims.size() - 1; + + if (trans_x) std::swap(x_dims[H_idx], x_dims[W_idx]); + if (trans_y) std::swap(y_dims[H_idx], y_dims[W_idx]); + + const memory::dim M = x_dims[H_idx]; + const memory::dim K = x_dims[W_idx]; + const memory::dim N = y_dims[W_idx]; + + std::vector x_strides(x_dims.size() - 3, 1); + std::vector y_strides(x_dims.size() - 3, 1); + std::vector out_strides(x_dims.size() - 3, 1); + std::vector out_ddims(x_dims.size() - 3, 1); + + x_strides.reserve(x_dims.size()); + y_strides.reserve(x_dims.size()); + out_strides.reserve(x_dims.size()); + + if (!x_strides_override.empty()) { + x_strides = x_strides_override; + } else { + if (!trans_x) { + x_strides.insert(x_strides.end(), {M * K, K, 1}); + } else { + x_strides.insert(x_strides.end(), {M * K, 1, M}); + } + } + + if (!y_strides_override.empty()) { + y_strides = y_strides_override; + } else { + if (!trans_y) { + y_strides.insert(y_strides.end(), {N * K, N, 1}); + } else { + y_strides.insert(y_strides.end(), {N * K, 1, K}); + } + } + + out_strides.insert(out_strides.end(), {M * N, N, 1}); + out_ddims.insert(out_ddims.end(), + {std::max(x_dims[MB_idx], y_dims[MB_idx]), M, N}); + + for (int i = x_dims.size() - 4; i >= 0; --i) { + out_ddims[i] = std::max(x_dims[i], y_dims[i]); + if (x_strides_override.empty()) { + x_strides[i] = x_dims[i + 1] * x_strides[i + 1]; + } + if (y_strides_override.empty()) { + y_strides[i] = y_dims[i + 1] * y_strides[i + 1]; + } + out_strides[i] = out_ddims[i + 1] * out_strides[i + 1]; + } + + // TODO(jczaja): Why not for int8?? + if (!is_int8() && is_output_fused) { + out_strides = FakeTransposeStrides(out_ddims); + } + + auto x_md = memory::desc(x_dims, OneDNNGetDataType(), x_strides); + auto y_md = memory::desc(y_dims, OneDNNGetDataType(), y_strides); + auto out_md = memory::desc(out_ddims, OneDNNGetDataType(), out_strides); + + const auto matmul_attrs = CreateMatmulAttrs(dev_ctx); + + this->AcquireForwardPrimitiveDescriptor(matmul_attrs, x_md, y_md, out_md); + } + + float ComputeOutputScale(const OneDNNContext& dev_ctx) { + float alpha = dev_ctx.HasDnnAttr("alpha") + ? PADDLE_GET_CONST(float, dev_ctx.GetDnnAttr("alpha")) + : 1.0f; + + if (dev_ctx.HasDnnAttr("Scale_x") && dev_ctx.HasDnnAttr("Scale_y") && + dev_ctx.HasDnnAttr("Scale_out")) { + float scale_x = PADDLE_GET_CONST(float, dev_ctx.GetDnnAttr("Scale_x")); + float scale_y = PADDLE_GET_CONST(float, dev_ctx.GetDnnAttr("Scale_y")); + bool force_fp32_out = + dev_ctx.HasDnnAttr("force_fp32_output") + ? PADDLE_GET_CONST(bool, dev_ctx.GetDnnAttr("force_fp32_output")) + : false; + float scale_out = + force_fp32_out + ? 1.f + : PADDLE_GET_CONST(float, dev_ctx.GetDnnAttr("Scale_out")); + alpha *= scale_out / (scale_x * scale_y); + } + return alpha; + } + + dnnl::primitive_attr CreateMatmulAttrs(const OneDNNContext& dev_ctx) { + dnnl::primitive_attr matmul_attrs; + dnnl::post_ops post_operations; + + float scale_out = ComputeOutputScale(dev_ctx); + if (scale_out != 1.0f) { + matmul_attrs.set_output_scales(0, {scale_out}); + } + + if (dev_ctx.HasDnnInput("ResidualData")) { + auto* residual_data = dev_ctx.GetDnnInput("ResidualData"); + auto residual_data_tz = vectorize(residual_data->dims()); + auto residual_data_md = memory::desc(residual_data_tz, + OneDNNGetDataType(), + dnnl::memory::format_tag::any); + post_operations.append_binary(dnnl::algorithm::binary_add, + residual_data_md); + if (dev_ctx.HasDnnAttr("Scale_in_eltwise")) { + float scale_in_eltwise = + PADDLE_GET_CONST(float, dev_ctx.GetDnnAttr("Scale_in_eltwise")); + float sum_scale = scale_out / scale_in_eltwise; + post_operations.append_sum(sum_scale); + } + } + + AppendActivation(dev_ctx, post_operations); + + if (dev_ctx.HasDnnAttr("fused_output_scale")) { + float scale_alpha = + PADDLE_GET_CONST(float, dev_ctx.GetDnnAttr("fused_output_scale")); + post_operations.append_eltwise( + 1.0, dnnl::algorithm::eltwise_linear, scale_alpha, 0.0f); + } + + matmul_attrs.set_post_ops(post_operations); + return matmul_attrs; + } + + std::vector FakeTransposeStrides( + const std::vector& matmul_out_dims) const { + // fuse matmul_v2 + transpose + reshape guarantees that output is 4D and + // transpose axis are: {0, 2, 1, 3} + std::vector transpose_axis = {0, 2, 1, 3}; + std::vector fake_strides(transpose_axis.size()); + int ndims = static_cast(transpose_axis.size()); + + int total_stride = 1; + + for (int i = ndims - 1; i >= 0; --i) { + fake_strides[transpose_axis[i]] = total_stride; + total_stride *= matmul_out_dims[transpose_axis[i]]; + } + + return fake_strides; + } + + std::shared_ptr AcquireWeightsMemory(const DenseTensor* input) { + const YT* input_data = input->data(); + return this->AcquireMemoryFromPrimitive(this->fwd_pd_->weights_desc(), + to_void_cast(input_data)); + } + + std::shared_ptr AcquireDstMemory(const OneDNNContext& dev_ctx, + DenseTensor* output) { + // We cannot use base AcquireDstMemory as it makes an allocation request + // base on DST memory primitive size. This is fine in general, but in MatMul + // we have primitive that covers only one batch of Data and then shift + // pointer for every new batch. Hence DenseTensor size is bigger that + // dst memory primitive size. So would we request less memory that is there + // and it triggers an assertion. So as there is no 'any' format here we can + // leave default size of DenseTensor as computed in ComputeInferShape + OT* ptr = dev_ctx.template Alloc(output); + return this->AcquireMemoryFromPrimitive(this->fwd_pd_->dst_desc(), ptr); + } +}; + +template +void ExecuteMatmul(const OneDNNContext& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + const std::vector& x_dims, + const std::vector& y_dims, + bool trans_x, + bool trans_y, + DenseTensor* out) { + auto x_strides_override = GetInputStrides(dev_ctx, x.dims(), "X", trans_x); + auto y_strides_override = GetInputStrides(dev_ctx, y.dims(), "Y", trans_y); + MatmulOneDNNHandler handler(dev_ctx, + x_dims, + y_dims, + trans_x, + trans_y, + x_strides_override, + y_strides_override, + IsOutputFused(dev_ctx)); + + const auto src_memory_p = handler.AcquireSrcMemory(&x); + const auto weights_memory_p = handler.AcquireWeightsMemory(&y); + const auto dst_memory_p = handler.AcquireDstMemory(dev_ctx, out); + + auto matmul_p = handler.AcquireForwardPrimitive(); + + std::unordered_map matmul_args = { + {DNNL_ARG_SRC, *src_memory_p}, + {DNNL_ARG_WEIGHTS, *weights_memory_p}, + {DNNL_ARG_DST, *dst_memory_p}}; + + if (dev_ctx.HasDnnInput("ResidualData")) { + auto* residual_data = dev_ctx.GetDnnInput("ResidualData"); + const auto residual_data_memory_p = handler.AcquireSrcMemory(residual_data); + matmul_args.insert({DNNL_ARG_ATTR_MULTIPLE_POST_OP(0) | DNNL_ARG_SRC_1, + *residual_data_memory_p}); + } + + auto& astream = OneDNNContext::tls().get_stream(); + matmul_p->execute(astream, matmul_args); + astream.wait(); + + // TODO(jczaja): Explain why int8 format of dst is ABCD and do not need + // permute + if (IsOutputFused(dev_ctx) && !is_int8()) { + const auto axis = + dev_ctx.HasDnnAttr("fused_transpose_Out") + ? PADDLE_GET_CONST(std::vector, + dev_ctx.GetDnnAttr("fused_transpose_Out")) + : std::vector(); + auto permuted_md = dst_memory_p->get_desc().permute_axes(axis); + out->set_mem_desc(permuted_md.reshape(vectorize(out->dims()))); + } else { + out->set_mem_desc( + dst_memory_p->get_desc().reshape(vectorize(out->dims()))); + } +} + } // namespace funcs } // namespace phi diff --git a/paddle/phi/kernels/onednn/matmul_grad_kernel.cc b/paddle/phi/kernels/onednn/matmul_grad_kernel.cc new file mode 100644 index 0000000000000..47807f156b18f --- /dev/null +++ b/paddle/phi/kernels/onednn/matmul_grad_kernel.cc @@ -0,0 +1,163 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/matmul_grad_kernel.h" + +#include "paddle/phi/backends/onednn/onednn_reuse.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +std::vector ExtendDimsWithOnes(const std::vector &dims, + int new_size) { + std::vector new_dims(new_size, 1); + for (size_t i = 0; i < dims.size(); ++i) { + new_dims[new_size - dims.size() + i] = dims[i]; + } + + return new_dims; +} + +template +void CalculateGradMatrixDims(const OneDNNContext &dev_ctx, + DenseTensor *dx_tmp, + DenseTensor *dy_tmp, + const std::vector &dx_dims, + const std::vector &dy_dims, + std::vector *dx_bd_dims, + std::vector *dy_bd_dims) { + for (size_t i = 0; i < dx_dims.size() - 2; ++i) { + if (dx_dims[i] != dy_dims[i]) { + if (dx_dims[i] == 1) { + (*dx_bd_dims)[i] = dy_dims[i]; + } else { + (*dy_bd_dims)[i] = dx_dims[i]; + } + } + } + + dx_tmp->Resize(make_ddim((*dx_bd_dims))); + dev_ctx.template Alloc(dx_tmp); + dy_tmp->Resize(make_ddim((*dy_bd_dims))); + dev_ctx.template Alloc(dy_tmp); +} + +template +void ReduceSumForMatmulGradOutput(const OneDNNContext &dev_ctx, + const DenseTensor *dx_tmp, + DenseTensor *dx, + const std::vector &dx_dims, + const std::vector &squeezed_dims) { + funcs::ReductionOneDNNHandler handler(dnnl::algorithm::reduction_sum, + 0.0f, + 0.0f, + dev_ctx.GetEngine(), + dev_ctx.GetPlace(), + dx_tmp, + dx, + dx_dims); + + auto src_memory_p = handler.AcquireSrcMemory(dx_tmp); + auto dst_memory_p = handler.AcquireDstMemory(dx); + + std::unordered_map reduction_args = { + {DNNL_ARG_SRC, *src_memory_p}, {DNNL_ARG_DST, *dst_memory_p}}; + + auto &astream = OneDNNContext::tls().get_stream(); + auto reduction_p = handler.AcquireForwardPrimitive(); + + reduction_p->execute(astream, reduction_args); + astream.wait(); + + dx->set_mem_desc(dst_memory_p->get_desc().reshape(squeezed_dims)); +} + +template +void MatmulGradKernel(const Context &dev_ctx, + const DenseTensor &x, + const DenseTensor &y, + const DenseTensor &dout, + bool transpose_x, + bool transpose_y, + DenseTensor *dx, + DenseTensor *dy) { + auto x_dims = vectorize(x.dims()); + auto y_dims = vectorize(y.dims()); + auto dout_dims = vectorize(dout.dims()); + + size_t ndims = std::max(x_dims.size(), y_dims.size()); + ndims = std::max(ndims, 3); + + if (x_dims.size() != ndims) { + x_dims = ExtendDimsWithOnes(x_dims, ndims); + } else if (y_dims.size() != ndims) { + y_dims = ExtendDimsWithOnes(y_dims, ndims); + } + + // in broadcasting scenario new memory is required because + // reduce sum must be calculated upon broadcasted dims + DenseTensor dx_tmp, dy_tmp; + std::vector dx_bd_dims(x_dims); + std::vector dy_bd_dims(y_dims); + + CalculateGradMatrixDims( + dev_ctx, &dx_tmp, &dy_tmp, x_dims, y_dims, &dx_bd_dims, &dy_bd_dims); + + if (transpose_x && transpose_y) { + funcs::ExecuteMatmul( + dev_ctx, y, dout, y_dims, dout_dims, true, true, &dx_tmp); + funcs::ExecuteMatmul( + dev_ctx, dout, x, dout_dims, x_dims, true, true, &dy_tmp); + } else if (transpose_x) { + funcs::ExecuteMatmul( + dev_ctx, y, dout, y_dims, dout_dims, false, true, &dx_tmp); + funcs::ExecuteMatmul( + dev_ctx, x, dout, x_dims, dout_dims, false, false, &dy_tmp); + } else if (transpose_y) { + funcs::ExecuteMatmul( + dev_ctx, dout, y, dout_dims, y_dims, false, false, &dx_tmp); + funcs::ExecuteMatmul( + dev_ctx, dout, x, dout_dims, x_dims, true, false, &dy_tmp); + } else { + funcs::ExecuteMatmul( + dev_ctx, dout, y, dout_dims, y_dims, false, true, &dx_tmp); + funcs::ExecuteMatmul( + dev_ctx, x, dout, x_dims, dout_dims, true, false, &dy_tmp); + } + + if (x_dims != dx_bd_dims) { + ReduceSumForMatmulGradOutput( + dev_ctx, &dx_tmp, dx, x_dims, vectorize(x.dims())); + } else { + *dx = std::move(dx_tmp); + } + if (y_dims != dy_bd_dims) { + ReduceSumForMatmulGradOutput( + dev_ctx, &dy_tmp, dy, y_dims, vectorize(y.dims())); + } else { + *dy = std::move(dy_tmp); + } + + dx->Resize(x.dims()); + dy->Resize(y.dims()); +} + +} // namespace phi + +PD_REGISTER_KERNEL(matmul_grad, + OneDNN, + ONEDNN, + phi::MatmulGradKernel, + float, + phi::dtype::bfloat16) {} From e54088357577b5bc9e3d19ca9eab4b7927f33ff4 Mon Sep 17 00:00:00 2001 From: yuehuayingxueluo <867460659@qq.com> Date: Fri, 18 Nov 2022 19:03:10 +0800 Subject: [PATCH 098/210] clear fluid apis: fix apis in fleet and passes (#48021) * clear fluid apis in fleet and passes * fix model.py * fix model.py * fix cpp_pass.py --- python/paddle/distributed/fleet/fleet.py | 20 +++--- .../distributed/fleet/fleet_executor_utils.py | 2 +- python/paddle/distributed/fleet/launch.py | 40 ++++++------ .../paddle/distributed/fleet/launch_utils.py | 64 +++++++++---------- python/paddle/distributed/fleet/model.py | 0 python/paddle/distributed/fleet/optimizer.py | 2 +- python/paddle/distributed/fleet/scaler.py | 2 +- python/paddle/distributed/passes/cpp_pass.py | 2 +- .../distributed/passes/fuse_all_reduce.py | 2 +- python/paddle/distributed/passes/pass_base.py | 2 +- python/paddle/framework/__init__.py | 3 + 11 files changed, 71 insertions(+), 68 deletions(-) mode change 100644 => 100755 python/paddle/distributed/fleet/fleet.py mode change 100644 => 100755 python/paddle/distributed/fleet/fleet_executor_utils.py mode change 100644 => 100755 python/paddle/distributed/fleet/model.py mode change 100644 => 100755 python/paddle/distributed/fleet/optimizer.py mode change 100644 => 100755 python/paddle/distributed/fleet/scaler.py mode change 100644 => 100755 python/paddle/distributed/passes/cpp_pass.py mode change 100644 => 100755 python/paddle/distributed/passes/fuse_all_reduce.py mode change 100644 => 100755 python/paddle/distributed/passes/pass_base.py mode change 100644 => 100755 python/paddle/framework/__init__.py diff --git a/python/paddle/distributed/fleet/fleet.py b/python/paddle/distributed/fleet/fleet.py old mode 100644 new mode 100755 index 617eb5729aef6..6b265c4902c7f --- a/python/paddle/distributed/fleet/fleet.py +++ b/python/paddle/distributed/fleet/fleet.py @@ -15,7 +15,7 @@ import copy import paddle import os -from paddle.fluid.framework import _global_flags +from paddle.framework import _global_flags from paddle.fluid import compiler from .base.role_maker import PaddleCloudRoleMaker, RoleMakerBase from .base.strategy_compiler import StrategyCompiler @@ -271,14 +271,14 @@ def init( self.strategy_compiler = StrategyCompiler() if self._role_maker._is_non_distributed() and self._is_collective: - if paddle.fluid.core.is_compiled_with_cuda(): - gpus_num = paddle.fluid.core.get_cuda_device_count() + if paddle.framework.core.is_compiled_with_cuda(): + gpus_num = paddle.framework.core.get_cuda_device_count() if gpus_num != 1: raise ValueError( "CUDA_VISIBLE_DEVICES shoule be set only 1 card if you use `python` to launch fleet program." ) - if paddle.fluid.framework._non_static_mode(): + if paddle.framework._non_static_mode(): if self.worker_num() == 1: # if worker_num is 1, should construct default topology & hcg self._topology = tp.CommunicateTopology() @@ -1011,8 +1011,8 @@ def save_dense_params( import paddle.distributed.fleet as fleet fleet.init() import paddle - place = paddle.fluid.CPUPlace() - exe = paddle.fluid.Executor(place) + place = paddle.CPUPlace() + exe = paddle.static.Executor(place) # build net # fleet.distributed_optimizer(...) @@ -1242,7 +1242,7 @@ def minimize( ) else: if ( - paddle.fluid.framework._non_static_mode() + paddle.framework._non_static_mode() or self._role_maker._is_non_distributed() or self._is_collective ): @@ -1258,7 +1258,7 @@ def _minimize_impl( context["user_defined_strategy"] = copy.deepcopy( self._user_defined_strategy ) - if paddle.fluid.framework._non_static_mode(): + if paddle.framework._non_static_mode(): # imitate target optimizer retrieval target_opt = self.user_defined_optimizer self._context = context @@ -1418,7 +1418,7 @@ def _minimize_impl( logger.debug("default program id: " + str(id(default_program))) if id(default_program) != id(loss.block.program): - paddle.fluid.framework.switch_main_program(loss.block.program) + paddle.framework.switch_main_program(loss.block.program) logger.debug( "default program id after switch: " + str(id(default_program)) ) @@ -1532,7 +1532,7 @@ def _minimize_losses_impl( # default_program = paddle.static.default_main_program() # if id(default_program) != id(losses[0].block.program): - # paddle.fluid.framework.switch_main_program(losses[0].block.program) + # paddle.framework.switch_main_program(losses[0].block.program) context["program_optimize_ops"] = optimize_ops context["program_params_grads"] = params_grads diff --git a/python/paddle/distributed/fleet/fleet_executor_utils.py b/python/paddle/distributed/fleet/fleet_executor_utils.py old mode 100644 new mode 100755 index 4eb4108d5d595..f92ab945894d9 --- a/python/paddle/distributed/fleet/fleet_executor_utils.py +++ b/python/paddle/distributed/fleet/fleet_executor_utils.py @@ -13,7 +13,7 @@ # limitations under the License. from paddle.distributed.fleet.meta_optimizers.common import OpRole, OP_ROLE_KEY -from paddle.fluid import core +from paddle.framework import core from paddle.static import Program diff --git a/python/paddle/distributed/fleet/launch.py b/python/paddle/distributed/fleet/launch.py index ecf6436b94fd5..e7837032cebe5 100755 --- a/python/paddle/distributed/fleet/launch.py +++ b/python/paddle/distributed/fleet/launch.py @@ -62,7 +62,7 @@ import copy import pathlib from argparse import ArgumentParser, REMAINDER -import paddle.fluid as fluid +import paddle.framework as framework from paddle.distributed.fleet import launch_utils from paddle.distributed.fleet.launch_utils import ( get_host_name_ip, @@ -136,7 +136,7 @@ def _parse_args(): help="run mode of job, can be:collective/ps/ps-heter", ) - if fluid.core.is_compiled_with_cuda(): + if framework.core.is_compiled_with_cuda(): base_group.add_argument( "--gpus", type=str, @@ -147,7 +147,7 @@ def _parse_args(): ) base_group.add_argument("--selected_gpus", dest="gpus") - if fluid.core.is_compiled_with_xpu(): + if framework.core.is_compiled_with_xpu(): base_group.add_argument( "--xpus", type=str, @@ -157,7 +157,7 @@ def _parse_args(): ) base_group.add_argument("--selected_xpus", dest="xpus") - if fluid.core.is_compiled_with_npu(): + if framework.core.is_compiled_with_npu(): base_group.add_argument( "--npus", type=str, @@ -167,7 +167,7 @@ def _parse_args(): ) base_group.add_argument("--selected_npus", dest="npus") - if fluid.core.is_compiled_with_mlu(): + if framework.core.is_compiled_with_mlu(): base_group.add_argument( "--mlus", type=str, @@ -505,13 +505,13 @@ def launch_ps(args, distribute_mode): def infer_backend(args): if args.backend != "auto": return - if fluid.core.is_compiled_with_cuda(): + if framework.core.is_compiled_with_cuda(): args.backend = 'nccl' - elif fluid.core.is_compiled_with_npu(): + elif framework.core.is_compiled_with_npu(): args.backend = 'unknown' - elif fluid.core.is_compiled_with_xpu(): + elif framework.core.is_compiled_with_xpu(): args.backend = 'bkcl' - elif fluid.core.is_compiled_with_mlu(): + elif framework.core.is_compiled_with_mlu(): args.backend = 'cncl' else: args.backend = 'gloo' @@ -559,14 +559,14 @@ def which_distributed_mode(args): "Only one mode(Collective or Parameter-Server) can be selected at the same time, but more than one configuration was received." ) - if fluid.core.is_compiled_with_cuda(): - accelerators = fluid.core.get_cuda_device_count() - elif fluid.core.is_compiled_with_npu(): - accelerators = fluid.core.get_npu_device_count() - elif fluid.core.is_compiled_with_xpu(): - accelerators = fluid.core.get_xpu_device_count() - elif fluid.core.is_compiled_with_mlu(): - accelerators = fluid.core.get_mlu_device_count() + if framework.core.is_compiled_with_cuda(): + accelerators = framework.core.get_cuda_device_count() + elif framework.core.is_compiled_with_npu(): + accelerators = framework.core.get_npu_device_count() + elif framework.core.is_compiled_with_xpu(): + accelerators = framework.core.get_xpu_device_count() + elif framework.core.is_compiled_with_mlu(): + accelerators = framework.core.get_mlu_device_count() else: accelerators = 0 @@ -591,9 +591,9 @@ def which_distributed_mode(args): return DistributeMode.COLLECTIVE else: if ( - not fluid.core.is_compiled_with_cuda() - and not fluid.core.is_compiled_with_xpu() - and not fluid.core.is_compiled_with_mlu() + not framework.core.is_compiled_with_cuda() + and not framework.core.is_compiled_with_xpu() + and not framework.core.is_compiled_with_mlu() ): if args.servers: logger.warning( diff --git a/python/paddle/distributed/fleet/launch_utils.py b/python/paddle/distributed/fleet/launch_utils.py index b676eee5bfb18..69220924a38f8 100755 --- a/python/paddle/distributed/fleet/launch_utils.py +++ b/python/paddle/distributed/fleet/launch_utils.py @@ -27,7 +27,7 @@ import struct import json -import paddle.fluid as fluid +import paddle.framework as framework from distutils.util import strtobool import paddle.utils.cpp_extension.extension_utils as utils @@ -572,7 +572,7 @@ def start_local_trainers( [str(g) for g in t.accelerators] ) # to do: same code style in future - if fluid.core.is_compiled_with_xpu() and len(t.accelerators) > 0: + if framework.core.is_compiled_with_xpu() and len(t.accelerators) > 0: proc_env["FLAGS_selected_xpus"] = "%s" % ",".join( [str(g) for g in t.accelerators] ) @@ -706,7 +706,7 @@ def watch_local_trainers(procs, nranks): def get_gpus(gpus): if gpus is None: - gpus_num = fluid.core.get_cuda_device_count() + gpus_num = framework.core.get_cuda_device_count() res_gpus = [str(x) for x in range(0, gpus_num)] else: cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES") @@ -740,7 +740,7 @@ def get_gpus(gpus): def get_xpus(xpus): if xpus is None: - xpus_num = fluid.core.get_xpu_device_count() + xpus_num = framework.core.get_xpu_device_count() res_xpus = [str(x) for x in range(0, xpus_num)] else: xpu_visible_devices = os.getenv("XPU_VISIBLE_DEVICES") @@ -775,7 +775,7 @@ def get_xpus(xpus): def get_npus(npus): if npus is None: - npus_num = fluid.core.get_npu_device_count() + npus_num = framework.core.get_npu_device_count() res_npus = [str(x) for x in range(0, npus_num)] else: npu_visible_devices = os.getenv("ASCEND_VISIBLE_DEVICES") @@ -809,7 +809,7 @@ def get_npus(npus): def get_mlus(mlus): if mlus is None: - mlus_num = fluid.core.get_mlu_device_count() + mlus_num = framework.core.get_mlu_device_count() res_mlus = [str(x) for x in range(0, mlus_num)] else: mlu_visible_devices = os.getenv("MLU_VISIBLE_DEVICES") @@ -845,37 +845,37 @@ def get_mlus(mlus): def get_device_mode(backend): if backend == 'heter': if ( - fluid.core.is_compiled_with_cuda() - and fluid.core.get_cuda_device_count() > 0 + framework.core.is_compiled_with_cuda() + and framework.core.get_cuda_device_count() > 0 ): print("launch train in heter mode with GPU device.") return DeviceMode.GPU if ( - fluid.core.is_compiled_with_xpu() - and fluid.core.get_xpu_device_count() > 0 + framework.core.is_compiled_with_xpu() + and framework.core.get_xpu_device_count() > 0 ): print("launch train in heter mode with XPU device.") return DeviceMode.XPU if ( - fluid.core.is_compiled_with_npu() - and fluid.core.get_npu_device_count() > 0 + framework.core.is_compiled_with_npu() + and framework.core.get_npu_device_count() > 0 ): print("launch train in heter mode with NPU device.") return DeviceMode.ASCEND_NPU - if backend == 'hccl' and fluid.core.get_npu_device_count() > 0: + if backend == 'hccl' and framework.core.get_npu_device_count() > 0: print("launch train in ascend npu mode!") return DeviceMode.ASCEND_NPU - if backend == 'nccl' and fluid.core.get_cuda_device_count() > 0: + if backend == 'nccl' and framework.core.get_cuda_device_count() > 0: print("launch train in GPU mode!") return DeviceMode.GPU - if backend == 'bkcl' and fluid.core.get_xpu_device_count() > 0: + if backend == 'bkcl' and framework.core.get_xpu_device_count() > 0: print("launch train in XPU mode") return DeviceMode.XPU - if backend == 'cncl' and fluid.core.get_mlu_device_count() > 0: + if backend == 'cncl' and framework.core.get_mlu_device_count() > 0: print("launch train in MLU mode") return DeviceMode.MLU @@ -1063,7 +1063,7 @@ def get_mapped_cluster_from_args_without_rank_mapping(args, device_mode): assert ( device_mode == DeviceMode.GPU ), "Only support get mapped cluster for gpu now." - gpus_num = fluid.core.get_cuda_device_count() + gpus_num = framework.core.get_cuda_device_count() # parse ip-ranks json file cluster_topo = None @@ -1192,7 +1192,7 @@ def get_mapped_cluster_from_args_with_rank_mapping(args, device_mode): assert ( device_mode == DeviceMode.GPU ), "Only support get mapped cluster for gpu now." - gpus_num = fluid.core.get_cuda_device_count() + gpus_num = framework.core.get_cuda_device_count() # parse ip-ranks json file rank_mapping_path = args.rank_mapping_path or os.getenv( @@ -1866,11 +1866,11 @@ def start_pod_worker(self, args, pod): heter_device_num = 0 device_list = [] - if fluid.core.is_compiled_with_cuda(): + if framework.core.is_compiled_with_cuda(): device_list = get_gpus(args.gpus) heter_device_num = len(device_list) - elif fluid.core.is_compiled_with_xpu(): - heter_device_num = fluid.core.get_xpu_device_count() + elif framework.core.is_compiled_with_xpu(): + heter_device_num = framework.core.get_xpu_device_count() device_list = [str(x) for x in range(0, heter_device_num)] for idx, cur_worker in enumerate(pod.workers): @@ -2042,11 +2042,11 @@ def start_pod_heter_worker(self, args, pod): heter_device_num = 0 device_list = [] - if fluid.core.is_compiled_with_cuda(): + if framework.core.is_compiled_with_cuda(): device_list = get_gpus(args.gpus) heter_device_num = len(device_list) - elif fluid.core.is_compiled_with_xpu(): - heter_device_num = fluid.core.get_xpu_device_count() + elif framework.core.is_compiled_with_xpu(): + heter_device_num = framework.core.get_xpu_device_count() device_list = [str(x) for x in range(0, heter_device_num)] for idx, cur_heter_worker in enumerate(pod.heter_workers): @@ -2144,25 +2144,25 @@ def check_backend(backend): "but got %s" % backend ) - if backend == 'nccl' and not fluid.core.is_compiled_with_cuda(): + if backend == 'nccl' and not framework.core.is_compiled_with_cuda(): raise ValueError( "paddle.distributed initialize error, " "your paddle is not compiled with cuda but you assign 'nccl' as backend." ) - if backend == 'bkcl' and not fluid.core.is_compiled_with_xpu(): + if backend == 'bkcl' and not framework.core.is_compiled_with_xpu(): raise ValueError( "paddle.distributed initialize error, " "your paddle is not compiled with xpu but you assign 'bkcl' as backend." ) - if backend == 'hccl' and not fluid.core.is_compiled_with_npu(): + if backend == 'hccl' and not framework.core.is_compiled_with_npu(): raise ValueError( "paddle.distributed initialize error, " "your paddle is not compiled with npu but you assign 'hccl' as backend." ) - if backend == 'cncl' and not fluid.core.is_compiled_with_mlu(): + if backend == 'cncl' and not framework.core.is_compiled_with_mlu(): raise ValueError( "paddle.distributed initialize error, " "your paddle is not compiled with mlu but you assign 'cncl' as backend." @@ -2183,16 +2183,16 @@ def block_windows_and_macos(backend): def get_backend_by_compile_flag(): - if fluid.core.is_compiled_with_cuda(): + if framework.core.is_compiled_with_cuda(): return 'nccl' - if fluid.core.is_compiled_with_xpu(): + if framework.core.is_compiled_with_xpu(): return 'bkcl' - if fluid.core.is_compiled_with_npu(): + if framework.core.is_compiled_with_npu(): return 'hccl' - if fluid.core.is_compiled_with_mlu(): + if framework.core.is_compiled_with_mlu(): return 'cncl' return 'gloo' diff --git a/python/paddle/distributed/fleet/model.py b/python/paddle/distributed/fleet/model.py old mode 100644 new mode 100755 diff --git a/python/paddle/distributed/fleet/optimizer.py b/python/paddle/distributed/fleet/optimizer.py old mode 100644 new mode 100755 index f67c108486a9b..042646ac50570 --- a/python/paddle/distributed/fleet/optimizer.py +++ b/python/paddle/distributed/fleet/optimizer.py @@ -72,7 +72,7 @@ def _dygraph_distributed_optimizer(optimizer, strategy=None): def distributed_optimizer(*args, **kwargs): - if paddle.fluid.framework._non_static_mode(): + if paddle.framework._non_static_mode(): return _dygraph_distributed_optimizer(*args, **kwargs) else: return fleet.fleet.distributed_optimizer(*args, **kwargs) diff --git a/python/paddle/distributed/fleet/scaler.py b/python/paddle/distributed/fleet/scaler.py old mode 100644 new mode 100755 index 60bc82016802c..0b8299517a79b --- a/python/paddle/distributed/fleet/scaler.py +++ b/python/paddle/distributed/fleet/scaler.py @@ -16,7 +16,7 @@ from .base.topology import ParallelMode from paddle.distributed import fleet from types import MethodType -from paddle.fluid import core +from paddle.framework import core from paddle.fluid.dygraph import to_variable import numpy as np from paddle import _legacy_C_ops diff --git a/python/paddle/distributed/passes/cpp_pass.py b/python/paddle/distributed/passes/cpp_pass.py old mode 100644 new mode 100755 index ffd8e29dc551f..a14d28053f2cc --- a/python/paddle/distributed/passes/cpp_pass.py +++ b/python/paddle/distributed/passes/cpp_pass.py @@ -14,7 +14,7 @@ from paddle.static import Executor from .pass_base import PassType, CPPPassWrapper, register_pass -from paddle.fluid.framework import core, _apply_pass as _apply_cpp_pass +from paddle.framework import core, _apply_pass as _apply_cpp_pass @register_pass("fuse_elewise_add_act") diff --git a/python/paddle/distributed/passes/fuse_all_reduce.py b/python/paddle/distributed/passes/fuse_all_reduce.py old mode 100644 new mode 100755 index 7dbfc8e67936e..3e5ca75d62fbe --- a/python/paddle/distributed/passes/fuse_all_reduce.py +++ b/python/paddle/distributed/passes/fuse_all_reduce.py @@ -13,7 +13,7 @@ # limitations under the License. from paddle.framework import core -from paddle.fluid import unique_name +from paddle.utils import unique_name from .pass_base import PassBase, PassType, register_pass import numpy as np diff --git a/python/paddle/distributed/passes/pass_base.py b/python/paddle/distributed/passes/pass_base.py old mode 100644 new mode 100755 index 4cfef574a71a1..00f7bcc156d06 --- a/python/paddle/distributed/passes/pass_base.py +++ b/python/paddle/distributed/passes/pass_base.py @@ -13,7 +13,7 @@ # limitations under the License. from abc import ABC, abstractmethod -from paddle.fluid.framework import _apply_pass as _apply_cpp_pass +from paddle.framework import _apply_pass as _apply_cpp_pass class PassContext: diff --git a/python/paddle/framework/__init__.py b/python/paddle/framework/__init__.py old mode 100644 new mode 100755 index 6725ed1443591..11250e32d35c4 --- a/python/paddle/framework/__init__.py +++ b/python/paddle/framework/__init__.py @@ -66,5 +66,8 @@ from ..fluid.layer_helper import LayerHelper # noqa: F401 from ..fluid.framework import in_dygraph_mode # noqa: F401 from ..fluid.framework import _in_legacy_dygraph # noqa: F401 +from ..fluid.framework import _global_flags # noqa: F401 +from ..fluid.framework import _apply_pass # noqa: F401 +from ..fluid.framework import switch_main_program __all__ = [] From d595928e00ff6de2c4a58677c33060de58f17cd2 Mon Sep 17 00:00:00 2001 From: ZZK <359521840@qq.com> Date: Fri, 18 Nov 2022 19:38:25 +0800 Subject: [PATCH 099/210] Fused QKVBiasAdd and Transpose with Split Q, KV (#47680) * fused qkvBiasAdd and transpose with split qkv * fix typo * fix format * fix name * add annotation * fix comment --- paddle/fluid/operators/fused/fmha_ref.h | 174 ++++++++++++++++++ .../fused/fused_multi_transformer_op.cu | 97 ++++++---- .../fused/fused_multi_transformer_op.cu.h | 134 +++++++++++++- 3 files changed, 367 insertions(+), 38 deletions(-) diff --git a/paddle/fluid/operators/fused/fmha_ref.h b/paddle/fluid/operators/fused/fmha_ref.h index 4854f81eae469..66176c9e75422 100644 --- a/paddle/fluid/operators/fused/fmha_ref.h +++ b/paddle/fluid/operators/fused/fmha_ref.h @@ -258,6 +258,180 @@ class FMHARef { dev_ctx_, *qktv_out_tensor, perm_3, fmha_out_tensor); } + void ComputeForwardWithoutTranspose(const phi::DenseTensor& qkv_input_tensor, + const phi::DenseTensor* cache_kv_tensor, + const phi::DenseTensor* src_mask_tensor, + phi::DenseTensor* q_transpose_out_tensor, + phi::DenseTensor* kv_transpose_out_tensor, + phi::DenseTensor* cache_kv_out_tensor, + phi::DenseTensor* qk_out_tensor, + phi::DenseTensor* src_mask_out_tensor, + phi::DenseTensor* softmax_out_tensor, + phi::DenseTensor* dropout_mask_out_tensor, + phi::DenseTensor* dropout_out_tensor, + phi::DenseTensor* qktv_out_tensor, + phi::DenseTensor* fmha_out_tensor) { + // input shape: [bs, seq_len, 3, num_head, head_dim] + // transpose with perm [2, 0, 3, 1, 4], + // output_shape: [3, bs, num_head, seq_len, head_dim] + T* qk_out_data = qk_out_tensor->data(); + T* qktv_out_data = qktv_out_tensor->data(); + T* softmax_out_data = softmax_out_tensor->data(); + T* dropout_out_data = dropout_out_tensor->data(); + T* fmha_out_data = fmha_out_tensor->data(); + + auto out_seq_len = seq_len_; + if (cache_kv_tensor) { + // kv [2, bs, num_head, seq_len, head_dim] + phi::funcs::ConcatFunctor concat; + // out [2, bs, num_head, cache_seq_len + seq_len, head_dim] + concat(dev_ctx_, + {*cache_kv_tensor, *kv_transpose_out_tensor}, + 3, + cache_kv_out_tensor); + out_seq_len = cache_kv_out_tensor->dims()[3]; + } + + int64_t q_size = batch_size_ * seq_len_ * num_head_ * head_dim_; + T* q_ptr = q_transpose_out_tensor->data(); + T* k_ptr = nullptr; + T* v_ptr = nullptr; + + if (cache_kv_tensor) { + int64_t k_size = cache_kv_out_tensor->numel() / 2; + k_ptr = cache_kv_out_tensor->data(); + v_ptr = k_ptr + k_size; + } else { + int64_t k_size = q_size; + k_ptr = kv_transpose_out_tensor->data(); + v_ptr = k_ptr + k_size; + } + + { + // NOTE(wangxi): We scale Q with 1/sqrt(Dh) before QK^T, because for + // float16 calculation, INF may appear in QK^T if we do not scale before. + float alpha = 1.0 / sqrt(head_dim_); + auto functor = phi::funcs::ScaleFunctor(alpha); + std::vector ins = {q_transpose_out_tensor}; + std::vector outs = {q_transpose_out_tensor}; + phi::funcs::ElementwiseKernel(dev_ctx_, ins, &outs, functor); + } + + // q*k^t, batched_gemm + CBLAS_TRANSPOSE transA = CblasNoTrans; + CBLAS_TRANSPOSE transB = CblasTrans; + auto blas = phi::funcs::GetBlas(dev_ctx_); + int gemm_batch_size = batch_size_ * num_head_; + int gemm_m = seq_len_; + int gemm_n = out_seq_len; + int gemm_k = head_dim_; + T alpha = static_cast(1.0); + T beta = static_cast(0.0); + int64_t stride_a = gemm_m * gemm_k; + int64_t stride_b = gemm_k * gemm_n; + blas.BatchedGEMM(transA, + transB, + gemm_m, + gemm_n, + gemm_k, + alpha, + q_ptr, + k_ptr, + beta, + qk_out_data, + gemm_batch_size, + stride_a, + stride_b); + int softmax_axis = -1; + if (src_mask_tensor != nullptr) { + if (src_mask_out_tensor == nullptr && seq_len_ == out_seq_len) { + LaunchFusedSoftmaxMaskKernel(qk_out_data, + src_mask_tensor->data(), + softmax_out_data, + batch_size_, + num_head_, + seq_len_, + dev_ctx_.stream()); + } else { + std::vector ins; + std::vector outs; + ins.emplace_back(qk_out_tensor); + ins.emplace_back(src_mask_tensor); + outs.emplace_back(src_mask_out_tensor); + int elewise_add_axis = -1; + phi::funcs::BroadcastKernel( + dev_ctx_, + ins, + &outs, + elewise_add_axis, + phi::funcs::AddFunctor()); + + phi::SoftmaxForwardCUDAKernelDriver( + dev_ctx_, *src_mask_out_tensor, softmax_axis, softmax_out_tensor); + } + } else { + phi::SoftmaxForwardCUDAKernelDriver( + dev_ctx_, *qk_out_tensor, softmax_axis, softmax_out_tensor); + } + + transB = CblasNoTrans; + gemm_m = seq_len_; + gemm_n = head_dim_; + gemm_k = out_seq_len; + alpha = static_cast(1.0); + stride_a = gemm_m * gemm_k; + stride_b = gemm_k * gemm_n; + + if (dropout_param_.dropout_prob_) { + DropoutFwGPUKernelDriver( + static_cast(dev_ctx_), + dropout_param_.is_test_, + dropout_param_.dropout_prob_, + dropout_param_.is_upscale_in_train_, + dropout_param_.is_fix_seed_, + dropout_param_.seed_val_, + static_cast(*softmax_out_tensor), + dropout_param_.seed_, + dropout_mask_out_tensor, + dropout_out_tensor, + false); + blas.BatchedGEMM(transA, + transB, + gemm_m, + gemm_n, + gemm_k, + alpha, + dropout_out_data, + v_ptr, + beta, + qktv_out_data, + gemm_batch_size, + stride_a, + stride_b); + } else { + // softmax_out * v, batched_gemm + // output shape: [batch_size, num_heads, seq_len, head_dim] + blas.BatchedGEMM(transA, + transB, + gemm_m, + gemm_n, + gemm_k, + alpha, + softmax_out_data, + v_ptr, + beta, + qktv_out_data, + gemm_batch_size, + stride_a, + stride_b); + } + // transpose: [0, 2, 1, 3] + // output shape: [batch_size, seq_len, num_heads, head_dim] + std::vector perm_3 = {0, 2, 1, 3}; + TransposeGPUKernelDriver( + dev_ctx_, *qktv_out_tensor, perm_3, fmha_out_tensor); + } + void ComputeBackward(const phi::DenseTensor& transpose_2_out_tensor, const phi::DenseTensor* src_mask_tensor, const phi::DenseTensor& softmax_out_tensor, diff --git a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu index 1274e247e696b..f52bc2a7f54d1 100644 --- a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu +++ b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu @@ -59,13 +59,16 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel { bool compute_bias = qkv_biases.size() > 0 && time_step == nullptr; // (transA, transB, compute_bias) = (false, trans_qkvw, false) + // Since we fused QKVBias into QKVBiasAddTransposeSplit kernel, here we set + // compute_bias as false. auto qkv_compute = AttnMatMul(dev_ctx, false, trans_qkvw, bsz_seq, output_size, input_size, - compute_bias); + /*compute_bias=*/false); + Tensor qkv_out; qkv_out.Resize({{bsz, seq_len, 3, num_head, dim_head}}); auto *qkv_out_data = @@ -110,10 +113,15 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel { out_seq_len += cache_offset; } - Tensor transpose_out_2, qk_out; - transpose_out_2.Resize({{3, bsz, num_head, seq_len, dim_head}}); - auto *transpose_out_2_data = - dev_ctx.Alloc(&transpose_out_2, transpose_out_2.numel() * sizeof(T)); + Tensor q_transpose_out, kv_transpose_out, qk_out; + q_transpose_out.Resize({{bsz, num_head, seq_len, dim_head}}); + auto *q_transpose_out_data = + dev_ctx.Alloc(&q_transpose_out, q_transpose_out.numel() * sizeof(T)); + + kv_transpose_out.Resize({{2, bsz, num_head, seq_len, dim_head}}); + auto *kv_transpose_out_data = dev_ctx.Alloc( + &kv_transpose_out, kv_transpose_out.numel() * sizeof(T)); + qk_out.Resize({{bsz, num_head, seq_len, out_seq_len}}); auto *qk_out_data = dev_ctx.Alloc(&qk_out, qk_out.numel() * sizeof(T)); @@ -305,19 +313,29 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel { Tensor *pre_cache_kv_out_tmp = cache_offset > 0 ? &pre_cache_kv_out : nullptr; Tensor *src_mask_tmp = cache_offset > 0 ? &src_mask_out : nullptr; - fmha_compute.ComputeForward(qkv_out, - pre_cache_kv_tensor, - src_mask, - &transpose_out_2, - pre_cache_kv_out_tmp, - &qk_out, - src_mask_tmp, - &softmax_out, - &attn_dropout_mask_out, - &attn_dropout_out, - &qktv_out, - &fmha_out); - + qkv_bias_add_transpose_split(dev_ctx, + q_transpose_out_data, + kv_transpose_out_data, + qkv_out_data, + qkv_bias->data(), + bsz, + num_head, + seq_len, + dim_head, + compute_bias); + fmha_compute.ComputeForwardWithoutTranspose(qkv_out, + pre_cache_kv_tensor, + src_mask, + &q_transpose_out, + &kv_transpose_out, + pre_cache_kv_out_tmp, + &qk_out, + src_mask_tmp, + &softmax_out, + &attn_dropout_mask_out, + &attn_dropout_out, + &qktv_out, + &fmha_out); const T *k_ptr = nullptr; const T *v_ptr = nullptr; @@ -329,11 +347,9 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel { v_ptr = k_ptr + k_size; } else { // [3, bsz, num_head, seq_len, head_dim] - T *qkv_data = transpose_out_2_data; - int64_t q_size = bsz * seq_len * num_head * dim_head; - int64_t k_size = q_size; - const T *q_ptr = qkv_data; - k_ptr = q_ptr + q_size; + int64_t k_size = bsz * seq_len * num_head * dim_head; + const T *q_ptr = q_transpose_out_data; + k_ptr = kv_transpose_out_data; v_ptr = k_ptr + k_size; } @@ -358,18 +374,29 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel { dim_head); } else { // not generation // TODO(wangxi): can remove dropout in inference - fmha_compute.ComputeForward(qkv_out, - cache_kv, - src_mask, - &transpose_out_2, - cache_kv_out, - &qk_out, - nullptr, - &softmax_out, - &attn_dropout_mask_out, - &attn_dropout_out, - &qktv_out, - &fmha_out); + qkv_bias_add_transpose_split(dev_ctx, + q_transpose_out_data, + kv_transpose_out_data, + qkv_out_data, + qkv_bias->data(), + bsz, + num_head, + seq_len, + dim_head, + compute_bias); + fmha_compute.ComputeForwardWithoutTranspose(qkv_out, + cache_kv, + src_mask, + &q_transpose_out, + &kv_transpose_out, + cache_kv_out, + &qk_out, + nullptr, + &softmax_out, + &attn_dropout_mask_out, + &attn_dropout_out, + &qktv_out, + &fmha_out); } #ifdef _DEBUG_FUSED_MULTI_TRANSFORMER VLOG(0) << "step3"; diff --git a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu.h b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu.h index e0795616fd951..e6f4461f0c157 100644 --- a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu.h +++ b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu.h @@ -1,12 +1,9 @@ /* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -1155,6 +1152,137 @@ void write_cache_kv(const phi::GPUContext &dev_ctx, cache_v, v, num_head, dim_head, seq_len, max_seq_len); } +template +__global__ void add_fusedQKV_bias_transpose_split_kernel( + T *q_buf, + T *kv_buf, + const T *qkv, + const T *qkv_bias, + const int32_t elem_cnt, + const int batch_size, + const int seq_len, + const int token_num, + const int head_num, + const int size_per_head) { + const int32_t offset = batch_size * seq_len * head_num * size_per_head; + const int32_t hidden_size = head_num * size_per_head; + const int32_t fused_hidden_size = 3 * hidden_size; + int64_t global_thread_idx = blockDim.x * blockIdx.x + threadIdx.x; + using LoadT = phi::AlignedVector; + LoadT src_vec; + LoadT bias_vec; + + for (int32_t linear_index = global_thread_idx * VecSize, + step = gridDim.x * blockDim.x * VecSize; + linear_index < elem_cnt; + linear_index += step) { + phi::Load(&qkv[linear_index], &src_vec); + int32_t bias_idx = linear_index % fused_hidden_size; + if (ComputeBias) { + phi::Load(&qkv_bias[bias_idx], &bias_vec); +#pragma unroll + for (int32_t unroll_idx = 0; unroll_idx < VecSize; unroll_idx++) { + src_vec[unroll_idx] += bias_vec[unroll_idx]; + } + } + const int32_t token_idx = linear_index / fused_hidden_size; + // const int32_t token_padded_idx = token_idx + (padding_offset == nullptr ? + // 0 : padding_offset[token_idx]); + const int32_t target_batch_id = token_idx / seq_len; + const int32_t seq_id = token_idx % seq_len; + + // equal to: + // const int qkv_id = (linear_index % fused_hidden_size) / hidden_size; + const int32_t qkv_id = bias_idx / hidden_size; + const int32_t head_id = (linear_index % hidden_size) / size_per_head; + const int32_t size_id = linear_index % size_per_head; + + if (qkv_id == 0) { + phi::Store( + src_vec, + &q_buf[target_batch_id * head_num * seq_len * size_per_head + + head_id * seq_len * size_per_head + seq_id * size_per_head + + size_id]); + } else { + const int32_t kv_store_offset = (qkv_id - 1) * offset; + phi::Store( + src_vec, + &kv_buf[kv_store_offset + + target_batch_id * head_num * seq_len * size_per_head + + head_id * seq_len * size_per_head + seq_id * size_per_head + + size_id]); + } + } +} + +inline cudaError_t GetNumBlocks(int64_t n, int *num_blocks) { + constexpr int kBlockSize = 128; + constexpr int kNumWaves = 16; + + const int device_id = paddle::platform::GetCurrentDeviceId(); + const int sm_count = paddle::platform::GetGPUMultiProcessors(device_id); + const int max_thread_per_multiprocessor = + paddle::platform::GetGPUMultiProcessors(device_id); + + *num_blocks = + std::max(1, + std::min((n + kBlockSize - 1) / kBlockSize, + sm_count * max_thread_per_multiprocessor / + kBlockSize * kNumWaves)); + return cudaSuccess; +} + +template +void qkv_bias_add_transpose_split(const phi::GPUContext &dev_ctx, + T *q_buf, + T *kv_buf, + const T *qkv, + const T *qkv_bias, + const int batch_size, + const int head_num, + const int seq_len, + const int size_per_head, + bool compute_bias) { + const int32_t token_num = batch_size * seq_len; + const int32_t elem_cnt = token_num * head_num * size_per_head * 3; + constexpr int PackSize = VEC_16B / sizeof(T); + PADDLE_ENFORCE_EQ(size_per_head % PackSize, + 0, + platform::errors::PreconditionNotMet( + "dim_head=%d must be divisible by vec_size=%d", + size_per_head, + PackSize)); + const int32_t pack_num = elem_cnt / PackSize; + const int32_t blocksize = 128; + int32_t grid_size = 1; + GetNumBlocks(pack_num, &grid_size); + if (compute_bias) { + add_fusedQKV_bias_transpose_split_kernel + <<>>(q_buf, + kv_buf, + qkv, + qkv_bias, + elem_cnt, + batch_size, + seq_len, + token_num, + head_num, + size_per_head); + } else { + add_fusedQKV_bias_transpose_split_kernel + <<>>(q_buf, + kv_buf, + qkv, + qkv_bias, + elem_cnt, + batch_size, + seq_len, + token_num, + head_num, + size_per_head); + } +} + } // namespace } // namespace operators From 04709310379187be651dce51bd0c4028f379278c Mon Sep 17 00:00:00 2001 From: wanghuancoder Date: Fri, 18 Nov 2022 21:05:27 +0800 Subject: [PATCH 100/210] refine save hook (#48124) --- paddle/fluid/eager/tensor_wrapper.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/eager/tensor_wrapper.h b/paddle/fluid/eager/tensor_wrapper.h index 6405191244505..67cd943f33174 100644 --- a/paddle/fluid/eager/tensor_wrapper.h +++ b/paddle/fluid/eager/tensor_wrapper.h @@ -74,7 +74,7 @@ class TensorWrapper { } else { #ifndef PADDLE_NO_PYTHON if (SavedTensorsHooks::GetInstance().IsEnable() && - tensor.is_dense_tensor()) { + tensor.is_dense_tensor() && tensor.initialized()) { phi::DenseTensor* dense_tensor = static_cast(tensor.impl().get()); intermidiate_tensor_.set_impl( From c775bc69d02d2d3b4eae6c1b9862d57cecb8bba0 Mon Sep 17 00:00:00 2001 From: Aganlengzi Date: Sat, 19 Nov 2022 11:43:31 +0800 Subject: [PATCH 101/210] [CustomPlace] fix amp (#48090) * [CustomPlace] fix amp * [CustomPlace] fix amp * fix ut because of too long time matmul fp16 --- paddle/fluid/eager/amp_auto_cast.h | 3 ++- paddle/fluid/eager/eager_amp_auto_cast.h | 3 ++- .../fluid/tests/custom_runtime/test_custom_cpu_to_static.py | 6 +++++- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/eager/amp_auto_cast.h b/paddle/fluid/eager/amp_auto_cast.h index 5110f6f883e67..a16dd95396427 100644 --- a/paddle/fluid/eager/amp_auto_cast.h +++ b/paddle/fluid/eager/amp_auto_cast.h @@ -29,7 +29,8 @@ static inline bool NeedCast(const paddle::experimental::Tensor& tensor, paddle::platform::is_xpu_place(place) || paddle::platform::is_mlu_place(place) || paddle::platform::is_npu_place(place) || - paddle::platform::is_npu_pinned_place(place)) { + paddle::platform::is_npu_pinned_place(place) || + paddle::platform::is_custom_place(place)) { // CudaPinndePlace is added for varbase created by dataloader if ((data_type == paddle::experimental::DataType::FLOAT32 || data_type == paddle::experimental::DataType::FLOAT16 || diff --git a/paddle/fluid/eager/eager_amp_auto_cast.h b/paddle/fluid/eager/eager_amp_auto_cast.h index 42961b84bcdb0..22748e31cfd7a 100644 --- a/paddle/fluid/eager/eager_amp_auto_cast.h +++ b/paddle/fluid/eager/eager_amp_auto_cast.h @@ -27,7 +27,8 @@ static inline bool NeedCast(const paddle::experimental::Tensor& tensor, paddle::platform::is_xpu_place(place) || paddle::platform::is_mlu_place(place) || paddle::platform::is_npu_place(place) || - paddle::platform::is_npu_pinned_place(place)) { + paddle::platform::is_npu_pinned_place(place) || + paddle::platform::is_custom_place(place)) { // CudaPinndePlace is added for varbase created by dataloader if ((data_type == paddle::experimental::DataType::FLOAT32 || data_type == paddle::experimental::DataType::FLOAT16 || diff --git a/python/paddle/fluid/tests/custom_runtime/test_custom_cpu_to_static.py b/python/paddle/fluid/tests/custom_runtime/test_custom_cpu_to_static.py index a1b485b3f5ce0..6ce9cb45761ec 100644 --- a/python/paddle/fluid/tests/custom_runtime/test_custom_cpu_to_static.py +++ b/python/paddle/fluid/tests/custom_runtime/test_custom_cpu_to_static.py @@ -54,7 +54,11 @@ def train_func_ampo1(epoch_id, train_loader, model, cost, optimizer, scaler): for batch_id, (images, labels) in enumerate(train_loader()): # forward with paddle.amp.auto_cast( - custom_black_list={"flatten_contiguous_range", "greater_than"}, + custom_black_list={ + "flatten_contiguous_range", + "greater_than", + "matmul_v2", + }, level='O1', ): outputs = model(images) From f38e09f0400f44b2b19ea9b89048d4d593712743 Mon Sep 17 00:00:00 2001 From: Wen Sun <35923278+HermitSun@users.noreply.github.com> Date: Sat, 19 Nov 2022 15:21:31 +0800 Subject: [PATCH 102/210] refactor: rm redundant funcs (#48149) --- .../distributed/collective/ProcessGroup.h | 22 -- .../collective/ProcessGroupNCCL.cc | 287 ------------------ .../distributed/collective/ProcessGroupNCCL.h | 47 --- .../collective/ProcessGroupStream.cc | 37 --- .../collective/ProcessGroupStream.h | 23 -- 5 files changed, 416 deletions(-) diff --git a/paddle/fluid/distributed/collective/ProcessGroup.h b/paddle/fluid/distributed/collective/ProcessGroup.h index 795a1a91b5235..09be2ca5e8788 100644 --- a/paddle/fluid/distributed/collective/ProcessGroup.h +++ b/paddle/fluid/distributed/collective/ProcessGroup.h @@ -244,26 +244,12 @@ class ProcessGroup { "ProcessGroup%s does not support send", GetBackendName())); } - virtual std::shared_ptr Send( - std::vector&, int, bool) { // NOLINT - PADDLE_THROW(platform::errors::InvalidArgument( - "ProcessGroup%s does not support send with sync_op flag", - GetBackendName())); - } - virtual std::shared_ptr Recv( std::vector&, int) { // NOLINT PADDLE_THROW(platform::errors::InvalidArgument( "ProcessGroup%s does not support recv", GetBackendName())); } - virtual std::shared_ptr Recv( - std::vector&, int, bool) { // NOLINT - PADDLE_THROW(platform::errors::InvalidArgument( - "ProcessGroup%s does not support recv with sync_op flag", - GetBackendName())); - } - virtual std::shared_ptr AllGather( std::vector&, // NOLINT std::vector&) { // NOLINT @@ -287,14 +273,6 @@ class ProcessGroup { "ProcessGroup%s does not support AllToAll", GetBackendName())); } - virtual std::shared_ptr AllToAll( - std::vector&, // NOLINT - std::vector&, // NOLINT - bool) { - PADDLE_THROW(platform::errors::InvalidArgument( - "ProcessGroup%s does not support alltoall", GetBackendName())); - } - virtual std::shared_ptr Reduce( std::vector&, // NOLINT std::vector&, // NOLINT diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc index 74ebf80205964..3c7bc0ec8429f 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc @@ -626,35 +626,6 @@ std::shared_ptr ProcessGroupNCCL::PointToPoint( return task; } -void ProcessGroupNCCL::CheckSplitSizes(std::vector* split_sizes, - std::vector tensor_shape) { - int64_t len_size = (*split_sizes).size(); - if (len_size == 0) { - PADDLE_ENFORCE_EQ(tensor_shape[0] % size_ == 0, - true, - platform::errors::InvalidArgument( - "Tensor's dim[0] must be divisible by group size " - "when split_sizes not given.")); - (*split_sizes) - .insert((*split_sizes).end(), - size_, - static_cast(tensor_shape[0] / size_)); - } else { - PADDLE_ENFORCE_EQ( - len_size == size_, - true, - platform::errors::InvalidArgument( - "The length of split_sizes must be equal to group size.")); - auto sum_size = std::accumulate( - (*split_sizes).begin(), (*split_sizes).end(), static_cast(0)); - PADDLE_ENFORCE_EQ( - sum_size == tensor_shape[0], - true, - platform::errors::InvalidArgument( - "The sum of split_sizes must be equal to tensor's dim[0].")); - } -} - // TODO(sunyilun): methods below will be removed later void SyncDefaultStream(const std::vector& places, platform::DeviceEvent& nccl_event, // NOLINT @@ -676,17 +647,6 @@ std::shared_ptr ProcessGroupNCCL::CreateTask( places, rank, comm_type, inputs); } -std::shared_ptr ProcessGroupNCCL::CreateTask( - const std::vector& places, - int rank, - CommType comm_type, - const std::vector& inputs, - bool is_sync, - bool use_calc_stream) { - return std::make_shared( - places, rank, comm_type, inputs, is_sync, use_calc_stream); -} - ProcessGroupNCCL::NCCLTask::NCCLTask( const std::vector& places, int rank, @@ -696,17 +656,6 @@ ProcessGroupNCCL::NCCLTask::NCCLTask( comm_event_(places[0]), task_place_(places[0]) {} -ProcessGroupNCCL::NCCLTask::NCCLTask( - const std::vector& places, - int rank, - CommType comm_type, - const std::vector& inputs, - bool sync_op, - bool use_calc_stream) - : TaskStream(rank, inputs, comm_type, sync_op, use_calc_stream), - comm_event_(places[0]), - task_place_(places[0]) {} - // create NCCLManager cache for places_key void ProcessGroupNCCL::CreateNCCLManagerCache( const std::string& places_key, const std::vector& places) { @@ -759,85 +708,6 @@ void ProcessGroupNCCL::CreateNCCLManagerCache( places_to_ctx_.emplace(places_key, std::move(dev_ctx_raw)); } -template -std::shared_ptr ProcessGroupNCCL::Collective( - std::vector& inputs, - std::vector& outputs, - Fn fn, - CommType comm_type, - bool sync_op, - bool use_calc_stream) { - const auto& places = GetPlaceList(inputs); - const auto& key = GetKeyFromPlaces(places); - - { - std::lock_guard lock(mutex_); - if (place_to_comm_ctx_.find(key) == place_to_comm_ctx_.end()) { - CreateNCCLManagerCache(key, places); - } - } - - if (!use_calc_stream) { - SyncDefaultStream( - places, place_to_calc_event_.at(key), places_to_ctx_.at(key)); - } - - auto task = - CreateTask(places, rank_, comm_type, inputs, sync_op, use_calc_stream); - - platform::CUDADeviceGuard cuda_guard; - - { - platform::NCCLGroupGuard nccl_guard; - for (size_t i = 0; i < inputs.size(); ++i) { - cuda_guard.SetDevice(places[i]); - - gpuStream_t nccl_stream; - if (use_calc_stream) { - nccl_stream = - static_cast( - platform::DeviceContextPool::Instance().Get(places[i])) - ->stream(); - } else { - nccl_stream = places_to_ctx_.at(key)[i]->stream(); - } - - fn(inputs[i], - outputs[i], - places_to_ctx_.at(key)[i]->nccl_comm(), - nccl_stream); - } - } - - if (FLAGS_use_stream_safe_cuda_allocator) { - for (size_t i = 0; i < inputs.size(); ++i) { - cuda_guard.SetDevice(places[i]); - - gpuStream_t nccl_stream; - if (use_calc_stream) { - nccl_stream = - static_cast( - platform::DeviceContextPool::Instance().Get(places[i])) - ->stream(); - } else { - nccl_stream = places_to_ctx_.at(key)[i]->stream(); - } - - memory::RecordStream(inputs[i].Holder(), nccl_stream); - } - } - - // Adding stream event dependency only when use comm stream - if (!use_calc_stream) { - for (size_t i = 0; i < inputs.size(); ++i) { - cuda_guard.SetDevice(places[i]); - task->UpdateWaitChain(*places_to_ctx_.at(key)[i]); - } - } - - return task; -} - template std::shared_ptr ProcessGroupNCCL::Collective( std::vector& inputs, @@ -889,117 +759,6 @@ std::shared_ptr ProcessGroupNCCL::Collective( return task; } -template -void ProcessGroupNCCL::Collective(const phi::DenseTensor* in, - phi::DenseTensor* out, - Fn fn, - CommType op_type) { - std::vector places; - places.push_back(in->place()); - const std::string& key = GetKeyFromPlaces(places); - - { - std::lock_guard lock(mutex_); - if (place_to_comm_ctx_.find(key) == place_to_comm_ctx_.end()) { - CreateNCCLManagerCache(key, places); - } - } - - SyncDefaultStream( - places, place_to_calc_event_.at(key), places_to_ctx_.at(key)); - - // construct uninitialize guard for device - platform::CUDADeviceGuard cuda_guard; - - if (FLAGS_use_stream_safe_cuda_allocator) { - cuda_guard.SetDevice(places[0]); - memory::RecordStream(in->Holder(), places_to_ctx_.at(key)[0]->stream()); - } - - { - platform::NCCLGroupGuard nccl_guard; - cuda_guard.SetDevice(places[0]); - const auto& nccl_stream = places_to_ctx_.at(key)[0]->stream(); - fn(in, out, places_to_ctx_.at(key)[0]->nccl_comm(), nccl_stream); - } - - cuda_guard.SetDevice(places[0]); -} - -template -std::shared_ptr ProcessGroupNCCL::PointToPoint( - std::vector& tensors, - Fn fn, - int dst_rank, - CommType op_type, - bool sync_op, - bool use_calc_stream) { - const auto& places = GetPlaceList(tensors); - const auto& key = GetKeyFromPlaces(places); - - { - std::lock_guard lock(mutex_); - if (place_to_comm_ctx_.find(key) == place_to_comm_ctx_.end()) { - CreateNCCLManagerCache(key, places); - } - } - - if (!use_calc_stream) { - SyncDefaultStream( - places, place_to_calc_event_.at(key), places_to_ctx_.at(key)); - } - - auto task = - CreateTask(places, rank_, op_type, tensors, sync_op, use_calc_stream); - - platform::CUDADeviceGuard cuda_guard; - - { - platform::NCCLGroupGuard nccl_guard; - for (size_t i = 0; i < tensors.size(); ++i) { - cuda_guard.SetDevice(places[i]); - gpuStream_t nccl_stream; - if (use_calc_stream) { - nccl_stream = - static_cast( - platform::DeviceContextPool::Instance().Get(places[i])) - ->stream(); - } else { - nccl_stream = places_to_ctx_.at(key)[i]->stream(); - } - fn(tensors[i], - places_to_ctx_.at(key)[i]->nccl_comm(), - nccl_stream, - dst_rank); - } - } - - if (FLAGS_use_stream_safe_cuda_allocator) { - for (size_t i = 0; i < tensors.size(); ++i) { - cuda_guard.SetDevice(places[i]); - gpuStream_t nccl_stream; - if (use_calc_stream) { - nccl_stream = - static_cast( - platform::DeviceContextPool::Instance().Get(places[i])) - ->stream(); - } else { - nccl_stream = places_to_ctx_.at(key)[i]->stream(); - } - memory::RecordStream(tensors[i].Holder(), nccl_stream); - } - } - - if (!use_calc_stream) { - for (size_t i = 0; i < tensors.size(); ++i) { - cuda_guard.SetDevice(places[i]); - task->UpdateWaitChain(*places_to_ctx_.at(key)[i]); - } - } - - return task; -} - template std::shared_ptr ProcessGroupNCCL::PointToPoint( std::vector& tensors, @@ -1290,52 +1049,6 @@ std::shared_ptr ProcessGroupNCCL::AllToAll( CommType::ALLTOALL); } -std::shared_ptr ProcessGroupNCCL::AllToAll( - std::vector& in_tensors, - std::vector& out_tensors, - bool sync_op, - bool use_calc_stream) { - PADDLE_ENFORCE_EQ( - CheckTensorsInCudaPlace(in_tensors), - true, - platform::errors::InvalidArgument("All inputs should be in CudaPlace.")); - PADDLE_ENFORCE_EQ( - CheckTensorsInCudaPlace(out_tensors), - true, - platform::errors::InvalidArgument("All inputs should be in CudaPlace.")); - return Collective( - in_tensors, - out_tensors, - [&](phi::DenseTensor& input, - phi::DenseTensor& output, - ncclComm_t comm, - const gpuStream_t& stream) { - size_t offset = 0; - GroupStart(); - for (auto i = 0; i < size_; i++) { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend( - GetPointerByOffset(input.data(), offset, input.dtype()), - input.numel() / size_, - platform::ToNCCLDataType(input.dtype()), - i, - comm, - stream)); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv( - GetPointerByOffset(output.data(), offset, input.dtype()), - input.numel() / size_, - platform::ToNCCLDataType(input.dtype()), - i, - comm, - stream)); - offset += input.numel() / size_; - } - GroupEnd(); - }, - CommType::ALLTOALL, - sync_op, - use_calc_stream); -} - std::shared_ptr ProcessGroupNCCL::Reduce( std::vector& in_tensors, std::vector& out_tensors, diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h index c10c4370b4b23..a52e5e61cd295 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h +++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h @@ -68,12 +68,6 @@ class ProcessGroupNCCL final : public ProcessGroupStream { int rank, CommType CommType, const std::vector& inputs); - NCCLTask(const std::vector& places, - int rank, - CommType comm_type, - const std::vector& inputs, - bool sync_op, - bool use_calc_stream); private: bool block_cpu_in_wait_{false}; @@ -192,12 +186,6 @@ class ProcessGroupNCCL final : public ProcessGroupStream { std::vector& in_tensors, std::vector& out_tensors) override; - std::shared_ptr AllToAll( - std::vector& in_tensors, - std::vector& out_tensors, - bool sync_op, - bool use_calc_stream) override; - std::shared_ptr Reduce( std::vector& tensors, std::vector& out_tensors, @@ -245,14 +233,6 @@ class ProcessGroupNCCL final : public ProcessGroupStream { CommType op_type, const std::vector& inputs); - std::shared_ptr CreateTask( - const std::vector& places, - int rank, - CommType op_type, - const std::vector& inputs, - bool sync_op, - bool use_calc_stream); - template std::shared_ptr Collective( std::vector& inputs, // NOLINT @@ -260,21 +240,6 @@ class ProcessGroupNCCL final : public ProcessGroupStream { Fn fn, CommType op_type); - template - std::shared_ptr Collective( - std::vector& inputs, // NOLINT - std::vector& outputs, // NOLINT - Fn fn, - CommType comm_type, - bool sync_op, - bool use_calc_stream); - - template - void Collective(const phi::DenseTensor*, - phi::DenseTensor*, - Fn fn, - CommType op_type); - template std::shared_ptr PointToPoint( std::vector& tensors, // NOLINT @@ -282,21 +247,9 @@ class ProcessGroupNCCL final : public ProcessGroupStream { int dst_rank, CommType op_type); - template - std::shared_ptr PointToPoint( - std::vector& tensors, // NOLINT - Fn fn, - int dst_rank, - CommType op_type, - bool sync_op, - bool use_calc_stream); - void CreateNCCLManagerCache(const std::string& places_key, const std::vector& places); - void CheckSplitSizes(std::vector* split_sizes, - std::vector tensor_shape); - private: std::shared_ptr store_; std::unordered_map diff --git a/paddle/fluid/distributed/collective/ProcessGroupStream.cc b/paddle/fluid/distributed/collective/ProcessGroupStream.cc index 9f7b3c1964e23..cd1e617a89e4c 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupStream.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupStream.cc @@ -236,42 +236,5 @@ std::shared_ptr ProcessGroupStream::Send( "ProcessGroup%s does not support send.", GetBackendName())); } -// TODO(sunyilun): methods below will be removed later -std::shared_ptr ProcessGroupStream::AllToAll( - std::vector& in_tensors, - std::vector& out_tensors, - bool sync_op) { - return AllToAll(in_tensors, - out_tensors, - sync_op, - /*use_calc_stream*/ false); -} - -std::shared_ptr ProcessGroupStream::AllToAll( - std::vector& in_tensors, - std::vector& out_tensors, - bool sync_op, - bool use_calc_stream) { - PADDLE_THROW(platform::errors::InvalidArgument( - "ProcessGroup%s does not support do alltoall", GetBackendName())); -} - -std::shared_ptr ProcessGroupStream::Recv( - std::vector& tensors, int src_rank, bool sync_op) { - return Recv(tensors, - src_rank, - sync_op, - /*use_calc_stream*/ false); -} - -std::shared_ptr ProcessGroupStream::Recv( - std::vector& tensors, - int src_rank, - bool sync_op, - bool use_calc_stream) { - PADDLE_THROW(platform::errors::InvalidArgument( - "ProcessGroup%s does not support do recv", GetBackendName())); -} - } // namespace distributed } // namespace paddle diff --git a/paddle/fluid/distributed/collective/ProcessGroupStream.h b/paddle/fluid/distributed/collective/ProcessGroupStream.h index d1fd95953f1f0..be76429580d10 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupStream.h +++ b/paddle/fluid/distributed/collective/ProcessGroupStream.h @@ -179,29 +179,6 @@ class ProcessGroupStream : public ProcessGroup { int64_t numel, bool sync_op, bool use_calc_stream); - - // TODO(sunyilun): methods below will be removed later - std::shared_ptr AllToAll( - std::vector& in_tensors, // NOLINT - std::vector& out_tensors, // NOLINT - bool sync_op) override; - - virtual std::shared_ptr AllToAll( - std::vector& in_tensors, // NOLINT - std::vector& out_tensors, // NOLINT - bool sync_op, - bool use_calc_stream); - - std::shared_ptr Recv( - std::vector& tensors, // NOLINT - int src_rank, - bool sync_op) override; - - virtual std::shared_ptr Recv( - std::vector& tensors, // NOLINT - int src_rank, - bool sync_op, - bool use_calc_stream); }; } // namespace distributed From 5675c7d549b55f0230ee8cec3e8c68dc7e8e4dbc Mon Sep 17 00:00:00 2001 From: ccrrong <101700995+ccrrong@users.noreply.github.com> Date: Sun, 20 Nov 2022 18:13:22 +0800 Subject: [PATCH 103/210] remove range from fluid (#48086) * remove range --- .../dygraph_to_static/convert_operators.py | 9 +- python/paddle/fluid/layers/rnn.py | 2 +- python/paddle/fluid/layers/tensor.py | 103 ------------------ .../test_dynamic_rnn_stop_gradient.py | 3 +- .../fluid/tests/unittests/test_layers.py | 8 +- 5 files changed, 11 insertions(+), 114 deletions(-) diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py b/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py index abf9c48828039..fc91a3a797424 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py @@ -20,7 +20,6 @@ ) from paddle.fluid.framework import core, Variable from paddle.fluid.layers import Assert, Print -from paddle.fluid.layers import range as paddle_range from paddle.fluid.layers import ( array_length, array_read, @@ -570,7 +569,7 @@ def __init__(self, var, start=0): self.var = var self.len = convert_len(var) if isinstance(self.len, Variable): - self.rag = paddle_range(start, start + self.len, 1, paddle.int64) + self.rag = paddle.arange(start, start + self.len, 1, paddle.int64) else: self.rag = range(start, start + self.len) @@ -592,11 +591,11 @@ def convert_range(*args): has_variable = any(map(lambda x: isinstance(x, Variable), args)) if has_variable: if len(args) == 1: - return paddle_range(0, args[0], 1, paddle.int64) + return paddle.arange(0, args[0], 1, paddle.int64) if len(args) == 2: - return paddle_range(args[0], args[1], 1, paddle.int64) + return paddle.arange(args[0], args[1], 1, paddle.int64) if len(args) == 3: - return paddle_range(args[0], args[1], args[2], paddle.int64) + return paddle.arange(args[0], args[1], args[2], paddle.int64) return range(*args) diff --git a/python/paddle/fluid/layers/rnn.py b/python/paddle/fluid/layers/rnn.py index 0104502a7ddbd..60b0eb5da67d0 100644 --- a/python/paddle/fluid/layers/rnn.py +++ b/python/paddle/fluid/layers/rnn.py @@ -1163,7 +1163,7 @@ def _gather(self, x, indices, batch_size): batch_size.stop_gradient = True # TODO: remove this batch_pos = paddle.tile( nn.unsqueeze( - tensor.range(0, batch_size, 1, dtype=indices.dtype), [1] + paddle.arange(0, batch_size, 1, dtype=indices.dtype), [1] ), [1, self.beam_size], ) diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py index 0f6652fdd5d7c..3982bcac6fa13 100644 --- a/python/paddle/fluid/layers/tensor.py +++ b/python/paddle/fluid/layers/tensor.py @@ -66,7 +66,6 @@ 'has_inf', 'has_nan', 'isfinite', - 'range', 'linspace', 'zeros_like', 'ones_like', @@ -1635,108 +1634,6 @@ def isfinite(x): return out -def range(start, end, step, dtype, name=None): - """ - This OP returns a 1-D Tensor with spaced values within a given interval. - - Values are generated into the half-open interval [``start``, ``end``) with - the ``step``. (the interval including ``start`` but excluding ``end``). - - If ``dtype`` is float32 or float64, we advise adding a small epsilon to - ``end`` to avoid floating point rounding errors when comparing against ``end``. - - Parameters: - start(float|int|Tensor): Start of interval. The interval includes this - value. If ``start`` is a Tensor, it is a 1-D Tensor with shape [1], - with data type int32, int64, float32, float64. - end(float|int|Tensor): End of interval. The interval does not include - this value. If ``end`` is a Tensor, it is a 1-D Tensor with shape - [1], with data type int32, int64, float32, float64. - step(float|int|Tensor): Spacing between values. For any out, it is - the istance between two adjacent values, out[i+1] - out[i]. If - ``step`` is a Tensor, it is a 1-D Tensor with shape [1], with data - type int32, int64, float32, float64. - dtype(str|np.dtype|core.VarDesc.VarType, optional): The data type of the - output tensor. Supported data types: int32, int64, float32, float64. - name(str, optional): The default value is None. Normally there is no - need for user to set this property. For more information, please - refer to :ref:`api_guide_Name`. - - Returns: - Tensor: A 1-D Tensor with values from the interval [``start``, ``end``) - taken with common difference ``step`` beginning from ``start``. Its - data type is set by ``dtype``. - - Raises: - TypeError: If ``dtype`` is not int32, int64, float32, float64. - - examples: - - .. code-block:: python - - import paddle.fluid as fluid - - out1 = fluid.layers.range(0, 10, 2, 'int32') - # [0, 2, 4, 6, 8] - - start_var = fluid.layers.fill_constant([1], 'int64', 3) - out2 = fluid.layers.range(start_var, 7, 1, 'int64') - # [3, 4, 5, 6] - - """ - out_shape = None - if ( - not isinstance(start, Variable) - and not isinstance(end, Variable) - and not isinstance(step, Variable) - ): - out_shape = [int(math.ceil((end - start) / step))] - - if not isinstance(dtype, core.VarDesc.VarType): - dtype = convert_np_dtype_to_dtype_(dtype) - - if not isinstance(start, Variable): - with device_guard("cpu"): - start = fill_constant([1], dtype, start, force_cpu=True) - elif start.dtype != dtype: - start = cast(start, dtype) - - if not isinstance(end, Variable): - with device_guard("cpu"): - end = fill_constant([1], dtype, end, force_cpu=True) - elif end.dtype != dtype: - end = cast(end, dtype) - - if not isinstance(step, Variable): - with device_guard("cpu"): - step = fill_constant([1], dtype, step, force_cpu=True) - elif step.dtype != dtype: - step = cast(step, dtype) - - if in_dygraph_mode(): - return _C_ops.arange(start, end, step, dtype, _current_expected_place()) - - if _in_legacy_dygraph(): - out = _legacy_C_ops.range(start, end, step) - out.stop_gradient = True - return out - - check_dtype( - dtype, 'dtype', ['float32', 'float64', 'int32', 'int64'], 'range/arange' - ) - helper = LayerHelper('range', **locals()) - out = helper.create_variable_for_type_inference(dtype, shape=out_shape) - helper.append_op( - type='range', - inputs={'Start': start, 'End': end, 'Step': step}, - outputs={'Out': out}, - ) - out.stop_gradient = True - if out_shape is not None: - out.desc.set_shape(out_shape) - return out - - def linspace(start, stop, num, dtype=None, name=None): r""" This OP return fixed number of evenly spaced values within a given interval. diff --git a/python/paddle/fluid/tests/unittests/test_dynamic_rnn_stop_gradient.py b/python/paddle/fluid/tests/unittests/test_dynamic_rnn_stop_gradient.py index dd19157c32169..f3f971b5778d8 100644 --- a/python/paddle/fluid/tests/unittests/test_dynamic_rnn_stop_gradient.py +++ b/python/paddle/fluid/tests/unittests/test_dynamic_rnn_stop_gradient.py @@ -13,6 +13,7 @@ # limitations under the License. import numpy as np +import paddle import paddle.fluid as fluid import paddle.fluid.layers as layers import unittest @@ -43,7 +44,7 @@ def build_and_run_program(place, batch_size, beam_size, stop_gradient=False): bs = layers.cast(bs, 'int64') bs.stop_gradient = stop_gradient batch_pos = layers.expand( - layers.unsqueeze(layers.range(0, bs, 1, dtype=bs.dtype), [1]), + layers.unsqueeze(paddle.arange(0, bs, 1, dtype=bs.dtype), [1]), [1, beam_size], ) topk_coordinates = paddle.stack([batch_pos, indices], axis=2) diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index de5b2cb67fbba..2eaf9432c6e87 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -4023,13 +4023,13 @@ def make_range(self): with program_guard( fluid.default_main_program(), fluid.default_startup_program() ): - layers.range(0, 10, 2, 'int32') - layers.range(0.1, 10.0, 0.2, 'float32') - layers.range(0.1, 10.0, 0.2, 'float64') + paddle.arange(0, 10, 2, 'int32') + paddle.arange(0.1, 10.0, 0.2, 'float32') + paddle.arange(0.1, 10.0, 0.2, 'float64') start = layers.fill_constant(shape=[1], value=0.1, dtype="float32") end = layers.fill_constant(shape=[1], value=10.0, dtype="float32") step = layers.fill_constant(shape=[1], value=0.2, dtype="float32") - y = layers.range(start, end, step, 'float64') + y = paddle.arange(start, end, step, 'float64') return y def make_spectral_norm(self): From c00f0dafa79f985a5ffd3d885e387ec57f5cf8e0 Mon Sep 17 00:00:00 2001 From: sneaxiy <32832641+sneaxiy@users.noreply.github.com> Date: Mon, 21 Nov 2022 07:40:18 +0800 Subject: [PATCH 104/210] add state_dict convert (#48161) --- python/paddle/fluid/dygraph/layers.py | 2 +- .../unittests/test_state_dict_convert.py | 77 +++++++++++++++++++ 2 files changed, 78 insertions(+), 1 deletion(-) create mode 100644 python/paddle/fluid/tests/unittests/test_state_dict_convert.py diff --git a/python/paddle/fluid/dygraph/layers.py b/python/paddle/fluid/dygraph/layers.py index 752694b614af7..5e15519bd9627 100644 --- a/python/paddle/fluid/dygraph/layers.py +++ b/python/paddle/fluid/dygraph/layers.py @@ -1627,7 +1627,7 @@ def _check_match(key, param): return param, state matched_param_state = [] - for key, param in self.state_dict(use_hook=False).items(): + for key, param in self._state_dict_impl(use_hook=False).items(): key_name = key if use_structured_name else param.name try: match_res = _check_match(key_name, param) diff --git a/python/paddle/fluid/tests/unittests/test_state_dict_convert.py b/python/paddle/fluid/tests/unittests/test_state_dict_convert.py new file mode 100644 index 0000000000000..f62f983e90320 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_state_dict_convert.py @@ -0,0 +1,77 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +import numpy as np +import unittest + + +class MyModel(nn.Layer): + def __init__(self): + super().__init__() + self.linear = nn.Linear(100, 300) + + def forward(self, x): + return self.linear(x) + + @paddle.no_grad() + def state_dict( + self, + destination=None, + include_sublayers=True, + structured_name_prefix="", + use_hook=True, + ): + st = super().state_dict( + destination=destination, + include_sublayers=include_sublayers, + structured_name_prefix=structured_name_prefix, + use_hook=use_hook, + ) + st["linear.new_weight"] = paddle.transpose( + st.pop("linear.weight"), [1, 0] + ) + return st + + @paddle.no_grad() + def set_state_dict(self, state_dict, use_structured_name=True): + state_dict["linear.weight"] = paddle.transpose( + state_dict.pop("linear.new_weight"), [1, 0] + ) + return super().set_state_dict(state_dict) + + +def is_state_dict_equal(model1, model2): + st1 = model1.state_dict() + st2 = model2.state_dict() + assert set(st1.keys()) == set(st2.keys()) + for k, v1 in st1.items(): + v2 = st2[k] + if not np.array_equal(v1.numpy(), v2.numpy()): + return False + return True + + +class TestStateDictConvert(unittest.TestCase): + def test_main(self): + model1 = MyModel() + model2 = MyModel() + self.assertFalse(is_state_dict_equal(model1, model2)) + model2.set_state_dict(model1.state_dict()) + self.assertTrue(is_state_dict_equal(model1, model2)) + + +if __name__ == "__main__": + unittest.main() From 02c51f3b33b422c6e548105d9e31ed6ed12cc3ed Mon Sep 17 00:00:00 2001 From: PuQing Date: Mon, 21 Nov 2022 10:39:03 +0800 Subject: [PATCH 105/210] remove macros.h (#48069) --- paddle/fluid/platform/macros.h | 16 +--------------- paddle/phi/core/macros.h | 6 ++++++ paddle/phi/kernels/funcs/pooling.h | 2 +- 3 files changed, 8 insertions(+), 16 deletions(-) diff --git a/paddle/fluid/platform/macros.h b/paddle/fluid/platform/macros.h index 2ea58a7bb0c81..3f854d40b8b23 100644 --- a/paddle/fluid/platform/macros.h +++ b/paddle/fluid/platform/macros.h @@ -14,18 +14,4 @@ limitations under the License. */ #pragma once -// Disable the copy and assignment operator for a class. -#ifndef DISABLE_COPY_AND_ASSIGN -#define DISABLE_COPY_AND_ASSIGN(classname) \ - private: \ - classname(const classname&) = delete; \ - classname(classname&&) = delete; \ - classname& operator=(const classname&) = delete; \ - classname& operator=(classname&&) = delete -#endif - -#ifndef PADDLE_WITH_MUSL -#if defined(__FLT_MAX__) -#define FLT_MAX __FLT_MAX__ -#endif // __FLT_MAX__ -#endif // PADDLE_WITH_MUSL +#include "paddle/phi/core/macros.h" diff --git a/paddle/phi/core/macros.h b/paddle/phi/core/macros.h index e48f7342e456e..2e78357492734 100644 --- a/paddle/phi/core/macros.h +++ b/paddle/phi/core/macros.h @@ -59,4 +59,10 @@ namespace phi { #define PADDLE_RESTRICT #endif +#ifndef PADDLE_WITH_MUSL +#if defined(__FLT_MAX__) +#define FLT_MAX __FLT_MAX__ +#endif // __FLT_MAX__ +#endif // PADDLE_WITH_MUSL + } // namespace phi diff --git a/paddle/phi/kernels/funcs/pooling.h b/paddle/phi/kernels/funcs/pooling.h index 1d1eacd0d5098..17b87a0e17d51 100644 --- a/paddle/phi/kernels/funcs/pooling.h +++ b/paddle/phi/kernels/funcs/pooling.h @@ -18,10 +18,10 @@ limitations under the License. */ #include #include -#include "paddle/fluid/platform/macros.h" // import FLT_MAX #include "paddle/phi/common/amp_type_traits.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/hostdevice.h" +#include "paddle/phi/core/macros.h" // import FLT_MAX #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #include "paddle/phi/backends/gpu/gpu_decls.h" From 403d58bba0c1c64560826d61816aba2cb7fe8653 Mon Sep 17 00:00:00 2001 From: LiYuRio <63526175+LiYuRio@users.noreply.github.com> Date: Mon, 21 Nov 2022 10:41:28 +0800 Subject: [PATCH 106/210] return pointer rather than reference (#48152) --- .../distributed/collective/ProcessGroup.h | 2 +- .../collective/ProcessGroupBKCL.cc | 4 ++-- .../distributed/collective/ProcessGroupBKCL.h | 6 +++--- .../collective/ProcessGroupCustom.cc | 4 ++-- .../collective/ProcessGroupCustom.h | 2 +- .../distributed/collective/ProcessGroupGloo.h | 5 ++--- .../collective/ProcessGroupNCCL.cc | 8 +++---- .../distributed/collective/ProcessGroupNCCL.h | 8 +++---- .../collective/ProcessGroupStream.cc | 2 +- .../collective/ProcessGroupStream.h | 5 +++-- .../fluid/distributed/collective/reducer.cc | 6 +++--- paddle/fluid/pybind/distributed_py.cc | 21 +++++++++---------- 12 files changed, 36 insertions(+), 37 deletions(-) diff --git a/paddle/fluid/distributed/collective/ProcessGroup.h b/paddle/fluid/distributed/collective/ProcessGroup.h index 09be2ca5e8788..8cc0cad8a5be0 100644 --- a/paddle/fluid/distributed/collective/ProcessGroup.h +++ b/paddle/fluid/distributed/collective/ProcessGroup.h @@ -96,7 +96,7 @@ class ProcessGroup { virtual std::string GetBackendName() const = 0; - virtual const phi::DeviceContext& GetDeviceContext(const Place& place) const { + virtual phi::DeviceContext* GetDeviceContext(const Place& place) const { PADDLE_THROW(platform::errors::Unimplemented( "ProcessGroup%s does not support get device_context.", GetBackendName())); diff --git a/paddle/fluid/distributed/collective/ProcessGroupBKCL.cc b/paddle/fluid/distributed/collective/ProcessGroupBKCL.cc index 5c122ce2a3216..42d7d7200edcd 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupBKCL.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupBKCL.cc @@ -282,12 +282,12 @@ std::shared_ptr ProcessGroupBKCL::Barrier( return task; } -const phi::DeviceContext& ProcessGroupBKCL::GetDeviceContext( +phi::DeviceContext* ProcessGroupBKCL::GetDeviceContext( const Place& place) const { return GetDeviceContext(place, /*use_calc_stream*/ false); } -const phi::DeviceContext& ProcessGroupBKCL::GetDeviceContext( +phi::DeviceContext* ProcessGroupBKCL::GetDeviceContext( const Place& place, bool use_calc_stream) const { const std::string& key = GetKeyFromPlace(place); if (use_calc_stream) { diff --git a/paddle/fluid/distributed/collective/ProcessGroupBKCL.h b/paddle/fluid/distributed/collective/ProcessGroupBKCL.h index f7a95f9e48269..11c0dfbdc6234 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupBKCL.h +++ b/paddle/fluid/distributed/collective/ProcessGroupBKCL.h @@ -77,10 +77,10 @@ class ProcessGroupBKCL : public ProcessGroupStream { return std::string(BKCL_BACKEND_NAME); } - const phi::DeviceContext& GetDeviceContext(const Place& place) const override; + phi::DeviceContext* GetDeviceContext(const Place& place) const override; - const phi::DeviceContext& GetDeviceContext( - const Place& place, bool use_calc_stream) const override; + phi::DeviceContext* GetDeviceContext(const Place& place, + bool use_calc_stream) const override; std::shared_ptr AllReduce( phi::DenseTensor* out_tensor, diff --git a/paddle/fluid/distributed/collective/ProcessGroupCustom.cc b/paddle/fluid/distributed/collective/ProcessGroupCustom.cc index 2a87c78993719..4eee250e48a52 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupCustom.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupCustom.cc @@ -299,7 +299,7 @@ std::shared_ptr ProcessGroupCustom::Barrier( return task; } -const phi::DeviceContext& ProcessGroupCustom::GetDeviceContext( +phi::DeviceContext* ProcessGroupCustom::GetDeviceContext( const Place& place) const { const std::string key = GetKeyFromPlace(place); const auto& iter = places_to_ctx_.find(key); @@ -308,7 +308,7 @@ const phi::DeviceContext& ProcessGroupCustom::GetDeviceContext( places_to_ctx_.end(), platform::errors::NotFound( "Cannot find the device context in this process group.")); - return *iter->second[0]; + return iter->second[0].get(); } phi::ccl::CCLComm ProcessGroupCustom::CustomCCLComm(const Place& place) const { diff --git a/paddle/fluid/distributed/collective/ProcessGroupCustom.h b/paddle/fluid/distributed/collective/ProcessGroupCustom.h index 050e780ae120d..6aca380258644 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupCustom.h +++ b/paddle/fluid/distributed/collective/ProcessGroupCustom.h @@ -93,7 +93,7 @@ class ProcessGroupCustom : public ProcessGroup { std::shared_ptr Barrier( const BarrierOptions& = BarrierOptions()) override; - const phi::DeviceContext& GetDeviceContext(const Place& place) const override; + phi::DeviceContext* GetDeviceContext(const Place& place) const override; phi::ccl::CCLComm CustomCCLComm(const Place& place) const; diff --git a/paddle/fluid/distributed/collective/ProcessGroupGloo.h b/paddle/fluid/distributed/collective/ProcessGroupGloo.h index fd691e024c4a5..5e8dc1c5e602c 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupGloo.h +++ b/paddle/fluid/distributed/collective/ProcessGroupGloo.h @@ -180,9 +180,8 @@ class ProcessGroupGloo : public ProcessGroup { std::string GetBackendName() const override { return "GLOO"; } - const phi::DeviceContext& GetDeviceContext( - const Place& place) const override { - return *platform::DeviceContextPool::Instance().Get(place); + phi::DeviceContext* GetDeviceContext(const Place& place) const override { + return platform::DeviceContextPool::Instance().Get(place); } // Helper functions for Gloo. diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc index 3c7bc0ec8429f..f9ceaf089992c 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc @@ -94,17 +94,17 @@ void ProcessGroupNCCL::GroupEnd() { NCCL_CHECK(platform::dynload::ncclGroupEnd()); } -const phi::DeviceContext& ProcessGroupNCCL::GetDeviceContext( +phi::DeviceContext* ProcessGroupNCCL::GetDeviceContext( const Place& place) const { return GetDeviceContext(place, /*use_calc_stream*/ false); } -const phi::DeviceContext& ProcessGroupNCCL::GetDeviceContext( +phi::DeviceContext* ProcessGroupNCCL::GetDeviceContext( const Place& place, bool use_calc_stream) const { const std::string& key = GetKeyFromPlace(place); if (use_calc_stream) { const auto& iter = place_to_calc_ctx_.find(key); - return *iter->second; + return iter->second; } else { const auto& iter = place_to_comm_ctx_.find(key); PADDLE_ENFORCE_NE( @@ -112,7 +112,7 @@ const phi::DeviceContext& ProcessGroupNCCL::GetDeviceContext( place_to_comm_ctx_.end(), platform::errors::NotFound( "Cannot find the device context in this process group.")); - return *iter->second; + return iter->second.get(); } } diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h index a52e5e61cd295..d50003ba5a702 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h +++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h @@ -34,7 +34,7 @@ #ifdef PADDLE_WITH_RCCL #include "paddle/fluid/platform/dynload/rccl.h" -#else +#elif PADDLE_WITH_NCCL #include "paddle/fluid/platform/dynload/nccl.h" #endif @@ -83,10 +83,10 @@ class ProcessGroupNCCL final : public ProcessGroupStream { std::string GetBackendName() const override { return "NCCL"; } - const phi::DeviceContext& GetDeviceContext(const Place& place) const override; + phi::DeviceContext* GetDeviceContext(const Place& place, + bool use_calc_stream) const override; - const phi::DeviceContext& GetDeviceContext( - const Place& place, bool use_calc_stream) const override; + phi::DeviceContext* GetDeviceContext(const Place& place) const override; std::shared_ptr AllGather( phi::DenseTensor* out_tensor, diff --git a/paddle/fluid/distributed/collective/ProcessGroupStream.cc b/paddle/fluid/distributed/collective/ProcessGroupStream.cc index cd1e617a89e4c..332298ecfd4a2 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupStream.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupStream.cc @@ -20,7 +20,7 @@ namespace distributed { ProcessGroupStream::ProcessGroupStream(int rank, int size, int gid) : ProcessGroup(rank, size, gid) {} -const phi::DeviceContext& ProcessGroupStream::GetDeviceContext( +phi::DeviceContext* ProcessGroupStream::GetDeviceContext( const Place& place, bool use_calc_stream) const { PADDLE_THROW(platform::errors::Unimplemented( "ProcessGroup%s does not support get device_context.", GetBackendName())); diff --git a/paddle/fluid/distributed/collective/ProcessGroupStream.h b/paddle/fluid/distributed/collective/ProcessGroupStream.h index be76429580d10..fcdbd88562edf 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupStream.h +++ b/paddle/fluid/distributed/collective/ProcessGroupStream.h @@ -57,9 +57,10 @@ class ProcessGroupStream : public ProcessGroup { public: ProcessGroupStream(int rank, int size, int gid); virtual ~ProcessGroupStream() = default; + using ProcessGroup::GetDeviceContext; - virtual const phi::DeviceContext& GetDeviceContext( - const Place& place, bool use_calc_stream) const; + virtual phi::DeviceContext* GetDeviceContext(const Place& place, + bool use_calc_stream) const; std::shared_ptr AllGather( phi::DenseTensor* out_tensor, diff --git a/paddle/fluid/distributed/collective/reducer.cc b/paddle/fluid/distributed/collective/reducer.cc index f8850660640c3..cd8c8ed2e0cc9 100644 --- a/paddle/fluid/distributed/collective/reducer.cc +++ b/paddle/fluid/distributed/collective/reducer.cc @@ -1053,9 +1053,9 @@ void EagerReducer::FusedAllReduceSchedule(EagerGroup *group, } group->task = process_group_->AllReduce(in_out, in_out, opts); - const auto &context = process_group_->GetDeviceContext(inner_place_); - group->SplitTensorsDev(context); - group->task->UpdateWaitChain(context); + auto *context = process_group_->GetDeviceContext(inner_place_); + group->SplitTensorsDev(*context); + group->task->UpdateWaitChain(*context); // split in FinalizeBackward() } diff --git a/paddle/fluid/pybind/distributed_py.cc b/paddle/fluid/pybind/distributed_py.cc index 0634f825a0110..7fdf4a0930ebf 100644 --- a/paddle/fluid/pybind/distributed_py.cc +++ b/paddle/fluid/pybind/distributed_py.cc @@ -271,14 +271,14 @@ void BindDistributed(py::module *m) { in_tensor.impl()); auto in_dense = *p_in_tensor; - const auto &dev_ctx = self.GetDeviceContext(in_tensor.place()); + auto *dev_ctx = self.GetDeviceContext(in_tensor.place()); auto task = self.AllGather(out_dense, in_dense, /*offset*/ 0, /*numel*/ -1, sync_op); - SplitTensor(dev_ctx, *out_dense, &out_tensor_list); - task->UpdateWaitChain(dev_ctx); + SplitTensor(*dev_ctx, *out_dense, &out_tensor_list); + task->UpdateWaitChain(*dev_ctx); return task; }, py::arg("out"), @@ -334,7 +334,7 @@ void BindDistributed(py::module *m) { auto in_dense = *p_in_tensor; // in_tensor_list should not be empty - const auto &dev_ctx = + auto *dev_ctx = self.GetDeviceContext(in_tensor_list.back().place()); int world_size = self.GetSize(); auto task = @@ -343,8 +343,8 @@ void BindDistributed(py::module *m) { GetDefaultSplitSizes(*out_dense, world_size), GetDefaultSplitSizes(in_dense, world_size), sync_op); - SplitTensor(dev_ctx, *out_dense, &out_tensor_list); - task->UpdateWaitChain(dev_ctx); + SplitTensor(*dev_ctx, *out_dense, &out_tensor_list); + task->UpdateWaitChain(*dev_ctx); return task; }, py::arg("out"), @@ -770,15 +770,14 @@ void BindDistributed(py::module *m) { in_tensor.impl()); auto in_dense = *p_in_tensor; - const auto &dev_ctx = - self.GetDeviceContext(in_tensor.place(), true); + auto *dev_ctx = self.GetDeviceContext(in_tensor.place(), true); auto task = self.AllGather(out_dense, in_dense, /*offset*/ 0, /*numel*/ -1, /*sync_op*/ true, /*use_calc_stream*/ true); - SplitTensor(dev_ctx, *out_dense, &out_tensor_list); + SplitTensor(*dev_ctx, *out_dense, &out_tensor_list); return task; }, py::arg("out"), @@ -886,7 +885,7 @@ void BindDistributed(py::module *m) { auto in_dense = *p_in_tensor; // in_tensor_list should not be empty - const auto &dev_ctx = self.GetDeviceContext( + auto *dev_ctx = self.GetDeviceContext( in_tensor_list.back().place(), /*use_calc_stream*/ true); int world_size = self.GetSize(); auto task = @@ -896,7 +895,7 @@ void BindDistributed(py::module *m) { GetDefaultSplitSizes(in_dense, world_size), /*sync_op*/ true, /*use_calc_stream*/ true); - SplitTensor(dev_ctx, *out_dense, &out_tensor_list); + SplitTensor(*dev_ctx, *out_dense, &out_tensor_list); return task; }, py::arg("out"), From 2a47416cd6f9b8fbc1db81dc277f6049e1c980e6 Mon Sep 17 00:00:00 2001 From: LiYuRio <63526175+LiYuRio@users.noreply.github.com> Date: Mon, 21 Nov 2022 10:46:05 +0800 Subject: [PATCH 107/210] add new map instance (#48145) --- .../distributed/collective/ProcessGroup.cc | 16 +-- .../distributed/collective/ProcessGroup.h | 21 ++-- .../collective/ProcessGroupBKCL.cc | 8 ++ .../distributed/collective/ProcessGroupBKCL.h | 3 + .../collective/ProcessGroupCustom.cc | 13 +++ .../collective/ProcessGroupCustom.h | 7 ++ .../collective/ProcessGroupGloo.cc | 17 +++ .../distributed/collective/ProcessGroupGloo.h | 14 ++- .../collective/ProcessGroupNCCL.cc | 8 ++ .../distributed/collective/ProcessGroupNCCL.h | 3 + paddle/fluid/pybind/distributed_py.cc | 100 ++++++------------ paddle/phi/backends/gpu/gpu_context.cc | 8 +- python/paddle/distributed/collective.py | 8 +- .../custom_runtime/process_group_xccl.py | 2 +- .../collective/process_group_gloo.py | 2 +- 15 files changed, 132 insertions(+), 98 deletions(-) diff --git a/paddle/fluid/distributed/collective/ProcessGroup.cc b/paddle/fluid/distributed/collective/ProcessGroup.cc index 72cd66467d956..9e6bbd93754ff 100644 --- a/paddle/fluid/distributed/collective/ProcessGroup.cc +++ b/paddle/fluid/distributed/collective/ProcessGroup.cc @@ -35,17 +35,6 @@ void ProcessGroup::Task::Synchronize() {} void ProcessGroup::Task::UpdateWaitChain(const phi::DeviceContext& ctx) {} -ProcessGroup::ProcessGroup(int rank, - int size, - const platform::Place& place, - int gid) - : rank_(rank), size_(size), place_(place), gid_(gid) { - if (gid != IGNORE_ID) { - auto map = ProcessGroupMapFromGid::getInstance(); - map->insert(gid_, this); - } -} - ProcessGroup::ProcessGroup(int rank, int size, int gid) : rank_(rank), size_(size), gid_(gid) { if (gid != IGNORE_ID) { @@ -66,5 +55,10 @@ ProcessGroup::Task::Task(int rank, bool sync_op) : rank_(rank), comm_type_(comm_type), sync_op_(sync_op) {} +ProcessGroupIdMap& ProcessGroupIdMap::GetInstance() { + static ProcessGroupIdMap instance; + return instance; +} + } // namespace distributed } // namespace paddle diff --git a/paddle/fluid/distributed/collective/ProcessGroup.h b/paddle/fluid/distributed/collective/ProcessGroup.h index 8cc0cad8a5be0..1b712a857c37c 100644 --- a/paddle/fluid/distributed/collective/ProcessGroup.h +++ b/paddle/fluid/distributed/collective/ProcessGroup.h @@ -82,13 +82,8 @@ class ProcessGroup { }; public: - explicit ProcessGroup(int rank, int size, int gid); + ProcessGroup(int rank, int size, int gid); virtual ~ProcessGroup() = default; - // TODO(dev): This constructor will be removed later. - explicit ProcessGroup(int rank, - int size, - const platform::Place& place, - int gid); int GetRank() const { return rank_; } @@ -290,12 +285,18 @@ class ProcessGroup { } protected: - const int rank_; - const int size_; - const platform::Place place_; - const int gid_; + int rank_; + int size_; + int gid_; }; +class ProcessGroupIdMap + : public std::unordered_map> { + public: + static ProcessGroupIdMap& GetInstance(); +}; + +// TODO(dev): The following method will be removed soon. class ProcessGroupMapFromGid { public: bool has(int gid) { diff --git a/paddle/fluid/distributed/collective/ProcessGroupBKCL.cc b/paddle/fluid/distributed/collective/ProcessGroupBKCL.cc index 42d7d7200edcd..898166faae187 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupBKCL.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupBKCL.cc @@ -531,5 +531,13 @@ std::shared_ptr ProcessGroupBKCL::AllGather( /*use_calc_stream*/ false); } +std::shared_ptr ProcessGroupBKCL::CreateProcessGroupBKCL( + const std::shared_ptr& store, int rank, int size, int gid) { + auto process_group = + std::make_shared(store, rank, size, gid); + ProcessGroupIdMap::GetInstance().emplace(gid, process_group); + return process_group; +} + } // namespace distributed } // namespace paddle diff --git a/paddle/fluid/distributed/collective/ProcessGroupBKCL.h b/paddle/fluid/distributed/collective/ProcessGroupBKCL.h index 11c0dfbdc6234..b4a47e83fdd8a 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupBKCL.h +++ b/paddle/fluid/distributed/collective/ProcessGroupBKCL.h @@ -73,6 +73,9 @@ class ProcessGroupBKCL : public ProcessGroupStream { int size, int gid); + static std::shared_ptr CreateProcessGroupBKCL( + const std::shared_ptr& store, int rank, int size, int gid); + std::string GetBackendName() const override { return std::string(BKCL_BACKEND_NAME); } diff --git a/paddle/fluid/distributed/collective/ProcessGroupCustom.cc b/paddle/fluid/distributed/collective/ProcessGroupCustom.cc index 4eee250e48a52..7f75fc1784668 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupCustom.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupCustom.cc @@ -433,5 +433,18 @@ std::shared_ptr ProcessGroupCustom::Broadcast( CommType::BROADCAST); } +std::shared_ptr +ProcessGroupCustom::CreateProcessGroupCustom( + const std::shared_ptr& store, + const std::string& device_type, + int rank, + int size, + int gid) { + auto process_group = + std::make_shared(store, device_type, rank, size, gid); + ProcessGroupIdMap::GetInstance().emplace(gid, process_group); + return process_group; +} + } // namespace distributed } // namespace paddle diff --git a/paddle/fluid/distributed/collective/ProcessGroupCustom.h b/paddle/fluid/distributed/collective/ProcessGroupCustom.h index 6aca380258644..50e7583731e34 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupCustom.h +++ b/paddle/fluid/distributed/collective/ProcessGroupCustom.h @@ -69,6 +69,13 @@ class ProcessGroupCustom : public ProcessGroup { int size, int gid); + static std::shared_ptr CreateProcessGroupCustom( + const std::shared_ptr& store, + const std::string& device_type, + int rank, + int size, + int gid); + std::string GetBackendName() const override { return "XCCL_" + device_type_; } std::shared_ptr AllGather( diff --git a/paddle/fluid/distributed/collective/ProcessGroupGloo.cc b/paddle/fluid/distributed/collective/ProcessGroupGloo.cc index f0a65b02fb69f..03eeea0684c1f 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupGloo.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupGloo.cc @@ -617,5 +617,22 @@ ProcessGroupGloo::createDefaultDevice() { return createDeviceForHostname("127.0.0.1"); } +std::shared_ptr ProcessGroupGloo::CreateProcessGroupGloo( + const std::shared_ptr& store, int rank, int size, int gid) { + std::string GLOO_SOCKET_IFNAME_ENV = "GLOO_SOCKET_IFNAME"; + auto opts = GlooOptions::create(); + char* ifname = getenv(GLOO_SOCKET_IFNAME_ENV.c_str()); + if (ifname && strlen(ifname) > 1) { + opts->device = + ProcessGroupGloo::createDeviceForInterface(std::string(ifname)); + } else { + opts->device = ProcessGroupGloo::createDefaultDevice(); + } + auto process_group = + std::make_shared(store, rank, size, gid, opts); + ProcessGroupIdMap::GetInstance().emplace(gid, process_group); + return process_group; +} + } // namespace distributed } // namespace paddle diff --git a/paddle/fluid/distributed/collective/ProcessGroupGloo.h b/paddle/fluid/distributed/collective/ProcessGroupGloo.h index 5e8dc1c5e602c..67294932926b3 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupGloo.h +++ b/paddle/fluid/distributed/collective/ProcessGroupGloo.h @@ -15,6 +15,7 @@ #pragma once #include +#include #include #include "paddle/fluid/distributed/collective/ProcessGroup.h" @@ -98,12 +99,17 @@ class ProcessGroupGloo : public ProcessGroup { std::shared_ptr<::gloo::transport::Device> device; }; - explicit ProcessGroupGloo( + ProcessGroupGloo(const std::shared_ptr& store, + int rank, + int world_size, + int gid, + std::shared_ptr options); + + static std::shared_ptr CreateProcessGroupGloo( const std::shared_ptr& store, int rank, int world_size, - int gid, - std::shared_ptr options); + int gid); ~ProcessGroupGloo() = default; @@ -191,7 +197,7 @@ class ProcessGroupGloo : public ProcessGroup { const std::string& ifname); static std::shared_ptr<::gloo::transport::Device> createDefaultDevice(); - protected: + private: uint32_t _tag; std::shared_ptr _context; std::shared_ptr<::gloo::rendezvous::Store> _store; diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc index f9ceaf089992c..016249963d79b 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc @@ -1130,5 +1130,13 @@ std::shared_ptr ProcessGroupNCCL::Scatter( CommType::SCATTER); } +std::shared_ptr ProcessGroupNCCL::CreateProcessGroupNCCL( + const std::shared_ptr& store, int rank, int size, int gid) { + auto process_group = + std::make_shared(store, rank, size, gid); + ProcessGroupIdMap::GetInstance().emplace(gid, process_group); + return process_group; +} + } // namespace distributed } // namespace paddle diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h index d50003ba5a702..73d484caca169 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h +++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h @@ -76,6 +76,9 @@ class ProcessGroupNCCL final : public ProcessGroupStream { }; public: + static std::shared_ptr CreateProcessGroupNCCL( + const std::shared_ptr& store, int rank, int size, int gid); + ProcessGroupNCCL(const std::shared_ptr& store, int rank, int size, diff --git a/paddle/fluid/pybind/distributed_py.cc b/paddle/fluid/pybind/distributed_py.cc index 7fdf4a0930ebf..6b612fda5337e 100644 --- a/paddle/fluid/pybind/distributed_py.cc +++ b/paddle/fluid/pybind/distributed_py.cc @@ -85,8 +85,6 @@ using GlooStore = paddle::distributed::ProcessGroupGloo::GlooStore; using GlooOptions = paddle::distributed::ProcessGroupGloo::GlooOptions; #endif -static std::string GLOO_SOCKET_IFNAME_ENV = "GLOO_SOCKET_IFNAME"; // NOLINT - static UNUSED void *use_ccl_comm_func = phi::detail::GetCCLComm(phi::CPUPlace()); @@ -1221,24 +1219,18 @@ void BindDistributed(py::module *m) { py::call_guard()); #if defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL) - auto processGroupNCCL = - py::class_>( - *m, "ProcessGroupNCCL", ProcessGroupStream) - .def(py::init &, - int, - int, - int>(), - py::arg("store"), - py::arg("rank"), - py::arg("world_size"), - py::arg("group_id") = 0, - py::call_guard()); - - processGroupNCCL.def_static( - "group_start", []() { distributed::ProcessGroupNCCL::GroupStart(); }); - processGroupNCCL.def_static( - "group_end", []() { distributed::ProcessGroupNCCL::GroupEnd(); }); + py::class_>( + *m, "ProcessGroupNCCL", ProcessGroupStream) + .def_static("create", + distributed::ProcessGroupNCCL::CreateProcessGroupNCCL, + py::arg("store"), + py::arg("rank"), + py::arg("world_size"), + py::arg("group_id") = 0, + py::call_guard()) + .def_static("group_start", distributed::ProcessGroupNCCL::GroupStart) + .def_static("group_end", distributed::ProcessGroupNCCL::GroupEnd); #endif @@ -1265,17 +1257,14 @@ void BindDistributed(py::module *m) { py::class_>( *m, "ProcessGroupCustom", ProcessGroup) - .def(py::init &, - const std::string &, - int, - int, - int>(), - py::arg("store"), - py::arg("device_type"), - py::arg("rank"), - py::arg("world_size"), - py::arg("group_id") = 0, - py::call_guard()); + .def_static("create", + distributed::ProcessGroupCustom::CreateProcessGroupCustom, + py::arg("store"), + py::arg("device_type"), + py::arg("rank"), + py::arg("world_size"), + py::arg("group_id") = 0, + py::call_guard()); #endif @@ -1284,15 +1273,13 @@ void BindDistributed(py::module *m) { py::class_>( *m, "ProcessGroupBKCL", ProcessGroupStream) - .def(py::init &, - int, - int, - int>(), - py::arg("store"), - py::arg("rank"), - py::arg("world_size"), - py::arg("group_id") = 0, - py::call_guard()); + .def_static("create", + distributed::ProcessGroupBKCL::CreateProcessGroupBKCL, + py::arg("store"), + py::arg("rank"), + py::arg("world_size"), + py::arg("group_id") = 0, + py::call_guard()); #endif py::class_>( *m, "ProcessGroupGloo", ProcessGroup) - .def(py::init &, - int, - int, - int, - std::shared_ptr &>(), - py::call_guard()) - .def(py::init([](const std::shared_ptr &store, - int rank, - int world_size, - int gid) { - auto opts = GlooOptions::create(); - char *ifname = getenv(GLOO_SOCKET_IFNAME_ENV.c_str()); - if (ifname && strlen(ifname) > 1) { - opts->device = ProcessGroupGloo::createDeviceForInterface( - std::string(ifname)); - } else { - opts->device = ProcessGroupGloo::createDefaultDevice(); - } - return std::make_shared( - store, rank, world_size, gid, opts); - }), - py::arg("store"), - py::arg("rank"), - py::arg("world_size"), - py::arg("group_id") = 0, - py::call_guard()) + .def_static("create", + distributed::ProcessGroupGloo::CreateProcessGroupGloo, + py::arg("store"), + py::arg("rank"), + py::arg("world_size"), + py::arg("group_id") = 0, + py::call_guard()) .def_static("create_default_device", &ProcessGroupGloo::createDefaultDevice); #endif diff --git a/paddle/phi/backends/gpu/gpu_context.cc b/paddle/phi/backends/gpu/gpu_context.cc index 69f0baf069984..0e102911442f2 100644 --- a/paddle/phi/backends/gpu/gpu_context.cc +++ b/paddle/phi/backends/gpu/gpu_context.cc @@ -252,7 +252,13 @@ struct GPUContext::Impl { phi::DestroyDnnHandle(dnn_handle_); #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) if (nccl_comm_) { - PADDLE_ENFORCE_GPU_SUCCESS(dynload::ncclCommDestroy(nccl_comm_)); + // NOTE(liyurui): It is not recommend calling CUDA runtime API + // in destructor. Since we can not ensure the release order of + // static object, calling ncclCommDestroy in static object destructor + // is a undefined behavior, CUDA driver may be already unloaded + // from process. + // If you really need to release the resource of nccl_comm, + // try to get the nccl_comm out and use ncclCommDestroy outside. } #endif phi::DestroyBlasHandle(blas_handle_); diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py index 4bdc473f9a0ac..5f33748559848 100644 --- a/python/paddle/distributed/collective.py +++ b/python/paddle/distributed/collective.py @@ -152,15 +152,15 @@ def _new_process_group_impl( genv = _get_global_env() assert backend in _valid_backend_list, "Unsupported backend: %s." % backend if backend == "gloo": - pg = core.ProcessGroupGloo(store, rank, world_size, group_id) + pg = core.ProcessGroupGloo.create(store, rank, world_size, group_id) elif backend == "nccl": - pg = core.ProcessGroupNCCL(store, rank, world_size, group_id) + pg = core.ProcessGroupNCCL.create(store, rank, world_size, group_id) elif backend == "xccl": - pg = core.ProcessGroupCustom( + pg = core.ProcessGroupCustom.create( store, genv.device_type, rank, world_size, group_id ) elif backend == "bkcl": - pg = core.ProcessGroupBKCL(store, rank, world_size, group_id) + pg = core.ProcessGroupBKCL.create(store, rank, world_size, group_id) return pg diff --git a/python/paddle/fluid/tests/custom_runtime/process_group_xccl.py b/python/paddle/fluid/tests/custom_runtime/process_group_xccl.py index 9734b2e775e19..04607fcb18326 100644 --- a/python/paddle/fluid/tests/custom_runtime/process_group_xccl.py +++ b/python/paddle/fluid/tests/custom_runtime/process_group_xccl.py @@ -28,7 +28,7 @@ def init_process_group(strategy=None): rank = ParallelEnv().local_rank is_master = True if rank == 0 else False store = paddle.fluid.core.TCPStore("127.0.0.1", 6173, is_master, nranks) - pg_group = core.ProcessGroupCustom( + pg_group = core.ProcessGroupCustom.create( store, ParallelEnv().device_type, rank, diff --git a/python/paddle/fluid/tests/unittests/collective/process_group_gloo.py b/python/paddle/fluid/tests/unittests/collective/process_group_gloo.py index f93adb60910b5..f7d558e96b818 100644 --- a/python/paddle/fluid/tests/unittests/collective/process_group_gloo.py +++ b/python/paddle/fluid/tests/unittests/collective/process_group_gloo.py @@ -42,7 +42,7 @@ def test_create_process_group_gloo(self): store = paddle.fluid.core.TCPStore( "127.0.0.1", 6272, is_master, nranks, 30 ) - pg = paddle.fluid.core.ProcessGroupGloo(store, rank, nranks) + pg = paddle.fluid.core.ProcessGroupGloo.create(store, rank, nranks) # test allreduce sum # rank 0 From 88410225adbc20a478190fe5c4924e04f354c855 Mon Sep 17 00:00:00 2001 From: Wen Sun <35923278+HermitSun@users.noreply.github.com> Date: Mon, 21 Nov 2022 10:55:22 +0800 Subject: [PATCH 108/210] Unify `ProcessGroupNCCL` APIs underlying implementation (#48163) * refactor: replace Collective & PointToPoint with NCCLEnv * refactor: rename to RunFnInNCCLEnv * refactor: pass std::function by value --- .../distributed/collective/ProcessGroup.h | 11 +- .../collective/ProcessGroupNCCL.cc | 234 ++++++------------ .../distributed/collective/ProcessGroupNCCL.h | 18 +- .../collective/ProcessGroupStream.cc | 4 +- .../collective/ProcessGroupStream.h | 15 +- .../collective/global_gather_op.cu.cc | 2 +- .../collective/global_scatter_op.cu.cc | 2 +- .../collective/partial_send_op.cu.cc | 2 +- paddle/fluid/pybind/distributed_py.cc | 8 +- 9 files changed, 104 insertions(+), 192 deletions(-) diff --git a/paddle/fluid/distributed/collective/ProcessGroup.h b/paddle/fluid/distributed/collective/ProcessGroup.h index 1b712a857c37c..7abecd36e3d00 100644 --- a/paddle/fluid/distributed/collective/ProcessGroup.h +++ b/paddle/fluid/distributed/collective/ProcessGroup.h @@ -185,11 +185,12 @@ class ProcessGroup { GetBackendName())); } - virtual std::shared_ptr Send(phi::DenseTensor*, - int dst_rank, - int64_t offset, - int64_t numel, - bool sync_op) { + virtual std::shared_ptr Send( + const phi::DenseTensor& tensor, + int dst_rank, + int64_t offset, + int64_t numel, + bool sync_op) { PADDLE_THROW(platform::errors::Unimplemented( "ProcessGroup%s does not support send with sync_op flag.", GetBackendName())); diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc index 016249963d79b..96666f50c91ef 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc @@ -137,21 +137,17 @@ std::shared_ptr ProcessGroupNCCL::AllGather( // numel > 0 indicates the tensor need to be sliced const phi::DenseTensor& in_tensor_maybe_partial = numel > 0 ? GetPartialTensor(in_tensor, offset, numel) : in_tensor; - return Collective( - out_tensor, - in_tensor_maybe_partial, - [](phi::DenseTensor* output, - const phi::DenseTensor& input, - ncclComm_t comm, - gpuStream_t stream) { + return RunFnInNCCLEnv( + [&](ncclComm_t comm, gpuStream_t stream) { NCCL_CHECK(platform::dynload::ncclAllGather( - input.data(), - output->data(), - input.numel(), - platform::ToNCCLDataType(input.dtype()), + in_tensor_maybe_partial.data(), + out_tensor->data(), + in_tensor_maybe_partial.numel(), + platform::ToNCCLDataType(in_tensor_maybe_partial.dtype()), comm, stream)); }, + in_tensor_maybe_partial, CommType::ALLGATHER, sync_op, use_calc_stream); @@ -163,22 +159,18 @@ std::shared_ptr ProcessGroupNCCL::AllReduce( const AllreduceOptions& opts, bool sync_op, bool use_calc_stream) { - return Collective( - out_tensor, - in_tensor, - [&](phi::DenseTensor* output, - const phi::DenseTensor& input, - ncclComm_t comm, - gpuStream_t stream) { + return RunFnInNCCLEnv( + [&](ncclComm_t comm, gpuStream_t stream) { NCCL_CHECK(platform::dynload::ncclAllReduce( - input.data(), - output->data(), - input.numel(), - platform::ToNCCLDataType(input.type()), + in_tensor.data(), + out_tensor->data(), + in_tensor.numel(), + platform::ToNCCLDataType(in_tensor.dtype()), ToNCCLRedType(opts.reduce_op), comm, stream)); }, + in_tensor, CommType::ALLREDUCE, sync_op, use_calc_stream); @@ -215,37 +207,32 @@ std::shared_ptr ProcessGroupNCCL::AllToAll( CheckSizeOnEachRank(out_dim, out_size_each_rank, size_); CheckSizeOnEachRank(in_dim, in_size_each_rank, size_); - return Collective( - out_tensor, - in_tensor, - [&](phi::DenseTensor* output, - const phi::DenseTensor& input, - ncclComm_t comm, - gpuStream_t stream) { - int64_t in_row_size = input.numel() / in_dim[0], - out_row_size = output->numel() / out_dim[0]; + return RunFnInNCCLEnv( + [&](ncclComm_t comm, gpuStream_t stream) { + int64_t in_row_size = in_tensor.numel() / in_dim[0], + out_row_size = out_tensor->numel() / out_dim[0]; int64_t in_offset = 0, in_numel = 0, out_offset = 0, out_numel = 0; phi::DenseTensor input_partial, output_partial; GroupStart(); for (auto i = 0; i < size_; i++) { in_numel = in_size_each_rank[i] * in_row_size; - input_partial = GetPartialTensor(input, in_offset, in_numel); + input_partial = GetPartialTensor(in_tensor, in_offset, in_numel); NCCL_CHECK(platform::dynload::ncclSend( input_partial.data(), in_numel, - platform::ToNCCLDataType(input.dtype()), + platform::ToNCCLDataType(input_partial.dtype()), i, comm, stream)); in_offset += in_numel; out_numel = out_size_each_rank[i] * out_row_size; - output_partial = GetPartialTensor(*output, out_offset, out_numel); + output_partial = GetPartialTensor(*out_tensor, out_offset, out_numel); NCCL_CHECK(platform::dynload::ncclRecv( output_partial.data(), out_numel, - platform::ToNCCLDataType(output->dtype()), + platform::ToNCCLDataType(output_partial.dtype()), i, comm, stream)); @@ -253,6 +240,7 @@ std::shared_ptr ProcessGroupNCCL::AllToAll( } GroupEnd(); }, + in_tensor, CommType::ALLTOALL, sync_op, use_calc_stream); @@ -286,23 +274,19 @@ std::shared_ptr ProcessGroupNCCL::Broadcast( const BroadcastOptions& opts, bool sync_op, bool use_calc_stream) { - return Collective( - out_tensor, - in_tensor, - [&](phi::DenseTensor* output, - const phi::DenseTensor& input, - ncclComm_t comm, - gpuStream_t stream) { + return RunFnInNCCLEnv( + [&](ncclComm_t comm, gpuStream_t stream) { int root = opts.source_rank + opts.source_root; NCCL_CHECK(platform::dynload::ncclBroadcast( - input.data(), - output->data(), - input.numel(), - platform::ToNCCLDataType(input.type()), + in_tensor.data(), + out_tensor->data(), + in_tensor.numel(), + platform::ToNCCLDataType(in_tensor.dtype()), root, comm, stream)); }, + in_tensor, CommType::BROADCAST, sync_op, use_calc_stream); @@ -314,23 +298,19 @@ std::shared_ptr ProcessGroupNCCL::Reduce( const ReduceOptions& opts, bool sync_op, bool use_calc_stream) { - return Collective( - out_tensor, - in_tensor, - [&](phi::DenseTensor* output, - const phi::DenseTensor& input, - ncclComm_t comm, - gpuStream_t stream) { + return RunFnInNCCLEnv( + [&](ncclComm_t comm, gpuStream_t stream) { NCCL_CHECK(platform::dynload::ncclReduce( - input.data(), - output->data(), - input.numel(), - platform::ToNCCLDataType(input.dtype()), + in_tensor.data(), + out_tensor->data(), + in_tensor.numel(), + platform::ToNCCLDataType(in_tensor.dtype()), ToNCCLRedType(opts.reduce_op), opts.root_rank, comm, stream)); }, + in_tensor, CommType::REDUCE, sync_op, use_calc_stream); @@ -342,22 +322,18 @@ std::shared_ptr ProcessGroupNCCL::ReduceScatter( const ReduceScatterOptions& opts, bool sync_op, bool use_calc_stream) { - return Collective( - out_tensor, - in_tensor, - [&](phi::DenseTensor* output, - const phi::DenseTensor& input, - ncclComm_t comm, - gpuStream_t stream) { + return RunFnInNCCLEnv( + [&](ncclComm_t comm, gpuStream_t stream) { NCCL_CHECK(platform::dynload::ncclReduceScatter( - input.data(), - output->data(), - output->numel(), - platform::ToNCCLDataType(input.dtype()), + in_tensor.data(), + out_tensor->data(), + out_tensor->numel(), + platform::ToNCCLDataType(in_tensor.dtype()), ToNCCLRedType(opts.reduce_op), comm, stream)); }, + in_tensor, CommType::REDUCE_SCATTER, sync_op, use_calc_stream); @@ -369,47 +345,43 @@ std::shared_ptr ProcessGroupNCCL::Scatter( const ScatterOptions& opts, bool sync_op, bool use_calc_stream) { - return Collective( - out_tensor, - in_tensor, - [&](phi::DenseTensor* output, - const phi::DenseTensor& input, - ncclComm_t comm, - gpuStream_t stream) { - int64_t numel = input.numel() / size_; + return RunFnInNCCLEnv( + [&](ncclComm_t comm, gpuStream_t stream) { + int64_t numel = in_tensor.numel() / size_; if (rank_ == opts.root_rank) { int64_t offset = 0; phi::DenseTensor partial_tensor; GroupStart(); for (auto i = 0; i < size_; i++) { - partial_tensor = GetPartialTensor(input, offset, numel); + partial_tensor = GetPartialTensor(in_tensor, offset, numel); NCCL_CHECK(platform::dynload::ncclSend( partial_tensor.data(), numel, - platform::ToNCCLDataType(input.dtype()), + platform::ToNCCLDataType(partial_tensor.dtype()), i, comm, stream)); offset += numel; } NCCL_CHECK(platform::dynload::ncclRecv( - output->data(), + out_tensor->data(), numel, - platform::ToNCCLDataType(output->dtype()), + platform::ToNCCLDataType(out_tensor->dtype()), opts.root_rank, comm, stream)); GroupEnd(); } else { NCCL_CHECK(platform::dynload::ncclRecv( - output->data(), + out_tensor->data(), numel, - platform::ToNCCLDataType(output->dtype()), + platform::ToNCCLDataType(out_tensor->dtype()), opts.root_rank, comm, stream)); } }, + in_tensor, CommType::SCATTER, sync_op, use_calc_stream); @@ -428,54 +400,43 @@ std::shared_ptr ProcessGroupNCCL::Recv( partial_tensor = GetPartialTensor(*tensor, offset, numel); tensor = &partial_tensor; } - return PointToPoint( - tensor, - src_rank, - [](phi::DenseTensor* output, - int src, - ncclComm_t comm, - gpuStream_t stream) { + return RunFnInNCCLEnv( + [&](ncclComm_t comm, gpuStream_t stream) { NCCL_CHECK(platform::dynload::ncclRecv( - output->data(), - output->numel(), - platform::ToNCCLDataType(output->dtype()), - src, + tensor->data(), + tensor->numel(), + platform::ToNCCLDataType(tensor->dtype()), + src_rank, comm, stream)); }, + *tensor, CommType::RECV, sync_op, use_calc_stream); } std::shared_ptr ProcessGroupNCCL::Send( - phi::DenseTensor* tensor, + const phi::DenseTensor& tensor, int dst_rank, int64_t offset, int64_t numel, bool sync_op, bool use_calc_stream) { // numel > 0 indicates the tensor need to be sliced - phi::DenseTensor partial_tensor; - if (numel > 0) { - partial_tensor = GetPartialTensor(*tensor, offset, numel); - tensor = &partial_tensor; - } - return PointToPoint( - tensor, - dst_rank, - [](phi::DenseTensor* input, - int dst, - ncclComm_t comm, - gpuStream_t stream) { + const phi::DenseTensor& tensor_maybe_partial = + numel > 0 ? GetPartialTensor(tensor, offset, numel) : tensor; + return RunFnInNCCLEnv( + [&](ncclComm_t comm, gpuStream_t stream) { NCCL_CHECK(platform::dynload::ncclSend( - input->data(), - input->numel(), - platform::ToNCCLDataType(input->dtype()), - dst, + tensor_maybe_partial.data(), + tensor_maybe_partial.numel(), + platform::ToNCCLDataType(tensor_maybe_partial.dtype()), + dst_rank, comm, stream)); }, + tensor_maybe_partial, CommType::SEND, sync_op, use_calc_stream); @@ -548,54 +509,13 @@ void ProcessGroupNCCL::SyncCalcStream(const Place& place) { calc_event.Wait(platform::Place2DeviceType(place), comm_ctx); } -template -std::shared_ptr ProcessGroupNCCL::Collective( - phi::DenseTensor* out_tensor, - const phi::DenseTensor& in_tensor, - Fn fn, - CommType comm_type, - bool sync_op, - bool use_calc_stream) { - const auto& place = in_tensor.place(); - const auto& key = GetKeyFromPlace(place); - - platform::CUDADeviceGuard cuda_guard(place); - - if (place_to_comm_ctx_.find(key) == place_to_comm_ctx_.end()) { - CreateNCCLEnvCache(place, key); - } - - if (!use_calc_stream) { - SyncCalcStream(place); - } - - auto task = CreateTask(place, rank_, comm_type, sync_op, use_calc_stream); - - const auto* calc_ctx = place_to_calc_ctx_.at(key); - const auto& comm_ctx = place_to_comm_ctx_.at(key); - auto nccl_comm = comm_ctx->nccl_comm(); - auto nccl_stream = use_calc_stream ? calc_ctx->stream() : comm_ctx->stream(); - fn(out_tensor, in_tensor, nccl_comm, nccl_stream); - - if (!use_calc_stream) { - if (FLAGS_use_stream_safe_cuda_allocator) { - memory::RecordStream(in_tensor.Holder(), nccl_stream); - } - task->UpdateWaitChain(*comm_ctx); - } - - return task; -} - -template -std::shared_ptr ProcessGroupNCCL::PointToPoint( - phi::DenseTensor* tensor, - int rank, - Fn fn, +std::shared_ptr ProcessGroupNCCL::RunFnInNCCLEnv( + std::function fn, + const phi::DenseTensor& tensor, CommType comm_type, bool sync_op, bool use_calc_stream) { - const auto& place = tensor->place(); + const auto& place = tensor.place(); const auto& key = GetKeyFromPlace(place); platform::CUDADeviceGuard cuda_guard(place); @@ -614,11 +534,11 @@ std::shared_ptr ProcessGroupNCCL::PointToPoint( const auto& comm_ctx = place_to_comm_ctx_.at(key); auto nccl_comm = comm_ctx->nccl_comm(); auto nccl_stream = use_calc_stream ? calc_ctx->stream() : comm_ctx->stream(); - fn(tensor, rank, nccl_comm, nccl_stream); + fn(nccl_comm, nccl_stream); if (!use_calc_stream) { if (FLAGS_use_stream_safe_cuda_allocator) { - memory::RecordStream(tensor->Holder(), nccl_stream); + memory::RecordStream(tensor.Holder(), nccl_stream); } task->UpdateWaitChain(*comm_ctx); } diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h index 73d484caca169..5153b7a678dd4 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h +++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h @@ -150,7 +150,7 @@ class ProcessGroupNCCL final : public ProcessGroupStream { bool sync_op, bool use_calc_stream) override; - std::shared_ptr Send(phi::DenseTensor* tensor, + std::shared_ptr Send(const phi::DenseTensor& tensor, int dst_rank, int64_t offset, int64_t numel, @@ -210,23 +210,13 @@ class ProcessGroupNCCL final : public ProcessGroupStream { void CreateNCCLEnvCache(const Place& place, const std::string& place_key); - template - std::shared_ptr Collective( - phi::DenseTensor* out_tensor, - const phi::DenseTensor& in_tensor, - Fn fn, + std::shared_ptr RunFnInNCCLEnv( + std::function fn, + const phi::DenseTensor& tensor, CommType comm_type, bool sync_op, bool use_calc_stream); - template - std::shared_ptr PointToPoint(phi::DenseTensor* tensor, - int rank, - Fn fn, - CommType op_type, - bool sync_op, - bool use_calc_stream); - void SyncCalcStream(const Place& place); // TODO(sunyilun): methods below will be removed later diff --git a/paddle/fluid/distributed/collective/ProcessGroupStream.cc b/paddle/fluid/distributed/collective/ProcessGroupStream.cc index 332298ecfd4a2..e1ee425f3f888 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupStream.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupStream.cc @@ -212,7 +212,7 @@ std::shared_ptr ProcessGroupStream::Recv( } std::shared_ptr ProcessGroupStream::Send( - phi::DenseTensor* tensor, + const phi::DenseTensor& tensor, int dst_rank, int64_t offset, int64_t numel, @@ -226,7 +226,7 @@ std::shared_ptr ProcessGroupStream::Send( } std::shared_ptr ProcessGroupStream::Send( - phi::DenseTensor*, + const phi::DenseTensor& tensor, int dst_rank, int64_t offset, int64_t numel, diff --git a/paddle/fluid/distributed/collective/ProcessGroupStream.h b/paddle/fluid/distributed/collective/ProcessGroupStream.h index fcdbd88562edf..4ad75be3658b9 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupStream.h +++ b/paddle/fluid/distributed/collective/ProcessGroupStream.h @@ -168,18 +168,19 @@ class ProcessGroupStream : public ProcessGroup { bool sync_op, bool use_calc_stream); - std::shared_ptr Send(phi::DenseTensor* tensor, + std::shared_ptr Send(const phi::DenseTensor& tensor, int dst_rank, int64_t offset, int64_t numel, bool sync_op) override; - virtual std::shared_ptr Send(phi::DenseTensor* tensor, - int dst_rank, - int64_t offset, - int64_t numel, - bool sync_op, - bool use_calc_stream); + virtual std::shared_ptr Send( + const phi::DenseTensor& tensor, + int dst_rank, + int64_t offset, + int64_t numel, + bool sync_op, + bool use_calc_stream); }; } // namespace distributed diff --git a/paddle/fluid/operators/collective/global_gather_op.cu.cc b/paddle/fluid/operators/collective/global_gather_op.cu.cc index 580f815c9ab6e..439630a7f1dd7 100644 --- a/paddle/fluid/operators/collective/global_gather_op.cu.cc +++ b/paddle/fluid/operators/collective/global_gather_op.cu.cc @@ -226,7 +226,7 @@ struct GlobalGatherProcessGroupFunctor { int idx = i + j * n_expert; if (cpu_global_count_data[idx]) { phi::DenseTensor tmp = *x; - pg->Send(&tmp, + pg->Send(tmp, j, send_ptr * in_feat, cpu_global_count_data[idx] * in_feat, diff --git a/paddle/fluid/operators/collective/global_scatter_op.cu.cc b/paddle/fluid/operators/collective/global_scatter_op.cu.cc index a6eb714662200..4ccf9dee2631f 100644 --- a/paddle/fluid/operators/collective/global_scatter_op.cu.cc +++ b/paddle/fluid/operators/collective/global_scatter_op.cu.cc @@ -224,7 +224,7 @@ struct GlobalScatterProcessGroupFunctor { int idx = i + j * n_expert; if (cpu_local_count_data[idx]) { phi::DenseTensor tmp = *x; - pg->Send(&tmp, + pg->Send(tmp, j, expert_ptr[idx] * in_feat, cpu_local_count_data[idx] * in_feat, diff --git a/paddle/fluid/operators/collective/partial_send_op.cu.cc b/paddle/fluid/operators/collective/partial_send_op.cu.cc index b7196473c9ac1..7d4125be8d32e 100644 --- a/paddle/fluid/operators/collective/partial_send_op.cu.cc +++ b/paddle/fluid/operators/collective/partial_send_op.cu.cc @@ -70,7 +70,7 @@ class PartialSendCUDAKernel : public framework::OpKernel { // Use ProcessGroup distributed::ProcessGroup* pg = map->get(rid); phi::DenseTensor tmp = *x; - auto task = pg->Send(&tmp, peer, offset, send_numel, /*sync_op*/ true); + auto task = pg->Send(tmp, peer, offset, send_numel, /*sync_op*/ true); task->Wait(); } else { gpuStream_t stream = nullptr; diff --git a/paddle/fluid/pybind/distributed_py.cc b/paddle/fluid/pybind/distributed_py.cc index 6b612fda5337e..c5d03ce8853e3 100644 --- a/paddle/fluid/pybind/distributed_py.cc +++ b/paddle/fluid/pybind/distributed_py.cc @@ -168,7 +168,7 @@ void BindDistributed(py::module *m) { auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0); auto p_dense = std::dynamic_pointer_cast(tensor.impl()); - auto *out_dense = p_dense.get(); + auto out_dense = *p_dense; // numel == -1 indicates sending the whole tensor return self.Send( out_dense, dst, /*offset*/ 0, /*numel*/ -1, sync_op); @@ -189,7 +189,7 @@ void BindDistributed(py::module *m) { auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0); auto p_dense = std::dynamic_pointer_cast(tensor.impl()); - auto *out_dense = p_dense.get(); + auto out_dense = *p_dense; int64_t numel = p_dense->numel(); int64_t send_numel = numel / nranks; @@ -1126,7 +1126,7 @@ void BindDistributed(py::module *m) { auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0); auto p_dense = std::dynamic_pointer_cast(tensor.impl()); - auto *out_dense = p_dense.get(); + auto out_dense = *p_dense; // numel == -1 indicates sending the whole tensor return self.Send(out_dense, dst, @@ -1149,7 +1149,7 @@ void BindDistributed(py::module *m) { auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0); auto p_dense = std::dynamic_pointer_cast(tensor.impl()); - auto *out_dense = p_dense.get(); + auto out_dense = *p_dense; int64_t numel = p_dense->numel(); int64_t send_numel = numel / nranks; From 3501ff7d4afeb51852008906201d50822d1395f5 Mon Sep 17 00:00:00 2001 From: huangjiyi <43315610+huangjiyi@users.noreply.github.com> Date: Mon, 21 Nov 2022 11:04:07 +0800 Subject: [PATCH 109/210] [PHI decoupling] move cross_entropy from fluid to phi (#48160) * move cross_entropy from fluid to phi * replace mutable_data with Alloc * use .template --- .../c_softmax_with_cross_entropy_op.cu | 10 ++--- .../c_softmax_with_cross_entropy_op.h | 2 +- paddle/fluid/operators/cross_entropy_op.h | 6 +-- paddle/fluid/operators/math/CMakeLists.txt | 1 - .../softmax_with_cross_entropy_op_npu.cc | 2 +- .../phi/kernels/cpu/cross_entropy_kernel.cc | 4 +- paddle/phi/kernels/funcs/CMakeLists.txt | 1 + .../kernels/funcs}/cross_entropy.cc | 42 +++++++++---------- .../kernels/funcs}/cross_entropy.cu | 37 ++++++++-------- .../kernels/funcs}/cross_entropy.h | 31 +++++++------- .../kernels/gpu/cross_entropy_grad_kernel.cu | 1 - .../phi/kernels/gpu/cross_entropy_kernel.cu | 38 ++++++++--------- 12 files changed, 84 insertions(+), 91 deletions(-) rename paddle/{fluid/operators/math => phi/kernels/funcs}/cross_entropy.cc (78%) rename paddle/{fluid/operators/math => phi/kernels/funcs}/cross_entropy.cu (85%) rename paddle/{fluid/operators/math => phi/kernels/funcs}/cross_entropy.h (74%) diff --git a/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu index 5a87f39092379..0881b702ec0d8 100644 --- a/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu +++ b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu @@ -13,13 +13,13 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.h" -#include "paddle/fluid/operators/math/cross_entropy.h" #include "paddle/fluid/operators/math/softmax_impl.h" #include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h" #include "paddle/fluid/string/string_helper.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/kernels/funcs/axis_utils.h" +#include "paddle/phi/kernels/funcs/cross_entropy.h" namespace paddle { namespace operators { @@ -237,9 +237,9 @@ struct CSoftmaxWithCrossEntropyFunctor { auto eigen_predicted_logits = math::EigenMatrix::From(predicted_logits); eigen_loss.device(*dev_ctx.eigen_device()) = - (eigen_sum_exp_logits.log().unaryExpr(math::TolerableValue()) - + (eigen_sum_exp_logits.log().unaryExpr(phi::funcs::TolerableValue()) - eigen_predicted_logits) - .unaryExpr(math::TolerableValue()); + .unaryExpr(phi::funcs::TolerableValue()); eigen_softmax.device(*dev_ctx.eigen_device()) = (eigen_softmax * @@ -372,9 +372,9 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor { auto eigen_predicted_logits = math::EigenMatrix::From(predicted_logits); eigen_loss.device(*dev_ctx.eigen_device()) = - (eigen_sum_exp_logits.log().unaryExpr(math::TolerableValue()) - + (eigen_sum_exp_logits.log().unaryExpr(phi::funcs::TolerableValue()) - eigen_predicted_logits) - .unaryExpr(math::TolerableValue()); + .unaryExpr(phi::funcs::TolerableValue()); eigen_softmax.device(*dev_ctx.eigen_device()) = (eigen_softmax * diff --git a/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.h b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.h index 0336d565de2bf..f3a438e729bb1 100644 --- a/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.h +++ b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.h @@ -22,9 +22,9 @@ limitations under the License. */ #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math/cross_entropy.h" #include "paddle/fluid/operators/math/softmax.h" #include "paddle/phi/api/include/tensor.h" +#include "paddle/phi/kernels/funcs/cross_entropy.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/cross_entropy_op.h b/paddle/fluid/operators/cross_entropy_op.h index 4dcaf7b99f091..8ae6f448d24ba 100644 --- a/paddle/fluid/operators/cross_entropy_op.h +++ b/paddle/fluid/operators/cross_entropy_op.h @@ -15,8 +15,8 @@ limitations under the License. */ #pragma once #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math/cross_entropy.h" #include "paddle/fluid/platform/for_range.h" +#include "paddle/phi/kernels/funcs/cross_entropy.h" #include "paddle/phi/kernels/funcs/math.h" #include "paddle/phi/kernels/funcs/math_function.h" @@ -51,7 +51,7 @@ class CrossEntropyOpKernel : public framework::OpKernel { } int axis_dim = x->dims()[rank - 1]; - math::CrossEntropyFunctor()( + phi::funcs::CrossEntropyFunctor()( ctx.template device_context(), &y_2d, &x_2d, @@ -190,7 +190,7 @@ struct HardLabelCrossEntropyForwardFunctor { label); auto match_x = x_[idx * feature_size_ + label]; - y_[idx] = -math::TolerableValue()(phi::funcs::real_log(match_x)); + y_[idx] = -phi::funcs::TolerableValue()(phi::funcs::real_log(match_x)); match_x_[idx] = match_x; } else { y_[idx] = 0; diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt index 20933a4162fdc..9bc7473d967cd 100644 --- a/paddle/fluid/operators/math/CMakeLists.txt +++ b/paddle/fluid/operators/math/CMakeLists.txt @@ -21,7 +21,6 @@ else() math_library(concat_and_split DEPS concat_and_split_functor) endif() math_library(context_project DEPS im2col math_function) -math_library(cross_entropy) math_library(cos_sim_functor) math_library(depthwise_conv) math_library(im2col) diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op_npu.cc b/paddle/fluid/operators/softmax_with_cross_entropy_op_npu.cc index ddcb07b4d77e4..d42f993f46219 100644 --- a/paddle/fluid/operators/softmax_with_cross_entropy_op_npu.cc +++ b/paddle/fluid/operators/softmax_with_cross_entropy_op_npu.cc @@ -16,10 +16,10 @@ limitations under the License. */ #include #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math/cross_entropy.h" #include "paddle/fluid/operators/math/softmax.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" #include "paddle/phi/kernels/funcs/axis_utils.h" +#include "paddle/phi/kernels/funcs/cross_entropy.h" namespace paddle { namespace operators { diff --git a/paddle/phi/kernels/cpu/cross_entropy_kernel.cc b/paddle/phi/kernels/cpu/cross_entropy_kernel.cc index 27675fa8b5a54..cc855a4666043 100644 --- a/paddle/phi/kernels/cpu/cross_entropy_kernel.cc +++ b/paddle/phi/kernels/cpu/cross_entropy_kernel.cc @@ -14,11 +14,11 @@ limitations under the License. */ #include "paddle/phi/kernels/cross_entropy_kernel.h" -#include "paddle/fluid/operators/math/cross_entropy.h" #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/tensor_utils.h" #include "paddle/phi/kernels/funcs/axis_utils.h" +#include "paddle/phi/kernels/funcs/cross_entropy.h" #include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/softmax_kernel.h" @@ -64,7 +64,7 @@ void CrossEntropy(const CPUContext& dev_ctx, DenseTensor out_2d(*out); out_2d.Resize({n, d / axis_dim}); - paddle::operators::math::CrossEntropyFunctor()( + phi::funcs::CrossEntropyFunctor()( dev_ctx, &out_2d, &x_2d, &label_2d, soft_label, ignore_index, axis_dim); } diff --git a/paddle/phi/kernels/funcs/CMakeLists.txt b/paddle/phi/kernels/funcs/CMakeLists.txt index 9c26e33ccebab..ac1bd1fd45c72 100644 --- a/paddle/phi/kernels/funcs/CMakeLists.txt +++ b/paddle/phi/kernels/funcs/CMakeLists.txt @@ -16,6 +16,7 @@ math_library(pooling DEPS dense_tensor) math_library(segment_pooling) math_library(sequence2batch) math_library(matrix_solve DEPS dense_tensor eigen3 blas math_function) +math_library(cross_entropy) cc_library( phi_data_layout_transform diff --git a/paddle/fluid/operators/math/cross_entropy.cc b/paddle/phi/kernels/funcs/cross_entropy.cc similarity index 78% rename from paddle/fluid/operators/math/cross_entropy.cc rename to paddle/phi/kernels/funcs/cross_entropy.cc index f87f5a107e696..b93123fad808e 100644 --- a/paddle/fluid/operators/math/cross_entropy.cc +++ b/paddle/phi/kernels/funcs/cross_entropy.cc @@ -1,4 +1,4 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -12,20 +12,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/math/cross_entropy.h" +#include "paddle/phi/kernels/funcs/cross_entropy.h" -#include "paddle/fluid/framework/convert_utils.h" #include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/utils/data_type.h" -namespace paddle { -namespace operators { -namespace math { +namespace phi { +namespace funcs { using Tensor = phi::DenseTensor; template -using EigenMatrix = framework::EigenMatrix; +using EigenMatrix = phi::EigenMatrix; template struct HardLabelCrossEntropyCPUFunctorImpl { @@ -54,17 +53,17 @@ struct HardLabelCrossEntropyCPUFunctorImpl { for (int j = 0; j < num_remain; j++) { int lbl = static_cast(label_data[i * num_remain + j]); if (lbl != ignore_index_) { - PADDLE_ENFORCE_GE(lbl, - 0, - platform::errors::OutOfRange( - "label value should >= 0 when label " - "value(%f) not equal to ignore_index(%f)", - lbl, - ignore_index_)); + PADDLE_ENFORCE_GE( + lbl, + 0, + phi::errors::OutOfRange("label value should >= 0 when label " + "value(%f) not equal to ignore_index(%f)", + lbl, + ignore_index_)); PADDLE_ENFORCE_LT( lbl, axis_dim_, - platform::errors::OutOfRange( + phi::errors::OutOfRange( "label value should less than the shape of axis dimension " "when label value(%f) not equal to ignore_index(%f), But " "received label value as %ld and shape of axis dimension " @@ -79,7 +78,7 @@ struct HardLabelCrossEntropyCPUFunctorImpl { loss_data[loss_idx] = lbl == ignore_index_ ? 0 - : -math::TolerableValue()(std::log(prob_data[index])); + : -phi::funcs::TolerableValue()(std::log(prob_data[index])); } } } @@ -112,19 +111,18 @@ void CrossEntropyFunctor::operator()( auto loss = EigenMatrix::From(*out); loss.device(*ctx.eigen_device()) = - -((lbl * in.log().unaryExpr(math::TolerableValue())) + -((lbl * in.log().unaryExpr(phi::funcs::TolerableValue())) .reshape(batch_axis_remain) .sum(Eigen::DSizes(1))); } else { HardLabelCrossEntropyCPUFunctorImpl functor_impl( out, prob, labels, ignore_index, axis_dim); - framework::VisitIntDataType(framework::TransToProtoVarType(labels->dtype()), - functor_impl); + phi::VisitDataType(labels->dtype(), functor_impl); } } template class CrossEntropyFunctor; template class CrossEntropyFunctor; -} // namespace math -} // namespace operators -} // namespace paddle + +} // namespace funcs +} // namespace phi diff --git a/paddle/fluid/operators/math/cross_entropy.cu b/paddle/phi/kernels/funcs/cross_entropy.cu similarity index 85% rename from paddle/fluid/operators/math/cross_entropy.cu rename to paddle/phi/kernels/funcs/cross_entropy.cu index 8282f2b8a24f2..174c1c1bd934e 100644 --- a/paddle/fluid/operators/math/cross_entropy.cu +++ b/paddle/phi/kernels/funcs/cross_entropy.cu @@ -1,4 +1,4 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -12,15 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/math/cross_entropy.h" -#include "paddle/fluid/framework/convert_utils.h" -#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" +#include "paddle/phi/kernels/funcs/cross_entropy.h" + #include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/backends/gpu/gpu_device_function.h" #include "paddle/phi/backends/gpu/gpu_primitives.h" +#include "paddle/phi/core/utils/data_type.h" #include "paddle/phi/kernels/funcs/math.h" -namespace paddle { -namespace operators { -namespace math { + +namespace phi { +namespace funcs { template __global__ void CrossEntropyKernel(T* Y, @@ -38,10 +39,9 @@ __global__ void CrossEntropyKernel(T* Y, D, ignore_index, lbl); - Y[i] = - ignore_index == lbl - ? static_cast(0) - : -math::TolerableValue()(phi::funcs::real_log(X[i * D + lbl])); + Y[i] = ignore_index == lbl ? static_cast(0) + : -phi::funcs::TolerableValue()( + phi::funcs::real_log(X[i * D + lbl])); } } @@ -56,10 +56,11 @@ __global__ void SoftCrossEntropyKernel(T* Y, int idx = blockIdx.x * class_num + tid; int end = blockIdx.x * class_num + class_num; for (; idx < end; idx += blockDim.x) { - val += math::TolerableValue()(phi::funcs::real_log(X[idx])) * label[idx]; + val += phi::funcs::TolerableValue()(phi::funcs::real_log(X[idx])) * + label[idx]; } - val = paddle::platform::reduceSum(val, tid, blockDim.x); + val = phi::backends::gpu::reduceSum(val, tid, blockDim.x); if (threadIdx.x == 0) { Y[blockIdx.x] = -val; } @@ -117,8 +118,8 @@ void CrossEntropyFunctor::operator()( const bool softLabel, const int ignore_index, const int axis_dim) { + T* loss_data = ctx.template Alloc(out); const T* prob_data = prob->data(); - T* loss_data = out->mutable_data(ctx.GetPlace()); int batch_size = prob->dims()[0]; int class_num = prob->dims()[1]; @@ -145,8 +146,7 @@ void CrossEntropyFunctor::operator()( ignore_index, kMaxBlockDim, ctx.stream()); - framework::VisitDataType(framework::TransToProtoVarType(labels->dtype()), - functor); + phi::VisitDataType(labels->dtype(), functor); } } @@ -154,6 +154,5 @@ template class CrossEntropyFunctor; template class CrossEntropyFunctor; template class CrossEntropyFunctor; -} // namespace math -} // namespace operators -} // namespace paddle +} // namespace funcs +} // namespace phi diff --git a/paddle/fluid/operators/math/cross_entropy.h b/paddle/phi/kernels/funcs/cross_entropy.h similarity index 74% rename from paddle/fluid/operators/math/cross_entropy.h rename to paddle/phi/kernels/funcs/cross_entropy.h index fba4c2ebc61c2..692ba5efef5b7 100644 --- a/paddle/fluid/operators/math/cross_entropy.h +++ b/paddle/phi/kernels/funcs/cross_entropy.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -15,14 +15,13 @@ limitations under the License. */ #pragma once #include -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/platform/float16.h" +#include "paddle/phi/common/float16.h" +#include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/hostdevice.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" -namespace paddle { -namespace operators { -namespace math { +namespace phi { +namespace funcs { template struct TolerableValue { @@ -46,14 +45,15 @@ struct TolerableValue { // Also. In standard implementation of cross entropy, other // framework not has the ValueClipping. template <> -struct TolerableValue { - HOSTDEVICE platform::float16 operator()(const platform::float16& x) const { - if (platform::isfinite(x)) +struct TolerableValue { + HOSTDEVICE phi::dtype::float16 operator()( + const phi::dtype::float16& x) const { + if (phi::dtype::isfinite(x)) return x; - else if (x > static_cast(0)) - return std::numeric_limits::max(); + else if (x > static_cast(0)) + return std::numeric_limits::max(); else - return std::numeric_limits::min(); + return std::numeric_limits::min(); } }; @@ -68,6 +68,5 @@ class CrossEntropyFunctor { const int ignore_index, const int axis_dim); }; -} // namespace math -} // namespace operators -} // namespace paddle +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/gpu/cross_entropy_grad_kernel.cu b/paddle/phi/kernels/gpu/cross_entropy_grad_kernel.cu index df3e4bd0cf118..8618f947be457 100644 --- a/paddle/phi/kernels/gpu/cross_entropy_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/cross_entropy_grad_kernel.cu @@ -22,7 +22,6 @@ limitations under the License. */ namespace cub = hipcub; #endif -#include "paddle/fluid/operators/math/cross_entropy.h" #include "paddle/fluid/operators/math/softmax.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" #include "paddle/phi/backends/gpu/gpu_device_function.h" diff --git a/paddle/phi/kernels/gpu/cross_entropy_kernel.cu b/paddle/phi/kernels/gpu/cross_entropy_kernel.cu index bee9fc801b795..93d5f06b66564 100644 --- a/paddle/phi/kernels/gpu/cross_entropy_kernel.cu +++ b/paddle/phi/kernels/gpu/cross_entropy_kernel.cu @@ -22,7 +22,6 @@ limitations under the License. */ namespace cub = hipcub; #endif -#include "paddle/fluid/operators/math/cross_entropy.h" #include "paddle/fluid/operators/math/softmax.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" #include "paddle/phi/backends/gpu/gpu_device_function.h" @@ -31,6 +30,7 @@ namespace cub = hipcub; #include "paddle/phi/core/tensor_utils.h" #include "paddle/phi/core/visit_type.h" #include "paddle/phi/kernels/funcs/axis_utils.h" +#include "paddle/phi/kernels/funcs/cross_entropy.h" #include "paddle/phi/kernels/funcs/for_range.h" #include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h" @@ -46,7 +46,7 @@ template static __device__ __forceinline__ T Log(T x) { using AccT = typename dtype::MPTypeTrait::Type; AccT logx = std::log(static_cast(x)); - return paddle::operators::math::TolerableValue()(static_cast(logx)); + return phi::funcs::TolerableValue()(static_cast(logx)); } // Wrapper of exp function. Use exp(float32) for float16 @@ -54,7 +54,7 @@ template static __device__ __forceinline__ T Exp(T x) { using AccT = typename dtype::MPTypeTrait::Type; AccT expx = std::exp(static_cast(x)); - return paddle::operators::math::TolerableValue()(static_cast(expx)); + return phi::funcs::TolerableValue()(static_cast(expx)); } template @@ -1285,16 +1285,15 @@ void CrossEntropyWithSoftmaxCUDAKernel(const GPUContext& dev_ctx, DenseTensor softmax_out_2d(*softmax_out); softmax_out_2d.Resize({n, d}); - // math::CrossEntropyFunctor support axis is the last + // phi::funcs::CrossEntropyFunctor support axis is the last if (axis_v == -1) { - paddle::operators::math::CrossEntropyFunctor()( - dev_ctx, - &loss_2d, - &softmax_2d, - &labels_2d, - soft_label, - ignore_index, - axis_dim); + phi::funcs::CrossEntropyFunctor()(dev_ctx, + &loss_2d, + &softmax_2d, + &labels_2d, + soft_label, + ignore_index, + axis_dim); return; } @@ -1389,14 +1388,13 @@ void CrossEntropyWithSoftmaxCUDAKernel(const GPUContext& dev_ctx, loss_2d.Resize({n, 1}); paddle::operators::math::SoftmaxCUDNNFunctor()( dev_ctx, &logits_2d, &softmax_2d); - paddle::operators::math::CrossEntropyFunctor()( - dev_ctx, - &loss_2d, - &softmax_2d, - &labels_2d, - false, - ignore_index, - axis_dim); + phi::funcs::CrossEntropyFunctor()(dev_ctx, + &loss_2d, + &softmax_2d, + &labels_2d, + false, + ignore_index, + axis_dim); } else { auto* logits_data = logits.data(); auto* labels_data = label.data(); From b546438c4e5cafb4a7a5d4967075004c4c8e6b5a Mon Sep 17 00:00:00 2001 From: wenbin Date: Mon, 21 Nov 2022 11:14:38 +0800 Subject: [PATCH 110/210] round (#48107) --- paddle/fluid/inference/tensorrt/convert/scale_op.cc | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/inference/tensorrt/convert/scale_op.cc b/paddle/fluid/inference/tensorrt/convert/scale_op.cc index 361ed22395532..a2cf5b487ce73 100644 --- a/paddle/fluid/inference/tensorrt/convert/scale_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/scale_op.cc @@ -53,7 +53,8 @@ class ScaleOpConverter : public OpConverter { nvinfer1::ILayer* layer = nullptr; if (engine_->with_dynamic_shape()) { nvinfer1::ITensor* bias_tensor = - is_int ? Add1DConstantLayer(static_cast(bias)) + is_int ? Add1DConstantLayer( + static_cast(bias > 0 ? bias + 0.5 : bias - 0.5)) : Add1DConstantLayer(bias); bool is_bias_0 = (bias < 1e-06 && bias > -1e-06); @@ -75,7 +76,8 @@ class ScaleOpConverter : public OpConverter { is_scale_1 = false; } else { has_scale_tensor = false; - scale_tensor = is_int ? Add1DConstantLayer(static_cast(scale)) + scale_tensor = is_int ? Add1DConstantLayer(static_cast( + scale > 0 ? scale + 0.5 : scale - 0.5)) : Add1DConstantLayer(scale); is_scale_1 = ((scale - 1.0) < 1e-06 && (scale - 1.0) > -1e-06); } From 1ba308f5cd156cc3cdb75df25157324e76ac13e7 Mon Sep 17 00:00:00 2001 From: engineer1109 <1292846099@qq.com> Date: Mon, 21 Nov 2022 11:45:10 +0800 Subject: [PATCH 111/210] Update AUTHORS.md (#48177) --- AUTHORS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS.md b/AUTHORS.md index 1eb6d3b3c1731..71e94562cf2eb 100644 --- a/AUTHORS.md +++ b/AUTHORS.md @@ -20,6 +20,7 @@ This is an incomplete list of authors of [Paddle](https://github.com/PaddlePaddl | dragonwarrior | Long Wang | | dyning | Yuning Du | | emailweixu | Wei Xu | +| engineer1109 | Jia-Liang Wang | | gangliao | Gang Liao | | gongweibao | Wei-Bao Gong | | guru4elephant | Daxiang Dong | From 844ab6fe1d5eb5d214cfc090817c43c3aa2eee1e Mon Sep 17 00:00:00 2001 From: Vvsmile <450864116@qq.com> Date: Mon, 21 Nov 2022 12:42:03 +0800 Subject: [PATCH 112/210] [Clean Fluid API]Remove API: gather (#47954) * Remove API: gather replace the paddle.fluid.layers.gather with paddle.gather * modify the call of gather from old style to new style --- python/paddle/fluid/layers/detection.py | 8 +- python/paddle/fluid/layers/nn.py | 76 ------------------- .../dygraph_to_static/bert_dygraph_model.py | 2 +- .../unittests/ipu/test_fp16_support_ipu.py | 2 +- .../tests/unittests/ipu/test_gather_op_ipu.py | 2 +- .../ir/inference/test_trt_gather_op.py | 5 +- .../unittests/mlu/test_gather_nd_op_mlu.py | 2 +- .../tests/unittests/mlu/test_gather_op_mlu.py | 6 +- .../unittests/npu/test_gather_nd_op_npu.py | 2 +- .../tests/unittests/npu/test_gather_op_npu.py | 2 +- .../tests/unittests/test_gather_nd_op.py | 2 +- .../fluid/tests/unittests/test_gather_op.py | 8 +- .../tests/unittests/test_imperative_deepcf.py | 4 +- 13 files changed, 23 insertions(+), 98 deletions(-) diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py index 097875a75f2e8..1f5c0273e5911 100644 --- a/python/paddle/fluid/layers/detection.py +++ b/python/paddle/fluid/layers/detection.py @@ -332,8 +332,8 @@ def retinanet_target_assign( cls_logits = nn.reshape(x=cls_logits, shape=(-1, num_classes)) bbox_pred = nn.reshape(x=bbox_pred, shape=(-1, 4)) - predicted_cls_logits = nn.gather(cls_logits, score_index) - predicted_bbox_pred = nn.gather(bbox_pred, loc_index) + predicted_cls_logits = paddle.gather(cls_logits, score_index) + predicted_bbox_pred = paddle.gather(bbox_pred, loc_index) return ( predicted_cls_logits, @@ -514,8 +514,8 @@ def rpn_target_assign( cls_logits = nn.reshape(x=cls_logits, shape=(-1, 1)) bbox_pred = nn.reshape(x=bbox_pred, shape=(-1, 4)) - predicted_cls_logits = nn.gather(cls_logits, score_index) - predicted_bbox_pred = nn.gather(bbox_pred, loc_index) + predicted_cls_logits = paddle.gather(cls_logits, score_index) + predicted_bbox_pred = paddle.gather(bbox_pred, loc_index) return ( predicted_cls_logits, diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 50602825c1d28..0ccd6ea0074a4 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -117,7 +117,6 @@ 'resize_bilinear', 'resize_trilinear', 'resize_nearest', - 'gather', 'gather_nd', 'scatter', 'scatter_nd_add', @@ -8612,81 +8611,6 @@ def image_resize_short(input, out_short_len, resample='BILINEAR'): return image_resize(input=input, out_shape=out_shape, resample=resample) -@deprecated(since="2.0.0", update_to="paddle.gather") -def gather(input, index, overwrite=True): - """ - - Output is obtained by gathering entries of the outer-most dimension - of X indexed by `index` and concatenate them together. - - .. math:: - - Out = X[Index] - - - .. code-block:: text - - - Given: - - X = [[1, 2], - [3, 4], - [5, 6]] - - Index = [1, 2] - - Then: - - Out = [[3, 4], - [5, 6]] - - Args: - input (Tensor): The source input tensor with rank>=1. Supported data type is - int32, int64, float32, float64 and uint8 (only for CPU), - float16 (only for GPU). - index (Tensor): The index input tensor with rank=1. Data type is int32 or int64. - overwrite (bool, optional): The mode that updating the grad when has same index. - If True, use the overwrite mode to update the grad of the same index, - if False, use the accumulate mode to update the grad of the same index. - Default value is True. - - Returns: - output (Tensor): The output is a tensor with the same rank as input. - - Examples: - - .. code-block:: python - - import paddle - import paddle.fluid as fluid - paddle.enable_static() - - x = fluid.data(name='x', shape=[-1, 5], dtype='float32') - index = fluid.data(name='index', shape=[-1, 1], dtype='int32') - output = fluid.layers.gather(x, index) - """ - if _non_static_mode(): - return _legacy_C_ops.gather(input, index, None, 'overwrite', overwrite) - - check_variable_and_dtype( - input, - 'x', - ['float16', 'float32', 'float64', 'int32', 'int64', 'uint8'], - 'gather', - ) - check_variable_and_dtype(index, 'index', ['int32', 'int64'], 'gather') - helper = LayerHelper('gather', **locals()) - dtype = helper.input_dtype() - out = helper.create_variable_for_type_inference(dtype) - helper.append_op( - type="gather", - inputs={"X": input, "Index": index}, - outputs={"Out": out}, - attrs={'overwrite': overwrite}, - ) - return out - - @deprecated(since="2.0.0", update_to="paddle.gather_nd") def gather_nd(input, index, name=None): """ diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/bert_dygraph_model.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/bert_dygraph_model.py index e5b85be96b8ba..721bc9122165b 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/bert_dygraph_model.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/bert_dygraph_model.py @@ -395,7 +395,7 @@ def forward( x=enc_output, shape=[-1, self._emb_size] ) - mask_feat = fluid.layers.gather(input=reshaped_emb_out, index=mask_pos) + mask_feat = paddle.gather(reshaped_emb_out, index=mask_pos) mask_trans_feat = self.pooled_fc(mask_feat) mask_trans_feat = self.pre_process_layer(mask_trans_feat) diff --git a/python/paddle/fluid/tests/unittests/ipu/test_fp16_support_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_fp16_support_ipu.py index 8433e45f46ce1..c0c6cb207355f 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_fp16_support_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_fp16_support_ipu.py @@ -97,7 +97,7 @@ def build_model(self): y = paddle.static.data( name=self.feed_list[1], shape=self.feed_shape[1], dtype='int32' ) - out = paddle.fluid.layers.gather(x, index=y) + out = paddle.gather(x, index=y) self.fetch_list = [out.name] diff --git a/python/paddle/fluid/tests/unittests/ipu/test_gather_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_gather_op_ipu.py index 5913db81f31cf..4f8a62e5c4e1a 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_gather_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_gather_op_ipu.py @@ -49,7 +49,7 @@ def build_model(self): y = paddle.static.data( name=self.feed_list[1], shape=self.feed_shape[1], dtype='int32' ) - out = paddle.fluid.layers.gather(x, index=y, **self.attrs) + out = paddle.gather(x, index=y, **self.attrs) self.fetch_list = [out.name] def run_model(self, exec_mode): diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_gather_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_gather_op.py index 72c62ff4127a6..4930fd12c8595 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_gather_op.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_gather_op.py @@ -19,6 +19,7 @@ import paddle.fluid.core as core from paddle.fluid.core import PassVersionChecker from paddle.fluid.core import AnalysisConfig +import paddle class TRTGatherTest1(InferencePassTest): @@ -27,7 +28,7 @@ def setUp(self): with fluid.program_guard(self.main_program, self.startup_program): data = fluid.data(name='data', shape=[-1, 128], dtype='float32') index = fluid.data(name='index', shape=[-1, 1], dtype='int32') - scale_out = fluid.layers.gather(data, index=index) + scale_out = paddle.gather(data, index=index) out = fluid.layers.softmax(input=scale_out) self.feeds = { @@ -66,7 +67,7 @@ def setUp(self): with fluid.program_guard(self.main_program, self.startup_program): data = fluid.data(name='data', shape=[16, 64], dtype='float32') index = fluid.data(name='index', shape=[2], dtype='int32') - scale_out = fluid.layers.gather(data, index=index) + scale_out = paddle.gather(data, index=index) out = fluid.layers.softmax(input=scale_out) self.feeds = { diff --git a/python/paddle/fluid/tests/unittests/mlu/test_gather_nd_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_gather_nd_op_mlu.py index 3fb5634df4532..bcec54413a348 100644 --- a/python/paddle/fluid/tests/unittests/mlu/test_gather_nd_op_mlu.py +++ b/python/paddle/fluid/tests/unittests/mlu/test_gather_nd_op_mlu.py @@ -281,7 +281,7 @@ def test_imperative(self): index_1 = np.array([[1]]).astype("int32") input = fluid.dygraph.to_variable(input_1) index = fluid.dygraph.to_variable(index_1) - output = paddle.fluid.layers.gather(input, index) + output = paddle.gather(input, index) output_np = output.numpy() expected_output = np.array([3, 4]) np.testing.assert_allclose(output_np[0], expected_output, rtol=1e-6) diff --git a/python/paddle/fluid/tests/unittests/mlu/test_gather_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_gather_op_mlu.py index 2f50c10a62f26..d4c5e966570f4 100644 --- a/python/paddle/fluid/tests/unittests/mlu/test_gather_op_mlu.py +++ b/python/paddle/fluid/tests/unittests/mlu/test_gather_op_mlu.py @@ -92,7 +92,7 @@ def test_out1(self): index_1 = np.array([1, 2]) input = paddle.to_tensor(input_1) index = paddle.to_tensor(index_1) - output = paddle.fluid.layers.gather(input, index) + output = paddle.gather(input, index) output_np = output.numpy() expected_output = np.array([[3, 4], [5, 6]]).astype('int32') np.testing.assert_allclose(output_np, expected_output) @@ -167,12 +167,12 @@ def test_error2(self): ) def test_x_type(): - paddle.fluid.layers.gather(x, index) + paddle.gather(x, index) self.assertRaises(TypeError, test_x_type) def test_index_type(): - paddle.fluid.layers.gather(x, index_float) + paddle.gather(x, index_float) self.assertRaises(TypeError, test_index_type) diff --git a/python/paddle/fluid/tests/unittests/npu/test_gather_nd_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_gather_nd_op_npu.py index 4d66c28d6fa40..cea2e4ff37d3b 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_gather_nd_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_gather_nd_op_npu.py @@ -270,7 +270,7 @@ def test_imperative(self): index_1 = np.array([[1]]) input = fluid.dygraph.to_variable(input_1) index = fluid.dygraph.to_variable(index_1) - output = paddle.fluid.layers.gather(input, index) + output = paddle.gather(input, index) output_np = output.numpy() expected_output = np.array([3, 4]) np.testing.assert_allclose(output_np[0], expected_output) diff --git a/python/paddle/fluid/tests/unittests/npu/test_gather_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_gather_op_npu.py index 5e783b952af9c..1d27eadbc12f3 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_gather_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_gather_op_npu.py @@ -86,7 +86,7 @@ def test_out1(self): with fluid.program_guard(fluid.Program(), fluid.Program()): data1 = fluid.layers.data('data1', shape=[-1, 2], dtype='float32') index = fluid.layers.data('index', shape=[-1, 1], dtype='int32') - out = paddle.fluid.layers.gather(data1, index) + out = paddle.gather(data1, index) place = paddle.NPUPlace(0) exe = fluid.Executor(place) input = np.array([[1, 2], [3, 4], [5, 6]]) diff --git a/python/paddle/fluid/tests/unittests/test_gather_nd_op.py b/python/paddle/fluid/tests/unittests/test_gather_nd_op.py index 176012e96a226..c49db2815ac1e 100644 --- a/python/paddle/fluid/tests/unittests/test_gather_nd_op.py +++ b/python/paddle/fluid/tests/unittests/test_gather_nd_op.py @@ -248,7 +248,7 @@ def test_imperative(self): index_1 = np.array([[1]]) input = fluid.dygraph.to_variable(input_1) index = fluid.dygraph.to_variable(index_1) - output = paddle.fluid.layers.gather(input, index) + output = paddle.gather(input, index) output_np = output.numpy() expected_output = np.array([[3, 4]]) np.testing.assert_allclose(output_np, expected_output, rtol=1e-05) diff --git a/python/paddle/fluid/tests/unittests/test_gather_op.py b/python/paddle/fluid/tests/unittests/test_gather_op.py index c6301d3e472cd..4f722c8bdc213 100644 --- a/python/paddle/fluid/tests/unittests/test_gather_op.py +++ b/python/paddle/fluid/tests/unittests/test_gather_op.py @@ -225,7 +225,7 @@ def test_out1(self): with fluid.program_guard(fluid.Program(), fluid.Program()): data1 = fluid.layers.data('data1', shape=[-1, 2], dtype='float64') index = fluid.layers.data('index', shape=[-1, 1], dtype='int32') - out = paddle.fluid.layers.gather(data1, index) + out = paddle.gather(data1, index) place = fluid.CPUPlace() exe = fluid.Executor(place) input = np.array([[1, 2], [3, 4], [5, 6]]) @@ -264,7 +264,7 @@ def test_out1(self): index_1 = np.array([1, 2]) input = paddle.to_tensor(input_1) index = paddle.to_tensor(index_1) - output = paddle.fluid.layers.gather(input, index) + output = paddle.gather(input, index) output_np = output.numpy() expected_output = np.array([[3, 4], [5, 6]]) np.testing.assert_allclose(output_np, expected_output, rtol=1e-05) @@ -372,12 +372,12 @@ def test_error2(self): ) def test_x_type(): - paddle.fluid.layers.gather(x, index) + paddle.gather(x, index) self.assertRaises(TypeError, test_x_type) def test_index_type(): - paddle.fluid.layers.gather(x, index_float) + paddle.gather(x, index_float) self.assertRaises(TypeError, test_index_type) diff --git a/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py b/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py index 8c307f57da37d..2b0291b601e75 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py @@ -119,8 +119,8 @@ def __init__(self, num_users, num_items, matrix): def forward(self, users, items): # users_emb = self._user_emb(users) # items_emb = self._item_emb(items) - users_emb = fluid.layers.gather(self._rating_matrix, users) - items_emb = fluid.layers.gather( + users_emb = paddle.gather(self._rating_matrix, users) + items_emb = paddle.gather( fluid.layers.transpose(self._rating_matrix, [1, 0]), items ) users_emb.stop_gradient = True From 1175a2b9977f3533c1da5bf47f23d578a2e13cd5 Mon Sep 17 00:00:00 2001 From: Vvsmile <450864116@qq.com> Date: Mon, 21 Nov 2022 12:42:14 +0800 Subject: [PATCH 113/210] Remove API: selu (#47969) replace paddle.fluid.layers.selu with paddle.nn.functional.selu --- python/paddle/fluid/layers/nn.py | 73 ------------------- .../fluid/tests/unittests/test_selu_op.py | 2 +- 2 files changed, 1 insertion(+), 74 deletions(-) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 0ccd6ea0074a4..3e8479acfb995 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -124,7 +124,6 @@ 'random_crop', 'mean_iou', 'relu', - 'selu', 'log', 'crop', 'crop_tensor', @@ -9074,78 +9073,6 @@ def relu(x, name=None): return out -@deprecated(since="2.0.0", update_to="paddle.nn.functional.selu") -def selu(x, scale=None, alpha=None, name=None): - r""" - - Selu Operator. - - The equation is: - - .. math:: - selu= \\lambda* - \\begin{cases} - x &\\quad \\text{ if } x>0 \n - \\alpha * e^x - \\alpha &\\quad \\text{ if } x<=0 - \\end{cases} - - - The input `X` can carry the LoD (Level of Details) information, - or not. And the output shares the LoD information with input `X`. - - Args: - x (Variable): The input N-D Tensor. - scale(float, optional): lambda in selu activation function, - the default value is 1.0507009873554804934193349852946. - For more information about this value, please refer - to: https://arxiv.org/abs/1706.02515. - alpha(float, optional): alpha in selu activation function, - the default value is 1.6732632423543772848170429916717. - For more information about this value, please refer - to: https://arxiv.org/abs/1706.02515. - name(str, optional): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name` . - - - Returns: - Variable(Tensor|LoDTensor): The output Tensor or LoDTensor with the same shape and LoD information as input. - - Examples: - - .. code-block:: python - - import paddle - import paddle.fluid as fluid - import numpy as np - paddle.enable_static() - - inputs = fluid.layers.data(name="x", shape=[2, 2], dtype="float32") - output = fluid.layers.selu(inputs) - - exe = fluid.Executor(fluid.CPUPlace()) - exe.run(fluid.default_startup_program()) - - img = np.array([[0, 1],[2, 3]]).astype(np.float32) - - res = exe.run(fluid.default_main_program(), feed={'x':img}, fetch_list=[output]) - print(res) # [array([[0. , 1.050701],[2.101402, 3.152103]], dtype=float32)] - """ - check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'selu') - - helper = LayerHelper('selu', **locals()) - dtype = helper.input_dtype(input_param_name='x') - out = helper.create_variable_for_type_inference(dtype) - attrs = {} - if scale is not None: - attrs["scale"] = scale - if alpha is not None: - attrs["alpha"] = alpha - - helper.append_op( - type="selu", inputs={"X": x}, outputs={"Out": out}, attrs=attrs - ) - return out - - def mean_iou(input, label, num_classes): r""" Mean Intersection-Over-Union is a common evaluation metric for diff --git a/python/paddle/fluid/tests/unittests/test_selu_op.py b/python/paddle/fluid/tests/unittests/test_selu_op.py index 1390ddb5c9606..381f3aa5ef563 100644 --- a/python/paddle/fluid/tests/unittests/test_selu_op.py +++ b/python/paddle/fluid/tests/unittests/test_selu_op.py @@ -118,7 +118,7 @@ def test_dygraph_api(self): def test_fluid_api(self): with fluid.program_guard(fluid.Program()): x = fluid.data('X', self.x_np.shape, self.x_np.dtype) - out = fluid.layers.selu(x, self.scale, self.alpha) + out = F.selu(x, self.scale, self.alpha) exe = fluid.Executor(self.place) res = exe.run(feed={'X': self.x_np}, fetch_list=[out]) out_ref = ref_selu(self.x_np, self.scale, self.alpha) From 5a45ceb2f65ef05b0e54940ff162efee58a334fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=82=85=E5=89=91=E5=AF=92?= Date: Mon, 21 Nov 2022 13:37:52 +0800 Subject: [PATCH 114/210] Remove fluid.layers.relu6 under fluid directory (#47876) * remove relu6 test case under fluid * fix relu6 test case in mkldnn_elt_act_fuse_pass --- python/paddle/fluid/layers/nn.py | 48 ------------------- .../dygraph_to_static/test_mobile_net.py | 2 +- .../test_mkldnn_elt_act_fuse_pass.py | 9 ++-- .../ir/inference/test_trt_activation_pass.py | 2 +- .../tests/unittests/test_activation_op.py | 2 +- 5 files changed, 6 insertions(+), 57 deletions(-) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 3e8479acfb995..8f873b9ff5bdd 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -14,7 +14,6 @@ """ All layers just related to the neural network. """ - import os import inspect import warnings @@ -127,7 +126,6 @@ 'log', 'crop', 'crop_tensor', - 'relu6', 'pow', 'hard_sigmoid', 'prelu', @@ -9580,52 +9578,6 @@ def pad2d( return out -@deprecated(since="2.0.0", update_to="paddle.nn.functional.relu6") -def relu6(x, threshold=6.0, name=None): - """ - - ${comment} - - Args: - x(${x_type}): ${x_comment} - threshold(float, optional): ${threshold_comment} - name(str, optional): The default value is None. Normally there is no - need for user to set this property. For more information, please - refer to :ref:`api_guide_Name`. - - Returns: - output(${out_type}): ${out_comment} - - Examples: - - .. code-block:: python - - import paddle.fluid as fluid - import numpy as np - in1 = np.array([[-1,0],[2.5,7.8]]) - with fluid.dygraph.guard(): - x1 = fluid.dygraph.to_variable(in1) - out1 = fluid.layers.relu6(x=x1, threshold=6.0) - print(out1.numpy()) - # [[0. 0. ] - # [2.5 6. ]] - """ - check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'relu6') - - helper = LayerHelper('relu6', **locals()) - out = helper.create_variable_for_type_inference(dtype=x.dtype) - helper.append_op( - type='relu6', - inputs={'X': x}, - outputs={'Out': out}, - attrs={ - 'threshold': threshold, - 'use_mkldnn': _global_flags()["FLAGS_use_mkldnn"], - }, - ) - return out - - @templatedoc() def pow(x, factor=1.0, name=None): """ diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py index 083345d9db99a..af50300be3a1c 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py @@ -80,7 +80,7 @@ def forward(self, inputs, if_act=False): y = self._conv(inputs) y = self._batch_norm(y) if if_act: - y = fluid.layers.relu6(y) + y = paddle.nn.functional.relu6(y) return y diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_elt_act_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_elt_act_fuse_pass.py index b926a8f71a9d5..3c2365c29e60c 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_elt_act_fuse_pass.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_elt_act_fuse_pass.py @@ -151,8 +151,7 @@ class ElementwiseActivationMkldnnFusePassTest_Add_Relu6( ): def set_params(self): self.operand = fluid.layers.elementwise_add - self.act = fluid.layers.relu6 - self.act_alpha = 5.0 + self.act = paddle.nn.functional.relu6 class ElementwiseActivationMkldnnFusePassTest_Add_Sigmoid( @@ -244,8 +243,7 @@ class ElementwiseActivationMkldnnFusePassTest_Sub_Relu6( ): def set_params(self): self.operand = fluid.layers.elementwise_sub - self.act = fluid.layers.relu6 - self.act_alpha = 5.0 + self.act = paddle.nn.functional.relu6 class ElementwiseActivationMkldnnFusePassTest_Sub_Sigmoid( @@ -345,8 +343,7 @@ class ElementwiseActivationMkldnnFusePassTest_Mul_Relu6( ): def set_params(self): self.operand = fluid.layers.elementwise_mul - self.act = fluid.layers.relu6 - self.act_alpha = 5.0 + self.act = paddle.nn.functional.relu6 class ElementwiseActivationMkldnnFusePassTest_Mul_Sigmoid( diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_activation_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_activation_pass.py index 80e51dbf26124..9b6e3e641037d 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_activation_pass.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_activation_pass.py @@ -71,7 +71,7 @@ def append_act(self, x): class TensorRTSubgraphPassRelu6Test(TensorRTSubgraphPassActivationTest): def append_act(self, x): - return fluid.layers.relu6(x) + return paddle.nn.functional.relu6(x) class TensorRTSubgraphPassSoftMaxTest(TensorRTSubgraphPassActivationTest): diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py index f460ed58c1d9c..ddd287a580318 100755 --- a/python/paddle/fluid/tests/unittests/test_activation_op.py +++ b/python/paddle/fluid/tests/unittests/test_activation_op.py @@ -2074,7 +2074,7 @@ def test_fluid_api(self): paddle.enable_static() with fluid.program_guard(fluid.Program()): x = fluid.data('X', self.x_np.shape, self.x_np.dtype) - out = fluid.layers.relu6(x) + out = paddle.nn.functional.relu6(x) exe = fluid.Executor(self.place) res = exe.run(feed={'X': self.x_np}, fetch_list=[out]) out_ref = ref_relu6(self.x_np) From 394a7179b5918fe087c2132f35a10c35b7227627 Mon Sep 17 00:00:00 2001 From: houj04 <35131887+houj04@users.noreply.github.com> Date: Mon, 21 Nov 2022 13:54:57 +0800 Subject: [PATCH 115/210] add check_xpu_dependence.sh script. (#48154) --- cmake/external/xpu.cmake | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake index a616387c0905e..3b09f92081e4e 100644 --- a/cmake/external/xpu.cmake +++ b/cmake/external/xpu.cmake @@ -56,6 +56,9 @@ set(XPU_XCCL_URL set(XPU_PACK_DEPENCE_URL "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/pack_paddle_depence.sh" CACHE STRING "" FORCE) +set(XPU_CHECK_DEPENCE_URL + "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/check_xpu_dependence.sh" + CACHE STRING "" FORCE) set(SNAPPY_PREFIX_DIR "${THIRD_PARTY_PATH}/xpu") set(XPU_DOWNLOAD_DIR "${SNAPPY_PREFIX_DIR}/src/${XPU_PROJECT}") @@ -80,9 +83,10 @@ ExternalProject_Add( PREFIX ${SNAPPY_PREFIX_DIR} DOWNLOAD_DIR ${XPU_DOWNLOAD_DIR} DOWNLOAD_COMMAND - wget ${XPU_PACK_DEPENCE_URL} && bash pack_paddle_depence.sh ${XPU_XRE_URL} - ${XPU_XRE_DIR_NAME} ${XPU_XDNN_URL} ${XPU_XDNN_DIR_NAME} ${XPU_XCCL_URL} - ${XPU_XCCL_DIR_NAME} + wget ${XPU_CHECK_DEPENCE_URL} && bash check_xpu_dependence.sh + ${XPU_BASE_URL} ${XPU_XCCL_BASE_URL} && wget ${XPU_PACK_DEPENCE_URL} && bash + pack_paddle_depence.sh ${XPU_XRE_URL} ${XPU_XRE_DIR_NAME} ${XPU_XDNN_URL} + ${XPU_XDNN_DIR_NAME} ${XPU_XCCL_URL} ${XPU_XCCL_DIR_NAME} DOWNLOAD_NO_PROGRESS 1 UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${XPU_INSTALL_ROOT} From 27e252d98d315edf9c384e2f2194250ff3c6190c Mon Sep 17 00:00:00 2001 From: taixiurong Date: Mon, 21 Nov 2022 13:56:14 +0800 Subject: [PATCH 116/210] add adamw suppor xpu, test=kunlun (#48114) --- .../fluid/platform/device/xpu/xpu2_op_list.h | 4 +- paddle/phi/kernels/xpu/adamw_kernel.cc | 74 ++++++++++--------- .../tests/unittests/xpu/test_adamw_op_xpu.py | 17 +++-- 3 files changed, 51 insertions(+), 44 deletions(-) diff --git a/paddle/fluid/platform/device/xpu/xpu2_op_list.h b/paddle/fluid/platform/device/xpu/xpu2_op_list.h index ae6d53989c316..4db463f46ae0d 100644 --- a/paddle/fluid/platform/device/xpu/xpu2_op_list.h +++ b/paddle/fluid/platform/device/xpu/xpu2_op_list.h @@ -34,7 +34,9 @@ XPUOpMap& get_kl2_ops() { XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), pOpKernelType(vartype::FP16, XPUPlace())})}, {"adadelta", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"adamw", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"adamw", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace())})}, {"adam", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), pOpKernelType(vartype::FP16, XPUPlace())})}, diff --git a/paddle/phi/kernels/xpu/adamw_kernel.cc b/paddle/phi/kernels/xpu/adamw_kernel.cc index 9aeb77aee8856..20f226c850e6d 100644 --- a/paddle/phi/kernels/xpu/adamw_kernel.cc +++ b/paddle/phi/kernels/xpu/adamw_kernel.cc @@ -52,6 +52,7 @@ void AdamwDenseKernel(const Context& dev_ctx, DenseTensor* beta1_pow_out, DenseTensor* beta2_pow_out, DenseTensor* master_param_outs) { + using XPUType = typename XPUTypeTrait::Type; bool skip_update_ = false; if (skip_update.is_initialized()) { PADDLE_ENFORCE_EQ( @@ -89,40 +90,42 @@ void AdamwDenseKernel(const Context& dev_ctx, beta2_pow_ptr = xpu_beta2_pow.template data(); } if (with_decay) { - int r = xpu::adamw(dev_ctx.x_context(), - grad.template data(), - moment1.template data(), - moment2.template data(), - param.template data(), - beta1_pow_ptr, - beta2_pow_ptr, - learning_rate.template data(), - dev_ctx.template Alloc(moment1_out), - dev_ctx.template Alloc(moment2_out), - dev_ctx.template Alloc(param_out), - beta1_, - beta2_, - epsilon_, - coeff, - param.numel()); + int r = xpu::adamw( + dev_ctx.x_context(), + reinterpret_cast(grad.template data()), + moment1.template data(), + moment2.template data(), + reinterpret_cast(param.template data()), + beta1_pow_ptr, + beta2_pow_ptr, + learning_rate.template data(), + dev_ctx.template Alloc(moment1_out), + dev_ctx.template Alloc(moment2_out), + reinterpret_cast(dev_ctx.template Alloc(param_out)), + beta1_, + beta2_, + epsilon_, + coeff, + param.numel()); PADDLE_ENFORCE_XDNN_SUCCESS(r, "adamw"); } else { - int r = xpu::adam(dev_ctx.x_context(), - grad.template data(), - moment1.template data(), - moment2.template data(), - param.template data(), - beta1_pow_ptr, - beta2_pow_ptr, - learning_rate.template data(), - dev_ctx.template Alloc(moment1_out), - dev_ctx.template Alloc(moment2_out), - dev_ctx.template Alloc(param_out), - beta1_, - beta2_, - epsilon_, - param.numel()); - PADDLE_ENFORCE_XDNN_SUCCESS(r, "adamw"); + int r = xpu::adam( + dev_ctx.x_context(), + reinterpret_cast(grad.template data()), + moment1.template data(), + moment2.template data(), + reinterpret_cast(param.template data()), + beta1_pow_ptr, + beta2_pow_ptr, + learning_rate.template data(), + dev_ctx.template Alloc(moment1_out), + dev_ctx.template Alloc(moment2_out), + reinterpret_cast(dev_ctx.template Alloc(param_out)), + beta1_, + beta2_, + epsilon_, + param.numel()); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "adam"); } if (!use_global_beta_pow) { @@ -145,7 +148,7 @@ void AdamwDenseKernel(const Context& dev_ctx, false, beta1_, 0.0f); - PADDLE_ENFORCE_XDNN_SUCCESS(r, "adamw"); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "scale"); r = xpu::scale(dev_ctx.x_context(), beta2_pow_ptr, beta2_pow_out_p, @@ -153,14 +156,15 @@ void AdamwDenseKernel(const Context& dev_ctx, false, beta2_, 0.0f); - PADDLE_ENFORCE_XDNN_SUCCESS(r, "adamw"); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "scale"); } } } } // namespace phi -PD_REGISTER_KERNEL(adamw, XPU, ALL_LAYOUT, phi::AdamwDenseKernel, float) { +PD_REGISTER_KERNEL( + adamw, XPU, ALL_LAYOUT, phi::AdamwDenseKernel, float, phi::dtype::float16) { // Skip beta1_pow, beta2_pow, skip_update data transform kernel->InputAt(5).SetBackend(phi::Backend::ALL_BACKEND); kernel->InputAt(6).SetBackend(phi::Backend::ALL_BACKEND); diff --git a/python/paddle/fluid/tests/unittests/xpu/test_adamw_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_adamw_op_xpu.py index 7280258272b40..d9e0fb973c4d3 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_adamw_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_adamw_op_xpu.py @@ -90,9 +90,9 @@ def setUp(self): self.dtype = self.in_type_str param = np.random.uniform(-1, 1, self.shape).astype(self.dtype) grad = np.random.uniform(-1, 1, self.shape).astype(self.dtype) - moment1 = np.random.uniform(-1, 1, self.shape).astype(self.dtype) + moment1 = np.random.uniform(-1, 1, self.shape).astype("float32") # The second moment is positive - moment2 = np.random.random(self.shape).astype(self.dtype) + moment2 = np.random.random(self.shape).astype("float32") learning_rate = 0.004 beta1 = 0.78 @@ -106,9 +106,9 @@ def setUp(self): 'Grad': grad, 'Moment1': moment1, 'Moment2': moment2, - 'LearningRate': np.array([learning_rate]).astype(self.dtype), - 'Beta1Pow': np.array([beta1_pow]).astype(self.dtype), - 'Beta2Pow': np.array([beta2_pow]).astype(self.dtype), + 'LearningRate': np.array([learning_rate]).astype("float32"), + 'Beta1Pow': np.array([beta1_pow]).astype("float32"), + 'Beta2Pow': np.array([beta2_pow]).astype("float32"), } self.attrs = { @@ -127,8 +127,8 @@ def setUp(self): 'Moment1Out': moment1_out, 'Moment2Out': moment2_out, 'ParamOut': param_out, - 'Beta1PowOut': np.array([beta1_pow]).astype(self.dtype) * beta1, - 'Beta2PowOut': np.array([beta2_pow]).astype(self.dtype) * beta2, + 'Beta1PowOut': np.array([beta1_pow]).astype("float32") * beta1, + 'Beta2PowOut': np.array([beta2_pow]).astype("float32") * beta2, } def init_shape(self): @@ -305,7 +305,8 @@ def test_adamw_op_dygraph(self): support_types = get_xpu_op_support_types('adamw') for stype in support_types: create_test_class(globals(), XPUTestAdamwOp1, stype) - create_test_class(globals(), XPUTestAdamwOp2, stype) + if stype == "float32": + create_test_class(globals(), XPUTestAdamwOp2, stype) if __name__ == "__main__": paddle.enable_static() From 468f8815a5043934980e5a5fa70429f6470ceb66 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=82=85=E5=89=91=E5=AF=92?= Date: Mon, 21 Nov 2022 13:59:30 +0800 Subject: [PATCH 117/210] =?UTF-8?q?=EF=BC=88fluid=E6=B8=85=E7=90=86?= =?UTF-8?q?=EF=BC=89Remove=20filter=20by=20instag=20in=20nn.py=20under=20f?= =?UTF-8?q?luid=20(#47929)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- python/paddle/fluid/layers/nn.py | 72 +------------------ .../fluid/tests/unittests/test_layers.py | 23 ------ 2 files changed, 1 insertion(+), 94 deletions(-) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 8f873b9ff5bdd..adddf52c1e8ea 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -23,7 +23,7 @@ import paddle from ..layer_helper import LayerHelper from paddle.fluid.framework import _in_legacy_dygraph -from ..initializer import Normal, Constant, NumpyArrayInitializer +from ..initializer import Normal, Constant from ..framework import ( Variable, OpProtoHolder, @@ -182,7 +182,6 @@ 'sign', 'unfold', 'deformable_roi_pooling', - 'filter_by_instag', 'shard_index', 'hard_swish', 'mish', @@ -9951,75 +9950,6 @@ def flatten(x, axis=1, name=None): return out -@templatedoc(op_type="filter_by_instag") -def filter_by_instag(ins, ins_tag, filter_tag, is_lod, out_val_if_empty=0): - """ - **Filter By Instag Layer** - - This function filter a batch of ins by instag, - There are multiple ins, and every ins belongs to some tags. - We can specify some tags we want. So the ins which belongs to that tags - remains in the output, and others removed. - - For example, one batch has 4 ins. Every ins has its tag list. - - | Ins | Ins_Tag | - |:-----:|:------:| - | 0 | 0, 1 | - | 1 | 1, 3 | - | 2 | 0, 3 | - | 3 | 2, 6 | - - And Lod is [1,1,1,1] - - And the filter tags [1] - - From the definition above, ins which has tag 1 can pass the filter - So Ins 0 and Ins 1 can pass and be seen in the output, - Ins 2 and 3 cannot pass because they do not has tag 1. - - Actually, if is_lod is false, it is normal tensor that equals to - lod_tensor with all 1, similar to the example above. - - Args: - ins (Variable): Input Variable (LoDTensor), usually it is 2D tensor - And first dimension can have lod info or not. - ins_tag (Variable): Input Variable (LoDTensor), usually it is 1D list - And split them by lod info - filter_tag (Variable): Input Variable (1D Tensor/List), usually it is - list that holds the tags. - is_lod (Bool): Boolean value to indicate ins is lod tensor or not. - out_val_if_empty(Int64): If the output after filter is empty, this value - will be set to Output tensor. - - Returns: - Variable: filtered ins (LoDTensor) and loss weight (Tensor) - - Examples: - .. code-block:: python - - import paddle.fluid.layers as layers - ins = layers.data(name='Ins', shape=[-1,32], lod_level=0, dtype='float64') - ins_tag = layers.data(name='Ins_tag', shape=[-1,16], lod_level=0, dtype='int64') - filter_tag = layers.data(name='Filter_tag', shape=[-1,16], dtype='int64') - out, loss_weight = layers.filter_by_instag(ins, ins_tag, filter_tag, True) - - """ - helper = LayerHelper('filter_by_instag', **locals()) - - out = helper.create_variable_for_type_inference(dtype=ins.dtype) - loss_weight = helper.create_variable_for_type_inference(dtype=np.float64) - mmap = helper.create_variable_for_type_inference(dtype=ins_tag.dtype) - helper.append_op( - type='filter_by_instag', - inputs={'Ins': ins, 'Ins_tag': ins_tag, 'Filter_tag': filter_tag}, - outputs={'Out': out, 'LossWeight': loss_weight, 'IndexMap': mmap}, - attrs={'is_lod': is_lod, 'out_val_if_empty': out_val_if_empty}, - ) - - return [out, loss_weight] - - @deprecated(since='2.0.0', update_to="paddle.expand") def expand(x, expand_times, name=None): """ diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index 2eaf9432c6e87..27d7e42accf18 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -4284,29 +4284,6 @@ def test_sequence_slice(self): ) return out - def test_filter_by_instag(self): - # TODO(minqiyang): dygraph do not support lod now - with self.static_graph(): - x1 = layers.data( - name='Ins', shape=[32, 1], dtype='float32', lod_level=0 - ) - x2 = layers.data( - name='Ins_tag', - shape=[32, 1], - dtype='int64', - lod_level=0, - stop_gradient=True, - ) - x3 = layers.create_global_var( - shape=[1, 1], - value=20, - dtype='int64', - persistable=True, - force_cpu=True, - name='Filter_tag', - ) - out1, out2 = layers.filter_by_instag(x1, x2, x3, is_lod=True) - def test_shuffle_batch(self): # TODO(minqiyang): dygraph do not support lod now with self.static_graph(): From 3ca7328f75b4bb86274ded7c3226c64aa673b21c Mon Sep 17 00:00:00 2001 From: PuQing Date: Mon, 21 Nov 2022 14:19:07 +0800 Subject: [PATCH 118/210] [PHI decoupling] move "thread pool" from fluid to phi (#48075) * move threadpool fix cmake * fix make --- paddle/fluid/framework/CMakeLists.txt | 4 - paddle/fluid/framework/scope.cc | 2 +- paddle/fluid/framework/threadpool.h | 130 +-------------- paddle/fluid/framework/threadpool_test.cc | 2 +- paddle/phi/core/CMakeLists.txt | 4 + .../framework => phi/core}/threadpool.cc | 47 +++--- paddle/phi/core/threadpool.h | 150 ++++++++++++++++++ .../kernels/selected_rows/cpu/adam_kernel.cc | 14 +- 8 files changed, 189 insertions(+), 164 deletions(-) rename paddle/{fluid/framework => phi/core}/threadpool.cc (71%) create mode 100644 paddle/phi/core/threadpool.h diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 46c3dcca12a27..6fd4095d0d28f 100755 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -225,10 +225,6 @@ cc_test( SRCS reader_test.cc DEPS reader) -cc_library( - threadpool - SRCS threadpool.cc - DEPS enforce) cc_test( threadpool_test SRCS threadpool_test.cc diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc index 88ffeb59503d3..a54110add67a8 100644 --- a/paddle/fluid/framework/scope.cc +++ b/paddle/fluid/framework/scope.cc @@ -155,7 +155,7 @@ void Scope::DeleteScope(Scope* scope) const { if (FLAGS_benchmark || FLAGS_eager_delete_scope) { delete scope; } else { - Async([scope] { delete scope; }); + phi::Async([scope] { delete scope; }); } } } diff --git a/paddle/fluid/framework/threadpool.h b/paddle/fluid/framework/threadpool.h index 7fecf07475b14..ac0ee6b933312 100644 --- a/paddle/fluid/framework/threadpool.h +++ b/paddle/fluid/framework/threadpool.h @@ -14,138 +14,16 @@ limitations under the License. */ #pragma once -#include // NOLINT -#include -#include // NOLINT -#include -#include // NOLINT -#include -#include // NOLINT -#include -#include - -#include "glog/logging.h" -#include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/macros.h" // for DISABLE_COPY_AND_ASSIGN +#include "paddle/phi/core/threadpool.h" namespace paddle { namespace framework { -struct ExceptionHandler { - mutable std::future> future_; - explicit ExceptionHandler( - std::future>&& f) - : future_(std::move(f)) {} - void operator()() const { - auto ex = this->future_.get(); - if (ex != nullptr) { - PADDLE_THROW(platform::errors::Fatal( - "The exception is thrown inside the thread pool. You " - "should use RunAndGetException to handle the exception." - "The exception is:\n %s.", - ex->what())); - } - } -}; - -// ThreadPool maintains a queue of tasks, and runs them using a fixed -// number of threads. -class ThreadPool { - public: - explicit ThreadPool(int num_threads); - - using Task = std::packaged_task()>; - - // Returns the singleton of ThreadPool. - static ThreadPool* GetInstance(); - - ~ThreadPool(); - - // Run pushes a function to the task queue and returns a std::future - // object. To wait for the completion of the task, call - // std::future::wait(). - template - std::future Run(Callback fn) { - auto f = this->RunAndGetException(fn); - return std::async(std::launch::deferred, ExceptionHandler(std::move(f))); - } - - template - std::future> RunAndGetException( - Callback fn) { - Task task([fn]() -> std::unique_ptr { - try { - fn(); - } catch (platform::EnforceNotMet& ex) { - return std::unique_ptr( - new platform::EnforceNotMet(ex)); - } catch (const std::exception& e) { - PADDLE_THROW(platform::errors::Fatal( - "Unexpected exception is catched in thread pool. All " - "throwable exception in Paddle should be an EnforceNotMet." - "The exception is:\n %s.", - e.what())); - } - return nullptr; - }); - std::future> f = task.get_future(); - { - std::unique_lock lock(mutex_); - if (!running_) { - PADDLE_THROW(platform::errors::Unavailable( - "Task is enqueued into stopped ThreadPool.")); - } - tasks_.push(std::move(task)); - } - scheduled_.notify_one(); - return f; - } - - private: - DISABLE_COPY_AND_ASSIGN(ThreadPool); - - // The constructor starts threads to run TaskLoop, which retrieves - // and runs tasks from the queue. - void TaskLoop(); - - // Init is called by GetInstance. - static void Init(); - - private: - static std::unique_ptr threadpool_; - static std::once_flag init_flag_; - - std::vector> threads_; - - std::queue tasks_; - std::mutex mutex_; - bool running_; - std::condition_variable scheduled_; -}; - -class ThreadPoolIO : ThreadPool { - public: - static ThreadPool* GetInstanceIO(); - static void InitIO(); - - private: - // NOTE: threadpool in base will be inhereted here. - static std::unique_ptr io_threadpool_; - static std::once_flag io_init_flag_; -}; +using ExceptionHandler = phi::ExceptionHandler; -// Run a function asynchronously. -// NOTE: The function must return void. If the function need to return a value, -// you can use lambda to capture a value pointer. -template -std::future Async(Callback callback) { - return ThreadPool::GetInstance()->Run(callback); -} +using ThreadPool = phi::ThreadPool; -template -std::future AsyncIO(Callback callback) { - return ThreadPoolIO::GetInstanceIO()->Run(callback); -} +using ThreadPoolIO = phi::ThreadPoolIO; } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/threadpool_test.cc b/paddle/fluid/framework/threadpool_test.cc index 59fc31c485f3b..25155a0f7e87c 100644 --- a/paddle/fluid/framework/threadpool_test.cc +++ b/paddle/fluid/framework/threadpool_test.cc @@ -26,7 +26,7 @@ void do_sum(std::vector>* fs, int cnt) { for (int i = 0; i < cnt; ++i) { std::lock_guard l(*mu); - fs->push_back(framework::Async([sum]() { sum->fetch_add(1); })); + fs->push_back(phi::Async([sum]() { sum->fetch_add(1); })); } } diff --git a/paddle/phi/core/CMakeLists.txt b/paddle/phi/core/CMakeLists.txt index d34f5f658b87b..90f5d38bfc93b 100644 --- a/paddle/phi/core/CMakeLists.txt +++ b/paddle/phi/core/CMakeLists.txt @@ -43,6 +43,10 @@ cc_library( lod_utils SRCS lod_utils.cc DEPS phi_enforce) +cc_library( + threadpool + SRCS threadpool.cc + DEPS phi_enforce) cc_library( dense_tensor diff --git a/paddle/fluid/framework/threadpool.cc b/paddle/phi/core/threadpool.cc similarity index 71% rename from paddle/fluid/framework/threadpool.cc rename to paddle/phi/core/threadpool.cc index 1a1e017b59e09..db1f3091031fc 100644 --- a/paddle/fluid/framework/threadpool.cc +++ b/paddle/phi/core/threadpool.cc @@ -1,33 +1,32 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -#include "paddle/fluid/framework/threadpool.h" +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/threadpool.h" #include #include "gflags/gflags.h" #include "glog/logging.h" -#include "paddle/fluid/platform/enforce.h" +#include "paddle/phi/core/enforce.h" +DECLARE_int32(dist_threadpool_size); DEFINE_int32(io_threadpool_size, 100, "number of threads used for doing IO, default 100"); -DECLARE_int32(dist_threadpool_size); +namespace phi { -namespace paddle { -namespace framework { std::unique_ptr ThreadPool::threadpool_(nullptr); std::once_flag ThreadPool::init_flag_; @@ -47,7 +46,7 @@ void ThreadPool::Init() { PADDLE_ENFORCE_GT( num_threads, 0, - platform::errors::InvalidArgument("The number of threads is 0.")); + phi::errors::InvalidArgument("The number of threads is 0.")); threadpool_.reset(new ThreadPool(num_threads)); } } @@ -88,8 +87,8 @@ void ThreadPool::TaskLoop() { } if (tasks_.empty()) { - PADDLE_THROW(platform::errors::Unavailable( - "Current thread has no task to Run.")); + PADDLE_THROW( + phi::errors::Unavailable("Current thread has no task to Run.")); } // pop a task from the task queue @@ -115,6 +114,4 @@ void ThreadPoolIO::InitIO() { io_threadpool_.reset(new ThreadPool(FLAGS_io_threadpool_size)); } } - -} // namespace framework -} // namespace paddle +} // namespace phi diff --git a/paddle/phi/core/threadpool.h b/paddle/phi/core/threadpool.h new file mode 100644 index 0000000000000..b45991f9a7f82 --- /dev/null +++ b/paddle/phi/core/threadpool.h @@ -0,0 +1,150 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include // NOLINT +#include +#include // NOLINT +#include +#include // NOLINT +#include +#include // NOLINT +#include +#include + +#include "paddle/phi/core/enforce.h" +#include "paddle/phi/core/macros.h" // for DISABLE_COPY_AND_ASSIGN + +namespace phi { + +struct ExceptionHandler { + mutable std::future> future_; + explicit ExceptionHandler( + std::future>&& f) + : future_(std::move(f)) {} + void operator()() const { + auto ex = this->future_.get(); + if (ex != nullptr) { + PADDLE_THROW(phi::errors::Fatal( + "The exception is thrown inside the thread pool. You " + "should use RunAndGetException to handle the exception." + "The exception is:\n %s.", + ex->what())); + } + } +}; + +// ThreadPool maintains a queue of tasks, and runs them using a fixed +// number of threads. +class ThreadPool { + public: + explicit ThreadPool(int num_threads); + + using Task = + std::packaged_task()>; + + // Returns the singleton of ThreadPool. + static ThreadPool* GetInstance(); + + ~ThreadPool(); + + // Run pushes a function to the task queue and returns a std::future + // object. To wait for the completion of the task, call + // std::future::wait(). + template + std::future Run(Callback fn) { + auto f = this->RunAndGetException(fn); + return std::async(std::launch::deferred, ExceptionHandler(std::move(f))); + } + + template + std::future> RunAndGetException( + Callback fn) { + Task task([fn]() -> std::unique_ptr { + try { + fn(); + } catch (phi::enforce::EnforceNotMet& ex) { + return std::unique_ptr( + new phi::enforce::EnforceNotMet(ex)); + } catch (const std::exception& e) { + PADDLE_THROW(phi::errors::Fatal( + "Unexpected exception is catched in thread pool. All " + "throwable exception in Paddle should be an EnforceNotMet." + "The exception is:\n %s.", + e.what())); + } + return nullptr; + }); + std::future> f = + task.get_future(); + { + std::unique_lock lock(mutex_); + if (!running_) { + PADDLE_THROW(phi::errors::Unavailable( + "Task is enqueued into stopped ThreadPool.")); + } + tasks_.push(std::move(task)); + } + scheduled_.notify_one(); + return f; + } + + private: + DISABLE_COPY_AND_ASSIGN(ThreadPool); + + // The constructor starts threads to run TaskLoop, which retrieves + // and runs tasks from the queue. + void TaskLoop(); + + // Init is called by GetInstance. + static void Init(); + + private: + static std::unique_ptr threadpool_; + static std::once_flag init_flag_; + + std::vector> threads_; + + std::queue tasks_; + std::mutex mutex_; + bool running_; + std::condition_variable scheduled_; +}; + +class ThreadPoolIO : ThreadPool { + public: + static ThreadPool* GetInstanceIO(); + static void InitIO(); + + private: + // NOTE: threadpool in base will be inhereted here. + static std::unique_ptr io_threadpool_; + static std::once_flag io_init_flag_; +}; + +// Run a function asynchronously. +// NOTE: The function must return void. If the function need to return a value, +// you can use lambda to capture a value pointer. +template +std::future Async(Callback callback) { + return ThreadPool::GetInstance()->Run(callback); +} + +template +std::future AsyncIO(Callback callback) { + return ThreadPoolIO::GetInstanceIO()->Run(callback); +} + +} // namespace phi diff --git a/paddle/phi/kernels/selected_rows/cpu/adam_kernel.cc b/paddle/phi/kernels/selected_rows/cpu/adam_kernel.cc index 2e7fe555feffc..de8b4eae4f660 100644 --- a/paddle/phi/kernels/selected_rows/cpu/adam_kernel.cc +++ b/paddle/phi/kernels/selected_rows/cpu/adam_kernel.cc @@ -15,10 +15,10 @@ #include "paddle/phi/kernels/selected_rows/adam_kernel.h" #include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/framework/threadpool.h" #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/tensor_utils.h" +#include "paddle/phi/core/threadpool.h" #include "paddle/phi/kernels/funcs/adam_functors.h" #include "paddle/phi/kernels/funcs/selected_rows_functor.h" @@ -201,12 +201,12 @@ void AdamDenseParamSparseGradKernel( if (end > static_cast(param_row_count)) { end = static_cast(param_row_count); } - fs.push_back(paddle::framework::Async([&functor, - &row_id_to_grad_row_offset, - &grad_data, - row_numel, - start, - end]() { + fs.push_back(phi::Async([&functor, + &row_id_to_grad_row_offset, + &grad_data, + row_numel, + start, + end]() { for (int64_t row_id = start; row_id < end; ++row_id) { auto iter = row_id_to_grad_row_offset.find(row_id); if (iter != row_id_to_grad_row_offset.end()) { From d92daae2a8dbcc11c2fc6c51a5b9115a289f0b68 Mon Sep 17 00:00:00 2001 From: Vvsmile <450864116@qq.com> Date: Mon, 21 Nov 2022 14:47:00 +0800 Subject: [PATCH 119/210] Remove API: crop (#47972) remove crop which is not used in Paddle 2.0 --- python/paddle/fluid/layers/nn.py | 103 ------------------ .../fluid/tests/unittests/test_layers.py | 9 -- 2 files changed, 112 deletions(-) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index adddf52c1e8ea..e9ca037e49916 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -124,7 +124,6 @@ 'mean_iou', 'relu', 'log', - 'crop', 'crop_tensor', 'pow', 'hard_sigmoid', @@ -9137,108 +9136,6 @@ def mean_iou(input, label, num_classes): return out_mean_iou, out_wrong, out_correct -def crop(x, shape=None, offsets=None, name=None): - """ - Crop input into output, as specified by offsets and shape. - - **Warning:** THIS OP IS DEPRECATED. It will be removed in the future version. - Instructions for updating: Use :ref:`api_fluid_layers_crop_tensor` instead. - - .. code-block:: text - - * Case 1: - Given - X = [[0, 1, 2, 0, 0] - [0, 3, 4, 0, 0] - [0, 0, 0, 0, 0]], - and - shape = [2, 2], - offsets = [0, 1], - output is: - Out = [[1, 2], - [3, 4]]. - * Case 2: - Given - X = [[0, 1, 2, 5, 0] - [0, 3, 4, 6, 0] - [0, 0, 0, 0, 0]], - and shape is tensor - shape = [[0, 0, 0] - [0, 0, 0]] - and - offsets = [0, 1], - - output is: - Out = [[1, 2, 5], - [3, 4, 6]]. - - Parameters: - x (Variable): Tensor, data type can be float32 or float64. - shape (Variable|list/tuple of integers, optional): The output shape is specified - by `shape`, which can be a Tensor or a list/tuple of integers. - If it is a Tensor, it's rank must be the same as `x` , only - it's shape will be used, and the value of it will be ignored. This way - is suitable for the case that the output shape may be changed each - iteration. If it is a list/tuple of integers, it's length must be the same - as the rank of `x` - offsets (Variable|list/tuple of integers|None, optional): Specifies the cropping - offsets at each dimension. It can be a Tensor or a list/tuple - of integers. If it is a Tensor, it's rank must be the same as `x`. - This way is suitable for the case that the offsets may be changed - each iteration. If it is a list/tuple of integers, it's length must be the - same as the rank of `x`. If None, the offsets are 0 at each dimension. - name(str, optional): For detailed information, please refer - to :ref:`api_guide_Name` . Usually name is no need to set and - None by default. - - Returns: - Tensor, The cropped Tensor, which has the same rank and data type with `x`. - - Examples: - - .. code-block:: python - - import paddle.fluid as fluid - import paddle.fluid as fluid - import paddle - paddle.enable_static() - x = fluid.data(name="x", shape=[3, 3, 5], dtype="float32") - y = fluid.data(name="y", shape=[2, 2, 3], dtype="float32") - crop = fluid.layers.crop(x, shape=y) - - # or - z = fluid.data(name="z", shape=[3, 3, 5], dtype="float32") - crop = fluid.layers.crop(z, shape=[2, 2, 3]) - - """ - check_variable_and_dtype(x, 'x', ['float32'], 'crop') - check_type(shape, 'shape', (list, tuple, Variable), 'crop') - helper = LayerHelper('crop', **locals()) - - if offsets is None: - offsets = [0] * len(x.shape) - - out = helper.create_variable_for_type_inference(x.dtype) - ipts = {'X': x} - attrs = {} - if isinstance(shape, Variable): - ipts['Y'] = shape - else: - attrs['shape'] = shape - if isinstance(offsets, Variable): - ipts['Offsets'] = offsets - else: - attrs['offsets'] = offsets - - helper.append_op( - type='crop', - inputs=ipts, - outputs={'Out': out}, - attrs=None if len(attrs) == 0 else attrs, - ) - return out - - def crop_tensor(x, shape=None, offsets=None, name=None): """ Crop input into output, as specified by offsets and shape. diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index 27d7e42accf18..f97c94858bd6a 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -3588,15 +3588,6 @@ def make_l2_normalize(self): output = layers.l2_normalize(x, axis=1) return output - def make_crop(self): - with program_guard( - fluid.default_main_program(), fluid.default_startup_program() - ): - x = self._get_data(name='x', shape=[3, 5], dtype="float32") - y = self._get_data(name='y', shape=[2, 3], dtype="float32") - output = layers.crop(x, shape=y) - return output - def make_mean_iou(self): with fluid.framework._dygraph_place_guard(place=fluid.CPUPlace()): x = self._get_data(name='x', shape=[16], dtype='int32') From 69eeaf03d50bcb2966aded91aca4a8110cf2333d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=82=85=E5=89=91=E5=AF=92?= Date: Mon, 21 Nov 2022 15:17:43 +0800 Subject: [PATCH 120/210] [fluid clean] remove fluid.layers.expand_as in nn.py under fluid (#47931) --- python/paddle/fluid/layers/nn.py | 89 ------------------- .../unittests/auto_parallel_gpt_model.py | 4 +- .../test_auto_parallel_completion_gpt.py | 4 +- .../test_auto_parallel_partitioner_gpt.py | 4 +- .../tests/unittests/test_expand_as_op.py | 44 --------- 5 files changed, 3 insertions(+), 142 deletions(-) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index e9ca037e49916..d9253f50a1bea 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -135,7 +135,6 @@ 'unique', 'unique_with_counts', 'expand', - 'expand_as', 'scale', 'elementwise_add', 'elementwise_div', @@ -9969,94 +9968,6 @@ def get_attr_expand_times(list_expand_times): return out -@deprecated(since='2.0.0', update_to="paddle.expand_as") -def expand_as(x, target_tensor, name=None): - """ - :alias_main: paddle.expand_as - :alias: paddle.expand_as,paddle.tensor.expand_as,paddle.tensor.manipulation.expand_as - :old_api: paddle.fluid.layers.expand_as - - expand_as operator tiles to the input by given expand tensor. You should set expand tensor - for each dimension by providing tensor 'target_tensor'. The rank of X - should be in [1, 6]. Please note that size of 'target_tensor' must be the same - with X's rank. Following is a using case: - - - .. code-block:: text - - Input(X) is a 3-D tensor with shape [2, 3, 1]: - - [ - [[1], [2], [3]], - [[4], [5], [6]] - ] - - target_tensor's shape: [2, 6, 2] - - Output(Out) is a 3-D tensor with shape [2, 6, 2]: - - [ - [[1, 1], [2, 2], [3, 3], [1, 1], [2, 2], [3, 3]], - [[4, 4], [5, 5], [6, 6], [4, 4], [5, 5], [6, 6]] - ] - - - Args: - x (Variable): A Tensor with dtype float64, float32, int32. - A tensor with rank in [1, 6]. - target_tensor (Variable): A Tensor with dtype float64, float32, int32. - target_tensor for expanding to Input(X). Only use target_tensor'shape. - - Returns: - Variable: A Tensor with dtype float64, float32, int32. - After expanding, size of each dimension of Output(Out) is equal to the size - of the corresponding dimension of target_tensor multiplying the corresponding - value given by target_tensor. - - - Examples: - .. code-block:: python - - import paddle - import paddle.fluid as fluid - import numpy as np - paddle.enable_static() - - data = fluid.layers.data(name="data", shape=[-1,10], dtype='float64') - target_tensor = fluid.layers.data( - name="target_tensor", shape=[-1,20], dtype='float64') - result = fluid.layers.expand_as(x=data, target_tensor=target_tensor) - use_cuda = False - place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() - exe = fluid.Executor(place) - exe.run(fluid.default_startup_program()) - x = np.random.rand(3,10) - y = np.random.rand(3,20) - output= exe.run(feed={"data":x,"target_tensor":y},fetch_list=[result.name]) - print(output[0].shape) - #(3,20) - - """ - if _non_static_mode(): - return _legacy_C_ops.expand_as(x, target_tensor) - - check_variable_and_dtype( - x, 'x', ['float32', 'float64', 'int32', 'int64', 'bool'], 'expand_as' - ) - check_variable_and_dtype( - target_tensor, - 'target_tensor', - ['float32', 'float64', 'int32', 'int64', 'bool'], - 'expand_as', - ) - helper = LayerHelper('expand_as', input=x, **locals()) - dtype = helper.input_dtype(input_param_name='x') - out = helper.create_variable_for_type_inference(dtype) - inputs = {'X': x, 'target_tensor': target_tensor} - helper.append_op(type='expand_as', inputs=inputs, outputs={'Out': out}) - return out - - from paddle.fluid.framework import convert_np_dtype_to_dtype_ diff --git a/python/paddle/fluid/tests/unittests/auto_parallel_gpt_model.py b/python/paddle/fluid/tests/unittests/auto_parallel_gpt_model.py index 829e7f7a5ddc5..425f00d12198d 100644 --- a/python/paddle/fluid/tests/unittests/auto_parallel_gpt_model.py +++ b/python/paddle/fluid/tests/unittests/auto_parallel_gpt_model.py @@ -724,9 +724,7 @@ def forward( dtype='int64', ) position_ids = position_ids.unsqueeze(0) - position_ids = paddle.fluid.layers.expand_as( - position_ids, input_ids - ) + position_ids = paddle.expand_as(position_ids, input_ids) embedding_output = self.embeddings( input_ids=input_ids, position_ids=position_ids ) diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_completion_gpt.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_completion_gpt.py index 75af22f2912ed..0febac998b32c 100644 --- a/python/paddle/fluid/tests/unittests/test_auto_parallel_completion_gpt.py +++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_completion_gpt.py @@ -616,9 +616,7 @@ def forward( ) position_ids = position_ids.unsqueeze(0) # .expand_as(input_ids) - position_ids = paddle.fluid.layers.expand_as( - position_ids, input_ids - ) + position_ids = paddle.expand_as(position_ids, input_ids) embedding_output = self.embeddings( input_ids=input_ids, position_ids=position_ids ) diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner_gpt.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner_gpt.py index 92528009bdc35..b65a2351244b6 100644 --- a/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner_gpt.py +++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner_gpt.py @@ -662,9 +662,7 @@ def forward( ) position_ids = position_ids.unsqueeze(0) # .expand_as(input_ids) - position_ids = paddle.fluid.layers.expand_as( - position_ids, input_ids - ) + position_ids = paddle.expand_as(position_ids, input_ids) embedding_output = self.embeddings( input_ids=input_ids, position_ids=position_ids ) diff --git a/python/paddle/fluid/tests/unittests/test_expand_as_op.py b/python/paddle/fluid/tests/unittests/test_expand_as_op.py index 827f7a73a75de..868f0d269ee44 100755 --- a/python/paddle/fluid/tests/unittests/test_expand_as_op.py +++ b/python/paddle/fluid/tests/unittests/test_expand_as_op.py @@ -15,7 +15,6 @@ import unittest import numpy as np from op_test import OpTest -import paddle.fluid as fluid def bcast(x, target_tensor): @@ -100,48 +99,5 @@ def test_check_grad(self): self.check_grad(['X'], 'Out') -# Test dygraph API -class TestExpandAsDygraphAPI(unittest.TestCase): - def test_api(self): - import paddle - - paddle.disable_static() - np_data_x = np.array([1, 2, 3]).astype('int32') - np_data_y = np.array([1, 2, 3, 1, 2, 3]).astype('int32') - data_x = paddle.to_tensor(np_data_x) - data_y = paddle.to_tensor(np_data_y) - out = fluid.layers.expand_as(data_x, data_y) - np_out = out.numpy() - assert np.array_equal(np_out, np.tile(np_data_x, (2))) - paddle.enable_static() - - -# Test python API -class TestExpandAsAPI(unittest.TestCase): - def test_api(self): - input1 = np.random.random([12, 14]).astype("float32") - input2 = np.random.random([48, 14]).astype("float32") - x = fluid.layers.data( - name='x', shape=[12, 14], append_batch_size=False, dtype="float32" - ) - - y = fluid.layers.data( - name='target_tensor', - shape=[48, 14], - append_batch_size=False, - dtype="float32", - ) - - out_1 = fluid.layers.expand_as(x, target_tensor=y) - - exe = fluid.Executor(place=fluid.CPUPlace()) - res_1 = exe.run( - fluid.default_main_program(), - feed={"x": input1, "target_tensor": input2}, - fetch_list=[out_1], - ) - assert np.array_equal(res_1[0], np.tile(input1, (4, 1))) - - if __name__ == "__main__": unittest.main() From 70589379211de9b1b63681c55fa771776974a848 Mon Sep 17 00:00:00 2001 From: zyfncg Date: Mon, 21 Nov 2022 15:47:37 +0800 Subject: [PATCH 121/210] Fix wrong eigen header include in data_type.h (#48157) * Fix wrong eigen header include * fix compile bug --- paddle/phi/core/utils/data_type.h | 2 -- paddle/phi/kernels/funcs/eigen/eigen_function.h | 2 ++ 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/phi/core/utils/data_type.h b/paddle/phi/core/utils/data_type.h index a7bf1123a05a0..7852d87c9a293 100644 --- a/paddle/phi/core/utils/data_type.h +++ b/paddle/phi/core/utils/data_type.h @@ -20,8 +20,6 @@ limitations under the License. */ #include "paddle/phi/common/data_type.h" #include "paddle/phi/core/enforce.h" -#include "paddle/phi/kernels/funcs/eigen/extensions.h" - namespace phi { // Here we can't depend on the fluid proto::VarType, so we use the dtype enum diff --git a/paddle/phi/kernels/funcs/eigen/eigen_function.h b/paddle/phi/kernels/funcs/eigen/eigen_function.h index 1e81256e79e14..42f46b814e168 100644 --- a/paddle/phi/kernels/funcs/eigen/eigen_function.h +++ b/paddle/phi/kernels/funcs/eigen/eigen_function.h @@ -18,6 +18,8 @@ limitations under the License. */ #ifndef NOMINMAX #define NOMINMAX #endif + +#include "paddle/phi/kernels/funcs/eigen/extensions.h" #include "unsupported/Eigen/CXX11/Tensor" namespace phi { From 208f625b6cad44cf0324ad034dca0e3a91610392 Mon Sep 17 00:00:00 2001 From: JYChen Date: Mon, 21 Nov 2022 16:00:18 +0800 Subject: [PATCH 122/210] [Fluid Clean] remove apis in fluid.layers.ops (#47867) * remove apis in fluid.ops * fix test_activation_nn_grad * fix circle import error * fix ops * fix cos * fix divide not inplace * remove lazy-import part --- .../meta_optimizers/localsgd_optimizer.py | 4 +- .../sharding/group_sharded_utils.py | 4 +- .../meta_parallel/sharding/sharding_utils.py | 4 +- python/paddle/distribution/categorical.py | 8 +- python/paddle/distribution/normal.py | 3 +- python/paddle/fluid/clip.py | 8 +- .../paddle/fluid/contrib/layers/rnn_impl.py | 19 +- .../contrib/slim/quantization/adaround.py | 8 +- .../contrib/tests/test_weight_decay_extend.py | 2 +- .../fluid/dygraph/learning_rate_scheduler.py | 24 +- python/paddle/fluid/dygraph/rnn.py | 15 +- python/paddle/fluid/layers/__init__.py | 3 - python/paddle/fluid/layers/control_flow.py | 2 +- python/paddle/fluid/layers/detection.py | 4 +- python/paddle/fluid/layers/distributions.py | 7 +- python/paddle/fluid/layers/io.py | 1 - .../fluid/layers/layer_function_generator.py | 16 - .../fluid/layers/learning_rate_scheduler.py | 15 +- python/paddle/fluid/layers/loss.py | 1 - python/paddle/fluid/layers/ops.py | 975 ------------------ python/paddle/fluid/nets.py | 3 +- python/paddle/fluid/optimizer.py | 3 +- .../tests/book/notest_understand_sentiment.py | 10 +- .../tests/book/test_rnn_encoder_decoder.py | 10 +- python/paddle/fluid/tests/test_if_else_op.py | 6 +- .../tests/unittests/dist_fleet_simnet_bow.py | 6 +- .../dygraph_to_static/ifelse_simple_func.py | 6 +- .../seq2seq_dygraph_model.py | 12 +- .../dygraph_to_static/simnet_dygraph_model.py | 3 +- .../dygraph_to_static/test_cycle_gan.py | 28 +- .../dygraph_to_static/test_ptb_lm.py | 8 +- .../dygraph_to_static/test_sentiment.py | 8 +- .../dygraph_to_static/test_word2vec.py | 3 +- .../tests/unittests/ipu/test_cumsum_op_ipu.py | 6 +- .../tests/unittests/ipu/test_gelu_op_ipu.py | 2 +- .../tests/unittests/ipu/test_unary_ops_ipu.py | 44 +- .../test_mkldnn_elt_act_fuse_pass.py | 34 +- .../ir/inference/test_trt_activation_pass.py | 17 +- .../unittests/ir/test_ir_fusion_group_pass.py | 17 +- .../unittests/mlu/sync_batch_norm_op_mlu.py | 2 +- .../tests/unittests/mlu/test_gelu_op_mlu.py | 2 +- .../unittests/npu/sync_batch_norm_op_npu.py | 2 +- .../tests/unittests/npu/test_gelu_op_npu.py | 2 +- .../fluid/tests/unittests/simple_nets.py | 2 +- .../unittests/test_activation_nn_grad.py | 22 +- .../tests/unittests/test_activation_op.py | 157 +-- .../fluid/tests/unittests/test_cumsum_op.py | 2 +- .../tests/unittests/test_dist_fleet_ps.py | 6 +- .../tests/unittests/test_dist_fleet_ps11.py | 6 +- .../tests/unittests/test_dist_fleet_ps12.py | 6 +- .../tests/unittests/test_dist_fleet_ps13.py | 6 +- .../tests/unittests/test_dist_fleet_ps2.py | 6 +- .../tests/unittests/test_dist_fleet_ps3.py | 6 +- .../tests/unittests/test_dist_fleet_ps4.py | 6 +- .../tests/unittests/test_dist_fleet_ps5.py | 6 +- .../tests/unittests/test_dist_fleet_ps6.py | 6 +- .../unittests/test_eager_deletion_gru_net.py | 2 +- .../unittests/test_eager_deletion_lstm_net.py | 2 +- .../test_eager_deletion_padding_rnn.py | 16 +- .../test_eager_deletion_recurrent_op.py | 4 +- .../fluid/tests/unittests/test_erf_op.py | 2 +- .../fluid/tests/unittests/test_gelu_op.py | 4 +- .../tests/unittests/test_gradient_clip.py | 2 +- .../tests/unittests/test_imperative_basic.py | 4 +- .../test_imperative_ocr_attention_model.py | 2 +- .../unittests/test_imperative_ptb_rnn.py | 8 +- .../unittests/test_imperative_save_load.py | 8 +- .../unittests/test_imperative_save_load_v2.py | 8 +- ...perative_star_gan_with_gradient_penalty.py | 10 +- .../unittests/test_imperative_triple_grad.py | 4 +- .../tests/unittests/test_inplace_abn_op.py | 2 +- .../unittests/test_ir_memory_optimize_nlp.py | 2 +- .../fluid/tests/unittests/test_layers.py | 128 --- .../fluid/tests/unittests/test_lgamma_op.py | 2 +- .../unittests/test_program_prune_backward.py | 2 +- .../fluid/tests/unittests/test_py_func_op.py | 2 +- .../tests/unittests/test_recurrent_op.py | 8 +- .../fluid/tests/unittests/test_regularizer.py | 4 +- .../tests/unittests/test_regularizer_api.py | 4 +- .../tests/unittests/test_retain_graph.py | 2 +- .../tests/unittests/test_static_save_load.py | 8 +- .../unittests/test_sync_batch_norm_op.py | 2 +- .../tests/unittests/test_weight_decay.py | 2 +- .../tests/unittests/transformer_model.py | 3 +- .../distributed/models/moe/grad_clip.py | 2 +- .../paddle/incubate/optimizer/modelaverage.py | 2 +- 86 files changed, 294 insertions(+), 1551 deletions(-) mode change 100755 => 100644 python/paddle/fluid/layers/layer_function_generator.py delete mode 100755 python/paddle/fluid/layers/ops.py diff --git a/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py index 67cd428f3b969..62ff253fb7765 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py @@ -449,8 +449,8 @@ def communicate_avg_loss(): communicate() self._generate_avg_loss(main_block, loss, avg_loss) next_local_steps = layers.cast( - layers.ceil( - layers.sqrt( + paddle.ceil( + paddle.sqrt( lr_0 * avg_loss / (global_lr * loss_0) diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py index 5ab3ffb4e4eec..2b883fe67e006 100644 --- a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py +++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py @@ -68,7 +68,7 @@ def _dygraph_clip(self, params_grads): merge_grad = layers.get_tensor_from_selected_rows( layers.merge_selected_rows(g) ) - square = layers.square(merge_grad) + square = paddle.square(merge_grad) sum_square = layers.reduce_sum(square) if p.dtype == paddle.float16: @@ -133,7 +133,7 @@ def _dygraph_clip(self, params_grads): with device_guard(dev_id, "gpu"): paddle.distributed.all_reduce(global_norm_var, group=self._group) - global_norm_var = layers.sqrt(global_norm_var) + global_norm_var = paddle.sqrt(global_norm_var) max_global_norm = layers.fill_constant( shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm ) diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py index 4cee382339538..c5ce4c249ea09 100644 --- a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py +++ b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py @@ -69,7 +69,7 @@ def _dygraph_clip(self, params_grads): merge_grad = layers.get_tensor_from_selected_rows( layers.merge_selected_rows(g) ) - square = layers.square(merge_grad) + square = paddle.square(merge_grad) sum_square = layers.reduce_sum(square) if p.dtype == paddle.float16: @@ -131,7 +131,7 @@ def _dygraph_clip(self, params_grads): with device_guard(dev_id, "gpu"): paddle.distributed.all_reduce(global_norm_var, group=self._group) - global_norm_var = layers.sqrt(global_norm_var) + global_norm_var = paddle.sqrt(global_norm_var) max_global_norm = layers.fill_constant( shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm ) diff --git a/python/paddle/distribution/categorical.py b/python/paddle/distribution/categorical.py index bd6c570ca1f10..b6aa173c298b4 100644 --- a/python/paddle/distribution/categorical.py +++ b/python/paddle/distribution/categorical.py @@ -17,7 +17,7 @@ from paddle.distribution import distribution from paddle.fluid.data_feeder import check_type, convert_dtype from paddle.fluid.framework import _non_static_mode -from paddle.fluid.layers import ops, tensor +from paddle.fluid.layers import tensor from paddle.tensor import multinomial @@ -214,8 +214,8 @@ def kl_divergence(self, other): other_logits = other.logits - paddle.max( other.logits, axis=-1, keepdim=True ) - e_logits = ops.exp(logits) - other_e_logits = ops.exp(other_logits) + e_logits = paddle.exp(logits) + other_e_logits = paddle.exp(other_logits) z = paddle.sum(e_logits, axis=-1, keepdim=True) other_z = paddle.sum(other_e_logits, axis=-1, keepdim=True) prob = e_logits / z @@ -255,7 +255,7 @@ def entropy(self): """ name = self.name + '_entropy' logits = self.logits - paddle.max(self.logits, axis=-1, keepdim=True) - e_logits = ops.exp(logits) + e_logits = paddle.exp(logits) z = paddle.sum(e_logits, axis=-1, keepdim=True) prob = e_logits / z diff --git a/python/paddle/distribution/normal.py b/python/paddle/distribution/normal.py index a7cd0e4d03a31..061d68ab5afa0 100644 --- a/python/paddle/distribution/normal.py +++ b/python/paddle/distribution/normal.py @@ -23,7 +23,6 @@ elementwise_div, elementwise_sub, nn, - ops, tensor, ) @@ -288,7 +287,7 @@ def probs(self, value): var = self.scale * self.scale return elementwise_div( - ops.exp( + paddle.exp( -1.0 * ((value - self.loc) * (value - self.loc)) / (2.0 * var) ), (math.sqrt(2 * math.pi) * self.scale), diff --git a/python/paddle/fluid/clip.py b/python/paddle/fluid/clip.py index 19c8629fa9ad8..1242fb172a0e5 100644 --- a/python/paddle/fluid/clip.py +++ b/python/paddle/fluid/clip.py @@ -72,7 +72,7 @@ def _squared_l2_norm(x): or x.dtype == core.VarDesc.VarType.FP16 or x.dtype == core.VarDesc.VarType.BF16 ): - square = layers.square(x) + square = paddle.square(x) sum_square = layers.reduce_sum(square) return sum_square @@ -540,7 +540,7 @@ def _dygraph_clip(self, params_grads): global_norm_var_fp64 = paddle.add_n(sum_square_list) global_norm_var.append(global_norm_var_fp64) global_norm_var = paddle.add_n(global_norm_var) - global_norm_var = layers.sqrt(global_norm_var) + global_norm_var = paddle.sqrt(global_norm_var) max_global_norm = layers.fill_constant( shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm ) @@ -648,7 +648,7 @@ def _static_clip(self, params_grads): if len(global_norm_var) > 1 else global_norm_var[0] ) - global_norm_var = layers.sqrt(x=global_norm_var) + global_norm_var = paddle.sqrt(x=global_norm_var) max_global_norm = layers.fill_constant( shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm ) @@ -727,7 +727,7 @@ def _create_operators(self, param, grad): group_scale_name = self.group_name + "_scale" if group_scale_name not in self.context: group_norm_var = layers.sums(input=self.context[self.group_name]) - group_norm_var = layers.sqrt(x=group_norm_var) + group_norm_var = paddle.sqrt(x=group_norm_var) clip_var = self.context[self.group_name + "_clip"] group_scale_var = layers.elementwise_div( x=clip_var, diff --git a/python/paddle/fluid/contrib/layers/rnn_impl.py b/python/paddle/fluid/contrib/layers/rnn_impl.py index 569c9e6877396..69b48fbe23430 100644 --- a/python/paddle/fluid/contrib/layers/rnn_impl.py +++ b/python/paddle/fluid/contrib/layers/rnn_impl.py @@ -14,6 +14,7 @@ import copy +import paddle from paddle.fluid import layers, unique_name from paddle.fluid.dygraph import Layer from paddle.fluid.dygraph.layer_object_helper import LayerObjectHelper @@ -95,8 +96,8 @@ def __init__( self._hiden_size = hidden_size self._param_attr = param_attr self._bias_attr = bias_attr - self._gate_activation = gate_activation or layers.sigmoid - self._activation = activation or layers.tanh + self._gate_activation = gate_activation or paddle.nn.functional.sigmoid + self._activation = activation or paddle.tanh self._dtype = dtype def _build_once(self, input, pre_hidden): @@ -845,8 +846,8 @@ def __init__( self._hiden_size = hidden_size self._param_attr = param_attr self._bias_attr = bias_attr - self._gate_activation = gate_activation or layers.sigmoid - self._activation = activation or layers.tanh + self._gate_activation = gate_activation or paddle.nn.functional.sigmoid + self._activation = activation or paddle.tanh self._forget_bias = layers.fill_constant( [1], dtype=dtype, value=forget_bias ) @@ -879,10 +880,14 @@ def forward(self, input, pre_hidden, pre_cell): new_cell = layers.elementwise_add( layers.elementwise_mul( pre_cell, - layers.sigmoid(layers.elementwise_add(f, self._forget_bias)), + paddle.nn.functional.sigmoid( + layers.elementwise_add(f, self._forget_bias) + ), + ), + layers.elementwise_mul( + paddle.nn.functional.sigmoid(i), paddle.tanh(j) ), - layers.elementwise_mul(layers.sigmoid(i), layers.tanh(j)), ) - new_hidden = layers.tanh(new_cell) * layers.sigmoid(o) + new_hidden = paddle.tanh(new_cell) * paddle.nn.functional.sigmoid(o) return new_hidden, new_cell diff --git a/python/paddle/fluid/contrib/slim/quantization/adaround.py b/python/paddle/fluid/contrib/slim/quantization/adaround.py index 278994ef318a1..b253e15918a5d 100644 --- a/python/paddle/fluid/contrib/slim/quantization/adaround.py +++ b/python/paddle/fluid/contrib/slim/quantization/adaround.py @@ -17,6 +17,7 @@ import sys import logging +import paddle import paddle.fluid as fluid from ....log_helper import get_logger @@ -41,7 +42,9 @@ def compute_soft_rounding(alpha_v): return fluid.layers.clip( - fluid.layers.sigmoid(alpha_v) * (ZETA - GAMMA) + GAMMA, min=0, max=1 + paddle.nn.functional.sigmoid(alpha_v) * (ZETA - GAMMA) + GAMMA, + min=0, + max=1, ) @@ -73,8 +76,7 @@ def round_loss_fn(): # calculate regularization term - which ensures parameter to converge to exactly zeros and ones # at the end of optimization reg_term = fluid.layers.reduce_sum( - -fluid.layers.pow(fluid.layers.abs(2 * h_v - 1), factor=beta) - + 1 + -fluid.layers.pow(paddle.abs(2 * h_v - 1), factor=beta) + 1 ) # calculate the rounding loss diff --git a/python/paddle/fluid/contrib/tests/test_weight_decay_extend.py b/python/paddle/fluid/contrib/tests/test_weight_decay_extend.py index adf02eacb306a..70c63c1d54af8 100644 --- a/python/paddle/fluid/contrib/tests/test_weight_decay_extend.py +++ b/python/paddle/fluid/contrib/tests/test_weight_decay_extend.py @@ -82,7 +82,7 @@ def bow_net( input=data, is_sparse=is_sparse, size=[dict_dim, emb_dim] ) bow = fluid.layers.sequence_pool(input=emb, pool_type='sum') - bow_tanh = fluid.layers.tanh(bow) + bow_tanh = paddle.tanh(bow) fc_1 = fluid.layers.fc(input=bow_tanh, size=hid_dim, act="tanh") fc_2 = fluid.layers.fc(input=fc_1, size=hid_dim2, act="tanh") prediction = fluid.layers.fc(input=[fc_2], size=class_dim, act="softmax") diff --git a/python/paddle/fluid/dygraph/learning_rate_scheduler.py b/python/paddle/fluid/dygraph/learning_rate_scheduler.py index 21e0dc7c20c62..faa7a434e3010 100644 --- a/python/paddle/fluid/dygraph/learning_rate_scheduler.py +++ b/python/paddle/fluid/dygraph/learning_rate_scheduler.py @@ -270,12 +270,10 @@ def __init__( self.staircase = staircase def step(self): - from .. import layers - div_res = self.create_lr_var(self.step_num / self.decay_steps) if self.staircase: - div_res = layers.floor(div_res) - decayed_lr = self.learning_rate * layers.exp( + div_res = paddle.floor(div_res) + decayed_lr = self.learning_rate * paddle.exp( -1 * self.decay_rate * div_res ) @@ -356,11 +354,9 @@ def __init__( self.staircase = staircase def step(self): - from .. import layers - div_res = self.create_lr_var(self.step_num / self.decay_steps) if self.staircase: - div_res = layers.floor(div_res) + div_res = paddle.floor(div_res) decayed_lr = self.learning_rate * (self.decay_rate**div_res) @@ -437,11 +433,9 @@ def __init__( self.staircase = staircase def step(self): - from .. import layers - div_res = self.create_lr_var(self.step_num / self.decay_steps) if self.staircase: - div_res = layers.floor(div_res) + div_res = paddle.floor(div_res) decayed_lr = self.learning_rate / (1 + self.decay_rate * div_res) @@ -524,12 +518,10 @@ def __init__( self.cycle = cycle def step(self): - from .. import layers - tmp_step_num = self.step_num tmp_decay_steps = self.decay_steps if self.cycle: - div_res = layers.ceil( + div_res = paddle.ceil( self.create_lr_var(tmp_step_num / float(self.decay_steps)) ) @@ -601,15 +593,13 @@ def __init__( self.epochs = epochs def step(self): - from .. import layers - - cur_epoch = layers.floor( + cur_epoch = paddle.floor( self.create_lr_var(self.step_num / self.step_each_epoch) ) decayed_lr = ( self.learning_rate * 0.5 - * (layers.cos(cur_epoch * math.pi / self.epochs) + 1) + * (paddle.cos(cur_epoch * math.pi / self.epochs) + 1) ) return decayed_lr diff --git a/python/paddle/fluid/dygraph/rnn.py b/python/paddle/fluid/dygraph/rnn.py index e8fd385a0a2b8..fa88dc44bbd21 100644 --- a/python/paddle/fluid/dygraph/rnn.py +++ b/python/paddle/fluid/dygraph/rnn.py @@ -12,10 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. +import paddle from . import Layer from ..layers import ( - sigmoid, - tanh, concat, fill_constant, matmul, @@ -139,8 +138,8 @@ def __init__( self._param_attr = param_attr self._bias_attr = bias_attr self._dtype = dtype - self._gate_activation = gate_activation or sigmoid - self._activation = activation or tanh + self._gate_activation = gate_activation or paddle.nn.functional.sigmoid + self._activation = activation or paddle.tanh self._use_cudnn_impl = use_cudnn_impl if self._use_cudnn_impl: @@ -254,7 +253,9 @@ def forward(self, input, pre_hidden, pre_cell): elementwise_add(f, self._forget_bias) ), ), - elementwise_mul(sigmoid(i), tanh(j)), + elementwise_mul( + paddle.nn.functional.sigmoid(i), paddle.tanh(j) + ), ) new_hidden = self._activation(new_cell) * self._gate_activation(o) @@ -357,8 +358,8 @@ def __init__( self._param_attr = param_attr self._bias_attr = bias_attr self._dtype = dtype - self._gate_activation = gate_activation or sigmoid - self._activation = activation or tanh + self._gate_activation = gate_activation or paddle.nn.functional.sigmoid + self._activation = activation or paddle.tanh self._use_cudnn_impl = use_cudnn_impl if self._use_cudnn_impl: diff --git a/python/paddle/fluid/layers/__init__.py b/python/paddle/fluid/layers/__init__.py index 560dbaf2c4873..9ce0c0afeb95a 100644 --- a/python/paddle/fluid/layers/__init__.py +++ b/python/paddle/fluid/layers/__init__.py @@ -12,8 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -from . import ops -from .ops import * from . import nn from .nn import * from . import io @@ -43,7 +41,6 @@ __all__ += io.__all__ __all__ += tensor.__all__ __all__ += control_flow.__all__ -__all__ += ops.__all__ __all__ += device.__all__ __all__ += detection.__all__ __all__ += metric_op.__all__ diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py index 62523afc9aabc..658941ad4446a 100755 --- a/python/paddle/fluid/layers/control_flow.py +++ b/python/paddle/fluid/layers/control_flow.py @@ -14,7 +14,7 @@ from ..wrapped_decorator import signature_safe_contextmanager -from .layer_function_generator import autodoc, templatedoc +from .layer_function_generator import templatedoc from .tensor import assign, cast, fill_constant from .. import core from ..framework import ( diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py index 1f5c0273e5911..b7a3b2aba9c88 100644 --- a/python/paddle/fluid/layers/detection.py +++ b/python/paddle/fluid/layers/detection.py @@ -17,15 +17,13 @@ import paddle -from .layer_function_generator import generate_layer_fn -from .layer_function_generator import autodoc, templatedoc +from .layer_function_generator import templatedoc from ..layer_helper import LayerHelper from ..framework import Variable, _non_static_mode, static_only, in_dygraph_mode from .. import core from .loss import softmax_with_cross_entropy from . import tensor from . import nn -from . import ops from ..data_feeder import check_variable_and_dtype, check_type, check_dtype import math import numpy as np diff --git a/python/paddle/fluid/layers/distributions.py b/python/paddle/fluid/layers/distributions.py index 0010a39e7fc31..e7c846c1fe08a 100644 --- a/python/paddle/fluid/layers/distributions.py +++ b/python/paddle/fluid/layers/distributions.py @@ -14,7 +14,6 @@ from . import control_flow from . import tensor -from . import ops from . import nn import math import numpy as np @@ -535,8 +534,8 @@ def kl_divergence(self, other): other_logits = other.logits - nn.reduce_max( other.logits, dim=-1, keep_dim=True ) - e_logits = ops.exp(logits) - other_e_logits = ops.exp(other_logits) + e_logits = paddle.exp(logits) + other_e_logits = paddle.exp(other_logits) z = nn.reduce_sum(e_logits, dim=-1, keep_dim=True) other_z = nn.reduce_sum(other_e_logits, dim=-1, keep_dim=True) prob = e_logits / z @@ -556,7 +555,7 @@ def entropy(self): """ logits = self.logits - nn.reduce_max(self.logits, dim=-1, keep_dim=True) - e_logits = ops.exp(logits) + e_logits = paddle.exp(logits) z = nn.reduce_sum(e_logits, dim=-1, keep_dim=True) prob = e_logits / z entropy = -1.0 * nn.reduce_sum( diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py index 1dcc07a20d06c..e92fea5afb1b0 100644 --- a/python/paddle/fluid/layers/io.py +++ b/python/paddle/fluid/layers/io.py @@ -19,7 +19,6 @@ from ..data_feeder import DataFeeder from .control_flow import BlockGuard -from .layer_function_generator import templatedoc from .. import core from ..executor import global_scope from ..framework import ( diff --git a/python/paddle/fluid/layers/layer_function_generator.py b/python/paddle/fluid/layers/layer_function_generator.py old mode 100755 new mode 100644 index b75e628a37a76..8de02b495c720 --- a/python/paddle/fluid/layers/layer_function_generator.py +++ b/python/paddle/fluid/layers/layer_function_generator.py @@ -45,14 +45,11 @@ def _convert_(name): """ Formatting. - Args: name: The name/alias - This function takes in a name and converts it to a standard format of group1_group2. Where as per the regular expression, group1 can have alphabets and numbers and group2 has capital alphabets. - """ s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name) return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower() @@ -80,10 +77,8 @@ def _generate_doc_string_( ): """ Generate docstring by OpProto - Args: op_proto (framework_pb2.OpProto): a protobuf message typed OpProto - Returns: str: the document string """ @@ -148,13 +143,10 @@ def _generate_doc_string_( def generate_layer_fn(op_type): """Register the Python layer for an Operator. - Args: op_type: The name of the operator to be created. - This function takes in the operator type (sigmoid, mean , average etc) and creates the operator functionality. - """ op_proto = OpProtoHolder.instance().get_op_proto(op_type) not_intermediate_outputs = [ @@ -271,13 +263,10 @@ def func(*args, **kwargs): def generate_activation_fn(op_type): """Register the Python layer for an Operator without Attribute. - Args: op_type: The name of the operator to be created. - This function takes in the operator type (sigmoid, exp , tanh etc) and creates the operator functionality. - """ op_proto = OpProtoHolder.instance().get_op_proto(op_type) @@ -330,10 +319,8 @@ def func(x, name=None): def generate_inplace_fn(inplace_op_type): """Register the Python layer for an Inplace Operator without Attribute. - Args: inplace_op_type: The name of the inplace operator to be created. - This function takes in the inplace operator type (exp_ , ceil_ etc) and creates the operator functionality. """ @@ -378,12 +365,10 @@ def templatedoc(op_type=None): """ Decorator of layer function. It will use the docstring from the layer function as the template. The template arguments are: - * ${comment}: The operator comment written in CPP. * ${{name}_comment}: The comment of ${name} written with AddAttr, AddOutput, and AddInput. The ${name} is Python snake style. i.e., xxx_xxx. * ${{name}_type}: The type of ${name}. - Returns: Decorated function. """ @@ -438,7 +423,6 @@ def __impl__(func): def add_sample_code(func, sample_code): """ Append sample code for dynamically generated functions. - Args: func: The function of the function to be append sample code to. sample_code: sample code session in rst format. diff --git a/python/paddle/fluid/layers/learning_rate_scheduler.py b/python/paddle/fluid/layers/learning_rate_scheduler.py index cf56c793511b2..9767fe0af932d 100644 --- a/python/paddle/fluid/layers/learning_rate_scheduler.py +++ b/python/paddle/fluid/layers/learning_rate_scheduler.py @@ -26,7 +26,6 @@ import paddle from . import control_flow from . import nn -from . import ops from . import tensor from ..framework import default_main_program, Parameter, unique_name, name_scope from ..framework import Variable @@ -171,7 +170,7 @@ def exponential_decay(learning_rate, decay_steps, decay_rate, staircase=False): div_res = global_step / decay_steps if staircase: - div_res = ops.floor(div_res) + div_res = paddle.floor(div_res) decayed_lr = learning_rate * (decay_rate**div_res) return decayed_lr @@ -233,8 +232,8 @@ def natural_exp_decay(learning_rate, decay_steps, decay_rate, staircase=False): div_res = global_step / decay_steps if staircase: - div_res = ops.floor(div_res) - decayed_lr = learning_rate * ops.exp(-1 * decay_rate * div_res) + div_res = paddle.floor(div_res) + decayed_lr = learning_rate * paddle.exp(-1 * decay_rate * div_res) return decayed_lr @@ -293,7 +292,7 @@ def inverse_time_decay(learning_rate, decay_steps, decay_rate, staircase=False): div_res = global_step / decay_steps if staircase: - div_res = ops.floor(div_res) + div_res = paddle.floor(div_res) decayed_lr = learning_rate / (1 + decay_rate * div_res) @@ -347,7 +346,7 @@ def polynomial_decay( global_step = _decay_step_counter() if cycle: - div_res = ops.ceil(global_step / decay_steps) + div_res = paddle.ceil(global_step / decay_steps) zero_var = tensor.fill_constant( shape=[1], dtype='float32', value=0.0 ) @@ -497,11 +496,11 @@ def cosine_decay(learning_rate, step_each_epoch, epochs): else: global_step = _decay_step_counter() - cur_epoch = ops.floor(global_step / step_each_epoch) + cur_epoch = paddle.floor(global_step / step_each_epoch) decayed_lr = ( learning_rate * 0.5 - * (ops.cos(cur_epoch * math.pi / epochs) + 1) + * (paddle.cos(cur_epoch * math.pi / epochs) + 1) ) return decayed_lr diff --git a/python/paddle/fluid/layers/loss.py b/python/paddle/fluid/layers/loss.py index 6dbbb7338a701..1c4a1ef6acade 100644 --- a/python/paddle/fluid/layers/loss.py +++ b/python/paddle/fluid/layers/loss.py @@ -1737,7 +1737,6 @@ def kldiv_loss(x, target, reduction='mean', name=None): return loss -from .ops import square from .control_flow import equal diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py deleted file mode 100755 index 61e12a98a0127..0000000000000 --- a/python/paddle/fluid/layers/ops.py +++ /dev/null @@ -1,975 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -from .layer_function_generator import ( - generate_layer_fn, - generate_activation_fn, - generate_inplace_fn, - add_sample_code, -) -from .. import core -from ..framework import convert_np_dtype_to_dtype_, Variable, in_dygraph_mode -from ..data_feeder import ( - convert_dtype, - check_variable_and_dtype, - check_type, - check_dtype, -) -from paddle.utils import deprecated -from paddle import _C_ops, _legacy_C_ops -import paddle - -__deprecated_func_name__ = { - 'tanh_shrink': 'tanhshrink', - 'logsigmoid': 'log_sigmoid', -} - -__activations_noattr__ = [ - 'sigmoid', - 'silu', - 'logsigmoid', - 'tanh_shrink', - 'softsign', - 'tanh', -] - -__unary_func__ = [ - 'exp', - 'expm1', - 'atan', - 'sqrt', - 'rsqrt', - 'abs', - 'ceil', - 'floor', - 'cos', - 'tan', - 'acos', - 'sin', - 'sinh', - 'asin', - 'cosh', - 'round', - 'reciprocal', - 'square', - 'acosh', - 'asinh', - 'atanh', - 'lgamma', -] - -__inplace_unary_func__ = [ - 'exp_', - 'sqrt_', - 'rsqrt_', - 'ceil_', - 'floor_', - 'round_', - 'reciprocal_', -] - -__all__ = [ - 'softplus', - 'softshrink', - 'hard_shrink', - 'cumsum', - 'thresholded_relu', - 'gelu', - 'erf', -] - -for _OP in set(__all__): - globals()[_OP] = generate_layer_fn(_OP) - -# It is a hot fix in some unittest using: -# fluid.layers.scale(x=x, scale=10.0, out=out_var) -# e.g.: test_program_code.py, test_dist_train.py -globals()['_scale'] = generate_layer_fn('scale') - -globals()['_elementwise_div'] = generate_layer_fn('elementwise_div') - -__all__ += __activations_noattr__ -__all__ += __unary_func__ -__all__ += __inplace_unary_func__ - -for _OP in set(__activations_noattr__): - _new_OP = _OP - if _OP in __deprecated_func_name__: - _new_OP = __deprecated_func_name__[_OP] - _func = generate_activation_fn(_OP) - _func = deprecated( - since="2.0.0", update_to="paddle.nn.functional.%s" % (_new_OP) - )(_func) - globals()[_OP] = _func - -for _OP in set(__unary_func__): - _new_OP = _OP - if _OP in __deprecated_func_name__: - _new_OP = __deprecated_func_name__[_OP] - _func = generate_activation_fn(_OP) - _func = deprecated(since="2.0.0", update_to="paddle.%s" % (_new_OP))(_func) - globals()[_OP] = _func - -for _OP in set(__inplace_unary_func__): - _new_OP = _OP - if _OP in __deprecated_func_name__: - _new_OP = __deprecated_func_name__[_OP] - _func = generate_inplace_fn(_OP) - _func = deprecated(since="2.0.0", update_to="paddle.%s" % (_new_OP))(_func) - globals()[_OP] = _func - -add_sample_code( - globals()["sigmoid"], - r""" -Examples: - .. code-block:: python - - import paddle - import paddle.nn.functional as F - - x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3]) - out = F.sigmoid(x) - print(out) - # [0.40131234 0.450166 0.52497919 0.57444252] - -""", -) - -add_sample_code( - globals()["silu"], - r""" -Examples: - .. code-block:: python - - import paddle - import paddle.nn.functional as F - - x = paddle.to_tensor([1.0, 2.0, 3.0, 4.0]) - out = F.silu(x) - print(out) - # [ 0.7310586 1.7615942 2.8577224, 3.9280552 ] - -""", -) - -add_sample_code( - globals()["logsigmoid"], - r""" -Examples: - .. code-block:: python - - import paddle - import paddle.nn.functional as F - - x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3]) - out = F.log_sigmoid(x) - print(out) - # [-0.91301525 -0.79813887 -0.64439666 -0.55435524] - -""", -) - -add_sample_code( - globals()["exp"], - r""" -Examples: - .. code-block:: python - - import paddle - - x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3]) - out = paddle.exp(x) - print(out) - # [0.67032005 0.81873075 1.10517092 1.34985881] - -""", -) - -add_sample_code( - globals()["expm1"], - r""" -Examples: - .. code-block:: python - - import paddle - - x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3]) - out = paddle.expm1(x) - print(out) - # [-0.32967997, -0.18126924, 0.10517092, 0.34985882] - -""", -) - -add_sample_code( - globals()["tanh"], - r""" -Examples: - .. code-block:: python - - import paddle - - x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3]) - out = paddle.tanh(x) - print(out) - # [-0.37994896 -0.19737532 0.09966799 0.29131261] - -""", -) - -add_sample_code( - globals()["atan"], - r""" -Examples: - .. code-block:: python - - import paddle - - x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3]) - out = paddle.atan(x) - print(out) - # [-0.38050638 -0.19739556 0.09966865 0.29145679] - -""", -) - -add_sample_code( - globals()["tanh_shrink"], - r""" -Examples: - .. code-block:: python - - import paddle - import paddle.nn.functional as F - - x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3]) - out = F.tanhshrink(x) - print(out) - # [-0.020051, -0.00262468, 0.000332005, 0.00868739] - -""", -) - -add_sample_code( - globals()["sqrt"], - r""" -Examples: - .. code-block:: python - - import paddle - - x = paddle.to_tensor([0.1, 0.2, 0.3, 0.4]) - out = paddle.sqrt(x) - print(out) - # [0.31622777 0.4472136 0.54772256 0.63245553] - -""", -) - -add_sample_code( - globals()["rsqrt"], - r""" -Examples: - .. code-block:: python - - import paddle - - x = paddle.to_tensor([0.1, 0.2, 0.3, 0.4]) - out = paddle.rsqrt(x) - print(out) - # [3.16227766 2.23606798 1.82574186 1.58113883] - -""", -) - -add_sample_code( - globals()["abs"], - r""" -Examples: - .. code-block:: python - - import paddle - - x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3]) - out = paddle.abs(x) - print(out) - # [0.4 0.2 0.1 0.3] - -""", -) - -add_sample_code( - globals()["ceil"], - r""" -Examples: - .. code-block:: python - - import paddle - - x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3]) - out = paddle.ceil(x) - print(out) - # [-0. -0. 1. 1.] - -""", -) - -add_sample_code( - globals()["floor"], - r""" -Examples: - .. code-block:: python - - import paddle - - x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3]) - out = paddle.floor(x) - print(out) - # [-1. -1. 0. 0.] - -""", -) - -add_sample_code( - globals()["cos"], - r""" -Examples: - .. code-block:: python - - import paddle - - x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3]) - out = paddle.cos(x) - print(out) - # [0.92106099 0.98006658 0.99500417 0.95533649] - -""", -) - -add_sample_code( - globals()["tan"], - r""" -Examples: - .. code-block:: python - - import paddle - - x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3]) - out = paddle.tan(x) - print(out) - # [-0.42279324, -0.20271005, 0.10033467, 0.30933627] - -""", -) - -add_sample_code( - globals()["acos"], - r""" -Examples: - .. code-block:: python - - import paddle - - x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3]) - out = paddle.acos(x) - print(out) - # [1.98231317 1.77215425 1.47062891 1.26610367] - -""", -) - -add_sample_code( - globals()["sin"], - r""" -Examples: - .. code-block:: python - - import paddle - - x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3]) - out = paddle.sin(x) - print(out) - # [-0.38941834 -0.19866933 0.09983342 0.29552021] - -""", -) - -add_sample_code( - globals()["asin"], - r""" -Examples: - .. code-block:: python - - import paddle - - x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3]) - out = paddle.asin(x) - print(out) - # [-0.41151685 -0.20135792 0.10016742 0.30469265] - -""", -) - -add_sample_code( - globals()["cosh"], - r""" -Examples: - .. code-block:: python - - import paddle - - x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3]) - out = paddle.cosh(x) - print(out) - # [1.08107237 1.02006676 1.00500417 1.04533851] - -""", -) - -add_sample_code( - globals()["sinh"], - r""" -Examples: - .. code-block:: python - - import paddle - - x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3]) - out = paddle.sinh(x) - print(out) - # [-0.41075233 -0.201336 0.10016675 0.30452029] - -""", -) - -add_sample_code( - globals()["asinh"], - r""" -Examples: - .. code-block:: python - - import paddle - - x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3]) - out = paddle.asinh(x) - print(out) - # [-0.39003533, -0.19869010, 0.09983408, 0.29567307] - -""", -) - -add_sample_code( - globals()["acosh"], - r""" -Examples: - .. code-block:: python - - import paddle - - x = paddle.to_tensor([1., 3., 4., 5.]) - out = paddle.acosh(x) - print(out) - # [0. , 1.76274729, 2.06343699, 2.29243159] - -""", -) - -add_sample_code( - globals()["atanh"], - r""" -Examples: - .. code-block:: python - - import paddle - - x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3]) - out = paddle.atanh(x) - print(out) - # [-0.42364895, -0.20273256, 0.10033535, 0.30951962] - -""", -) - -add_sample_code( - globals()["round"], - r""" -Examples: - .. code-block:: python - - import paddle - - x = paddle.to_tensor([-0.5, -0.2, 0.6, 1.5]) - out = paddle.round(x) - print(out) - # [-1. -0. 1. 2.] - -""", -) - -add_sample_code( - globals()["reciprocal"], - r""" -Examples: - .. code-block:: python - - import paddle - - x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3]) - out = paddle.reciprocal(x) - print(out) - # [-2.5 -5. 10. 3.33333333] - -""", -) - -add_sample_code( - globals()["square"], - r""" -Examples: - .. code-block:: python - - import paddle - - x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3]) - out = paddle.square(x) - print(out) - # [0.16 0.04 0.01 0.09] - -""", -) - -_softplus_ = generate_layer_fn('softplus') - - -def softplus(x, beta: float = 1.0, threshold: float = 20.0, name=None): - check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'softplus') - locals_val = locals().copy() - kwargs = dict() - for name, val in locals_val.items(): - if val is not None: - kwargs[name] = val - return _softplus_(**kwargs) - - -softplus.__doc__ = r""" - :alias_main: paddle.nn.functional.softplus - :alias: paddle.nn.functional.softplus, paddle.nn.functional.activation.softplus - :old_api: paddle.fluid.layers.softplus - -:strong:`Softplus Activation Operator` - -Equation: - .. math:: - out = \\frac{1}{beta} * log(1 + e^{beta * x}) - For numerical stability, the implementation reverts to the linear function when: beta * x > threshold. - -Args: - x(Tensor): Input of Softplus op, Tensor, dtype: float32 or float64 - beta(float, optional): The value of beta for softplus. Default is 1 - threshold (float, optional): The value of threshold for softplus. Default is 20 - name(str, optional): Name for the operation (optional, default is None) - -Returns: - Variable: The output of Softplus op, Tensor, dtype: float32 or float64 - -Examples: - .. code-block:: python - - import paddle - import paddle.nn.functional as F - - x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3]) - out = F.softplus(x) - print(out) - # [0.513015, 0.598139, 0.744397, 0.854355] -""" - -add_sample_code( - globals()["softsign"], - r""" -Examples: - .. code-block:: python - - import paddle - import paddle.nn.functional as F - - x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3]) - out = F.softsign(x) - print(out) - # [-0.285714, -0.166667, 0.0909091, 0.230769] - -""", -) - -_softshrink_ = generate_layer_fn('softshrink') - - -def softshrink(x, alpha=None): - check_variable_and_dtype( - x, 'x', ['float16', 'float32', 'float64'], 'softshrink' - ) - - locals_var = locals().copy() - kwargs = dict() - for name, val in locals_var.items(): - if val is not None: - if name == 'alpha': - kwargs['lambda'] = val - else: - kwargs[name] = val - return _softshrink_(**kwargs) - - -softshrink.__doc__ = r""" - :alias_main: paddle.nn.functional.softshrink - :alias: paddle.nn.functional.softshrink,paddle.nn.functional.activation.softshrink - :old_api: paddle.fluid.layers.softshrink - -:strong:`Softshrink Activation Operator` - -.. math:: - out = \\begin{cases} - x - \\alpha, \\text{if } x > \\alpha \\\\ - x + \\alpha, \\text{if } x < -\\alpha \\\\ - 0, \\text{otherwise} - \\end{cases} - - -Args: - x: Input of Softshrink operator, an N-D Tensor, with data type float32, float64 or float16. - alpha (float): non-negative offset - -Returns: - Output of Softshrink operator with the same type of input. - -Examples: - .. code-block:: python - - import paddle.fluid as fluid - data = fluid.data(name="input", shape=[None, 784]) - result = fluid.layers.softshrink(x=data, alpha=0.3) -""" - -_hard_shrink_ = generate_layer_fn('hard_shrink') - - -@deprecated(since="2.0.0", update_to="paddle.nn.functional.hardshrink") -def hard_shrink(x, threshold=None): - check_variable_and_dtype( - x, 'x', ['float16', 'float32', 'float64'], 'hard_shrink' - ) - - locals_var = locals().copy() - kwargs = dict() - for name, val in locals_var.items(): - if val is not None: - kwargs[name] = val - return _hard_shrink_(**kwargs) - - -hard_shrink.__doc__ = ( - _hard_shrink_.__doc__ - + """ -Examples: - - >>> import paddle.fluid as fluid - >>> data = fluid.layers.data(name="input", shape=[784]) - >>> result = fluid.layers.hard_shrink(x=data, threshold=0.3) -""" -) - -_cum_sum_ = generate_layer_fn('cumsum') - - -@deprecated( - since="2.0.0", - update_to="paddle.cumsum", - reason="New APIs for Paddle 2.0 are coming.", -) -def cumsum(x, axis=None, exclusive=None, reverse=None): - check_type(x, 'x', (Variable), 'cumsum') - locals_var = locals().copy() - kwargs = dict() - for name, val in locals_var.items(): - if val is not None: - kwargs[name] = val - return _cum_sum_(**kwargs) - - -cumsum.__doc__ = """ - :alias_main: paddle.cumsum - :alias: paddle.cumsum,paddle.tensor.cumsum,paddle.tensor.math.cumsum - :old_api: paddle.fluid.layers.cumsum - -The cumulative sum of the elements along a given axis. By default, the first element of the result is the same of the first element of the input. If exlusive is true, the first element of the result is 0. - -Args: - x (Variable): Input of cumsum operator, the Tensor/LoDTensor needed to be cumsumed. - axis (int, optional): The dimension to accumulate along. -1 means the last dimension. Default is -1. - exclusive (bool, optional): Whether to perform exclusive cumsum. Default is False. - reverse (bool, optional): If true, the cumsum is performed in the reversed direction. Default is False. - -Returns: - Variable(Tensor/LoDTensor): The result of cumsum operator, output of cumsum operator. - -Examples: - .. code-block:: python - - import paddle.fluid as fluid - data = fluid.layers.data(name="input", shape=[32, 784]) - result = fluid.layers.cumsum(data, axis=0) -""" - -_thresholded_relu_ = generate_layer_fn('thresholded_relu') - - -def thresholded_relu(x, threshold=None): - check_variable_and_dtype( - x, 'x', ['float16', 'float32', 'float64'], 'thresholded_relu' - ) - - locals_var = locals().copy() - kwargs = dict() - for name, val in locals_var.items(): - if val is not None: - kwargs[name] = val - - return _thresholded_relu_(**kwargs) - - -thresholded_relu.__doc__ = r""" - :alias_main: paddle.nn.functional.thresholded_relu - :alias: paddle.nn.functional.thresholded_relu,paddle.nn.functional.activation.thresholded_relu - :old_api: paddle.fluid.layers.thresholded_relu - -:strong:`Thresholded ReLU Activation Operator` - -Equation: - .. math:: - out = \\begin{cases} - x, &if x > threshold \\\\ - 0, &otherwise - \\end{cases} - -Args: - x(Variable): The input of Thresholded ReLU op, Tensor or LoDTensor, dtype: float32 or float64. - - threshold(float, optional): The threshold value. Note that if the arg `threshold` is not set, the threshold in the equation is 1.0. - -Returns: - - Variable: The output of Thresholded ReLU op, Tensor or LoDTensor, dtype: float32 or float64, the same as the input, shape: the same as the input. - -Examples: - - .. code-block:: python - - # declarative mode - import numpy as np - from paddle import fluid - - x = fluid.data(name="x", shape=(-1, 3), dtype="float32") - y = fluid.layers.thresholded_relu(x, threshold=0.1) - - place = fluid.CPUPlace() - exe = fluid.Executor(place) - start = fluid.default_startup_program() - main = fluid.default_main_program() - - data = np.random.randn(2, 3).astype("float32") - exe.run(start) - - y_np, = exe.run(main, feed={"x": data}, fetch_list=[y]) - - data - # array([[ 0.21134382, -1.1805999 , 0.32876605], - # [-1.2210793 , -0.7365624 , 1.0013918 ]], dtype=float32) - y_np - # array([[ 0.21134382, -0. , 0.32876605], - # [-0. , -0. , 1.0013918 ]], dtype=float32) - - .. code-block:: python - - # imperative mode - import numpy as np - from paddle import fluid - import paddle.fluid.dygraph as dg - - data = np.random.randn(2, 3).astype("float32") - place = fluid.CPUPlace() - with dg.guard(place) as g: - x = dg.to_variable(data) - y = fluid.layers.thresholded_relu(x, threshold=0.1) - y_np = y.numpy() - data - # array([[ 0.21134382, -1.1805999 , 0.32876605], - # [-1.2210793 , -0.7365624 , 1.0013918 ]], dtype=float32) - y_np - # array([[ 0.21134382, -0. , 0.32876605], - # [-0. , -0. , 1.0013918 ]], dtype=float32) -""" - -_gelu_ = generate_layer_fn('gelu') - - -@deprecated(since="2.0.0", update_to="paddle.nn.functional.gelu") -def gelu(x, approximate=False): - locals_var = locals().copy() - kwargs = dict() - for name, val in locals_var.items(): - if val is not None: - kwargs[name] = val - return _gelu_(**kwargs) - - -gelu.__doc__ = r""" -:strong:`GeLU Activation Operator` -For more details, see [Gaussian Error Linear Units](https://arxiv.org/abs/1606.08415). - -Equation: - if approximate is True - .. math:: - out = 0.5 * x * (1 + tanh(\\sqrt{\\frac{2}{\\pi}} * (x + 0.044715x^{3}))) - - else - .. math:: - out = 0.5 * x * (1 + erf(\\frac{x}{\\sqrt{2}})) - -Args: - - x(Variable): The input of GeLU op, Tensor or LoDTensor, dtype: float32 or float64. - -Returns: - - Variable: The output of GeLU op, Tensor or LoDTensor, dtype: float32 or float64, the same as the input, shape: the same as the input. - -Examples: - - .. code-block:: python - - # declarative mode - import numpy as np - from paddle import fluid - - x = fluid.data(name="x", shape=(-1, 3), dtype="float32") - y = fluid.layers.gelu(x) - - place = fluid.CPUPlace() - exe = fluid.Executor(place) - start = fluid.default_startup_program() - main = fluid.default_main_program() - - data = np.random.randn(2, 3).astype("float32") - exe.run(start) - - y_np, = exe.run(main, feed={"x": data}, fetch_list=[y]) - - data - # array([[ 0.87165993, -1.0541513 , -0.37214822], - # [ 0.15647964, 0.32496083, 0.33045998]], dtype=float32) - y_np - # array([[ 0.70456535, -0.15380788, -0.13207214], - # [ 0.08796856, 0.20387867, 0.2080159 ]], dtype=float32) - - .. code-block:: python - - # imperative mode - import numpy as np - from paddle import fluid - import paddle.fluid.dygraph as dg - - data = np.random.randn(2, 3).astype("float32") - place = fluid.CPUPlace() - with dg.guard(place) as g: - x = dg.to_variable(data) - y = fluid.layers.gelu(x) - y_np = y.numpy() - data - # array([[ 0.87165993, -1.0541513 , -0.37214822], - # [ 0.15647964, 0.32496083, 0.33045998]], dtype=float32) - y_np - # array([[ 0.70456535, -0.15380788, -0.13207214], - # [ 0.08796856, 0.20387867, 0.2080159 ]], dtype=float32) -""" - -_erf_ = generate_layer_fn('erf') - - -def erf(x, name=None): - if in_dygraph_mode(): - return _C_ops.erf(x) - - locals_var = locals().copy() - kwargs = dict() - for name, val in locals_var.items(): - if val is not None: - kwargs[name] = val - return _erf_(**kwargs) - - -erf.__doc__ = r""" -:strong:`Erf Operator` -For more details, see [Error function](https://en.wikipedia.org/wiki/Error_function). - -Equation: - .. math:: - out = \\frac{2}{\\sqrt{\\pi}} \\int_{0}^{x}e^{- \\eta^{2}}d\\eta - -Args: - - x (Tensor): The input tensor, it's data type should be float32, float64. - -Returns: - - Tensor: The output of Erf op, dtype: float32 or float64, the same as the input, shape: the same as the input. - -Examples: - - .. code-block:: python - - import paddle - x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3]) - out = paddle.erf(x) - print(out) - # [-0.42839236 -0.22270259 0.11246292 0.32862676] -""" - - -def lgamma(x, name=None): - r""" - Calculates the lgamma of the given input tensor, element-wise. - - This operator performs elementwise lgamma for input $X$. - :math:`out = log\Gamma(x)` - - - Args: - x (Tensor): Input Tensor. Must be one of the following types: float32, float64. - name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. - - Returns: - Tensor, the lgamma of the input Tensor, the shape and data type is the same with input. - - Examples: - .. code-block:: python - - import paddle - - x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3]) - out = paddle.lgamma(x) - print(out) - # [1.31452441, 1.76149750, 2.25271273, 1.09579802] - """ - return paddle.Tensor.lgamma(x) diff --git a/python/paddle/fluid/nets.py b/python/paddle/fluid/nets.py index bfff9c12e9241..bccb7e039824b 100644 --- a/python/paddle/fluid/nets.py +++ b/python/paddle/fluid/nets.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import paddle from . import layers from .data_feeder import check_variable_and_dtype, convert_dtype from ..utils import deprecated @@ -387,7 +388,7 @@ def glu(input, dim=-1): input, 'input', ['float16', 'float32', 'float64'], "glu" ) a, b = layers.split(input, num_or_sections=2, dim=dim) - act_b = layers.sigmoid(x=b) + act_b = paddle.nn.functional.sigmoid(x=b) out = layers.elementwise_mul(x=a, y=act_b) return out diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index e3d71a0bafa8c..8e030a54d832b 100755 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -48,7 +48,6 @@ from .framework import program_guard from .initializer import Constant from .layer_helper import LayerHelper -from .layers import ops from .dygraph import base as imperative_base from .dygraph import no_grad from .dygraph.learning_rate_scheduler import ( @@ -4457,7 +4456,7 @@ def _add_average_apply_op(self, block, param_grad): sum = layers.cast( x=sum, dtype='float32' if self._dtype is None else self._dtype ) - ops._elementwise_div(x=sum, y=tmp, out=param) + paddle.assign(paddle.divide(sum, tmp), output=param) def _add_average_restore_op(self, block, param_grad): param = block._clone_variable(param_grad[0]) diff --git a/python/paddle/fluid/tests/book/notest_understand_sentiment.py b/python/paddle/fluid/tests/book/notest_understand_sentiment.py index 02a17b9f92c99..4a9b7fefc9f3c 100644 --- a/python/paddle/fluid/tests/book/notest_understand_sentiment.py +++ b/python/paddle/fluid/tests/book/notest_understand_sentiment.py @@ -70,21 +70,21 @@ def gate_common(ipt, hidden, size): gate1 = fluid.layers.fc(input=hidden, size=size, bias_attr=False) return gate0 + gate1 - forget_gate = fluid.layers.sigmoid( + forget_gate = paddle.nn.functional.sigmoid( x=gate_common(word, prev_hidden, lstm_size) ) - input_gate = fluid.layers.sigmoid( + input_gate = paddle.nn.functional.sigmoid( x=gate_common(word, prev_hidden, lstm_size) ) - output_gate = fluid.layers.sigmoid( + output_gate = paddle.nn.functional.sigmoid( x=gate_common(word, prev_hidden, lstm_size) ) - cell_gate = fluid.layers.sigmoid( + cell_gate = paddle.nn.functional.sigmoid( x=gate_common(word, prev_hidden, lstm_size) ) cell = forget_gate * prev_cell + input_gate * cell_gate - hidden = output_gate * fluid.layers.tanh(x=cell) + hidden = output_gate * paddle.tanh(x=cell) rnn.update_memory(prev_cell, cell) rnn.update_memory(prev_hidden, hidden) rnn.output(hidden) diff --git a/python/paddle/fluid/tests/book/test_rnn_encoder_decoder.py b/python/paddle/fluid/tests/book/test_rnn_encoder_decoder.py index f896e24497599..f2fdc1016fdff 100644 --- a/python/paddle/fluid/tests/book/test_rnn_encoder_decoder.py +++ b/python/paddle/fluid/tests/book/test_rnn_encoder_decoder.py @@ -70,10 +70,10 @@ def lstm_step(x_t, hidden_t_prev, cell_t_prev, size): def linear(inputs): return fluid.layers.fc(input=inputs, size=size, bias_attr=True) - forget_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t])) - input_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t])) - output_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t])) - cell_tilde = fluid.layers.tanh(x=linear([hidden_t_prev, x_t])) + forget_gate = paddle.nn.functional.sigmoid(x=linear([hidden_t_prev, x_t])) + input_gate = paddle.nn.functional.sigmoid(x=linear([hidden_t_prev, x_t])) + output_gate = paddle.nn.functional.sigmoid(x=linear([hidden_t_prev, x_t])) + cell_tilde = paddle.tanh(x=linear([hidden_t_prev, x_t])) cell_t = fluid.layers.sums( input=[ @@ -83,7 +83,7 @@ def linear(inputs): ) hidden_t = fluid.layers.elementwise_mul( - x=output_gate, y=fluid.layers.tanh(x=cell_t) + x=output_gate, y=paddle.tanh(x=cell_t) ) return hidden_t, cell_t diff --git a/python/paddle/fluid/tests/test_if_else_op.py b/python/paddle/fluid/tests/test_if_else_op.py index aa4aca0724c70..321ff522c0874 100644 --- a/python/paddle/fluid/tests/test_if_else_op.py +++ b/python/paddle/fluid/tests/test_if_else_op.py @@ -175,12 +175,12 @@ def compare_ifelse_op_and_numpy(self, place): ie = layers.IfElse(ifcond) with ie.true_block(): true_target = ie.input(src) - true_target = fluid.layers.exp(true_target) + true_target = paddle.exp(true_target) ie.output(true_target) with ie.false_block(): false_target = ie.input(src) - false_target = fluid.layers.tanh(false_target) + false_target = paddle.tanh(false_target) ie.output(false_target) if_out = ie() out = layers.reduce_sum(if_out[0]) @@ -244,7 +244,7 @@ def test_input_type_error(self): ie = layers.IfElse(ifcond) with ie.true_block(): true_target = ie.input(src) - true_target = fluid.layers.exp(true_target) + true_target = paddle.exp(true_target) ie.output([]) diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_simnet_bow.py b/python/paddle/fluid/tests/unittests/dist_fleet_simnet_bow.py index cc6371cc7cc67..c9a2a16da2db8 100644 --- a/python/paddle/fluid/tests/unittests/dist_fleet_simnet_bow.py +++ b/python/paddle/fluid/tests/unittests/dist_fleet_simnet_bow.py @@ -130,7 +130,7 @@ def train_network( q_emb = fluid.layers.reshape(q_emb, [-1, emb_dim]) # vsum q_sum = fluid.layers.sequence_pool(input=q_emb, pool_type='sum') - q_ss = fluid.layers.softsign(q_sum) + q_ss = paddle.nn.functional.softsign(q_sum) # fc layer after conv q_fc = fluid.layers.fc( input=q_ss, @@ -157,7 +157,7 @@ def train_network( pt_emb = fluid.layers.reshape(pt_emb, [-1, emb_dim]) # vsum pt_sum = fluid.layers.sequence_pool(input=pt_emb, pool_type='sum') - pt_ss = fluid.layers.softsign(pt_sum) + pt_ss = paddle.nn.functional.softsign(pt_sum) # fc layer pt_fc = fluid.layers.fc( input=pt_ss, @@ -181,7 +181,7 @@ def train_network( nt_emb = fluid.layers.reshape(nt_emb, [-1, emb_dim]) # vsum nt_sum = fluid.layers.sequence_pool(input=nt_emb, pool_type='sum') - nt_ss = fluid.layers.softsign(nt_sum) + nt_ss = paddle.nn.functional.softsign(nt_sum) # fc layer nt_fc = fluid.layers.fc( input=nt_ss, diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/ifelse_simple_func.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/ifelse_simple_func.py index 2604bdd3a690d..d7d1fd4cf9622 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/ifelse_simple_func.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/ifelse_simple_func.py @@ -61,7 +61,7 @@ def dyfunc_with_if_else2(x, col=100): y = fluid.layers.relu(x) else: x_pow = fluid.layers.pow(x, 2) - y = fluid.layers.tanh(x_pow) + y = paddle.tanh(x_pow) return y @@ -161,7 +161,7 @@ def nested_if_else(x_v): tmp = y * w y = fluid.layers.relu(tmp) if paddle.mean(y).numpy()[0] < batch_size: - y = fluid.layers.abs(y) + y = paddle.abs(y) else: tmp = fluid.layers.fill_constant( y.shape, dtype='float32', value=-1 @@ -276,7 +276,7 @@ def forward(self, input): self.constant_vars['w'] = fluid.layers.fill_constant( [hidden_dim], dtype='float32', value=9 ) - y = fluid.layers.abs(y) + y = paddle.abs(y) else: tmp = fluid.layers.fill_constant( y.shape, dtype='float32', value=-1 diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_dygraph_model.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_dygraph_model.py index 20aa0870086f4..9f8eba7f59af3 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_dygraph_model.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_dygraph_model.py @@ -49,8 +49,8 @@ def __init__( self._hiden_size = hidden_size self._param_attr = param_attr self._bias_attr = bias_attr - self._gate_activation = gate_activation or layers.sigmoid - self._activation = activation or layers.tanh + self._gate_activation = gate_activation or paddle.nn.functional.sigmoid + self._activation = activation or paddle.tanh self._forget_bias = forget_bias self._dtype = dtype self._input_size = input_size @@ -76,12 +76,14 @@ def forward(self, input, pre_hidden, pre_cell): i, j, f, o = layers.split(gate_input, num_or_sections=4, dim=-1) new_cell = layers.elementwise_add( layers.elementwise_mul( - pre_cell, layers.sigmoid(f + self._forget_bias) + pre_cell, paddle.nn.functional.sigmoid(f + self._forget_bias) + ), + layers.elementwise_mul( + paddle.nn.functional.sigmoid(i), paddle.tanh(j) ), - layers.elementwise_mul(layers.sigmoid(i), layers.tanh(j)), ) - new_hidden = layers.tanh(new_cell) * layers.sigmoid(o) + new_hidden = paddle.tanh(new_cell) * paddle.nn.functional.sigmoid(o) return new_hidden, new_cell diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model.py index facc72faf8736..2d58a64ca2dc2 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model.py @@ -13,7 +13,6 @@ # limitations under the License. import paddle - import paddle.fluid as fluid import paddle.fluid.param_attr as attr @@ -232,7 +231,7 @@ def ops(self, input): """ operation """ - softsign = fluid.layers.softsign(input) + softsign = paddle.nn.functional.softsign(input) return softsign diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_cycle_gan.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_cycle_gan.py index 12a4f48f64452..ab79a05796de4 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_cycle_gan.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_cycle_gan.py @@ -89,28 +89,22 @@ def forward(self, input_A, input_B): cyc_A = self.build_generator_resnet_9blocks_b(fake_B) cyc_B = self.build_generator_resnet_9blocks_a(fake_A) - diff_A = fluid.layers.abs( - fluid.layers.elementwise_sub(x=input_A, y=cyc_A) - ) - diff_B = fluid.layers.abs( - fluid.layers.elementwise_sub(x=input_B, y=cyc_B) - ) + diff_A = paddle.abs(fluid.layers.elementwise_sub(x=input_A, y=cyc_A)) + diff_B = paddle.abs(fluid.layers.elementwise_sub(x=input_B, y=cyc_B)) cyc_A_loss = fluid.layers.reduce_mean(diff_A) * lambda_A cyc_B_loss = fluid.layers.reduce_mean(diff_B) * lambda_B cyc_loss = cyc_A_loss + cyc_B_loss fake_rec_A = self.build_gen_discriminator_a(fake_B) - g_A_loss = fluid.layers.reduce_mean(fluid.layers.square(fake_rec_A - 1)) + g_A_loss = paddle.mean(paddle.square(fake_rec_A - 1)) fake_rec_B = self.build_gen_discriminator_b(fake_A) - g_B_loss = fluid.layers.reduce_mean(fluid.layers.square(fake_rec_B - 1)) + g_B_loss = paddle.mean(paddle.square(fake_rec_B - 1)) G = g_A_loss + g_B_loss idt_A = self.build_generator_resnet_9blocks_a(input_B) idt_loss_A = ( fluid.layers.reduce_mean( - fluid.layers.abs( - fluid.layers.elementwise_sub(x=input_B, y=idt_A) - ) + paddle.abs(fluid.layers.elementwise_sub(x=input_B, y=idt_A)) ) * lambda_B * lambda_identity @@ -119,9 +113,7 @@ def forward(self, input_A, input_B): idt_B = self.build_generator_resnet_9blocks_b(input_A) idt_loss_B = ( fluid.layers.reduce_mean( - fluid.layers.abs( - fluid.layers.elementwise_sub(x=input_A, y=idt_B) - ) + paddle.abs(fluid.layers.elementwise_sub(x=input_A, y=idt_B)) ) * lambda_A * lambda_identity @@ -271,7 +263,7 @@ def forward(self, inputs): y = self.deconv1(y) y = fluid.layers.pad2d(y, [3, 3, 3, 3], mode="reflect") y = self.conv3(y) - y = fluid.layers.tanh(y) + y = paddle.tanh(y) return y @@ -647,8 +639,7 @@ def train(args, to_static): data_B, fake_pool_B ) d_loss_A = ( - fluid.layers.square(fake_pool_rec_B) - + fluid.layers.square(rec_B - 1) + paddle.square(fake_pool_rec_B) + paddle.square(rec_B - 1) ) / 2.0 d_loss_A = fluid.layers.reduce_mean(d_loss_A) @@ -661,8 +652,7 @@ def train(args, to_static): data_A, fake_pool_A ) d_loss_B = ( - fluid.layers.square(fake_pool_rec_A) - + fluid.layers.square(rec_A - 1) + paddle.square(fake_pool_rec_A) + paddle.square(rec_A - 1) ) / 2.0 d_loss_B = fluid.layers.reduce_mean(d_loss_B) diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ptb_lm.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ptb_lm.py index e5d097ebba62d..6b4da8aa1b536 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ptb_lm.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ptb_lm.py @@ -99,10 +99,10 @@ def forward(self, input_embedding, init_hidden=None, init_cell=None): i, j, f, o = fluid.layers.split( gate_input, num_or_sections=4, dim=-1 ) - c = pre_cell * fluid.layers.sigmoid(f) + fluid.layers.sigmoid( - i - ) * fluid.layers.tanh(j) - m = fluid.layers.tanh(c) * fluid.layers.sigmoid(o) + c = pre_cell * paddle.nn.functional.sigmoid( + f + ) + paddle.nn.functional.sigmoid(i) * paddle.tanh(j) + m = paddle.tanh(c) * paddle.nn.functional.sigmoid(o) hidden_array[k] = m cell_array[k] = c step_input = m diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_sentiment.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_sentiment.py index 25ca7e08472a0..042ba310619af 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_sentiment.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_sentiment.py @@ -145,7 +145,7 @@ def forward(self, inputs, label=None): emb = emb * mask_emb emb = fluid.layers.reshape(emb, shape=[-1, self.seq_len, self.hid_dim]) bow_1 = fluid.layers.reduce_sum(emb, dim=1) - bow_1 = fluid.layers.tanh(bow_1) + bow_1 = paddle.tanh(bow_1) fc_1 = self._fc1(bow_1) fc_2 = self._fc2(fc_1) prediction = self._fc_prediction(fc_2) @@ -197,7 +197,7 @@ def forward(self, inputs, label=None): fc_1 = self._fc1(emb) gru_hidden = self._gru(fc_1) gru_hidden = fluid.layers.reduce_max(gru_hidden, dim=1) - tanh_1 = fluid.layers.tanh(gru_hidden) + tanh_1 = paddle.tanh(gru_hidden) fc_2 = self._fc2(tanh_1) prediction = self._fc_prediction(fc_2) @@ -253,8 +253,8 @@ def forward(self, inputs, label=None): fc_1 = self._fc1(emb) gru_forward = self._gru_forward(fc_1) gru_backward = self._gru_backward(fc_1) - gru_forward_tanh = fluid.layers.tanh(gru_forward) - gru_backward_tanh = fluid.layers.tanh(gru_backward) + gru_forward_tanh = paddle.tanh(gru_forward) + gru_backward_tanh = paddle.tanh(gru_backward) encoded_vector = fluid.layers.concat( input=[gru_forward_tanh, gru_backward_tanh], axis=2 ) diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_word2vec.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_word2vec.py index 5dfcb867c7e99..962ef28707ea1 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_word2vec.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_word2vec.py @@ -18,6 +18,7 @@ import paddle.fluid as fluid import unittest +import paddle from paddle.fluid.dygraph.nn import Embedding from paddle.fluid.dygraph import ProgramTranslator from paddle.fluid.dygraph import declarative @@ -260,7 +261,7 @@ def forward(self, center_words, target_words, label): ) word_sim = fluid.layers.reduce_sum(word_sim, dim=-1) - pred = fluid.layers.sigmoid(word_sim) + pred = paddle.nn.functional.sigmoid(word_sim) loss = fluid.layers.sigmoid_cross_entropy_with_logits(word_sim, label) loss = fluid.layers.reduce_mean(loss) diff --git a/python/paddle/fluid/tests/unittests/ipu/test_cumsum_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_cumsum_op_ipu.py index d31a78e1230d9..a6142b8353608 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_cumsum_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_cumsum_op_ipu.py @@ -51,7 +51,7 @@ def build_model(self): x = paddle.static.data( name=self.feed_list[0], shape=self.feed_shape[0], dtype="float32" ) - out = paddle.fluid.layers.cumsum(x, **self.attrs) + out = paddle.cumsum(x, **self.attrs) self.fetch_list = [out.name] def run_model(self, exec_mode): @@ -90,7 +90,7 @@ def build_model(self): x = paddle.static.data( name=self.feed_list[0], shape=self.feed_shape[0], dtype="int32" ) - out = paddle.fluid.layers.cumsum(x, **self.attrs) + out = paddle.cumsum(x, **self.attrs) self.fetch_list = [out.name] @@ -104,7 +104,7 @@ def build_model(self): x = paddle.static.data( name=self.feed_list[0], shape=self.feed_shape[0], dtype="int64" ) - out = paddle.fluid.layers.cumsum(x, **self.attrs) + out = paddle.cumsum(x, **self.attrs) self.fetch_list = [out.name] diff --git a/python/paddle/fluid/tests/unittests/ipu/test_gelu_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_gelu_op_ipu.py index 031fd25777249..d021b618c8323 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_gelu_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_gelu_op_ipu.py @@ -45,7 +45,7 @@ def build_model(self): x = paddle.static.data( name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32' ) - out = paddle.fluid.layers.gelu(x, **self.attrs) + out = paddle.nn.functional.gelu(x, **self.attrs) self.fetch_list = [out.name] def run_model(self, exec_mode): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_unary_ops_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_unary_ops_ipu.py index 9670e1da6334d..f0773ee8e39e4 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_unary_ops_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_unary_ops_ipu.py @@ -29,7 +29,7 @@ def setUp(self): self.set_feed_attr() def set_test_op(self): - self.op = paddle.fluid.layers.abs + self.op = paddle.abs self.op_attrs = {} def set_data_feed(self): @@ -70,55 +70,55 @@ def set_atol(self): self.atol = 1e-6 def set_test_op(self): - self.op = paddle.fluid.layers.acos + self.op = paddle.acos self.op_attrs = {} class TestAsin(TestAcos): def set_test_op(self): - self.op = paddle.fluid.layers.asin + self.op = paddle.asin self.op_attrs = {} class TestSinh(TestAcos): def set_test_op(self): - self.op = paddle.fluid.layers.sinh + self.op = paddle.sinh self.op_attrs = {} class TestAtan(TestBase): def set_test_op(self): - self.op = paddle.fluid.layers.atan + self.op = paddle.atan self.op_attrs = {} class TestCeil(TestBase): def set_test_op(self): - self.op = paddle.fluid.layers.ceil + self.op = paddle.ceil self.op_attrs = {} class TestCos(TestBase): def set_test_op(self): - self.op = paddle.fluid.layers.cos + self.op = paddle.cos self.op_attrs = {} class TestCosh(TestBase): def set_test_op(self): - self.op = paddle.fluid.layers.cosh + self.op = paddle.cosh self.op_attrs = {} class TestErf(TestBase): def set_test_op(self): - self.op = paddle.fluid.layers.erf + self.op = paddle.erf self.op_attrs = {} class TestExp(TestBase): def set_test_op(self): - self.op = paddle.fluid.layers.exp + self.op = paddle.exp self.op_attrs = {} @@ -128,19 +128,19 @@ def fp16_enabled(self): return False def set_test_op(self): - self.op = paddle.fluid.layers.floor + self.op = paddle.floor self.op_attrs = {} class TestLog(TestBase): def set_test_op(self): - self.op = paddle.fluid.layers.log + self.op = paddle.log self.op_attrs = {} class TestReciprocal(TestBase): def set_test_op(self): - self.op = paddle.fluid.layers.reciprocal + self.op = paddle.reciprocal self.op_attrs = {} @@ -152,55 +152,55 @@ def set_test_op(self): class TestRound(TestBase): def set_test_op(self): - self.op = paddle.fluid.layers.round + self.op = paddle.round self.op_attrs = {} class TestSigmoid(TestBase): def set_test_op(self): - self.op = paddle.fluid.layers.sigmoid + self.op = paddle.nn.functional.sigmoid self.op_attrs = {} class TestSign(TestBase): def set_test_op(self): - self.op = paddle.fluid.layers.sign + self.op = paddle.sign self.op_attrs = {} class TestSin(TestBase): def set_test_op(self): - self.op = paddle.fluid.layers.sin + self.op = paddle.sin self.op_attrs = {} class TestSoftplus(TestBase): def set_test_op(self): - self.op = paddle.fluid.layers.softplus + self.op = paddle.nn.functional.softplus self.op_attrs = {} class TestSoftsign(TestBase): def set_test_op(self): - self.op = paddle.fluid.layers.softsign + self.op = paddle.nn.functional.softsign self.op_attrs = {} class TestSqrt(TestBase): def set_test_op(self): - self.op = paddle.fluid.layers.sqrt + self.op = paddle.sqrt self.op_attrs = {} class TestTan(TestBase): def set_test_op(self): - self.op = paddle.fluid.layers.tan + self.op = paddle.tan self.op_attrs = {} class TestTanh(TestBase): def set_test_op(self): - self.op = paddle.fluid.layers.tanh + self.op = paddle.tanh self.op_attrs = {} diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_elt_act_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_elt_act_fuse_pass.py index 3c2365c29e60c..bcf79b82baf47 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_elt_act_fuse_pass.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_elt_act_fuse_pass.py @@ -75,7 +75,7 @@ class ElementwiseActivationMkldnnFusePassTest_Add_Tanh( ): def set_params(self): self.operand = fluid.layers.elementwise_add - self.act = fluid.layers.tanh + self.act = paddle.tanh class ElementwiseActivationMkldnnFusePassTest_Add_LeakyRelu( @@ -108,7 +108,7 @@ class ElementwiseActivationMkldnnFusePassTest_Add_SQRT( ): def set_params(self): self.operand = fluid.layers.elementwise_add - self.act = fluid.layers.sqrt + self.act = paddle.sqrt class ElementwiseActivationMkldnnFusePassTest_Add_ABS( @@ -116,7 +116,7 @@ class ElementwiseActivationMkldnnFusePassTest_Add_ABS( ): def set_params(self): self.operand = fluid.layers.elementwise_add - self.act = fluid.layers.abs + self.act = paddle.abs class ElementwiseActivationMkldnnFusePassTest_Add_Clip( @@ -134,7 +134,7 @@ class ElementwiseActivationMkldnnFusePassTest_Add_Gelu( ): def set_params(self): self.operand = fluid.layers.elementwise_add - self.act = fluid.layers.gelu + self.act = paddle.nn.functional.gelu class ElementwiseActivationMkldnnFusePassTest_Add_Gelu_Tanh( @@ -142,7 +142,7 @@ class ElementwiseActivationMkldnnFusePassTest_Add_Gelu_Tanh( ): def set_params(self): self.operand = fluid.layers.elementwise_add - self.act = fluid.layers.gelu + self.act = paddle.nn.functional.gelu self.act_alpha = True @@ -159,7 +159,7 @@ class ElementwiseActivationMkldnnFusePassTest_Add_Sigmoid( ): def set_params(self): self.operand = fluid.layers.elementwise_add - self.act = fluid.layers.sigmoid + self.act = paddle.nn.functional.sigmoid class ElementwiseActivationMkldnnFusePassTest_Sub_Relu( @@ -175,7 +175,7 @@ class ElementwiseActivationMkldnnFusePassTest_Sub_Tanh( ): def set_params(self): self.operand = fluid.layers.elementwise_sub - self.act = fluid.layers.tanh + self.act = paddle.tanh class ElementwiseActivationMkldnnFusePassTest_Sub_LeakyRelu( @@ -208,7 +208,7 @@ class ElementwiseActivationMkldnnFusePassTest_Sub_ABS( ): def set_params(self): self.operand = fluid.layers.elementwise_sub - self.act = fluid.layers.abs + self.act = paddle.abs class ElementwiseActivationMkldnnFusePassTest_Sub_Clip( @@ -226,7 +226,7 @@ class ElementwiseActivationMkldnnFusePassTest_Sub_Gelu( ): def set_params(self): self.operand = fluid.layers.elementwise_sub - self.act = fluid.layers.gelu + self.act = paddle.nn.functional.gelu class ElementwiseActivationMkldnnFusePassTest_Sub_Gelu_Tanh( @@ -234,7 +234,7 @@ class ElementwiseActivationMkldnnFusePassTest_Sub_Gelu_Tanh( ): def set_params(self): self.operand = fluid.layers.elementwise_sub - self.act = fluid.layers.gelu + self.act = paddle.nn.functional.gelu self.act_alpha = True @@ -251,7 +251,7 @@ class ElementwiseActivationMkldnnFusePassTest_Sub_Sigmoid( ): def set_params(self): self.operand = fluid.layers.elementwise_sub - self.act = fluid.layers.sigmoid + self.act = paddle.nn.functional.sigmoid class ElementwiseActivationMkldnnFusePassTest_Mul_Relu( @@ -267,7 +267,7 @@ class ElementwiseActivationMkldnnFusePassTest_Mul_Tanh( ): def set_params(self): self.operand = fluid.layers.elementwise_mul - self.act = fluid.layers.tanh + self.act = paddle.tanh class ElementwiseActivationMkldnnFusePassTest_Mul_LeakyRelu( @@ -300,7 +300,7 @@ class ElementwiseActivationMkldnnFusePassTest_Mul_SQRT( ): def set_params(self): self.operand = fluid.layers.elementwise_mul - self.act = fluid.layers.sqrt + self.act = paddle.sqrt class ElementwiseActivationMkldnnFusePassTest_Mul_ABS( @@ -308,7 +308,7 @@ class ElementwiseActivationMkldnnFusePassTest_Mul_ABS( ): def set_params(self): self.operand = fluid.layers.elementwise_mul - self.act = fluid.layers.abs + self.act = paddle.abs class ElementwiseActivationMkldnnFusePassTest_Mul_Clip( @@ -326,7 +326,7 @@ class ElementwiseActivationMkldnnFusePassTest_Mul_Gelu( ): def set_params(self): self.operand = fluid.layers.elementwise_mul - self.act = fluid.layers.gelu + self.act = paddle.nn.functional.gelu class ElementwiseActivationMkldnnFusePassTest_Mul_Gelu_Tanh( @@ -334,7 +334,7 @@ class ElementwiseActivationMkldnnFusePassTest_Mul_Gelu_Tanh( ): def set_params(self): self.operand = fluid.layers.elementwise_mul - self.act = fluid.layers.gelu + self.act = paddle.nn.functional.gelu self.act_alpha = True @@ -351,7 +351,7 @@ class ElementwiseActivationMkldnnFusePassTest_Mul_Sigmoid( ): def set_params(self): self.operand = fluid.layers.elementwise_mul - self.act = fluid.layers.sigmoid + self.act = paddle.nn.functional.sigmoid if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_activation_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_activation_pass.py index 9b6e3e641037d..4c86911c2eae1 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_activation_pass.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_activation_pass.py @@ -17,6 +17,7 @@ import unittest import numpy as np from inference_pass_test import InferencePassTest +import paddle import paddle.fluid as fluid import paddle.fluid.core as core from paddle.fluid.core import PassVersionChecker @@ -81,7 +82,7 @@ def append_act(self, x): class TensorRTSubgraphPassSigmoidTest(TensorRTSubgraphPassActivationTest): def append_act(self, x): - return fluid.layers.sigmoid(x) + return paddle.nn.functional.sigmoid(x) class TensorRTSubgraphPassHardSwishTest(TensorRTSubgraphPassActivationTest): @@ -108,7 +109,7 @@ def append_act(self, x): class TensorRTSubgraphPassTanhTest(TensorRTSubgraphPassActivationTest): def append_act(self, x): - return fluid.layers.tanh(x) + return paddle.tanh(x) class TensorRTSubgraphPassSwishTest(TensorRTSubgraphPassActivationTest): @@ -303,7 +304,7 @@ def append_act(self, x): class TensorRTSubgraphPassGeluTest(TensorRTSubgraphPassActivationTest): def append_act(self, x): - return fluid.layers.gelu(x) + return paddle.nn.functional.gelu(x) class TensorRTSubgraphPassGeluDynamicTest(TensorRTSubgraphPassActivationTest): @@ -322,7 +323,7 @@ def setUpTensorRTParam(self): ) def append_act(self, x): - return fluid.layers.gelu(x) + return paddle.nn.functional.gelu(x) class TensorRTSubgraphPassGeluFp16Test(TensorRTSubgraphPassActivationTest): @@ -333,7 +334,7 @@ def setUpTensorRTParam(self): ) def append_act(self, x): - return fluid.layers.gelu(x) + return paddle.nn.functional.gelu(x) class TensorRTSubgraphPassGeluFp16SerializeTest( @@ -346,7 +347,7 @@ def setUpTensorRTParam(self): ) def append_act(self, x): - return fluid.layers.gelu(x) + return paddle.nn.functional.gelu(x) class TensorRTSubgraphPassGeluFp16DynamicTest( @@ -367,7 +368,7 @@ def setUpTensorRTParam(self): ) def append_act(self, x): - return fluid.layers.gelu(x) + return paddle.nn.functional.gelu(x) class TensorRTSubgraphPassGeluFp16DynamicSerializeTest( @@ -388,7 +389,7 @@ def setUpTensorRTParam(self): ) def append_act(self, x): - return fluid.layers.gelu(x) + return paddle.nn.functional.gelu(x) if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/ir/test_ir_fusion_group_pass.py b/python/paddle/fluid/tests/unittests/ir/test_ir_fusion_group_pass.py index 3775a4d08d1aa..906406d43da9b 100644 --- a/python/paddle/fluid/tests/unittests/ir/test_ir_fusion_group_pass.py +++ b/python/paddle/fluid/tests/unittests/ir/test_ir_fusion_group_pass.py @@ -15,6 +15,7 @@ import unittest import numpy as np +import paddle from pass_test import PassTest import paddle.fluid as fluid import paddle.fluid.layers as layers @@ -85,10 +86,14 @@ def build_program(self, dtype): one = layers.fill_constant(shape=[1], dtype=dtype, value=1.0) tmp_0 = one * self.feed_vars[0] # subgraph with 9 op nodes - tmp_1 = tmp_0 * layers.sigmoid(self.feed_vars[1]) + layers.sigmoid( - self.feed_vars[2] - ) * layers.tanh(self.feed_vars[3]) - tmp_2 = layers.tanh(tmp_1) + layers.sigmoid(self.feed_vars[4]) + tmp_1 = tmp_0 * paddle.nn.functional.sigmoid( + self.feed_vars[1] + ) + paddle.nn.functional.sigmoid(self.feed_vars[2]) * paddle.tanh( + self.feed_vars[3] + ) + tmp_2 = paddle.tanh(tmp_1) + paddle.nn.functional.sigmoid( + self.feed_vars[4] + ) self.append_gradients(tmp_2) @@ -162,10 +167,10 @@ def build_program(self, dtype): tmp_0 = layers.sum( [self.feed_vars[0], self.feed_vars[1], self.feed_vars[2]] ) - tmp_1 = layers.sqrt(tmp_0) + tmp_1 = paddle.sqrt(tmp_0) tmp_2 = layers.mul(tmp_0, self.feed_vars[3]) # subgraph with 2 op nodes - tmp_3 = layers.square(layers.sum([tmp_1, tmp_2])) + tmp_3 = paddle.square(layers.sum([tmp_1, tmp_2])) self.append_gradients(tmp_3) diff --git a/python/paddle/fluid/tests/unittests/mlu/sync_batch_norm_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/sync_batch_norm_op_mlu.py index 3ea79600954aa..d1ffbe16884ef 100644 --- a/python/paddle/fluid/tests/unittests/mlu/sync_batch_norm_op_mlu.py +++ b/python/paddle/fluid/tests/unittests/mlu/sync_batch_norm_op_mlu.py @@ -97,7 +97,7 @@ def get_model( ) if self.bn_dtype == np.float16: bn = fluid.layers.cast(bn, 'float32') - sigmoid = fluid.layers.sigmoid(bn) + sigmoid = paddle.nn.functional.sigmoid(bn) out = fluid.layers.reduce_sum(sigmoid) # if not sync_bn: # out = out / core.get_mlu_device_count() diff --git a/python/paddle/fluid/tests/unittests/mlu/test_gelu_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_gelu_op_mlu.py index 8a879aa22213a..d5b0913b86112 100644 --- a/python/paddle/fluid/tests/unittests/mlu/test_gelu_op_mlu.py +++ b/python/paddle/fluid/tests/unittests/mlu/test_gelu_op_mlu.py @@ -109,7 +109,7 @@ def _test(self, run_mlu=True): c = paddle.multiply(a, b) fc_1 = fluid.layers.fc(input=c, size=128) - fc_1_gelu = fluid.layers.gelu(fc_1) + fc_1_gelu = paddle.nn.functional.gelu(fc_1) prediction = fluid.layers.fc(input=fc_1_gelu, size=2, act='softmax') cost = fluid.layers.cross_entropy(input=prediction, label=label) diff --git a/python/paddle/fluid/tests/unittests/npu/sync_batch_norm_op_npu.py b/python/paddle/fluid/tests/unittests/npu/sync_batch_norm_op_npu.py index 4960fee949b61..464614db48908 100644 --- a/python/paddle/fluid/tests/unittests/npu/sync_batch_norm_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/sync_batch_norm_op_npu.py @@ -99,7 +99,7 @@ def get_model( ) # if self.dtype == np.float16: # bn = fluid.layers.cast(bn, 'float32') - sigmoid = fluid.layers.sigmoid(bn) + sigmoid = paddle.nn.functional.sigmoid(bn) out = fluid.layers.reduce_sum(sigmoid) # if not sync_bn: # out = out / core.get_npu_device_count() diff --git a/python/paddle/fluid/tests/unittests/npu/test_gelu_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_gelu_op_npu.py index 0c2d163becadd..90e3f8dd2b206 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_gelu_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_gelu_op_npu.py @@ -109,7 +109,7 @@ def _test(self, run_npu=True): c = paddle.multiply(a, b) fc_1 = fluid.layers.fc(input=c, size=128) - fc_1_gelu = fluid.layers.gelu(fc_1) + fc_1_gelu = paddle.nn.functional.gelu(fc_1) prediction = fluid.layers.fc(input=fc_1_gelu, size=2, act='softmax') cost = fluid.layers.cross_entropy(input=prediction, label=label) diff --git a/python/paddle/fluid/tests/unittests/simple_nets.py b/python/paddle/fluid/tests/unittests/simple_nets.py index f85a0f9e135e2..8984108db32cf 100644 --- a/python/paddle/fluid/tests/unittests/simple_nets.py +++ b/python/paddle/fluid/tests/unittests/simple_nets.py @@ -88,7 +88,7 @@ def bow_net( input=data, is_sparse=is_sparse, size=[dict_dim, emb_dim] ) bow = fluid.layers.sequence_pool(input=emb, pool_type='sum') - bow_tanh = fluid.layers.tanh(bow) + bow_tanh = paddle.tanh(bow) fc_1 = fluid.layers.fc(input=bow_tanh, size=hid_dim, act="tanh") fc_2 = fluid.layers.fc(input=fc_1, size=hid_dim2, act="tanh") prediction = fluid.layers.fc(input=[fc_2], size=class_dim, act="softmax") diff --git a/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py b/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py index cee448b0648f5..4182d1c586df5 100644 --- a/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py +++ b/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py @@ -33,7 +33,7 @@ def func(self, place): dtype = np.float64 x = layers.data('x', shape, False, dtype=dtype) x.persistable = True - y = layers.sigmoid(x) + y = F.sigmoid(x) x_arr = np.random.random(shape).astype(dtype) x_arr[np.abs(x_arr) < 0.005] = 0.002 gradient_checker.triple_grad_check( @@ -51,7 +51,7 @@ def test_grad(self): class TestSigmoidDoubleGradCheck(unittest.TestCase): def sigmoid_wrapper(self, x): - return fluid.layers.sigmoid(x[0]) + return F.sigmoid(x[0]) @prog_scope() def func(self, place): @@ -60,7 +60,7 @@ def func(self, place): dtype = np.float64 x = layers.data('x', shape, False, dtype=dtype) x.persistable = True - y = layers.sigmoid(x) + y = F.sigmoid(x) x_arr = np.random.uniform(-1, 1, shape).astype(dtype) x_arr[np.abs(x_arr) < 0.005] = 0.002 gradient_checker.double_grad_check( @@ -92,7 +92,7 @@ def func(self, place): dtype = np.float64 x = layers.data('x', shape, False, dtype=dtype) x.persistable = True - y = layers.tanh(x) + y = paddle.tanh(x) x_arr = np.random.random(shape).astype(dtype) x_arr[np.abs(x_arr) < 0.005] = 0.002 gradient_checker.triple_grad_check( @@ -322,7 +322,7 @@ def func(self, place): x = layers.data('x', shape, False, dtype) x.persistable = True - y = layers.sqrt(x) + y = paddle.sqrt(x) x_arr = np.random.uniform(0.1, 1, shape).astype(dtype) gradient_checker.double_grad_check( @@ -354,7 +354,7 @@ def func(self, place): x = layers.data('x', shape, False, dtype) x.persistable = True - y = layers.rsqrt(x) + y = paddle.rsqrt(x) x_arr = np.random.uniform(0.1, 1, shape).astype(dtype) gradient_checker.double_grad_check( @@ -386,7 +386,7 @@ def func(self, place): x = layers.data('x', shape, False, dtype) x.persistable = True - y = layers.square(x) + y = paddle.square(x) x_arr = np.random.uniform(-1, 1, shape).astype(dtype) gradient_checker.double_grad_check( @@ -417,7 +417,7 @@ def func(self, place): x = layers.data('x', shape, False, dtype) x.persistable = True - y = layers.abs(x) + y = paddle.abs(x) x_arr = np.random.uniform(-1, 1, shape).astype(dtype) # Because we set delta = 0.005 in calculating numeric gradient, # if x is too small, the numeric gradient is inaccurate. @@ -449,7 +449,7 @@ def func(self, place): x = layers.data('x', shape, False, dtype) x.persistable = True - y = layers.log(x) + y = paddle.log(x) x_arr = np.random.uniform(0.1, 1, shape).astype(dtype) @@ -608,7 +608,7 @@ def func(self, place): dtype = np.float64 x = layers.data('x', shape, False, dtype=dtype) x.persistable = True - y = layers.sin(x) + y = paddle.sin(x) x_arr = np.random.random(shape).astype(dtype) x_arr[np.abs(x_arr) < 0.005] = 0.002 gradient_checker.triple_grad_check( @@ -733,7 +733,7 @@ def func(self, place): dtype = np.float64 x = layers.data('x', shape, False, dtype=dtype) x.persistable = True - y = layers.cos(x) + y = paddle.cos(x) x_arr = np.random.random(shape).astype(dtype) x_arr[np.abs(x_arr) < 0.005] = 0.002 gradient_checker.triple_grad_check( diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py index ddd287a580318..3b0057e226a3c 100755 --- a/python/paddle/fluid/tests/unittests/test_activation_op.py +++ b/python/paddle/fluid/tests/unittests/test_activation_op.py @@ -33,17 +33,17 @@ def test_errors(self): with program_guard(Program(), Program()): # The input type of sqrt op must be Variable or numpy.ndarray. in1 = 1 - self.assertRaises(TypeError, fluid.layers.sqrt, in1) + self.assertRaises(TypeError, paddle.sqrt, in1) # The input dtype of sqrt op must be float16, float32, float64. in2 = fluid.layers.data( name='input2', shape=[12, 10], dtype="int32" ) - self.assertRaises(TypeError, fluid.layers.sqrt, in2) + self.assertRaises(TypeError, paddle.sqrt, in2) in3 = fluid.layers.data( name='input3', shape=[12, 10], dtype="float16" ) - fluid.layers.sqrt(x=in3) + paddle.sqrt(x=in3) class TestActivation(OpTest): @@ -390,16 +390,6 @@ def test_dygraph_api(self): np.testing.assert_allclose(out_ref, r.numpy(), rtol=1e-05) paddle.enable_static() - def test_fluid_api(self): - paddle.enable_static() - with paddle.static.program_guard(paddle.static.Program()): - x = paddle.fluid.data('X', [11, 17]) - out = paddle.fluid.layers.logsigmoid(x) - exe = paddle.static.Executor(self.place) - res = exe.run(feed={'X': self.x_np}, fetch_list=[out]) - out_ref = np.log(1 / (1 + np.exp(-self.x_np))) - np.testing.assert_allclose(out_ref, res[0], rtol=1e-05) - def test_errors(self): paddle.enable_static() with paddle.static.program_guard(paddle.static.Program()): @@ -488,16 +478,6 @@ def test_dygraph_api(self): np.testing.assert_allclose(out_ref, r.numpy(), rtol=1e-05) paddle.enable_static() - def test_fluid_api(self): - paddle.enable_static() - with fluid.program_guard(fluid.Program()): - x = fluid.data('X', [10, 12], self.dtype) - out = fluid.layers.tanh(x) - exe = fluid.Executor(self.place) - res = exe.run(feed={'X': self.x_np}, fetch_list=[out]) - out_ref = np.tanh(self.x_np) - np.testing.assert_allclose(out_ref, res[0], rtol=1e-05) - def test_errors(self): paddle.enable_static() with paddle.static.program_guard(paddle.static.Program()): @@ -593,7 +573,7 @@ def test_dygraph(self): with fluid.dygraph.guard(): np_x = np.array([0.1]) x = fluid.dygraph.to_variable(np_x) - z = fluid.layers.sinh(x).numpy() + z = paddle.sinh(x).numpy() z_expected = np.sinh(np_x) np.testing.assert_allclose(z, z_expected, rtol=1e-05) @@ -610,7 +590,7 @@ def test_api(self): dtype="float32", ) - pd_sinh_out = fluid.layers.sinh(data_x) + pd_sinh_out = paddle.sinh(data_x) exe = fluid.Executor(place=fluid.CPUPlace()) exe.run(fluid.default_startup_program()) (np_sinh_res,) = exe.run( @@ -630,7 +610,7 @@ def test_backward(self): ) var = fluid.dygraph.to_variable(input_x) var.stop_gradient = False - loss = fluid.layers.sinh(var) + loss = paddle.sinh(var) loss.backward() grad_var = var.gradient() self.assertEqual(grad_var.shape, input_x.shape) @@ -640,13 +620,13 @@ class TestSinhOpError(unittest.TestCase): def test_errors(self): with program_guard(Program()): # The input type must be Variable. - self.assertRaises(TypeError, fluid.layers.sinh, 1) + self.assertRaises(TypeError, paddle.sinh, 1) # The input dtype must be float16, float32, float64. x_int32 = fluid.data(name='x_int32', shape=[12, 10], dtype='int32') - self.assertRaises(TypeError, fluid.layers.sinh, x_int32) + self.assertRaises(TypeError, paddle.sinh, x_int32) # support the input dtype is float16 x_fp16 = fluid.data(name='x_fp16', shape=[12, 10], dtype='float16') - fluid.layers.sinh(x_fp16) + paddle.sinh(x_fp16) class TestCosh(TestActivation): @@ -678,7 +658,7 @@ def test_dygraph(self): with fluid.dygraph.guard(): np_x = np.array([0.1]) x = fluid.dygraph.to_variable(np_x) - z = fluid.layers.cosh(x).numpy() + z = paddle.cosh(x).numpy() z_expected = np.cosh(np_x) np.testing.assert_allclose(z, z_expected, rtol=1e-05) @@ -715,7 +695,7 @@ def test_backward(self): ) var = fluid.dygraph.to_variable(input_x) var.stop_gradient = False - loss = fluid.layers.cosh(var) + loss = paddle.cosh(var) loss.backward() grad_var = var.gradient() self.assertEqual(grad_var.shape, input_x.shape) @@ -725,13 +705,13 @@ class TestCoshOpError(unittest.TestCase): def test_errors(self): with program_guard(Program()): # The input type must be Variable. - self.assertRaises(TypeError, fluid.layers.cosh, 1) + self.assertRaises(TypeError, paddle.cosh, 1) # The input dtype must be float16, float32, float64. x_int32 = fluid.data(name='x_int32', shape=[12, 10], dtype='int32') - self.assertRaises(TypeError, fluid.layers.cosh, x_int32) + self.assertRaises(TypeError, paddle.cosh, x_int32) # support the input dtype is float16 x_fp16 = fluid.data(name='x_fp16', shape=[12, 10], dtype='float16') - fluid.layers.cosh(x_fp16) + paddle.cosh(x_fp16) def ref_tanhshrink(x): @@ -798,16 +778,6 @@ def test_dygraph_api(self): np.testing.assert_allclose(out_ref, r.numpy(), rtol=1e-05) paddle.enable_static() - def test_fluid_api(self): - paddle.enable_static() - with fluid.program_guard(fluid.Program()): - x = fluid.data('X', self.x_np.shape, self.x_np.dtype) - out = fluid.layers.tanh_shrink(x) - exe = fluid.Executor(self.place) - res = exe.run(feed={'X': self.x_np}, fetch_list=[out]) - out_ref = ref_tanhshrink(self.x_np) - np.testing.assert_allclose(out_ref, res[0], rtol=1e-05) - def test_errors(self): paddle.enable_static() with paddle.static.program_guard(paddle.static.Program()): @@ -914,16 +884,6 @@ def test_dygraph_api(self): np.testing.assert_allclose(out_ref, r.numpy(), rtol=1e-05) paddle.enable_static() - def test_fluid_api(self): - paddle.enable_static() - with fluid.program_guard(fluid.Program()): - x = fluid.data('X', [10, 12]) - out = fluid.layers.hard_shrink(x) - exe = fluid.Executor(self.place) - res = exe.run(feed={'X': self.x_np}, fetch_list=[out]) - out_ref = ref_hardshrink(self.x_np, 0.5) - np.testing.assert_allclose(out_ref, res[0], rtol=1e-05) - def test_errors(self): paddle.enable_static() with paddle.static.program_guard(paddle.static.Program()): @@ -1080,16 +1040,6 @@ def test_dygraph_api(self): np.testing.assert_allclose(out_ref, r.numpy(), rtol=1e-05) paddle.enable_static() - def test_fluid_api(self): - paddle.enable_static() - with fluid.program_guard(fluid.Program()): - x = fluid.data('X', self.x_np.shape, self.x_np.dtype) - out = fluid.layers.softshrink(x, self.threshold) - exe = fluid.Executor(self.place) - res = exe.run(feed={'X': self.x_np}, fetch_list=[out]) - out_ref = ref_softshrink(self.x_np, self.threshold) - np.testing.assert_allclose(out_ref, res[0], rtol=1e-05) - def test_errors(self): paddle.enable_static() with paddle.static.program_guard(paddle.static.Program()): @@ -1780,16 +1730,6 @@ def test_dygraph_api(self): np.testing.assert_allclose(out_ref, r.numpy(), rtol=1e-05) paddle.enable_static() - def test_fluid_api(self): - paddle.enable_static() - with fluid.program_guard(fluid.Program()): - x = fluid.data('X', [10, 12]) - out = fluid.layers.leaky_relu(x, 0.01) - exe = fluid.Executor(self.place) - res = exe.run(feed={'X': self.x_np}, fetch_list=[out]) - out_ref = ref_leaky_relu(self.x_np) - np.testing.assert_allclose(out_ref, res[0], rtol=1e-05) - def test_errors(self): paddle.enable_static() with paddle.static.program_guard(paddle.static.Program()): @@ -3120,16 +3060,6 @@ def test_dygraph_api(self): np.testing.assert_allclose(out_ref, r.numpy(), rtol=1e-05) paddle.enable_static() - def test_fluid_api(self): - paddle.enable_static() - with fluid.program_guard(fluid.Program()): - x = fluid.data('X', self.x_np.shape, self.x_np.dtype) - out = fluid.layers.softplus(x) - exe = fluid.Executor(self.place) - res = exe.run(feed={'X': self.x_np}, fetch_list=[out]) - out_ref = ref_softplus(self.x_np) - np.testing.assert_allclose(out_ref, res[0], rtol=1e-05) - def test_errors(self): paddle.enable_static() with paddle.static.program_guard(paddle.static.Program()): @@ -3215,16 +3145,6 @@ def test_dygraph_api(self): np.testing.assert_allclose(out_ref, r.numpy(), rtol=1e-05) paddle.enable_static() - def test_fluid_api(self): - paddle.enable_static() - with fluid.program_guard(fluid.Program()): - x = fluid.data('X', self.x_np.shape, self.x_np.dtype) - out = fluid.layers.softsign(x) - exe = fluid.Executor(self.place) - res = exe.run(feed={'X': self.x_np}, fetch_list=[out]) - out_ref = ref_softsign(self.x_np) - np.testing.assert_allclose(out_ref, res[0], rtol=1e-05) - def test_errors(self): paddle.enable_static() with paddle.static.program_guard(paddle.static.Program()): @@ -3314,16 +3234,6 @@ def test_dygraph_api(self): np.testing.assert_allclose(out_ref, r.numpy(), rtol=1e-05) paddle.enable_static() - def test_fluid_api(self): - paddle.enable_static() - with fluid.program_guard(fluid.Program()): - x = fluid.data('X', self.x_np.shape, self.x_np.dtype) - out = fluid.layers.thresholded_relu(x, self.threshold) - exe = fluid.Executor(self.place) - res = exe.run(feed={'X': self.x_np}, fetch_list=[out]) - out_ref = ref_thresholded_relu(self.x_np, self.threshold) - np.testing.assert_allclose(out_ref, res[0], rtol=1e-05) - def test_errors(self): paddle.enable_static() with paddle.static.program_guard(paddle.static.Program()): @@ -3660,45 +3570,6 @@ def test_errors(self): F.mish(x_fp16) -# ------------------ Test Error Activation---------------------- -def create_test_error_class(op_type): - class TestOpErrors(unittest.TestCase): - def test_errors(self): - with program_guard(Program(), Program()): - op = getattr(fluid.layers, op_type) - # The input dtype of op_type must be float32, float64. - in1 = fluid.layers.data( - name='input2', shape=[12, 10], dtype="int32" - ) - in2 = fluid.layers.data( - name='input3', shape=[12, 10], dtype="int64" - ) - self.assertRaises(TypeError, op, in1) - self.assertRaises(TypeError, op, in2) - - cls_name = "{0}_{1}".format(op_type, "test_errors") - TestOpErrors.__name__ = cls_name - globals()[cls_name] = TestOpErrors - - -create_test_error_class('acos') -create_test_error_class('asin') -create_test_error_class('atan') -create_test_error_class('ceil') -create_test_error_class('cos') -create_test_error_class('floor') -create_test_error_class('reciprocal') -create_test_error_class('round') -create_test_error_class('rsqrt') -create_test_error_class('sin') -create_test_error_class('sqrt') -create_test_error_class('tanh') -create_test_error_class('tan') -create_test_error_class('acosh') -create_test_error_class('asinh') -create_test_error_class('atanh') - - # ------------------ Test Cudnn Activation---------------------- def create_test_act_cudnn_class(parent, atol=1e-3, grad_atol=1e-3): @unittest.skipIf( diff --git a/python/paddle/fluid/tests/unittests/test_cumsum_op.py b/python/paddle/fluid/tests/unittests/test_cumsum_op.py index b28c2863d1872..1621b6c543217 100644 --- a/python/paddle/fluid/tests/unittests/test_cumsum_op.py +++ b/python/paddle/fluid/tests/unittests/test_cumsum_op.py @@ -371,7 +371,7 @@ def test_error(self): def test_bad_x(): data = [1, 2, 4] - result = fluid.layers.cumsum(data, axis=0) + result = paddle.cumsum(data, axis=0) self.assertRaises(TypeError, test_bad_x) diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps.py index 42a96cc66f41d..87bf029956442 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps.py +++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps.py @@ -87,7 +87,7 @@ def get_loss(cos_q_pt, cos_q_nt): q_emb = fluid.layers.reshape(q_emb, [-1, emb_dim]) # vsum q_sum = fluid.layers.sequence_pool(input=q_emb, pool_type='sum') - q_ss = fluid.layers.softsign(q_sum) + q_ss = paddle.nn.functional.softsign(q_sum) # fc layer after conv q_fc = fluid.layers.fc( input=q_ss, @@ -119,7 +119,7 @@ def get_loss(cos_q_pt, cos_q_nt): pt_emb = fluid.layers.reshape(pt_emb, [-1, emb_dim]) # vsum pt_sum = fluid.layers.sequence_pool(input=pt_emb, pool_type='sum') - pt_ss = fluid.layers.softsign(pt_sum) + pt_ss = paddle.nn.functional.softsign(pt_sum) # fc layer pt_fc = fluid.layers.fc( input=pt_ss, @@ -150,7 +150,7 @@ def get_loss(cos_q_pt, cos_q_nt): nt_emb = fluid.layers.reshape(nt_emb, [-1, emb_dim]) # vsum nt_sum = fluid.layers.sequence_pool(input=nt_emb, pool_type='sum') - nt_ss = fluid.layers.softsign(nt_sum) + nt_ss = paddle.nn.functional.softsign(nt_sum) # fc layer nt_fc = fluid.layers.fc( input=nt_ss, diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps11.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps11.py index 4fc0e2eb5a0c3..d70ed0b9031ba 100755 --- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps11.py +++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps11.py @@ -83,7 +83,7 @@ def get_loss(cos_q_pt, cos_q_nt): q_emb = fluid.layers.reshape(q_emb, [-1, emb_dim]) # vsum q_sum = fluid.layers.sequence_pool(input=q_emb, pool_type='sum') - q_ss = fluid.layers.softsign(q_sum) + q_ss = paddle.nn.functional.softsign(q_sum) # fc layer after conv q_fc = fluid.layers.fc( input=q_ss, @@ -111,7 +111,7 @@ def get_loss(cos_q_pt, cos_q_nt): pt_emb = fluid.layers.reshape(pt_emb, [-1, emb_dim]) # vsum pt_sum = fluid.layers.sequence_pool(input=pt_emb, pool_type='sum') - pt_ss = fluid.layers.softsign(pt_sum) + pt_ss = paddle.nn.functional.softsign(pt_sum) # fc layer pt_fc = fluid.layers.fc( input=pt_ss, @@ -138,7 +138,7 @@ def get_loss(cos_q_pt, cos_q_nt): nt_emb = fluid.layers.reshape(nt_emb, [-1, emb_dim]) # vsum nt_sum = fluid.layers.sequence_pool(input=nt_emb, pool_type='sum') - nt_ss = fluid.layers.softsign(nt_sum) + nt_ss = paddle.nn.functional.softsign(nt_sum) # fc layer nt_fc = fluid.layers.fc( input=nt_ss, diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps12.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps12.py index f38dbf9e56334..d8506f64a4af2 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps12.py +++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps12.py @@ -86,7 +86,7 @@ def get_loss(cos_q_pt, cos_q_nt): q_emb = fluid.layers.reshape(q_emb, [-1, emb_dim]) # vsum q_sum = fluid.layers.sequence_pool(input=q_emb, pool_type='sum') - q_ss = fluid.layers.softsign(q_sum) + q_ss = paddle.nn.functional.softsign(q_sum) # fc layer after conv q_fc = fluid.layers.fc( input=q_ss, @@ -114,7 +114,7 @@ def get_loss(cos_q_pt, cos_q_nt): pt_emb = fluid.layers.reshape(pt_emb, [-1, emb_dim]) # vsum pt_sum = fluid.layers.sequence_pool(input=pt_emb, pool_type='sum') - pt_ss = fluid.layers.softsign(pt_sum) + pt_ss = paddle.nn.functional.softsign(pt_sum) # fc layer pt_fc = fluid.layers.fc( input=pt_ss, @@ -141,7 +141,7 @@ def get_loss(cos_q_pt, cos_q_nt): nt_emb = fluid.layers.reshape(nt_emb, [-1, emb_dim]) # vsum nt_sum = fluid.layers.sequence_pool(input=nt_emb, pool_type='sum') - nt_ss = fluid.layers.softsign(nt_sum) + nt_ss = paddle.nn.functional.softsign(nt_sum) # fc layer nt_fc = fluid.layers.fc( input=nt_ss, diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps13.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps13.py index 1c8c3b4f879e0..0d531054a709b 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps13.py +++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps13.py @@ -89,7 +89,7 @@ def get_loss(cos_q_pt, cos_q_nt): q_emb = fluid.layers.reshape(q_emb, [-1, emb_dim]) # vsum q_sum = fluid.layers.sequence_pool(input=q_emb, pool_type='sum') - q_ss = fluid.layers.softsign(q_sum) + q_ss = paddle.nn.functional.softsign(q_sum) # fc layer after conv q_fc = fluid.layers.fc( input=q_ss, @@ -119,7 +119,7 @@ def get_loss(cos_q_pt, cos_q_nt): pt_emb = fluid.layers.reshape(pt_emb, [-1, emb_dim]) # vsum pt_sum = fluid.layers.sequence_pool(input=pt_emb, pool_type='sum') - pt_ss = fluid.layers.softsign(pt_sum) + pt_ss = paddle.nn.functional.softsign(pt_sum) # fc layer pt_fc = fluid.layers.fc( input=pt_ss, @@ -148,7 +148,7 @@ def get_loss(cos_q_pt, cos_q_nt): nt_emb = fluid.layers.reshape(nt_emb, [-1, emb_dim]) # vsum nt_sum = fluid.layers.sequence_pool(input=nt_emb, pool_type='sum') - nt_ss = fluid.layers.softsign(nt_sum) + nt_ss = paddle.nn.functional.softsign(nt_sum) # fc layer nt_fc = fluid.layers.fc( input=nt_ss, diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps2.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps2.py index 335577818259c..8d0fdd6f9c0cb 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps2.py +++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps2.py @@ -88,7 +88,7 @@ def get_loss(cos_q_pt, cos_q_nt): q_emb = fluid.layers.reshape(q_emb, [-1, emb_dim]) # vsum q_sum = fluid.layers.sequence_pool(input=q_emb, pool_type='sum') - q_ss = fluid.layers.softsign(q_sum) + q_ss = paddle.nn.functional.softsign(q_sum) q_ss = fluid.layers.data_norm(input=q_ss) # fc layer after conv q_fc = fluid.layers.fc( @@ -119,7 +119,7 @@ def get_loss(cos_q_pt, cos_q_nt): pt_emb = fluid.layers.reshape(pt_emb, [-1, emb_dim]) # vsum pt_sum = fluid.layers.sequence_pool(input=pt_emb, pool_type='sum') - pt_ss = fluid.layers.softsign(pt_sum) + pt_ss = paddle.nn.functional.softsign(pt_sum) # fc layer pt_fc = fluid.layers.fc( input=pt_ss, @@ -148,7 +148,7 @@ def get_loss(cos_q_pt, cos_q_nt): nt_emb = fluid.layers.reshape(nt_emb, [-1, emb_dim]) # vsum nt_sum = fluid.layers.sequence_pool(input=nt_emb, pool_type='sum') - nt_ss = fluid.layers.softsign(nt_sum) + nt_ss = paddle.nn.functional.softsign(nt_sum) # fc layer nt_fc = fluid.layers.fc( input=nt_ss, diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps3.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps3.py index abd0ff1c858c1..80830b96936dc 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps3.py +++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps3.py @@ -87,7 +87,7 @@ def get_loss(cos_q_pt, cos_q_nt): q_emb = fluid.layers.reshape(q_emb, [-1, emb_dim]) # vsum q_sum = fluid.layers.sequence_pool(input=q_emb, pool_type='sum') - q_ss = fluid.layers.softsign(q_sum) + q_ss = paddle.nn.functional.softsign(q_sum) # fc layer after conv q_fc = fluid.layers.fc( input=q_ss, @@ -119,7 +119,7 @@ def get_loss(cos_q_pt, cos_q_nt): pt_emb = fluid.layers.reshape(pt_emb, [-1, emb_dim]) # vsum pt_sum = fluid.layers.sequence_pool(input=pt_emb, pool_type='sum') - pt_ss = fluid.layers.softsign(pt_sum) + pt_ss = paddle.nn.functional.softsign(pt_sum) # fc layer pt_fc = fluid.layers.fc( input=pt_ss, @@ -150,7 +150,7 @@ def get_loss(cos_q_pt, cos_q_nt): nt_emb = fluid.layers.reshape(nt_emb, [-1, emb_dim]) # vsum nt_sum = fluid.layers.sequence_pool(input=nt_emb, pool_type='sum') - nt_ss = fluid.layers.softsign(nt_sum) + nt_ss = paddle.nn.functional.softsign(nt_sum) # fc layer nt_fc = fluid.layers.fc( input=nt_ss, diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps4.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps4.py index cfc806d372b01..61561621d3839 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps4.py +++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps4.py @@ -85,7 +85,7 @@ def get_loss(cos_q_pt, cos_q_nt): q_emb = fluid.layers.reshape(q_emb, [-1, emb_dim]) # vsum q_sum = fluid.layers.sequence_pool(input=q_emb, pool_type='sum') - q_ss = fluid.layers.softsign(q_sum) + q_ss = paddle.nn.functional.softsign(q_sum) # fc layer after conv q_fc = fluid.layers.fc( input=q_ss, @@ -115,7 +115,7 @@ def get_loss(cos_q_pt, cos_q_nt): pt_emb = fluid.layers.reshape(pt_emb, [-1, emb_dim]) # vsum pt_sum = fluid.layers.sequence_pool(input=pt_emb, pool_type='sum') - pt_ss = fluid.layers.softsign(pt_sum) + pt_ss = paddle.nn.functional.softsign(pt_sum) # fc layer pt_fc = fluid.layers.fc( input=pt_ss, @@ -144,7 +144,7 @@ def get_loss(cos_q_pt, cos_q_nt): nt_emb = fluid.layers.reshape(nt_emb, [-1, emb_dim]) # vsum nt_sum = fluid.layers.sequence_pool(input=nt_emb, pool_type='sum') - nt_ss = fluid.layers.softsign(nt_sum) + nt_ss = paddle.nn.functional.softsign(nt_sum) # fc layer nt_fc = fluid.layers.fc( input=nt_ss, diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps5.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps5.py index f88ca8fcb1de3..8729c4d63971c 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps5.py +++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps5.py @@ -87,7 +87,7 @@ def get_loss(cos_q_pt, cos_q_nt): q_emb = fluid.layers.reshape(q_emb, [-1, emb_dim]) # vsum q_sum = fluid.layers.sequence_pool(input=q_emb, pool_type='sum') - q_ss = fluid.layers.softsign(q_sum) + q_ss = paddle.nn.functional.softsign(q_sum) # fc layer after conv q_fc = fluid.layers.fc( input=q_ss, @@ -119,7 +119,7 @@ def get_loss(cos_q_pt, cos_q_nt): pt_emb = fluid.layers.reshape(pt_emb, [-1, emb_dim]) # vsum pt_sum = fluid.layers.sequence_pool(input=pt_emb, pool_type='sum') - pt_ss = fluid.layers.softsign(pt_sum) + pt_ss = paddle.nn.functional.softsign(pt_sum) # fc layer pt_fc = fluid.layers.fc( input=pt_ss, @@ -150,7 +150,7 @@ def get_loss(cos_q_pt, cos_q_nt): nt_emb = fluid.layers.reshape(nt_emb, [-1, emb_dim]) # vsum nt_sum = fluid.layers.sequence_pool(input=nt_emb, pool_type='sum') - nt_ss = fluid.layers.softsign(nt_sum) + nt_ss = paddle.nn.functional.softsign(nt_sum) # fc layer nt_fc = fluid.layers.fc( input=nt_ss, diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps6.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps6.py index 6cfae26323e5c..83c710c4eaef7 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps6.py +++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps6.py @@ -85,7 +85,7 @@ def get_loss(cos_q_pt, cos_q_nt): q_emb = fluid.layers.reshape(q_emb, [-1, emb_dim]) # vsum q_sum = fluid.layers.sequence_pool(input=q_emb, pool_type='sum') - q_ss = fluid.layers.softsign(q_sum) + q_ss = paddle.nn.functional.softsign(q_sum) # fc layer after conv q_fc = fluid.layers.fc( input=q_ss, @@ -115,7 +115,7 @@ def get_loss(cos_q_pt, cos_q_nt): pt_emb = fluid.layers.reshape(pt_emb, [-1, emb_dim]) # vsum pt_sum = fluid.layers.sequence_pool(input=pt_emb, pool_type='sum') - pt_ss = fluid.layers.softsign(pt_sum) + pt_ss = paddle.nn.functional.softsign(pt_sum) # fc layer pt_fc = fluid.layers.fc( input=pt_ss, @@ -144,7 +144,7 @@ def get_loss(cos_q_pt, cos_q_nt): nt_emb = fluid.layers.reshape(nt_emb, [-1, emb_dim]) # vsum nt_sum = fluid.layers.sequence_pool(input=nt_emb, pool_type='sum') - nt_ss = fluid.layers.softsign(nt_sum) + nt_ss = paddle.nn.functional.softsign(nt_sum) # fc layer nt_fc = fluid.layers.fc( input=nt_ss, diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_gru_net.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_gru_net.py index f785936dcb99f..c28ece138fb41 100644 --- a/python/paddle/fluid/tests/unittests/test_eager_deletion_gru_net.py +++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_gru_net.py @@ -38,7 +38,7 @@ def gru_net( fc0 = fluid.layers.fc(input=emb, size=hid_dim * 3) gru_h = fluid.layers.dynamic_gru(input=fc0, size=hid_dim, is_reverse=False) gru_max = fluid.layers.sequence_pool(input=gru_h, pool_type='max') - gru_max_tanh = fluid.layers.tanh(gru_max) + gru_max_tanh = paddle.tanh(gru_max) fc1 = fluid.layers.fc(input=gru_max_tanh, size=hid_dim2, act='tanh') prediction = fluid.layers.fc(input=fc1, size=class_dim, act='softmax') cost = fluid.layers.cross_entropy(input=prediction, label=label) diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_lstm_net.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_lstm_net.py index 7709460111ac7..998872f0acf21 100644 --- a/python/paddle/fluid/tests/unittests/test_eager_deletion_lstm_net.py +++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_lstm_net.py @@ -40,7 +40,7 @@ def lstm_net( input=fc0, size=hid_dim * 4, is_reverse=False ) lstm_max = fluid.layers.sequence_pool(input=lstm_h, pool_type='max') - lstm_max_tanh = fluid.layers.tanh(lstm_max) + lstm_max_tanh = paddle.tanh(lstm_max) fc1 = fluid.layers.fc(input=lstm_max_tanh, size=hid_dim2, act='tanh') prediction = fluid.layers.fc(input=fc1, size=class_dim, act='softmax') cost = fluid.layers.cross_entropy(input=prediction, label=label) diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_padding_rnn.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_padding_rnn.py index 8bdbe2aa9c963..e1271f30c4f35 100644 --- a/python/paddle/fluid/tests/unittests/test_eager_deletion_padding_rnn.py +++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_padding_rnn.py @@ -191,10 +191,10 @@ def padding_rnn(input_embedding, len=3, init_hidden=None, init_cell=None): ends=[hidden_size * 4], ) - c = pre_cell * layers.sigmoid(f) + layers.sigmoid( - i - ) * layers.tanh(j) - m = layers.tanh(c) * layers.sigmoid(o) + c = pre_cell * paddle.nn.functional.sigmoid( + f + ) + paddle.nn.functional.sigmoid(i) * paddle.tanh(j) + m = paddle.tanh(c) * paddle.nn.functional.sigmoid(o) rnn.update_memory(pre_hidden, m) rnn.update_memory(pre_cell, c) @@ -299,10 +299,10 @@ def encoder_static( gate_input = layers.elementwise_add(gate_input, bias) i, j, f, o = layers.split(gate_input, num_or_sections=4, dim=-1) - c = pre_cell * layers.sigmoid(f) + layers.sigmoid( - i - ) * layers.tanh(j) - m = layers.tanh(c) * layers.sigmoid(o) + c = pre_cell * paddle.nn.functional.sigmoid( + f + ) + paddle.nn.functional.sigmoid(i) * paddle.tanh(j) + m = paddle.tanh(c) * paddle.nn.functional.sigmoid(o) hidden_array[k] = m cell_array[k] = c diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_recurrent_op.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_recurrent_op.py index d89aba3da840e..21ebf05038e8c 100644 --- a/python/paddle/fluid/tests/unittests/test_eager_deletion_recurrent_op.py +++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_recurrent_op.py @@ -327,7 +327,9 @@ def create_rnn_op(self): bias_attr=False, ) - h = layers.sigmoid(x=layers.elementwise_add(x=temp_l, y=temp_r)) + h = paddle.nn.functional.sigmoid( + x=layers.elementwise_add(x=temp_l, y=temp_r) + ) rnn.update_memory(h_pre, h) rnn.output(h) diff --git a/python/paddle/fluid/tests/unittests/test_erf_op.py b/python/paddle/fluid/tests/unittests/test_erf_op.py index 089fdc0a0b4bc..f21ae86d1c2ea 100644 --- a/python/paddle/fluid/tests/unittests/test_erf_op.py +++ b/python/paddle/fluid/tests/unittests/test_erf_op.py @@ -48,7 +48,7 @@ def _test_case(self, place): y_ref = erf(x) with dg.guard(place) as g: x_var = dg.to_variable(x) - y_var = fluid.layers.erf(x_var) + y_var = paddle.erf(x_var) y_test = y_var.numpy() np.testing.assert_allclose(y_ref, y_test, rtol=1e-05) diff --git a/python/paddle/fluid/tests/unittests/test_gelu_op.py b/python/paddle/fluid/tests/unittests/test_gelu_op.py index 203e2517cd425..e8999b7ba9679 100644 --- a/python/paddle/fluid/tests/unittests/test_gelu_op.py +++ b/python/paddle/fluid/tests/unittests/test_gelu_op.py @@ -45,7 +45,7 @@ def _test_case1_cpu(self, approximate): place = fluid.CPUPlace() with dg.guard(place) as g: x_var = dg.to_variable(x) - y_var = fluid.layers.gelu(x_var, approximate) + y_var = F.gelu(x_var, approximate) y_test = y_var.numpy() np.testing.assert_allclose(y_ref, y_test, rtol=1e-05, atol=1e-08) @@ -56,7 +56,7 @@ def _test_case1_gpu(self, approximate): place = fluid.CUDAPlace(0) with dg.guard(place) as g: x_var = dg.to_variable(x) - y_var = fluid.layers.gelu(x_var, approximate) + y_var = F.gelu(x_var, approximate) y_test = y_var.numpy() np.testing.assert_allclose(y_ref, y_test, rtol=1e-05, atol=1e-08) diff --git a/python/paddle/fluid/tests/unittests/test_gradient_clip.py b/python/paddle/fluid/tests/unittests/test_gradient_clip.py index a072e4454da2e..d8eaaef49b215 100644 --- a/python/paddle/fluid/tests/unittests/test_gradient_clip.py +++ b/python/paddle/fluid/tests/unittests/test_gradient_clip.py @@ -35,7 +35,7 @@ def bow_net( input=data, is_sparse=True, size=[dict_dim, emb_dim] ) bow = fluid.layers.sequence_pool(input=emb, pool_type='sum') - bow_tanh = fluid.layers.tanh(bow) + bow_tanh = paddle.tanh(bow) fc_1 = fluid.layers.fc(input=bow_tanh, size=hid_dim, act="tanh") fc_2 = fluid.layers.fc(input=fc_1, size=hid_dim2, act="tanh") prediction = fluid.layers.fc(input=[fc_2], size=class_dim, act="softmax") diff --git a/python/paddle/fluid/tests/unittests/test_imperative_basic.py b/python/paddle/fluid/tests/unittests/test_imperative_basic.py index 87fc35b5fee5c..eaa8474c8246f 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_basic.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_basic.py @@ -914,7 +914,7 @@ def func_append_activation_in_dygraph2(self): with fluid.dygraph.guard(): a = paddle.to_tensor(a_np) res1 = func(a, act="sigmoid", use_mkldnn=True, use_cudnn=True) - res2 = fluid.layers.sigmoid(a) + res2 = paddle.nn.functional.sigmoid(a) np.testing.assert_allclose(res1.numpy(), res2.numpy(), rtol=1e-05) def test_append_activation_in_dygraph2(self): @@ -929,7 +929,7 @@ def func_append_activation_in_dygraph3(self): with fluid.dygraph.guard(): a = paddle.to_tensor(a_np) res1 = func(a, act="sigmoid", use_cudnn=True) - res2 = fluid.layers.sigmoid(a) + res2 = paddle.nn.functional.sigmoid(a) np.testing.assert_array_equal(res1.numpy(), res2.numpy()) def test_append_activation_in_dygraph3(self): diff --git a/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py b/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py index f5a33f692bdc9..08a32aeaa9971 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py @@ -317,7 +317,7 @@ def forward(self, encoder_vec, encoder_proj, decoder_state): concated = fluid.layers.elementwise_add( encoder_proj, decoder_state_expand ) - concated = fluid.layers.tanh(x=concated) + concated = paddle.tanh(x=concated) attention_weight = self.fc_2(concated) weights_reshape = fluid.layers.reshape( diff --git a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py index 8fecc45636f30..df0a8996aca5a 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py @@ -115,10 +115,10 @@ def forward(self, input_embedding, init_hidden=None, init_cell=None): i, j, f, o = fluid.layers.split( gate_input, num_or_sections=4, dim=-1 ) - c = pre_cell * fluid.layers.sigmoid(f) + fluid.layers.sigmoid( - i - ) * fluid.layers.tanh(j) - m = fluid.layers.tanh(c) * fluid.layers.sigmoid(o) + c = pre_cell * paddle.nn.functional.sigmoid( + f + ) + paddle.nn.functional.sigmoid(i) * paddle.tanh(j) + m = paddle.tanh(c) * paddle.nn.functional.sigmoid(o) self.hidden_array[k] = m self.cell_array[k] = c self._input = m diff --git a/python/paddle/fluid/tests/unittests/test_imperative_save_load.py b/python/paddle/fluid/tests/unittests/test_imperative_save_load.py index 24e2116699221..a0b75d716074a 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_save_load.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_save_load.py @@ -110,10 +110,10 @@ def forward(self, input_embedding, init_hidden=None, init_cell=None): i, j, f, o = fluid.layers.split( gate_input, num_or_sections=4, dim=-1 ) - c = pre_cell * fluid.layers.sigmoid(f) + fluid.layers.sigmoid( - i - ) * fluid.layers.tanh(j) - m = fluid.layers.tanh(c) * fluid.layers.sigmoid(o) + c = pre_cell * paddle.nn.functional.sigmoid( + f + ) + paddle.nn.functional.sigmoid(i) * paddle.tanh(j) + m = paddle.tanh(c) * paddle.nn.functional.sigmoid(o) self.hidden_array[k] = m self.cell_array[k] = c self._input = m diff --git a/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py b/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py index 8bba8042bf17f..1274200f31bb3 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py @@ -112,10 +112,10 @@ def forward(self, input_embedding, init_hidden=None, init_cell=None): i, j, f, o = fluid.layers.split( gate_input, num_or_sections=4, dim=-1 ) - c = pre_cell * fluid.layers.sigmoid(f) + fluid.layers.sigmoid( - i - ) * fluid.layers.tanh(j) - m = fluid.layers.tanh(c) * fluid.layers.sigmoid(o) + c = pre_cell * paddle.nn.functional.sigmoid( + f + ) + paddle.nn.functional.sigmoid(i) * paddle.tanh(j) + m = paddle.tanh(c) * paddle.nn.functional.sigmoid(o) self.hidden_array[k] = m self.cell_array[k] = c self._input = m diff --git a/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py b/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py index 96fc83402762a..6a6dd3f7712a3 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py @@ -322,7 +322,7 @@ def forward(self, input, label_trg): res_block = self._res_block(conv0) deconv = self._deconv(res_block) conv1 = self._conv1(deconv) - out = fluid.layers.tanh(conv1) + out = paddle.tanh(conv1) return out @@ -437,11 +437,9 @@ def _interpolate(a, b): ) epsilon = 1e-16 - norm = fluid.layers.sqrt( - fluid.layers.reduce_sum(fluid.layers.square(gradient), dim=1) + epsilon - ) + norm = paddle.sqrt(paddle.sum(paddle.square(gradient), axis=1) + epsilon) - gp = fluid.layers.reduce_mean(fluid.layers.square(norm - 1.0)) + gp = paddle.mean(paddle.square(norm - 1.0)) return gp @@ -451,7 +449,7 @@ def get_generator_loss( fake_img = generator(image_real, label_trg) rec_img = generator(fake_img, label_org) g_loss_rec = fluid.layers.reduce_mean( - fluid.layers.abs(fluid.layers.elementwise_sub(image_real, rec_img)) + paddle.abs(paddle.subtract(image_real, rec_img)) ) pred_fake, cls_fake = discriminator(fake_img) diff --git a/python/paddle/fluid/tests/unittests/test_imperative_triple_grad.py b/python/paddle/fluid/tests/unittests/test_imperative_triple_grad.py index ebebe754b2176..231a3157104c8 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_triple_grad.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_triple_grad.py @@ -182,7 +182,7 @@ def func_example_with_gradient_and_create_graph(self): numel = z_np.size z.stop_gradient = False - out = fluid.layers.sigmoid(paddle.matmul(x, y) + z) + out = paddle.nn.functional.sigmoid(paddle.matmul(x, y) + z) out_np = out.numpy() (dx_actual,) = self.grad([out], [x], create_graph=True) @@ -278,7 +278,7 @@ def func_example_with_gradient_and_create_graph(self): numel = z_np.size z.stop_gradient = False - out = fluid.layers.sigmoid(paddle.matmul(x, y) + z) + out = paddle.nn.functional.sigmoid(paddle.matmul(x, y) + z) out_np = out.numpy() (dx_actual,) = self.grad([out], [x], create_graph=True) diff --git a/python/paddle/fluid/tests/unittests/test_inplace_abn_op.py b/python/paddle/fluid/tests/unittests/test_inplace_abn_op.py index ff033d51efdbd..d02214623b7ce 100644 --- a/python/paddle/fluid/tests/unittests/test_inplace_abn_op.py +++ b/python/paddle/fluid/tests/unittests/test_inplace_abn_op.py @@ -87,7 +87,7 @@ def build_program( # a new Variable for fetch bn = bn * 1.0 - sigmoid = fluid.layers.sigmoid(bn) + sigmoid = paddle.nn.functional.sigmoid(bn) out = fluid.layers.reduce_sum(sigmoid) if not only_forward: sgd_opt = fluid.optimizer.SGD(learning_rate=0.0) diff --git a/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_nlp.py b/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_nlp.py index f661db6d15e50..85f78d9aef4a1 100644 --- a/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_nlp.py +++ b/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_nlp.py @@ -41,7 +41,7 @@ def lstm_net( input=fc0, size=hid_dim * 4, is_reverse=False ) lstm_max = fluid.layers.sequence_pool(input=lstm_h, pool_type='max') - lstm_max_tanh = fluid.layers.tanh(lstm_max) + lstm_max_tanh = paddle.tanh(lstm_max) fc1 = fluid.layers.fc(input=lstm_max_tanh, size=hid_dim2, act='tanh') prediction = fluid.layers.fc(input=fc1, size=class_dim, act='softmax') cost = fluid.layers.cross_entropy(input=prediction, label=label) diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index f97c94858bd6a..621e8090d2a14 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -3680,126 +3680,6 @@ def make_prelu(self): ) return out - def make_sigmoid(self): - with program_guard( - fluid.default_main_program(), fluid.default_startup_program() - ): - input = self._get_data(name="input", shape=[16], dtype="float32") - out = layers.sigmoid(input, name='sigmoid') - return out - - def make_exp(self): - with program_guard( - fluid.default_main_program(), fluid.default_startup_program() - ): - input = self._get_data(name="input", shape=[16], dtype="float32") - out = layers.exp(input, name='exp') - return out - - def make_tanh(self): - with program_guard( - fluid.default_main_program(), fluid.default_startup_program() - ): - input = self._get_data(name="input", shape=[16], dtype="float32") - out = layers.tanh(input, name='tanh') - return out - - def make_tanh_shrink(self): - with program_guard( - fluid.default_main_program(), fluid.default_startup_program() - ): - input = self._get_data(name="input", shape=[16], dtype="float32") - out = layers.tanh_shrink(input, name='tanh_shrink') - return out - - def make_sqrt(self): - with program_guard( - fluid.default_main_program(), fluid.default_startup_program() - ): - input = self._get_data(name="input", shape=[16], dtype="float32") - out = layers.sqrt(input, name='sqrt') - return out - - def make_abs(self): - with program_guard( - fluid.default_main_program(), fluid.default_startup_program() - ): - input = self._get_data(name="input", shape=[16], dtype="float32") - out = layers.abs(input, name='abs') - return out - - def make_ceil(self): - with program_guard( - fluid.default_main_program(), fluid.default_startup_program() - ): - input = self._get_data(name="input", shape=[16], dtype="float32") - out = layers.ceil(input, name='ceil') - return out - - def make_floor(self): - with program_guard( - fluid.default_main_program(), fluid.default_startup_program() - ): - input = self._get_data(name="input", shape=[16], dtype="float32") - out = layers.floor(input, name='floor') - return out - - def make_cos(self): - with program_guard( - fluid.default_main_program(), fluid.default_startup_program() - ): - input = self._get_data(name="input", shape=[16], dtype="float32") - out = layers.cos(input, name='cos') - return out - - def make_sin(self): - with program_guard( - fluid.default_main_program(), fluid.default_startup_program() - ): - input = self._get_data(name="input", shape=[16], dtype="float32") - out = layers.sin(input, name='sin') - return out - - def make_round(self): - with program_guard( - fluid.default_main_program(), fluid.default_startup_program() - ): - input = self._get_data(name="input", shape=[16], dtype="float32") - out = layers.round(input, name='round') - return out - - def make_reciprocal(self): - with program_guard( - fluid.default_main_program(), fluid.default_startup_program() - ): - input = self._get_data(name="input", shape=[16], dtype="float32") - out = layers.reciprocal(input, name='reciprocal') - return out - - def make_square(self): - with program_guard( - fluid.default_main_program(), fluid.default_startup_program() - ): - input = self._get_data(name="input", shape=[16], dtype="float32") - out = layers.square(input, name='square') - return out - - def make_softplus(self): - with program_guard( - fluid.default_main_program(), fluid.default_startup_program() - ): - input = self._get_data(name="input", shape=[16], dtype="float32") - out = layers.softplus(input, name='softplus') - return out - - def make_softsign(self): - with program_guard( - fluid.default_main_program(), fluid.default_startup_program() - ): - input = self._get_data(name="input", shape=[16], dtype="float32") - out = layers.softsign(input, name='softsign') - return out - def make_mish(self): with program_guard( fluid.default_main_program(), fluid.default_startup_program() @@ -3920,14 +3800,6 @@ def make_scale_variable(self): out = layers.scale(input, scale=scale_var) return out - def make_softshrink(self): - with program_guard( - fluid.default_main_program(), fluid.default_startup_program() - ): - input = self._get_data(name="input", shape=[16], dtype="float32") - out = layers.softshrink(input, alpha=0.3) - return out - def make_iou_similarity(self): with program_guard( fluid.default_main_program(), fluid.default_startup_program() diff --git a/python/paddle/fluid/tests/unittests/test_lgamma_op.py b/python/paddle/fluid/tests/unittests/test_lgamma_op.py index 54211aef14ffa..3b88e7fda57c9 100644 --- a/python/paddle/fluid/tests/unittests/test_lgamma_op.py +++ b/python/paddle/fluid/tests/unittests/test_lgamma_op.py @@ -63,7 +63,7 @@ def test_lgamma(self): shape = (1, 4) data = np.random.random(shape).astype(self.dtype) + 1 data_ = paddle.to_tensor(data) - out = paddle.fluid.layers.lgamma(data_) + out = paddle.lgamma(data_) result = special.gammaln(data) np.testing.assert_allclose(result, out.numpy(), rtol=1e-05) paddle.enable_static() diff --git a/python/paddle/fluid/tests/unittests/test_program_prune_backward.py b/python/paddle/fluid/tests/unittests/test_program_prune_backward.py index ad7617654d275..9be70eeca2a50 100755 --- a/python/paddle/fluid/tests/unittests/test_program_prune_backward.py +++ b/python/paddle/fluid/tests/unittests/test_program_prune_backward.py @@ -50,7 +50,7 @@ def lstm_net(use_feed): input=fc0, size=hid_dim * 4, is_reverse=False ) lstm_max = fluid.layers.sequence_pool(input=lstm_h, pool_type='max') - lstm_max_tanh = fluid.layers.tanh(lstm_max) + lstm_max_tanh = paddle.tanh(lstm_max) fc1 = fluid.layers.fc(input=lstm_max_tanh, size=hid_dim2, act='tanh') prediction = fluid.layers.fc(input=fc1, size=class_dim, act='softmax') cost = fluid.layers.cross_entropy(input=prediction, label=label) diff --git a/python/paddle/fluid/tests/unittests/test_py_func_op.py b/python/paddle/fluid/tests/unittests/test_py_func_op.py index 961216795671f..a764dbf4a595b 100644 --- a/python/paddle/fluid/tests/unittests/test_py_func_op.py +++ b/python/paddle/fluid/tests/unittests/test_py_func_op.py @@ -81,7 +81,7 @@ def simple_fc_net(img, label, use_py_func_op): ), ) if not use_py_func_op: - hidden = fluid.layers.tanh(hidden) + hidden = paddle.tanh(hidden) else: new_hidden = ( fluid.default_main_program() diff --git a/python/paddle/fluid/tests/unittests/test_recurrent_op.py b/python/paddle/fluid/tests/unittests/test_recurrent_op.py index 29c12daf55ff3..e8069c5f06363 100644 --- a/python/paddle/fluid/tests/unittests/test_recurrent_op.py +++ b/python/paddle/fluid/tests/unittests/test_recurrent_op.py @@ -316,7 +316,9 @@ def create_rnn_op(self): bias_attr=False, ) - h = layers.sigmoid(x=layers.elementwise_add(x=temp_l, y=temp_r)) + h = paddle.nn.functional.sigmoid( + x=layers.elementwise_add(x=temp_l, y=temp_r) + ) rnn.update_memory(h_pre, h) rnn.output(h) @@ -710,7 +712,9 @@ def create_rnn_op(self): bias_attr=False, ) - h = layers.sigmoid(x=layers.elementwise_add(temp_l, temp_r)) + h = paddle.nn.functional.sigmoid( + x=layers.elementwise_add(temp_l, temp_r) + ) rnn.update_memory(h_pre, h) rnn.output(h) diff --git a/python/paddle/fluid/tests/unittests/test_regularizer.py b/python/paddle/fluid/tests/unittests/test_regularizer.py index 35e7f47fefbac..5636c8d74e9a8 100644 --- a/python/paddle/fluid/tests/unittests/test_regularizer.py +++ b/python/paddle/fluid/tests/unittests/test_regularizer.py @@ -135,7 +135,7 @@ def bow_net( input=data, is_sparse=is_sparse, size=[dict_dim, emb_dim] ) bow = fluid.layers.sequence_pool(input=emb, pool_type='sum') - bow_tanh = fluid.layers.tanh(bow) + bow_tanh = paddle.tanh(bow) fc_1 = fluid.layers.fc(input=bow_tanh, size=hid_dim, act="tanh") fc_2 = fluid.layers.fc(input=fc_1, size=hid_dim2, act="tanh") prediction = fluid.layers.fc(input=[fc_2], size=class_dim, act="softmax") @@ -225,7 +225,7 @@ def check_l2decay(self, place, model): param_list = fluid.default_main_program().block(0).all_parameters() para_sum = [] for para in param_list: - para_mul = fluid.layers.square(x=para) + para_mul = paddle.square(x=para) para_sum.append(fluid.layers.reduce_sum(input=para_mul)) avg_cost_l2 += fluid.layers.sums(para_sum) * 0.5 diff --git a/python/paddle/fluid/tests/unittests/test_regularizer_api.py b/python/paddle/fluid/tests/unittests/test_regularizer_api.py index 930de8429f493..c4d92cbe03b73 100644 --- a/python/paddle/fluid/tests/unittests/test_regularizer_api.py +++ b/python/paddle/fluid/tests/unittests/test_regularizer_api.py @@ -41,7 +41,7 @@ def bow_net( input=data, is_sparse=is_sparse, size=[dict_dim, emb_dim] ) bow = fluid.layers.sequence_pool(input=emb, pool_type='sum') - bow_tanh = fluid.layers.tanh(bow) + bow_tanh = paddle.tanh(bow) fc_1 = fluid.layers.fc(input=bow_tanh, size=hid_dim, act="tanh") fc_2 = fluid.layers.fc(input=fc_1, size=hid_dim2, act="tanh") prediction = fluid.layers.fc(input=[fc_2], size=class_dim, act="softmax") @@ -133,7 +133,7 @@ def check_l2decay(self, place, model): param_list = fluid.default_main_program().block(0).all_parameters() para_sum = [] for para in param_list: - para_mul = fluid.layers.square(x=para) + para_mul = paddle.square(x=para) para_sum.append(fluid.layers.reduce_sum(input=para_mul)) avg_cost_l2 += fluid.layers.sums(para_sum) * 0.5 diff --git a/python/paddle/fluid/tests/unittests/test_retain_graph.py b/python/paddle/fluid/tests/unittests/test_retain_graph.py index 921652cec3459..ff0c6ba86e8ec 100644 --- a/python/paddle/fluid/tests/unittests/test_retain_graph.py +++ b/python/paddle/fluid/tests/unittests/test_retain_graph.py @@ -30,7 +30,7 @@ def __init__(self): def forward(self, x): x = self.conv1(x) - x = fluid.layers.tanh(x) + x = paddle.tanh(x) return x diff --git a/python/paddle/fluid/tests/unittests/test_static_save_load.py b/python/paddle/fluid/tests/unittests/test_static_save_load.py index e177e351a6de4..5c1e96ebb8477 100644 --- a/python/paddle/fluid/tests/unittests/test_static_save_load.py +++ b/python/paddle/fluid/tests/unittests/test_static_save_load.py @@ -122,10 +122,10 @@ def forward(self, input_embedding, init_hidden=None, init_cell=None): i, j, f, o = fluid.layers.split( gate_input, num_or_sections=4, dim=-1 ) - c = pre_cell * fluid.layers.sigmoid(f) + fluid.layers.sigmoid( - i - ) * fluid.layers.tanh(j) - m = fluid.layers.tanh(c) * fluid.layers.sigmoid(o) + c = pre_cell * paddle.nn.functional.sigmoid( + f + ) + paddle.nn.functional.sigmoid(i) * paddle.tanh(j) + m = paddle.tanh(c) * paddle.nn.functional.sigmoid(o) self.hidden_array[k] = m self.cell_array[k] = c self._input = m diff --git a/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py b/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py index 7e885944d6b0a..e3fec39c4525a 100644 --- a/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py +++ b/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py @@ -94,7 +94,7 @@ def _build_program( bn = fluid.layers.cast(bn, 'float32') else: bn = fluid.layers.cast(bn, 'float64') - sigmoid = fluid.layers.sigmoid(bn) + sigmoid = paddle.nn.functional.sigmoid(bn) out = fluid.layers.reduce_sum(sigmoid) if not sync_bn: out = out / core.get_cuda_device_count() diff --git a/python/paddle/fluid/tests/unittests/test_weight_decay.py b/python/paddle/fluid/tests/unittests/test_weight_decay.py index 5133a20375593..f4a1d0b965c09 100644 --- a/python/paddle/fluid/tests/unittests/test_weight_decay.py +++ b/python/paddle/fluid/tests/unittests/test_weight_decay.py @@ -59,7 +59,7 @@ def bow_net( input=data, is_sparse=is_sparse, size=[dict_dim, emb_dim] ) bow = fluid.layers.sequence_pool(input=emb, pool_type='sum') - bow_tanh = fluid.layers.tanh(bow) + bow_tanh = paddle.tanh(bow) fc_1 = fluid.layers.fc(input=bow_tanh, size=hid_dim, act="tanh") fc_2 = fluid.layers.fc(input=fc_1, size=hid_dim2, act="tanh") prediction = fluid.layers.fc(input=[fc_2], size=class_dim, act="softmax") diff --git a/python/paddle/fluid/tests/unittests/transformer_model.py b/python/paddle/fluid/tests/unittests/transformer_model.py index 2baf56caaf391..c45e17ab77694 100644 --- a/python/paddle/fluid/tests/unittests/transformer_model.py +++ b/python/paddle/fluid/tests/unittests/transformer_model.py @@ -15,6 +15,7 @@ from functools import partial import numpy as np +import paddle import paddle.fluid as fluid import paddle.fluid.layers as layers @@ -156,7 +157,7 @@ def scaled_dot_product_attention(q, k, v, attn_bias, d_model, dropout_rate): # So, here define the softmax for temporary solution. def __softmax(x, eps=1e-9): - exp_out = layers.exp(x=x) + exp_out = paddle.exp(x=x) sum_out = layers.reduce_sum(exp_out, dim=-1, keep_dim=False) return layers.elementwise_div(x=exp_out, y=sum_out, axis=0) diff --git a/python/paddle/incubate/distributed/models/moe/grad_clip.py b/python/paddle/incubate/distributed/models/moe/grad_clip.py index 17372ea4f175d..d74bd1e3d89fb 100644 --- a/python/paddle/incubate/distributed/models/moe/grad_clip.py +++ b/python/paddle/incubate/distributed/models/moe/grad_clip.py @@ -209,7 +209,7 @@ def _dygraph_clip(self, params_grads): global_norm_var = global_norm_var_normal + global_norm_var_moe params_and_grads = [] - global_norm_var = layers.sqrt(global_norm_var) + global_norm_var = paddle.sqrt(global_norm_var) max_global_norm = layers.fill_constant( shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm ) diff --git a/python/paddle/incubate/optimizer/modelaverage.py b/python/paddle/incubate/optimizer/modelaverage.py index 8a7a7f586d65c..1f9c60cf508b3 100644 --- a/python/paddle/incubate/optimizer/modelaverage.py +++ b/python/paddle/incubate/optimizer/modelaverage.py @@ -557,7 +557,7 @@ def _add_average_apply_op(self, block, param): sum = layers.cast( x=sum, dtype='float32' if self._dtype is None else self._dtype ) - layers.ops._elementwise_div(x=sum, y=tmp, out=param) + paddle.tensor.ops._elementwise_div(x=sum, y=tmp, out=param) def _add_average_restore_op(self, block, param): param = block._clone_variable(param) From 56f15c439781b135257aab648739d2d80b6ae009 Mon Sep 17 00:00:00 2001 From: wanghuancoder Date: Mon, 21 Nov 2022 16:18:11 +0800 Subject: [PATCH 123/210] refine reduce_all (#48133) * refine reduce_all --- paddle/phi/core/kernel_utils.h | 11 +++++++++++ paddle/phi/kernels/cpu/prod_kernel.cc | 1 + paddle/phi/kernels/cpu/reduce.h | 2 ++ paddle/phi/kernels/cpu/reduce_all_kernel.cc | 1 + paddle/phi/kernels/cpu/reduce_amax_grad_kernel.cc | 1 + paddle/phi/kernels/cpu/reduce_amax_kernel.cc | 1 + paddle/phi/kernels/cpu/reduce_amin_grad_kernel.cc | 1 + paddle/phi/kernels/cpu/reduce_amin_kernel.cc | 1 + paddle/phi/kernels/cpu/reduce_any_kernel.cc | 1 + paddle/phi/kernels/cpu/reduce_max_kernel.cc | 1 + paddle/phi/kernels/cpu/reduce_mean_grad_kernel.cc | 1 + paddle/phi/kernels/cpu/reduce_mean_kernel.cc | 1 + paddle/phi/kernels/cpu/reduce_min_kernel.cc | 1 + paddle/phi/kernels/cpu/reduce_sum_grad_kernel.cc | 1 + paddle/phi/kernels/funcs/reduce_function.h | 1 + paddle/phi/kernels/gpu/frobenius_norm_kernel.cu | 1 + paddle/phi/kernels/gpu/reduce.h | 1 + paddle/phi/kernels/gpu/reduce_amax_grad_kernel.cu | 1 + paddle/phi/kernels/gpu/reduce_amin_amax_common.h | 4 +--- paddle/phi/kernels/gpu/reduce_amin_grad_kernel.cu | 1 + paddle/phi/kernels/gpu/reduce_grad.h | 1 + paddle/phi/kernels/gpu/reduce_mean_grad_kernel.cu | 4 +--- paddle/phi/kernels/gpu/reduce_sum_grad_kernel.cu | 4 +--- .../kernels/impl/frobenius_norm_grad_kernel_impl.h | 1 + paddle/phi/kernels/impl/frobenius_norm_kernel_impl.h | 1 + paddle/phi/kernels/impl/logsumexp_grad_kernel_impl.h | 4 +--- paddle/phi/kernels/impl/logsumexp_kernel_impl.h | 4 +--- paddle/phi/kernels/impl/prod_grad_kernel_impl.h | 1 + paddle/phi/kernels/impl/reduce_grad.h | 6 +++--- paddle/phi/kernels/impl/reduce_max_grad_kernel_impl.h | 1 + paddle/phi/kernels/impl/reduce_min_grad_kernel_impl.h | 1 + paddle/phi/kernels/kps/prod_kernel.cu | 1 + paddle/phi/kernels/kps/reduce_all_kernel.cu | 3 ++- paddle/phi/kernels/kps/reduce_amax_kernel.cu | 1 + paddle/phi/kernels/kps/reduce_amin_kernel.cu | 1 + paddle/phi/kernels/kps/reduce_any_kernel.cu | 3 ++- paddle/phi/kernels/kps/reduce_max_kernel.cu | 1 + paddle/phi/kernels/kps/reduce_mean_kernel.cu | 1 + paddle/phi/kernels/kps/reduce_min_kernel.cu | 1 + paddle/phi/kernels/kps/reduce_sum_kernel.cu | 2 ++ paddle/phi/kernels/onednn/reduce_kernel_impl.h | 2 ++ paddle/phi/kernels/onednn/reduce_max_kernel.cc | 1 + paddle/phi/kernels/onednn/reduce_mean_grad_kernel.cc | 1 + paddle/phi/kernels/onednn/reduce_mean_kernel.cc | 1 + paddle/phi/kernels/onednn/reduce_min_kernel.cc | 1 + paddle/phi/kernels/onednn/reduce_sum_grad_kernel.cc | 1 + paddle/phi/kernels/onednn/reduce_sum_kernel.cc | 1 + paddle/phi/kernels/prod_kernel.cc | 2 +- paddle/phi/kernels/reduce_all_kernel.cc | 5 +---- paddle/phi/kernels/reduce_amax_kernel.cc | 5 +---- paddle/phi/kernels/reduce_amin_kernel.cc | 5 +---- paddle/phi/kernels/reduce_any_kernel.cc | 5 +---- paddle/phi/kernels/reduce_max_kernel.cc | 5 +---- paddle/phi/kernels/reduce_mean_kernel.cc | 5 +---- paddle/phi/kernels/reduce_min_kernel.cc | 5 +---- paddle/phi/kernels/reduce_sum_kernel.cc | 5 +---- paddle/phi/kernels/xpu/prod_kernel.cc | 1 + paddle/phi/kernels/xpu/reduce.h | 1 + paddle/phi/kernels/xpu/reduce_max_grad_kernel.cc | 1 + paddle/phi/kernels/xpu/reduce_max_kernel.cc | 1 + paddle/phi/kernels/xpu/reduce_mean_grad_kernel.cc | 1 + paddle/phi/kernels/xpu/reduce_mean_kernel.cc | 1 + paddle/phi/kernels/xpu/reduce_min_kernel.cc | 1 + paddle/phi/kernels/xpu/reduce_sum_grad_kernel.cc | 4 +--- paddle/phi/kernels/xpu/reduce_sum_kernel.cc | 1 + 65 files changed, 82 insertions(+), 56 deletions(-) diff --git a/paddle/phi/core/kernel_utils.h b/paddle/phi/core/kernel_utils.h index 55ea3a31eb318..05d8e259cff10 100644 --- a/paddle/phi/core/kernel_utils.h +++ b/paddle/phi/core/kernel_utils.h @@ -336,4 +336,15 @@ struct KernelImpl { }; }; +inline bool recompute_reduce_all(const DenseTensor& x, + const IntArray& dims, + bool reduce_all = false) { + if (dims.size() == 0 || static_cast(dims.size()) == x.dims().size() || + reduce_all) { + return true; + } else { + return false; + } +} + } // namespace phi diff --git a/paddle/phi/kernels/cpu/prod_kernel.cc b/paddle/phi/kernels/cpu/prod_kernel.cc index af5ea5cb9568d..d5a07c0057dd7 100644 --- a/paddle/phi/kernels/cpu/prod_kernel.cc +++ b/paddle/phi/kernels/cpu/prod_kernel.cc @@ -28,6 +28,7 @@ void ProdRawKernel(const Context& dev_ctx, bool keep_dim, bool reduce_all, DenseTensor* out) { + reduce_all = recompute_reduce_all(x, dims, reduce_all); auto out_dtype = x.dtype(); phi::Reduce( dev_ctx, x, reduce_all, dims.GetData(), keep_dim, out_dtype, out); diff --git a/paddle/phi/kernels/cpu/reduce.h b/paddle/phi/kernels/cpu/reduce.h index e5f610b955409..bfcbe0eee1f60 100644 --- a/paddle/phi/kernels/cpu/reduce.h +++ b/paddle/phi/kernels/cpu/reduce.h @@ -30,6 +30,7 @@ void Reduce(const DeviceContext& dev_ctx, bool keep_dim, DataType out_dtype, DenseTensor* out) { + reduce_all = recompute_reduce_all(x, dims, reduce_all); // If the dims has full dim, set the reduce_all is True const int& input_dim_size = x.dims().size(); std::set dims_set(dims.begin(), dims.end()); @@ -71,6 +72,7 @@ void BoolReduceKernel(const DeviceContext& dev_ctx, bool keep_dim, bool reduce_all, phi::DenseTensor* output) { + reduce_all = recompute_reduce_all(input, dims, reduce_all); dev_ctx.template Alloc(output); // The dims has full dim, set the reduce_all is True diff --git a/paddle/phi/kernels/cpu/reduce_all_kernel.cc b/paddle/phi/kernels/cpu/reduce_all_kernel.cc index 3e8e38ee4447e..60094d1345a77 100644 --- a/paddle/phi/kernels/cpu/reduce_all_kernel.cc +++ b/paddle/phi/kernels/cpu/reduce_all_kernel.cc @@ -28,6 +28,7 @@ void AllRawKernel(const Context& dev_ctx, bool keep_dim, bool reduce_all, DenseTensor* out) { + reduce_all = recompute_reduce_all(x, dims, reduce_all); phi::BoolReduceKernel( dev_ctx, x, dims, keep_dim, reduce_all, out); } diff --git a/paddle/phi/kernels/cpu/reduce_amax_grad_kernel.cc b/paddle/phi/kernels/cpu/reduce_amax_grad_kernel.cc index ffe9133d6d94c..731ee34636580 100644 --- a/paddle/phi/kernels/cpu/reduce_amax_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/reduce_amax_grad_kernel.cc @@ -28,6 +28,7 @@ void ReduceAMaxGradKernel(const Context& dev_ctx, bool keep_dim, bool reduce_all, DenseTensor* x_grad) { + reduce_all = recompute_reduce_all(x, dims, reduce_all); ReduceGradKernel( dev_ctx, x, out, out_grad, dims, keep_dim, reduce_all, x_grad); } diff --git a/paddle/phi/kernels/cpu/reduce_amax_kernel.cc b/paddle/phi/kernels/cpu/reduce_amax_kernel.cc index ac3b5ce762e29..72ac780e40071 100644 --- a/paddle/phi/kernels/cpu/reduce_amax_kernel.cc +++ b/paddle/phi/kernels/cpu/reduce_amax_kernel.cc @@ -28,6 +28,7 @@ void AMaxRawKernel(const Context& dev_ctx, bool keep_dim, bool reduce_all, DenseTensor* out) { + reduce_all = recompute_reduce_all(x, dims, reduce_all); auto out_dtype = x.dtype(); phi::Reduce( dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); diff --git a/paddle/phi/kernels/cpu/reduce_amin_grad_kernel.cc b/paddle/phi/kernels/cpu/reduce_amin_grad_kernel.cc index 6bb0e5061cc20..1165e4c7545ab 100644 --- a/paddle/phi/kernels/cpu/reduce_amin_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/reduce_amin_grad_kernel.cc @@ -28,6 +28,7 @@ void ReduceAMinGradKernel(const Context& dev_ctx, bool keep_dim, bool reduce_all, DenseTensor* x_grad) { + reduce_all = recompute_reduce_all(x, dims, reduce_all); ReduceGradKernel( dev_ctx, x, out, out_grad, dims, keep_dim, reduce_all, x_grad); } diff --git a/paddle/phi/kernels/cpu/reduce_amin_kernel.cc b/paddle/phi/kernels/cpu/reduce_amin_kernel.cc index d8f090f93ffd3..47aa5210f3297 100644 --- a/paddle/phi/kernels/cpu/reduce_amin_kernel.cc +++ b/paddle/phi/kernels/cpu/reduce_amin_kernel.cc @@ -28,6 +28,7 @@ void AMinRawKernel(const Context& dev_ctx, bool keep_dim, bool reduce_all, DenseTensor* out) { + reduce_all = recompute_reduce_all(x, dims, reduce_all); auto out_dtype = x.dtype(); phi::Reduce( dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); diff --git a/paddle/phi/kernels/cpu/reduce_any_kernel.cc b/paddle/phi/kernels/cpu/reduce_any_kernel.cc index 4fd71f1d0b169..553393e7dba35 100644 --- a/paddle/phi/kernels/cpu/reduce_any_kernel.cc +++ b/paddle/phi/kernels/cpu/reduce_any_kernel.cc @@ -28,6 +28,7 @@ void AnyRawKernel(const Context& dev_ctx, bool keep_dim, bool reduce_all, DenseTensor* out) { + reduce_all = recompute_reduce_all(x, dims, reduce_all); phi::BoolReduceKernel( dev_ctx, x, dims, keep_dim, reduce_all, out); } diff --git a/paddle/phi/kernels/cpu/reduce_max_kernel.cc b/paddle/phi/kernels/cpu/reduce_max_kernel.cc index b15a555a2cf4d..d71476a0f920d 100644 --- a/paddle/phi/kernels/cpu/reduce_max_kernel.cc +++ b/paddle/phi/kernels/cpu/reduce_max_kernel.cc @@ -28,6 +28,7 @@ void MaxRawKernel(const Context& dev_ctx, bool keep_dim, bool reduce_all, DenseTensor* out) { + reduce_all = recompute_reduce_all(x, dims, reduce_all); auto out_dtype = x.dtype(); phi::Reduce( dev_ctx, x, reduce_all, dims.GetData(), keep_dim, out_dtype, out); diff --git a/paddle/phi/kernels/cpu/reduce_mean_grad_kernel.cc b/paddle/phi/kernels/cpu/reduce_mean_grad_kernel.cc index 3ab8a40a85e55..b19f6ebdad806 100644 --- a/paddle/phi/kernels/cpu/reduce_mean_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/reduce_mean_grad_kernel.cc @@ -28,6 +28,7 @@ void ReduceMeanGradKernel(const Context& dev_ctx, bool keep_dim, bool reduce_all, DenseTensor* x_grad) { + reduce_all = recompute_reduce_all(x, dims, reduce_all); ReduceGradKernel(dev_ctx, x, paddle::none, diff --git a/paddle/phi/kernels/cpu/reduce_mean_kernel.cc b/paddle/phi/kernels/cpu/reduce_mean_kernel.cc index 7164ec8b2bf99..2ab1b3e5a4739 100644 --- a/paddle/phi/kernels/cpu/reduce_mean_kernel.cc +++ b/paddle/phi/kernels/cpu/reduce_mean_kernel.cc @@ -28,6 +28,7 @@ void MeanRawKernel(const Context& dev_ctx, bool keep_dim, bool reduce_all, DenseTensor* out) { + reduce_all = recompute_reduce_all(x, dims, reduce_all); auto out_dtype = x.dtype(); phi::Reduce( dev_ctx, x, reduce_all, dims.GetData(), keep_dim, out_dtype, out); diff --git a/paddle/phi/kernels/cpu/reduce_min_kernel.cc b/paddle/phi/kernels/cpu/reduce_min_kernel.cc index a11de5ea81ab6..286951f6720d7 100644 --- a/paddle/phi/kernels/cpu/reduce_min_kernel.cc +++ b/paddle/phi/kernels/cpu/reduce_min_kernel.cc @@ -28,6 +28,7 @@ void MinRawKernel(const Context& dev_ctx, bool keep_dim, bool reduce_all, DenseTensor* out) { + reduce_all = recompute_reduce_all(x, dims, reduce_all); auto out_dtype = x.dtype(); phi::Reduce( dev_ctx, x, reduce_all, dims.GetData(), keep_dim, out_dtype, out); diff --git a/paddle/phi/kernels/cpu/reduce_sum_grad_kernel.cc b/paddle/phi/kernels/cpu/reduce_sum_grad_kernel.cc index 87e3df717b244..e7d73611cf041 100644 --- a/paddle/phi/kernels/cpu/reduce_sum_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/reduce_sum_grad_kernel.cc @@ -77,6 +77,7 @@ void ReduceSumGradKernel(const Context& dev_ctx, bool keep_dim, bool reduce_all, DenseTensor* x_grad) { + reduce_all = recompute_reduce_all(x, dims, reduce_all); if (dims.size() == 1) { if (out_grad.dtype() != x.dtype()) { DenseTensorMeta x_grad_meta( diff --git a/paddle/phi/kernels/funcs/reduce_function.h b/paddle/phi/kernels/funcs/reduce_function.h index 1b1a55b25c5ec..be64e3c7db7dd 100644 --- a/paddle/phi/kernels/funcs/reduce_function.h +++ b/paddle/phi/kernels/funcs/reduce_function.h @@ -58,6 +58,7 @@ using dim3 = phi::kps::dim3; #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/enforce.h" +#include "paddle/phi/core/kernel_utils.h" #include "paddle/phi/core/utils/array.h" #include "paddle/phi/kernels/funcs/eigen/common.h" #include "paddle/phi/kernels/funcs/eigen/eigen_function.h" diff --git a/paddle/phi/kernels/gpu/frobenius_norm_kernel.cu b/paddle/phi/kernels/gpu/frobenius_norm_kernel.cu index a439711f5d045..9878aa6ee231d 100644 --- a/paddle/phi/kernels/gpu/frobenius_norm_kernel.cu +++ b/paddle/phi/kernels/gpu/frobenius_norm_kernel.cu @@ -26,6 +26,7 @@ void FrobeniusNormKernel(const Context& dev_ctx, bool keep_dim, bool reduce_all, DenseTensor* out) { + reduce_all = recompute_reduce_all(x, dims, reduce_all); auto out_dtype = x.dtype(); phi::Reduce( dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); diff --git a/paddle/phi/kernels/gpu/reduce.h b/paddle/phi/kernels/gpu/reduce.h index bb914defbe892..0d6edd13ac9d6 100644 --- a/paddle/phi/kernels/gpu/reduce.h +++ b/paddle/phi/kernels/gpu/reduce.h @@ -36,6 +36,7 @@ void Reduce(const KPDevice& dev_ctx, DataType out_dtype, DenseTensor* out, bool is_mean = false) { + reduce_all = recompute_reduce_all(x, dims, reduce_all); std::vector reduce_dims = phi::funcs::details::GetReduceDim(dims, x.dims().size(), reduce_all); diff --git a/paddle/phi/kernels/gpu/reduce_amax_grad_kernel.cu b/paddle/phi/kernels/gpu/reduce_amax_grad_kernel.cu index a75ef42889da2..db6cb2274cdc6 100644 --- a/paddle/phi/kernels/gpu/reduce_amax_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/reduce_amax_grad_kernel.cu @@ -28,6 +28,7 @@ void ReduceAMaxGradKernel(const Context& dev_ctx, bool keep_dim, bool reduce_all, DenseTensor* x_grad) { + reduce_all = recompute_reduce_all(x, dims, reduce_all); ReduceCudaAMaxAMinGrad( dev_ctx, x, out, out_grad, dims, keep_dim, reduce_all, x_grad); } diff --git a/paddle/phi/kernels/gpu/reduce_amin_amax_common.h b/paddle/phi/kernels/gpu/reduce_amin_amax_common.h index 5d90433ad22e3..ed6e0ef51558a 100644 --- a/paddle/phi/kernels/gpu/reduce_amin_amax_common.h +++ b/paddle/phi/kernels/gpu/reduce_amin_amax_common.h @@ -32,15 +32,13 @@ void ReduceCudaAMaxAMinGrad(const Context& dev_ctx, bool keep_dim, bool reduce_all, DenseTensor* x_grad) { + reduce_all = recompute_reduce_all(x, dims, reduce_all); auto* in_x = &x; auto* out_y = &out; auto* d_out = &out_grad; auto* d_x = x_grad; // get reduce_dim and reduce_num for reduce_mean_grad int dim_size = in_x->dims().size(); - if (dims.size() == 0) { - reduce_all = true; - } auto reduce_dims = funcs::details::GetReduceDim(dims, dim_size, reduce_all); auto update_dims = vectorize(d_x->dims()); int reduce_num = 1; diff --git a/paddle/phi/kernels/gpu/reduce_amin_grad_kernel.cu b/paddle/phi/kernels/gpu/reduce_amin_grad_kernel.cu index 152ef494b4c13..58598cae56a35 100644 --- a/paddle/phi/kernels/gpu/reduce_amin_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/reduce_amin_grad_kernel.cu @@ -29,6 +29,7 @@ void ReduceAMinGradKernel(const Context& dev_ctx, bool keep_dim, bool reduce_all, DenseTensor* x_grad) { + reduce_all = recompute_reduce_all(x, dims, reduce_all); ReduceCudaAMaxAMinGrad( dev_ctx, x, out, out_grad, dims, keep_dim, reduce_all, x_grad); } diff --git a/paddle/phi/kernels/gpu/reduce_grad.h b/paddle/phi/kernels/gpu/reduce_grad.h index ed6cc0c3c2022..01f91924645fa 100644 --- a/paddle/phi/kernels/gpu/reduce_grad.h +++ b/paddle/phi/kernels/gpu/reduce_grad.h @@ -52,6 +52,7 @@ void ReduceGradKernel(const Context& dev_ctx, bool reduce_all, DenseTensor* x_grad, Functor functor) { + reduce_all = recompute_reduce_all(x, dims, reduce_all); auto* in_x = &x; auto* d_out = &out_grad; auto* d_x = x_grad; diff --git a/paddle/phi/kernels/gpu/reduce_mean_grad_kernel.cu b/paddle/phi/kernels/gpu/reduce_mean_grad_kernel.cu index 40c317e1262c5..d7b3adfcd6f48 100644 --- a/paddle/phi/kernels/gpu/reduce_mean_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/reduce_mean_grad_kernel.cu @@ -29,11 +29,9 @@ void ReduceMeanGradKernel(const Context& dev_ctx, bool keep_dim, bool reduce_all, DenseTensor* x_grad) { + reduce_all = recompute_reduce_all(x, dims, reduce_all); // get reduce_dim and reduce_num for reduce_mean_grad int dim_size = x.dims().size(); - if (dims.size() == 0) { - reduce_all = true; - } std::vector reduce_dims = funcs::details::GetReduceDim(dims.GetData(), dim_size, reduce_all); diff --git a/paddle/phi/kernels/gpu/reduce_sum_grad_kernel.cu b/paddle/phi/kernels/gpu/reduce_sum_grad_kernel.cu index 74209afe37467..04b3253178902 100644 --- a/paddle/phi/kernels/gpu/reduce_sum_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/reduce_sum_grad_kernel.cu @@ -29,11 +29,9 @@ void ReduceSumGradKernel(const Context& dev_ctx, bool keep_dim, bool reduce_all, DenseTensor* x_grad) { + reduce_all = recompute_reduce_all(x, dims, reduce_all); // get reduce_dim for reduce_mean_grad int dim_size = x.dims().size(); - if (dims.size() == 0) { - reduce_all = true; - } std::vector reduce_dims = funcs::details::GetReduceDim(dims.GetData(), dim_size, reduce_all); diff --git a/paddle/phi/kernels/impl/frobenius_norm_grad_kernel_impl.h b/paddle/phi/kernels/impl/frobenius_norm_grad_kernel_impl.h index 96cf08af9634f..385ea68e6e707 100644 --- a/paddle/phi/kernels/impl/frobenius_norm_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/frobenius_norm_grad_kernel_impl.h @@ -29,6 +29,7 @@ void FrobeniusNormGradKernel(const Context& ctx, bool keep_dim, bool reduce_all, DenseTensor* dx) { + reduce_all = recompute_reduce_all(x, axis, reduce_all); ReduceGradKernel( ctx, x, out, dout, axis, keep_dim, reduce_all, dx); } diff --git a/paddle/phi/kernels/impl/frobenius_norm_kernel_impl.h b/paddle/phi/kernels/impl/frobenius_norm_kernel_impl.h index d1de47e128e57..7dbc3ab3af7ba 100644 --- a/paddle/phi/kernels/impl/frobenius_norm_kernel_impl.h +++ b/paddle/phi/kernels/impl/frobenius_norm_kernel_impl.h @@ -27,6 +27,7 @@ void FrobeniusNormKernel(const Context& ctx, bool keep_dim, bool reduce_all, DenseTensor* out) { + reduce_all = recompute_reduce_all(x, axis, reduce_all); Reduce( ctx, x, reduce_all, axis, keep_dim, x.dtype(), out); } diff --git a/paddle/phi/kernels/impl/logsumexp_grad_kernel_impl.h b/paddle/phi/kernels/impl/logsumexp_grad_kernel_impl.h index 098503f82cd20..0db6c12d4a07c 100644 --- a/paddle/phi/kernels/impl/logsumexp_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/logsumexp_grad_kernel_impl.h @@ -60,9 +60,7 @@ void LogsumexpGradKernel(const Context& dev_ctx, DenseTensor* in_grad) { dev_ctx.template Alloc(in_grad); - if (axis.size() == 0 || static_cast(axis.size()) == in.dims().size()) { - reduce_all = true; - } + reduce_all = recompute_reduce_all(in, axis, reduce_all); if (reduce_all) { auto x = phi::EigenVector::Flatten(in); diff --git a/paddle/phi/kernels/impl/logsumexp_kernel_impl.h b/paddle/phi/kernels/impl/logsumexp_kernel_impl.h index 0d16dc7baf621..cc5057396265c 100644 --- a/paddle/phi/kernels/impl/logsumexp_kernel_impl.h +++ b/paddle/phi/kernels/impl/logsumexp_kernel_impl.h @@ -69,9 +69,7 @@ void LogsumexpKernel(const Context& dev_ctx, DenseTensor* out) { dev_ctx.template Alloc(out); - if (axis.size() == 0 || static_cast(axis.size()) == x.dims().size()) { - reduce_all = true; - } + reduce_all = recompute_reduce_all(x, axis, reduce_all); if (reduce_all) { // Flatten and reduce 1-D tensor diff --git a/paddle/phi/kernels/impl/prod_grad_kernel_impl.h b/paddle/phi/kernels/impl/prod_grad_kernel_impl.h index 13f517c072c15..208e3362de48a 100644 --- a/paddle/phi/kernels/impl/prod_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/prod_grad_kernel_impl.h @@ -30,6 +30,7 @@ void ProdGradKernel(const Context& dev_ctx, bool keep_dim, bool reduce_all, DenseTensor* x_grad) { + reduce_all = recompute_reduce_all(x, dims, reduce_all); ReduceGradKernel( dev_ctx, x, out, out_grad, dims.GetData(), keep_dim, reduce_all, x_grad); } diff --git a/paddle/phi/kernels/impl/reduce_grad.h b/paddle/phi/kernels/impl/reduce_grad.h index 40b62cc83fa73..e9d1aec0f09c5 100644 --- a/paddle/phi/kernels/impl/reduce_grad.h +++ b/paddle/phi/kernels/impl/reduce_grad.h @@ -34,6 +34,7 @@ void ComputeFromInput(const Context& dev_ctx, bool keep_dim, bool reduce_all, DenseTensor* x_grad) { + reduce_all = recompute_reduce_all(x, dims, reduce_all); auto* input0 = &x; auto* input1 = out.get_ptr(); auto* output = x_grad; @@ -91,9 +92,8 @@ void ReduceGradKernel(const Context& dev_ctx, bool keep_dim, bool reduce_all, DenseTensor* x_grad) { - if (dims.size() == 0) { - reduce_all = true; - } + reduce_all = recompute_reduce_all(x, dims, reduce_all); + if (x.dtype() != out_grad.dtype()) { DenseTensorMeta x_grad_meta( out_grad.dtype(), x_grad->dims(), x_grad->layout()); diff --git a/paddle/phi/kernels/impl/reduce_max_grad_kernel_impl.h b/paddle/phi/kernels/impl/reduce_max_grad_kernel_impl.h index 33730a3717781..1d73b582ea0f5 100644 --- a/paddle/phi/kernels/impl/reduce_max_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/reduce_max_grad_kernel_impl.h @@ -29,6 +29,7 @@ void ReduceMaxGradKernel(const Context& dev_ctx, bool keep_dim, bool reduce_all, DenseTensor* x_grad) { + reduce_all = recompute_reduce_all(x, dims, reduce_all); ReduceGradKernel( dev_ctx, x, out, out_grad, dims.GetData(), keep_dim, reduce_all, x_grad); } diff --git a/paddle/phi/kernels/impl/reduce_min_grad_kernel_impl.h b/paddle/phi/kernels/impl/reduce_min_grad_kernel_impl.h index 93afa07ff01af..1f27ed10392cc 100644 --- a/paddle/phi/kernels/impl/reduce_min_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/reduce_min_grad_kernel_impl.h @@ -29,6 +29,7 @@ void ReduceMinGradKernel(const Context& dev_ctx, bool keep_dim, bool reduce_all, DenseTensor* x_grad) { + reduce_all = recompute_reduce_all(x, dims, reduce_all); ReduceGradKernel( dev_ctx, x, out, out_grad, dims.GetData(), keep_dim, reduce_all, x_grad); } diff --git a/paddle/phi/kernels/kps/prod_kernel.cu b/paddle/phi/kernels/kps/prod_kernel.cu index 326a351f6dabb..79dc76f81c032 100644 --- a/paddle/phi/kernels/kps/prod_kernel.cu +++ b/paddle/phi/kernels/kps/prod_kernel.cu @@ -25,6 +25,7 @@ void ProdRawKernel(const Context& dev_ctx, bool keep_dim, bool reduce_all, DenseTensor* out) { + reduce_all = recompute_reduce_all(x, dims, reduce_all); auto out_dtype = x.dtype(); phi::Reduce( dev_ctx, x, reduce_all, dims.GetData(), keep_dim, out_dtype, out); diff --git a/paddle/phi/kernels/kps/reduce_all_kernel.cu b/paddle/phi/kernels/kps/reduce_all_kernel.cu index 0459acd982269..d4d4596917bf8 100644 --- a/paddle/phi/kernels/kps/reduce_all_kernel.cu +++ b/paddle/phi/kernels/kps/reduce_all_kernel.cu @@ -12,9 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/phi/kernels/reduce_all_kernel.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/gpu/reduce.h" -#include "paddle/phi/kernels/reduce_all_kernel.h" namespace phi { @@ -25,6 +25,7 @@ void AllRawKernel(const Context& dev_ctx, bool keep_dim, bool reduce_all, DenseTensor* out) { + reduce_all = recompute_reduce_all(x, dims, reduce_all); auto out_dtype = x.dtype(); phi::Reduce( dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); diff --git a/paddle/phi/kernels/kps/reduce_amax_kernel.cu b/paddle/phi/kernels/kps/reduce_amax_kernel.cu index 57197fd9d5b8a..f762a30638f05 100644 --- a/paddle/phi/kernels/kps/reduce_amax_kernel.cu +++ b/paddle/phi/kernels/kps/reduce_amax_kernel.cu @@ -25,6 +25,7 @@ void AMaxRawKernel(const Context& dev_ctx, bool keep_dim, bool reduce_all, DenseTensor* out) { + reduce_all = recompute_reduce_all(x, dims, reduce_all); auto out_dtype = x.dtype(); phi::Reduce( dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); diff --git a/paddle/phi/kernels/kps/reduce_amin_kernel.cu b/paddle/phi/kernels/kps/reduce_amin_kernel.cu index 230adcc829441..e5d15b337fa04 100644 --- a/paddle/phi/kernels/kps/reduce_amin_kernel.cu +++ b/paddle/phi/kernels/kps/reduce_amin_kernel.cu @@ -25,6 +25,7 @@ void AMinRawKernel(const Context& dev_ctx, bool keep_dim, bool reduce_all, DenseTensor* out) { + reduce_all = recompute_reduce_all(x, dims, reduce_all); auto out_dtype = x.dtype(); phi::Reduce( dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); diff --git a/paddle/phi/kernels/kps/reduce_any_kernel.cu b/paddle/phi/kernels/kps/reduce_any_kernel.cu index 480268936f49f..3210f23c3b205 100644 --- a/paddle/phi/kernels/kps/reduce_any_kernel.cu +++ b/paddle/phi/kernels/kps/reduce_any_kernel.cu @@ -12,9 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/phi/kernels/reduce_any_kernel.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/gpu/reduce.h" -#include "paddle/phi/kernels/reduce_any_kernel.h" namespace phi { @@ -25,6 +25,7 @@ void AnyRawKernel(const Context& dev_ctx, bool keep_dim, bool reduce_all, DenseTensor* out) { + reduce_all = recompute_reduce_all(x, dims, reduce_all); auto out_dtype = x.dtype(); phi::Reduce( dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); diff --git a/paddle/phi/kernels/kps/reduce_max_kernel.cu b/paddle/phi/kernels/kps/reduce_max_kernel.cu index fb47b64f6ecec..9c0fdb52c4279 100644 --- a/paddle/phi/kernels/kps/reduce_max_kernel.cu +++ b/paddle/phi/kernels/kps/reduce_max_kernel.cu @@ -25,6 +25,7 @@ void MaxRawKernel(const Context& dev_ctx, bool keep_dim, bool reduce_all, DenseTensor* out) { + reduce_all = recompute_reduce_all(x, dims, reduce_all); auto out_dtype = x.dtype(); phi::Reduce( dev_ctx, x, reduce_all, dims.GetData(), keep_dim, out_dtype, out); diff --git a/paddle/phi/kernels/kps/reduce_mean_kernel.cu b/paddle/phi/kernels/kps/reduce_mean_kernel.cu index 7f7946e030063..8fc63b2256db9 100644 --- a/paddle/phi/kernels/kps/reduce_mean_kernel.cu +++ b/paddle/phi/kernels/kps/reduce_mean_kernel.cu @@ -25,6 +25,7 @@ void MeanRawKernel(const Context& dev_ctx, bool keep_dim, bool reduce_all, DenseTensor* out) { + reduce_all = recompute_reduce_all(x, dims, reduce_all); auto out_dtype = x.dtype(); phi::Reduce( dev_ctx, x, reduce_all, dims.GetData(), keep_dim, out_dtype, out, true); diff --git a/paddle/phi/kernels/kps/reduce_min_kernel.cu b/paddle/phi/kernels/kps/reduce_min_kernel.cu index 9c3e61d3c0bc5..450fee16b4ca9 100644 --- a/paddle/phi/kernels/kps/reduce_min_kernel.cu +++ b/paddle/phi/kernels/kps/reduce_min_kernel.cu @@ -25,6 +25,7 @@ void MinRawKernel(const Context& dev_ctx, bool keep_dim, bool reduce_all, DenseTensor* out) { + reduce_all = recompute_reduce_all(x, dims, reduce_all); auto out_dtype = x.dtype(); phi::Reduce( dev_ctx, x, reduce_all, dims.GetData(), keep_dim, out_dtype, out); diff --git a/paddle/phi/kernels/kps/reduce_sum_kernel.cu b/paddle/phi/kernels/kps/reduce_sum_kernel.cu index c5a30a6a634a8..e6030db8aa325 100644 --- a/paddle/phi/kernels/kps/reduce_sum_kernel.cu +++ b/paddle/phi/kernels/kps/reduce_sum_kernel.cu @@ -35,6 +35,7 @@ void ReduceSumEigen(const KPDevice& dev_ctx, DataType out_dtype, DenseTensor* out, std::vector* reduce_dims) { + reduce_all = recompute_reduce_all(x, dims, reduce_all); // Resize Input Tensor auto new_x = x; int added_dims = EigenDimSize - x.dims().size(); @@ -79,6 +80,7 @@ void SumRawKernel(const Context& dev_ctx, bool reduce_all, DataType out_dtype, DenseTensor* out) { + reduce_all = recompute_reduce_all(x, dims, reduce_all); if (out_dtype == DataType::UNDEFINED && out->dtype() != x.dtype()) { out_dtype = out->dtype(); } diff --git a/paddle/phi/kernels/onednn/reduce_kernel_impl.h b/paddle/phi/kernels/onednn/reduce_kernel_impl.h index 4665876469cd5..7a2f66ec984e5 100644 --- a/paddle/phi/kernels/onednn/reduce_kernel_impl.h +++ b/paddle/phi/kernels/onednn/reduce_kernel_impl.h @@ -46,6 +46,7 @@ void ReduceKernel(const Context& dev_ctx, bool reduce_all, DenseTensor* out, dnnl::algorithm reduction_type) { + reduce_all = recompute_reduce_all(x, dims, reduce_all); const auto& onednn_engine = dev_ctx.GetEngine(); auto x_tz = vectorize(x.dims()); auto out_tz = @@ -116,6 +117,7 @@ void ReduceGradKernel(const Context& dev_ctx, dnnl::algorithm reduction_type, float scale_x, float scale_y) { + reduce_all = recompute_reduce_all(x, dims, reduce_all); const auto& onednn_engine = dev_ctx.GetEngine(); auto out_grad_tz = CalculateReducedDims( x_grad, &out_grad, dims.GetData(), reduce_all, keep_dim); diff --git a/paddle/phi/kernels/onednn/reduce_max_kernel.cc b/paddle/phi/kernels/onednn/reduce_max_kernel.cc index 9e3932d7f0b5c..3ece76367598a 100644 --- a/paddle/phi/kernels/onednn/reduce_max_kernel.cc +++ b/paddle/phi/kernels/onednn/reduce_max_kernel.cc @@ -24,6 +24,7 @@ void MaxRawKernel(const Context& dev_ctx, bool keep_dim, bool reduce_all, DenseTensor* out) { + reduce_all = recompute_reduce_all(x, dims, reduce_all); ReduceKernel(dev_ctx, x, dims, diff --git a/paddle/phi/kernels/onednn/reduce_mean_grad_kernel.cc b/paddle/phi/kernels/onednn/reduce_mean_grad_kernel.cc index 4395126821bad..fd566782b182e 100644 --- a/paddle/phi/kernels/onednn/reduce_mean_grad_kernel.cc +++ b/paddle/phi/kernels/onednn/reduce_mean_grad_kernel.cc @@ -25,6 +25,7 @@ void MeanGradKernel(const Context& dev_ctx, bool keep_dim, bool reduce_all, DenseTensor* x_grad) { + reduce_all = recompute_reduce_all(x, dims, reduce_all); auto input_dims = phi::vectorize(x.dims()); std::vector reduce_dims = dims.GetData(); int number_of_elements = 1; diff --git a/paddle/phi/kernels/onednn/reduce_mean_kernel.cc b/paddle/phi/kernels/onednn/reduce_mean_kernel.cc index 22e6b3f87b1f5..a6d72c03e7767 100644 --- a/paddle/phi/kernels/onednn/reduce_mean_kernel.cc +++ b/paddle/phi/kernels/onednn/reduce_mean_kernel.cc @@ -24,6 +24,7 @@ void MeanRawKernel(const Context& dev_ctx, bool keep_dim, bool reduce_all, DenseTensor* out) { + reduce_all = recompute_reduce_all(x, dims, reduce_all); ReduceKernel(dev_ctx, x, dims, diff --git a/paddle/phi/kernels/onednn/reduce_min_kernel.cc b/paddle/phi/kernels/onednn/reduce_min_kernel.cc index 177e588d38ef6..d5985efcbaac3 100644 --- a/paddle/phi/kernels/onednn/reduce_min_kernel.cc +++ b/paddle/phi/kernels/onednn/reduce_min_kernel.cc @@ -24,6 +24,7 @@ void MinRawKernel(const Context& dev_ctx, bool keep_dim, bool reduce_all, DenseTensor* out) { + reduce_all = recompute_reduce_all(x, dims, reduce_all); ReduceKernel(dev_ctx, x, dims, diff --git a/paddle/phi/kernels/onednn/reduce_sum_grad_kernel.cc b/paddle/phi/kernels/onednn/reduce_sum_grad_kernel.cc index cd21d36cba7b1..10b914a2005cd 100644 --- a/paddle/phi/kernels/onednn/reduce_sum_grad_kernel.cc +++ b/paddle/phi/kernels/onednn/reduce_sum_grad_kernel.cc @@ -25,6 +25,7 @@ void SumGradKernel(const Context& dev_ctx, bool keep_dim, bool reduce_all, DenseTensor* x_grad) { + reduce_all = recompute_reduce_all(x, dims, reduce_all); ReduceGradKernel(dev_ctx, x, out_grad, diff --git a/paddle/phi/kernels/onednn/reduce_sum_kernel.cc b/paddle/phi/kernels/onednn/reduce_sum_kernel.cc index e5b1d8b6fb432..81e77546b490a 100644 --- a/paddle/phi/kernels/onednn/reduce_sum_kernel.cc +++ b/paddle/phi/kernels/onednn/reduce_sum_kernel.cc @@ -25,6 +25,7 @@ void SumRawKernel(const Context& dev_ctx, bool reduce_all, DataType out_dtype, DenseTensor* out) { + reduce_all = recompute_reduce_all(x, dims, reduce_all); ReduceKernel(dev_ctx, x, dims, diff --git a/paddle/phi/kernels/prod_kernel.cc b/paddle/phi/kernels/prod_kernel.cc index 532b6fdaa141f..1fce5167da958 100644 --- a/paddle/phi/kernels/prod_kernel.cc +++ b/paddle/phi/kernels/prod_kernel.cc @@ -25,7 +25,7 @@ void ProdKernel(const Context& dev_ctx, const IntArray& dims, bool keep_dim, DenseTensor* out) { - bool reduce_all = false; + bool reduce_all = false; // recompute_reduce_all(x, dims); ProdRawKernel(dev_ctx, x, dims, keep_dim, reduce_all, out); } diff --git a/paddle/phi/kernels/reduce_all_kernel.cc b/paddle/phi/kernels/reduce_all_kernel.cc index 5b8d2cbecca5f..e1651f12c1c84 100644 --- a/paddle/phi/kernels/reduce_all_kernel.cc +++ b/paddle/phi/kernels/reduce_all_kernel.cc @@ -25,10 +25,7 @@ void AllKernel(const Context& dev_ctx, const std::vector& dims, bool keep_dim, DenseTensor* out) { - bool reduce_all = false; - if (dims.size() == 0 || static_cast(dims.size()) == x.dims().size()) { - reduce_all = true; - } + bool reduce_all = recompute_reduce_all(x, dims); AllRawKernel(dev_ctx, x, dims, keep_dim, reduce_all, out); } diff --git a/paddle/phi/kernels/reduce_amax_kernel.cc b/paddle/phi/kernels/reduce_amax_kernel.cc index 47b5e97467fe7..87e432c5c20a7 100644 --- a/paddle/phi/kernels/reduce_amax_kernel.cc +++ b/paddle/phi/kernels/reduce_amax_kernel.cc @@ -25,10 +25,7 @@ void AMaxKernel(const Context& dev_ctx, const std::vector& dims, bool keep_dim, DenseTensor* out) { - bool reduce_all = false; - if (dims.size() == 0) { - reduce_all = true; - } + bool reduce_all = recompute_reduce_all(x, dims); AMaxRawKernel(dev_ctx, x, dims, keep_dim, reduce_all, out); } diff --git a/paddle/phi/kernels/reduce_amin_kernel.cc b/paddle/phi/kernels/reduce_amin_kernel.cc index 8da4f3afd9f43..a355da64230dc 100644 --- a/paddle/phi/kernels/reduce_amin_kernel.cc +++ b/paddle/phi/kernels/reduce_amin_kernel.cc @@ -25,10 +25,7 @@ void AMinKernel(const Context& dev_ctx, const std::vector& dims, bool keep_dim, DenseTensor* out) { - bool reduce_all = false; - if (dims.size() == 0) { - reduce_all = true; - } + bool reduce_all = recompute_reduce_all(x, dims); AMinRawKernel(dev_ctx, x, dims, keep_dim, reduce_all, out); } diff --git a/paddle/phi/kernels/reduce_any_kernel.cc b/paddle/phi/kernels/reduce_any_kernel.cc index cc70e3968067c..2baa1edb094b9 100644 --- a/paddle/phi/kernels/reduce_any_kernel.cc +++ b/paddle/phi/kernels/reduce_any_kernel.cc @@ -25,10 +25,7 @@ void AnyKernel(const Context& dev_ctx, const std::vector& dims, bool keep_dim, DenseTensor* out) { - bool reduce_all = false; - if (dims.size() == 0 || static_cast(dims.size()) == x.dims().size()) { - reduce_all = true; - } + bool reduce_all = recompute_reduce_all(x, dims); AnyRawKernel(dev_ctx, x, dims, keep_dim, reduce_all, out); } diff --git a/paddle/phi/kernels/reduce_max_kernel.cc b/paddle/phi/kernels/reduce_max_kernel.cc index 64079cb2aefad..23da5bd4cd54e 100644 --- a/paddle/phi/kernels/reduce_max_kernel.cc +++ b/paddle/phi/kernels/reduce_max_kernel.cc @@ -25,10 +25,7 @@ void MaxKernel(const Context& dev_ctx, const IntArray& dims, bool keep_dim, DenseTensor* out) { - bool reduce_all = false; - if (dims.size() == 0 || static_cast(dims.size()) == x.dims().size()) { - reduce_all = true; - } + bool reduce_all = recompute_reduce_all(x, dims); MaxRawKernel(dev_ctx, x, dims, keep_dim, reduce_all, out); } diff --git a/paddle/phi/kernels/reduce_mean_kernel.cc b/paddle/phi/kernels/reduce_mean_kernel.cc index aa615a6bb1ef1..83906fdfc0853 100644 --- a/paddle/phi/kernels/reduce_mean_kernel.cc +++ b/paddle/phi/kernels/reduce_mean_kernel.cc @@ -25,10 +25,7 @@ void MeanKernel(const Context& dev_ctx, const IntArray& dims, bool keep_dim, DenseTensor* out) { - bool reduce_all = false; - if (dims.size() == 0) { - reduce_all = true; - } + bool reduce_all = recompute_reduce_all(x, dims); MeanRawKernel(dev_ctx, x, dims, keep_dim, reduce_all, out); } diff --git a/paddle/phi/kernels/reduce_min_kernel.cc b/paddle/phi/kernels/reduce_min_kernel.cc index 7a14d106c3d74..660d3b753e97e 100644 --- a/paddle/phi/kernels/reduce_min_kernel.cc +++ b/paddle/phi/kernels/reduce_min_kernel.cc @@ -25,10 +25,7 @@ void MinKernel(const Context& dev_ctx, const IntArray& dims, bool keep_dim, DenseTensor* out) { - bool reduce_all = false; - if (dims.size() == 0 || static_cast(dims.size()) == x.dims().size()) { - reduce_all = true; - } + bool reduce_all = recompute_reduce_all(x, dims); MinRawKernel(dev_ctx, x, dims, keep_dim, reduce_all, out); } diff --git a/paddle/phi/kernels/reduce_sum_kernel.cc b/paddle/phi/kernels/reduce_sum_kernel.cc index 70c88c23585b3..c6cfe42566372 100644 --- a/paddle/phi/kernels/reduce_sum_kernel.cc +++ b/paddle/phi/kernels/reduce_sum_kernel.cc @@ -26,10 +26,7 @@ void SumKernel(const Context& dev_ctx, DataType out_dtype, bool keep_dim, DenseTensor* out) { - bool reduce_all = false; - if (dims.size() == 0) { - reduce_all = true; - } + bool reduce_all = recompute_reduce_all(x, dims); SumRawKernel(dev_ctx, x, dims, keep_dim, reduce_all, out_dtype, out); } diff --git a/paddle/phi/kernels/xpu/prod_kernel.cc b/paddle/phi/kernels/xpu/prod_kernel.cc index 7be48a8bab774..cf237afb22797 100644 --- a/paddle/phi/kernels/xpu/prod_kernel.cc +++ b/paddle/phi/kernels/xpu/prod_kernel.cc @@ -28,6 +28,7 @@ void ProdRawKernel(const Context& dev_ctx, bool keep_dim, bool reduce_all, DenseTensor* out) { + reduce_all = recompute_reduce_all(x, dims, reduce_all); int r = XPUReduce(dev_ctx, x, dims.GetData(), diff --git a/paddle/phi/kernels/xpu/reduce.h b/paddle/phi/kernels/xpu/reduce.h index 81fe362a61a06..49c9eb5ea684f 100644 --- a/paddle/phi/kernels/xpu/reduce.h +++ b/paddle/phi/kernels/xpu/reduce.h @@ -33,6 +33,7 @@ int XPUReduce(const Context& dev_ctx, T*, const std::vector&, const std::vector&)> func) { + reduce_all = recompute_reduce_all(x, dims, reduce_all); dev_ctx.template Alloc(out); const auto* x_data = x.data(); diff --git a/paddle/phi/kernels/xpu/reduce_max_grad_kernel.cc b/paddle/phi/kernels/xpu/reduce_max_grad_kernel.cc index 1bfc5ae5f877e..b1561233ea1d4 100644 --- a/paddle/phi/kernels/xpu/reduce_max_grad_kernel.cc +++ b/paddle/phi/kernels/xpu/reduce_max_grad_kernel.cc @@ -31,6 +31,7 @@ void ReduceMaxGradKernel(const Context& dev_ctx, bool keep_dim, bool reduce_all, DenseTensor* x_grad) { + reduce_all = recompute_reduce_all(x, dims_arr, reduce_all); auto dims = dims_arr.GetData(); dev_ctx.template Alloc(x_grad); diff --git a/paddle/phi/kernels/xpu/reduce_max_kernel.cc b/paddle/phi/kernels/xpu/reduce_max_kernel.cc index d0994f580cfbf..8db710a24adce 100644 --- a/paddle/phi/kernels/xpu/reduce_max_kernel.cc +++ b/paddle/phi/kernels/xpu/reduce_max_kernel.cc @@ -28,6 +28,7 @@ void MaxRawKernel(const Context& dev_ctx, bool keep_dim, bool reduce_all, DenseTensor* out) { + reduce_all = recompute_reduce_all(x, dims, reduce_all); int r = XPUReduce(dev_ctx, x, dims.GetData(), diff --git a/paddle/phi/kernels/xpu/reduce_mean_grad_kernel.cc b/paddle/phi/kernels/xpu/reduce_mean_grad_kernel.cc index 0c2fe9a9d9e64..afe84e43d99d1 100644 --- a/paddle/phi/kernels/xpu/reduce_mean_grad_kernel.cc +++ b/paddle/phi/kernels/xpu/reduce_mean_grad_kernel.cc @@ -31,6 +31,7 @@ void ReduceMeanGradKernel(const Context& dev_ctx, bool reduce_all, DenseTensor* x_grad) { using XPUType = typename XPUTypeTrait::Type; + reduce_all = recompute_reduce_all(x, dims_arr, reduce_all); dev_ctx.template Alloc(x_grad); const XPUType* dy_data = reinterpret_cast(out_grad.data()); diff --git a/paddle/phi/kernels/xpu/reduce_mean_kernel.cc b/paddle/phi/kernels/xpu/reduce_mean_kernel.cc index 4af1ba2da2756..d29db35517f37 100644 --- a/paddle/phi/kernels/xpu/reduce_mean_kernel.cc +++ b/paddle/phi/kernels/xpu/reduce_mean_kernel.cc @@ -28,6 +28,7 @@ void MeanRawKernel(const Context& dev_ctx, bool keep_dim, bool reduce_all, DenseTensor* out) { + reduce_all = recompute_reduce_all(x, dims, reduce_all); int r = XPUReduce(dev_ctx, x, dims.GetData(), diff --git a/paddle/phi/kernels/xpu/reduce_min_kernel.cc b/paddle/phi/kernels/xpu/reduce_min_kernel.cc index c54aca1830b0a..e330e30becdcf 100644 --- a/paddle/phi/kernels/xpu/reduce_min_kernel.cc +++ b/paddle/phi/kernels/xpu/reduce_min_kernel.cc @@ -28,6 +28,7 @@ void MinRawKernel(const Context& dev_ctx, bool keep_dim, bool reduce_all, DenseTensor* out) { + reduce_all = recompute_reduce_all(x, dims, reduce_all); int r = XPUReduce(dev_ctx, x, dims.GetData(), diff --git a/paddle/phi/kernels/xpu/reduce_sum_grad_kernel.cc b/paddle/phi/kernels/xpu/reduce_sum_grad_kernel.cc index b6e4d1021e47d..0ba67f68bccf3 100644 --- a/paddle/phi/kernels/xpu/reduce_sum_grad_kernel.cc +++ b/paddle/phi/kernels/xpu/reduce_sum_grad_kernel.cc @@ -28,13 +28,11 @@ void ReduceSumGradKernel(const Context& dev_ctx, bool reduce_all, DenseTensor* x_grad) { using XPUType = typename XPUTypeTrait::Type; + reduce_all = recompute_reduce_all(x, dims_arr, reduce_all); auto dims = dims_arr.GetData(); dev_ctx.template Alloc(x_grad); const auto* out_data = out_grad.data(); auto* x_grad_data = x_grad->data(); - if (dims_arr.size() == 0) { - reduce_all = true; - } const auto& input_dim_size = x.dims().size(); std::vector true_dims; for (size_t i = 0; i < dims.size(); ++i) { diff --git a/paddle/phi/kernels/xpu/reduce_sum_kernel.cc b/paddle/phi/kernels/xpu/reduce_sum_kernel.cc index 74c50304b1407..952ed101cdcb8 100644 --- a/paddle/phi/kernels/xpu/reduce_sum_kernel.cc +++ b/paddle/phi/kernels/xpu/reduce_sum_kernel.cc @@ -29,6 +29,7 @@ void SumRawKernel(const Context& dev_ctx, bool reduce_all, DataType out_dtype, DenseTensor* out) { + reduce_all = recompute_reduce_all(x, dims, reduce_all); int r = XPUReduce(dev_ctx, x, dims.GetData(), From d79eda716bfdb31bc1dc41ecf6aefb5330e50ef3 Mon Sep 17 00:00:00 2001 From: lzy <569782149@qq.com> Date: Mon, 21 Nov 2022 16:25:02 +0800 Subject: [PATCH 124/210] mma qk tensor_core (#48087) * use mma for QK dot computing in fused_multi_transformer. * Update fused_multi_transformer_op.cu.h --- .../fused/fused_multi_transformer_op.cu.h | 119 +++++++++++++++++- 1 file changed, 118 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu.h b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu.h index e6f4461f0c157..777ee83c38dc6 100644 --- a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu.h +++ b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu.h @@ -13,6 +13,8 @@ limitations under the License. */ // https://github.com/NVIDIA/FasterTransformer/blob/v4.0/fastertransformer/cuda/masked_multihead_attention.cu // We add License in the head. +#pragma once + #include #include @@ -88,6 +90,23 @@ using float16 = plat::float16; #define MMHA_USE_FP32_ACUM_FOR_LOGITS #define MMHA_USE_FP32_ACUM_FOR_OUT +#define MMHA_USE_FP32_ACUM_FOR_FMA +#define MMHA_USE_HMMA_FOR_REDUCTION + +template +class PDDataTypeTraits; + +template <> +class PDDataTypeTraits { + public: + typedef float DataType; +}; + +template <> +class PDDataTypeTraits { + public: + typedef half DataType; +}; template struct Masked_multihead_attention_params { @@ -150,6 +169,17 @@ template <> struct V_vec_ { using Type = uint32_t; }; template <> struct V_vec_ { using Type = uint2; }; template <> struct V_vec_ { using Type = uint4; }; +#ifdef MMHA_USE_FP32_ACUM_FOR_FMA +template +struct K_vec_acum_fp32_ { +}; + +template<> +struct K_vec_acum_fp32_ { + using Type = float2; +}; +#endif + #ifdef MMHA_USE_FP32_ACUM_FOR_OUT template struct V_vec_acum_fp32_ {}; // template <> struct V_vec_acum_fp32_ { using Type = float; }; @@ -318,6 +348,15 @@ inline __device__ uint32_t mul(uint32_t a, float b) { return res; } +template <> +inline __device__ float2 mul(uint32_t a, float b) { + float2 tmp = half2_to_float2(a); + float2 res; + res.x = tmp.x * b; + res.y = tmp.y * b; + return res; +} + template <> inline __device__ uint2 mul(uint2 a, float b) { uint2 res; @@ -344,6 +383,15 @@ inline __device__ float2 mul(float2 a, float b) { return res; } +template <> +inline __device__ float2 mul(float2 a, uint32_t b) { + float2 tmp_b = half2_to_float2(b); + float2 res; + res.x = a.x * tmp_b.x; + res.y = a.y * tmp_b.y; + return res; +} + template <> inline __device__ float4 mul(float4 a, float b) { float4 res; @@ -403,6 +451,12 @@ inline __device__ float2 fma(float2 a, float2 b, float2 c) { return d; } +inline __device__ float2 fma(float2 a, uint32_t b, float2 c) { + float2 tmp_b = half2_to_float2(b); + float2 d = fma(a, tmp_b, c); + return d; +} + inline __device__ float4 fma(float4 a, float4 b, float4 c) { float4 d; d.x = fma(a.x, b.x, c.x); @@ -524,6 +578,49 @@ inline __device__ float qk_dot_(const K_vec (&q)[N], return qk; } +inline __device__ float4 hmma_fp32_tensorcore(const uint2 &a, uint32_t b) { + float4 c; + float zero = 0.f; + asm volatile( + "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32 \n" + " {%0, %1, %2, %3}, \n" + " {%4, %5}, \n" + " {%6}, \n" + " {%7, %7, %7, %7}; \n" + + : "=f"(c.x), "=f"(c.y), "=f"(c.z), "=f"(c.w) + : "r"(a.x) "r"(a.y), "r"(b), "f"(zero)); + return c; +} + +template +inline __device__ float qk_hmma_dot_(const uint32_t (&q)[N], + const uint32_t (&k)[N], + float inv_sqrt_dh) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 750 +#ifdef MMHA_USE_FP32_ACUM_FOR_FMA + using K_vec_acum = typename K_vec_acum_fp32_::Type; +#else + using K_vec_acum = uint32_t; +#endif + K_vec_acum inv_q = mul(q[0], inv_sqrt_dh); + K_vec_acum qk_vec = mul(inv_q, k[0]); +#pragma unroll + for (int ii = 1; ii < N; ++ii) { + inv_q = mul(q[ii], inv_sqrt_dh); + qk_vec = fma(inv_q, k[ii], qk_vec); + } +#ifdef MMHA_USE_FP32_ACUM_FOR_FMA + uint32_t qk_vec_ = float2_to_half2(qk_vec); + return hmma_fp32_tensorcore(make_uint2(qk_vec_, 0u), 0x3c003c00u).x; +#else + return hmma_fp32_tensorcore(make_uint2(qk_vec, 0u), 0x3c003c00u).x; +#endif +#else + return 0.f; +#endif +} + template struct Qk_dot { template @@ -534,6 +631,20 @@ struct Qk_dot { } }; +template <> +struct Qk_dot { + template + static inline __device__ float dot(const uint32_t (&q)[N], + const uint32_t (&k)[N], + float inv_sqrt_dh) { +#if defined(MMHA_USE_HMMA_FOR_REDUCTION) && __CUDA_ARCH__ >= 750 + return qk_hmma_dot_(q, k, inv_sqrt_dh); +#else + return qk_dot_<4>(q, k, inv_sqrt_dh); +#endif + } +}; + template inline __device__ float block_sum(float *red_smem, float sum) { int warp = threadIdx.x / WARP_SIZE; @@ -606,6 +717,8 @@ template params) { #if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__) + typedef PDDataTypeTraits traits_; + typedef typename traits_::DataType DataType_; static_assert(Dh_MAX % THREADS_PER_KEY == 0, ""); static_assert(Dh_MAX % THREADS_PER_VALUE == 0, ""); @@ -863,7 +976,7 @@ __global__ void masked_multihead_attention_kernel( float logit = logits_smem[ti]; out = fma(logit, cast_to_float(v), out); #else - T logit = logits_smem[ti]; + DataType_ logit = static_cast(logits_smem[ti]); // Update the partial sums. out = fma(logit, v, out); #endif @@ -987,7 +1100,11 @@ void fmha_launch_kernel(const Masked_multihead_attention_params ¶ms, if (params.timestep < 32) { MMHA_LAUNCH_KERNEL(T, Dh, Dh_MAX, 4, THREADS_PER_VALUE, 64, stream); } else if (params.timestep < 2048) { +#if defined(MMHA_USE_HMMA_FOR_REDUCTION) && __CUDA_ARCH__ >= 750 + MMHA_LAUNCH_KERNEL(T, Dh, Dh_MAX, 4, THREADS_PER_VALUE, 256, stream); +#else MMHA_LAUNCH_KERNEL(T, Dh, Dh_MAX, 2, THREADS_PER_VALUE, 128, stream); +#endif } else { MMHA_LAUNCH_KERNEL(T, Dh, Dh_MAX, 1, THREADS_PER_VALUE, 256, stream); } From 87388d59677cc94a8d4c528394b9212eb9e6448a Mon Sep 17 00:00:00 2001 From: Vvsmile <450864116@qq.com> Date: Mon, 21 Nov 2022 16:45:20 +0800 Subject: [PATCH 125/210] remove lrn which is not used in paddle 2.0 (#47945) --- python/paddle/fluid/layers/nn.py | 98 ------------------- .../fluid/tests/unittests/test_layers.py | 7 -- .../fluid/tests/unittests/test_lrn_op.py | 50 ---------- 3 files changed, 155 deletions(-) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index d9253f50a1bea..c10dbcd7d1c19 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -104,7 +104,6 @@ 'unsqueeze', 'lod_reset', 'lod_append', - 'lrn', 'pad', 'label_smooth', 'roi_pool', @@ -6831,103 +6830,6 @@ def lod_append(x, level): return out -def lrn( - input, n=5, k=1.0, alpha=1e-4, beta=0.75, name=None, data_format='NCHW' -): - r""" - :alias_main: paddle.nn.functional.lrn - :alias: paddle.nn.functional.lrn,paddle.nn.functional.norm.lrn - :old_api: paddle.fluid.layers.lrn - - This operator implements the Local Response Normalization Layer. - This layer performs a type of "lateral inhibition" by normalizing over local input regions. - For more information, please refer to `ImageNet Classification with Deep Convolutional Neural Networks `_ - - The formula is as follows: - - .. math:: - - Output(i, x, y) = Input(i, x, y) / \\left(k + \\alpha \\sum\\limits^{\\min(C-1, i + n/2)}_{j = \\max(0, i - n/2)}(Input(j, x, y))^2\\right)^{\\beta} - - In the above equation: - - - :math:`n` : The number of channels to sum over. - - :math:`k` : The offset (avoid being divided by 0). - - :math:`\\alpha` : The scaling parameter. - - :math:`\\beta` : The exponent parameter. - - - Args: - input (Variable): Input feature, 4D-Tensor with the shape of [N,C,H,W] or [N, H, W, C], - where N is the batch size, C is the input channel, H is Height, W is weight. The data - type is float32. The rank of this tensor must be 4, otherwise it will raise ValueError. - n (int, optional): The number of channels to sum over. Default: 5 - k (float, optional): An offset, positive. Default: 1.0 - alpha (float, optional): The scaling parameter, positive. Default:1e-4 - beta (float, optional): The exponent, positive. Default:0.75 - name (str, optional): The default value is None. Normally there is no need for user to set - this property. For more information, please refer to :ref:`api_guide_Name` - data_format (str, optional): Specify the data format of the input, and the data format of the output - will be consistent with that of the input. An optional string from: `"NCHW"`, `"NHWC"`. - The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of: - `[batch_size, input_channels, input_height, input_width]`. - - Returns: - Variable: A tensor variable storing the transformation result with the same shape and data type as input. - - - Examples: - - .. code-block:: python - - import paddle.fluid as fluid - data = fluid.data( - name="data", shape=[None, 3, 112, 112], dtype="float32") - lrn = fluid.layers.lrn(input=data) - print(lrn.shape) # [-1, 3, 112, 112] - print(lrn.dtype) # float32 - """ - helper = LayerHelper('lrn', **locals()) - check_variable_and_dtype(input, 'input', ['float32'], 'lrn') - dtype = helper.input_dtype() - input_shape = input.shape - dims = len(input_shape) - - if dims != 4: - raise ValueError( - "Input's dimension size of Op(lrn) must be 4, but received %d." - % (dims) - ) - if data_format not in ['NCHW', 'NHWC']: - raise ValueError( - "Attr(data_format) of Op(lrn) got wrong value: received " - + data_format - + " but only NCHW or NHWC supported." - ) - - mid_out = helper.create_variable_for_type_inference( - dtype=dtype, stop_gradient=True - ) - lrn_out = helper.create_variable_for_type_inference(dtype) - helper.append_op( - type="lrn", - inputs={"X": input}, - outputs={ - "Out": lrn_out, - "MidOut": mid_out, - }, - attrs={ - "n": n, - "k": k, - "alpha": alpha, - "beta": beta, - "data_format": data_format, - }, - ) - - return lrn_out - - def pad(x, paddings, pad_value=0.0, name=None): r""" :alias_main: paddle.nn.functional.pad diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index 621e8090d2a14..1f2f07a067e63 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -3339,13 +3339,6 @@ def make_space_to_depth(self): ) return layers.space_to_depth(data, 3) - def make_lrn(self): - with program_guard( - fluid.default_main_program(), fluid.default_startup_program() - ): - data = self._get_data(name='data', shape=[6, 2, 2], dtype='float32') - return layers.lrn(data) - def make_get_places(self): with program_guard( fluid.default_main_program(), fluid.default_startup_program() diff --git a/python/paddle/fluid/tests/unittests/test_lrn_op.py b/python/paddle/fluid/tests/unittests/test_lrn_op.py index 6044628460000..521889f53eca7 100644 --- a/python/paddle/fluid/tests/unittests/test_lrn_op.py +++ b/python/paddle/fluid/tests/unittests/test_lrn_op.py @@ -106,56 +106,6 @@ def init_test_case(self): self.data_format = 'NHWC' -class TestLRNAPI(unittest.TestCase): - def test_case(self): - data1 = fluid.data(name='data1', shape=[2, 4, 5, 5], dtype='float32') - data2 = fluid.data(name='data2', shape=[2, 5, 5, 4], dtype='float32') - out1 = fluid.layers.lrn(data1, data_format='NCHW') - out2 = fluid.layers.lrn(data2, data_format='NHWC') - data1_np = np.random.random((2, 4, 5, 5)).astype("float32") - data2_np = np.transpose(data1_np, [0, 2, 3, 1]) - - if core.is_compiled_with_cuda(): - place = core.CUDAPlace(0) - else: - place = core.CPUPlace() - exe = fluid.Executor(place) - exe.run(fluid.default_startup_program()) - results = exe.run( - fluid.default_main_program(), - feed={"data1": data1_np, "data2": data2_np}, - fetch_list=[out1, out2], - return_numpy=True, - ) - - np.testing.assert_allclose( - results[0], np.transpose(results[1], (0, 3, 1, 2)), rtol=1e-05 - ) - - def test_exception(self): - input1 = fluid.data(name="input1", shape=[2, 4, 5, 5], dtype="float32") - input2 = fluid.data( - name="input2", shape=[2, 4, 5, 5, 5], dtype="float32" - ) - - def _attr_data_fromat(): - out = fluid.layers.lrn(input1, data_format='NDHW') - - def _input_dim_size(): - out = fluid.layers.lrn(input2) - - self.assertRaises(ValueError, _attr_data_fromat) - self.assertRaises(ValueError, _input_dim_size) - - -class TestLRNOpError(unittest.TestCase): - def test_errors(self): - with program_guard(Program(), Program()): - # the input must be float32 - in_w = fluid.data(name="in_w", shape=[None, 3, 3, 3], dtype="int64") - self.assertRaises(TypeError, fluid.layers.lrn, in_w) - - class TestLocalResponseNormFAPI(unittest.TestCase): def setUp(self): np.random.seed(123) From 02dfd18d906279c3540056a5d4f7bb6b84d94d88 Mon Sep 17 00:00:00 2001 From: Vvsmile <450864116@qq.com> Date: Mon, 21 Nov 2022 16:48:54 +0800 Subject: [PATCH 126/210] replace scatter_nd and scatter_nd_add with paddle.scatter_nd and (#47960) paddle.scatter_nd_add --- python/paddle/fluid/layers/nn.py | 134 ------------------ .../tests/unittests/test_scatter_nd_op.py | 16 +-- 2 files changed, 7 insertions(+), 143 deletions(-) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index c10dbcd7d1c19..b32124e6bc471 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -117,8 +117,6 @@ 'resize_nearest', 'gather_nd', 'scatter', - 'scatter_nd_add', - 'scatter_nd', 'random_crop', 'mean_iou', 'relu', @@ -8695,138 +8693,6 @@ def scatter(input, index, updates, name=None, overwrite=True): return out -def scatter_nd_add(ref, index, updates, name=None): - r""" - **Scatter_nd_add Layer** - - Output is obtained by applying sparse addition to a single value - or slice in a Variable. - - :attr:`ref` is a Tensor with rank :math:`R` - and :attr:`index` is a Tensor with rank :math:`K` . Thus, :attr:`index` - has shape :math:`[i_0, i_1, ..., i_{K-2}, Q]` where :math:`Q \leq R` . :attr:`updates` - is a Tensor with rank :math:`K - 1 + R - Q` and its - shape is :math:`index.shape[:-1] + ref.shape[index.shape[-1]:]` . - - According to the :math:`[i_0, i_1, ..., i_{K-2}]` of :attr:`index` , - add the corresponding :attr:`updates` slice to the :attr:`ref` slice - which is obtained by the last one dimension of :attr:`index` . - - .. code-block:: text - - Given: - - * Case 1: - ref = [0, 1, 2, 3, 4, 5] - index = [[1], [2], [3], [1]] - updates = [9, 10, 11, 12] - - we get: - - output = [0, 22, 12, 14, 4, 5] - - * Case 2: - ref = [[65, 17], [-14, -25]] - index = [[], []] - updates = [[[-1, -2], [1, 2]], - [[3, 4], [-3, -4]]] - ref.shape = (2, 2) - index.shape = (2, 0) - updates.shape = (2, 2, 2) - - we get: - - output = [[67, 19], [-16, -27]] - - Args: - ref (Variable): The ref input. Its dtype should be int32, int64, float32, float64. - index (Variable): The index input with rank > 1 and index.shape[-1] <= ref.rank. - Its dtype should be int32 or int64 as it is used as indexes. - updates (Variable): The updated value of scatter_nd_add op, and it must have the same dtype - as ref. It must have the shape index.shape[:-1] + ref.shape[index.shape[-1]:]. - name (str|None): The output variable name. If set None, the layer will be named automatically. - - Returns: - output (Variable): The output is a tensor with the same shape and dtype as ref. - - Examples: - - .. code-block:: python - - import paddle.fluid as fluid - import paddle - paddle.enable_static() - ref = fluid.data(name='ref', shape=[3, 5, 9, 10], dtype='float32') - index = fluid.data(name='index', shape=[3, 2], dtype='int32') - updates = fluid.data(name='update', shape=[3, 9, 10], dtype='float32') - - output = fluid.layers.scatter_nd_add(ref, index, updates) - """ - - if in_dygraph_mode(): - return _C_ops.scatter_nd_add(ref, index, updates) - else: - if _in_legacy_dygraph(): - op = getattr(_legacy_C_ops, 'scatter_nd_add') - return op(ref, index, updates) - else: - if ref.dtype != updates.dtype: - raise ValueError("ref and updates must have same data type.") - - helper = LayerHelper('scatter_nd_add', **locals()) - dtype = helper.input_dtype(input_param_name='ref') - output = helper.create_variable_for_type_inference(dtype) - helper.append_op( - type="scatter_nd_add", - inputs={"X": ref, "Index": index, "Updates": updates}, - outputs={"Out": output}, - ) - return output - - -def scatter_nd(index, updates, shape, name=None): - """ - **Scatter_nd Layer** - - Output is obtained by scattering the :attr:`updates` in a new tensor according - to :attr:`index` . This op is similar to :code:`scatter_nd_add`, except the - tensor of :attr:`shape` is zero-initialized. Correspondingly, :code:`scatter_nd(index, updates, shape)` - is equal to :code:`scatter_nd_add(paddle.zeros(shape, updates.dtype), index, updates)` . - If :attr:`index` has repeated elements, then the corresponding updates are accumulated. - Because of the numerical approximation issues, the different order of repeated elements - in :attr:`index` may cause different results. The specific calculation method can be - seen :code:`scatter_nd_add` . This op is the inverse of the :code:`gather_nd` op. - - Args: - index (Tensor): The index input with ndim > 1 and index.shape[-1] <= len(shape). - Its dtype should be int32 or int64 as it is used as indexes. - updates (Tensor): The updated value of scatter_nd op. Its dtype should be float32, float64. - It must have the shape index.shape[:-1] + shape[index.shape[-1]:] - shape(tuple|list): Shape of output tensor. - name (str|None): The output Tensor name. If set None, the layer will be named automatically. - - Returns: - output (Tensor): The output is a tensor with the same type as :attr:`updates` . - - Examples: - - .. code-block:: python - - import paddle - import numpy as np - - index_data = np.array([[1, 1], - [0, 1], - [1, 3]]).astype(np.int64) - index = paddle.to_tensor(index_data) - updates = paddle.rand(shape=[3, 9, 10], dtype='float32') - shape = [3, 5, 9, 10] - - output = paddle.scatter_nd(index, updates, shape) - """ - return scatter_nd_add(zeros(shape, updates.dtype), index, updates, name) - - @templatedoc() def random_crop(x, shape, seed=None): """ diff --git a/python/paddle/fluid/tests/unittests/test_scatter_nd_op.py b/python/paddle/fluid/tests/unittests/test_scatter_nd_op.py index e9edd089f5d3c..5fef3d6d3f9f4 100644 --- a/python/paddle/fluid/tests/unittests/test_scatter_nd_op.py +++ b/python/paddle/fluid/tests/unittests/test_scatter_nd_op.py @@ -183,7 +183,7 @@ def testcase1(self): dtype='float32', append_batch_size=False, ) - output1 = fluid.layers.scatter_nd_add(ref1, index1, updates1) + output1 = paddle.scatter_nd_add(ref1, index1, updates1) def testcase2(self): ref2 = fluid.layers.data( @@ -204,7 +204,7 @@ def testcase2(self): dtype='double', append_batch_size=False, ) - output2 = fluid.layers.scatter_nd_add( + output2 = paddle.scatter_nd_add( ref2, index2, updates2, name="scatter_nd_add" ) @@ -222,7 +222,7 @@ def testcase3(self): dtype='float32', append_batch_size=False, ) - output3 = fluid.layers.scatter_nd(index3, updates3, shape3) + output3 = paddle.scatter_nd(index3, updates3, shape3) def testcase4(self): shape4 = [10, 9, 8, 1, 3] @@ -238,9 +238,7 @@ def testcase4(self): dtype='double', append_batch_size=False, ) - output4 = fluid.layers.scatter_nd( - index4, updates4, shape4, name='scatter_nd' - ) + output4 = paddle.scatter_nd(index4, updates4, shape4, name='scatter_nd') def testcase5(self): if not fluid.core.is_compiled_with_cuda(): @@ -307,7 +305,7 @@ def check_raise_is_test(): updates5 = fluid.layers.data( name='updates5', shape=[2, 10], dtype='float32' ) - output5 = fluid.layers.scatter_nd_add(ref5, index5, updates5) + output5 = paddle.scatter_nd_add(ref5, index5, updates5) except Exception as e: t = "The last dimension of Input(Index)'s shape should be no greater " if t in str(e): @@ -335,7 +333,7 @@ def test_check_raise2(self): dtype='float32', append_batch_size=False, ) - output6 = fluid.layers.scatter_nd_add(ref6, index6, updates6) + output6 = paddle.scatter_nd_add(ref6, index6, updates6) def test_check_raise3(self): def check_raise_is_test(): @@ -347,7 +345,7 @@ def check_raise_is_test(): updates7 = fluid.layers.data( name='updates7', shape=[2, 4, 5, 20], dtype='float32' ) - output7 = fluid.layers.scatter_nd(index7, updates7, shape) + output7 = paddle.scatter_nd(index7, updates7, shape) except Exception as e: t = "Updates has wrong shape" if t in str(e): From 55f6fb3d1f9d08ceb6026964b9f38a4102bc79ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awomir=20Siwek?= Date: Mon, 21 Nov 2022 10:08:32 +0100 Subject: [PATCH 127/210] [PHI] Migrate mul_grad kernel (#48061) * cleanup unused code * unify is_int8 is_bfloat16 * Simplify matmul_v2 FWD kernel * remove RunKernel methods * remove import namespace * remove headers * clean fluid/phi cross imports * remove fluid axpy_handler * delete fluid methods * activations * OneDNNMemDesc * MKLDNNFormatForSize * MatchShapeToLayout * MKLDNNMemoryFormat * MKLDNNFormat * ReorderMKLDNNHandler * to_void_cast * review suggestions * interpolate * remove fluid depedency * init * ExecuteMatMulV2 * rm fluid kernel * matmul_grad * remove mutable_data * mul_grad --- .../fluid/operators/mkldnn/mul_mkldnn_op.cc | 83 ------------------- paddle/phi/backends/onednn/onednn_reuse.h | 41 +++++++++ .../phi/kernels/onednn/matmul_grad_kernel.cc | 50 +++++++++++ 3 files changed, 91 insertions(+), 83 deletions(-) diff --git a/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc index f667c9809df04..86395b0465d03 100644 --- a/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc @@ -489,83 +489,6 @@ class MulMKLDNNKernel : public framework::OpKernel { } }; -template -class MulGradMKLDNNKernel : public MulMKLDNNKernel { - public: - void Compute(const ExecutionContext &ctx) const override { RunKernel(ctx); } - - private: - void RunKernel(const ExecutionContext &ctx) const { - const auto &dev_ctx = ctx.template device_context(); - const auto &onednn_engine = dev_ctx.GetEngine(); - - const auto *x = ctx.Input("X"); - const auto *y = ctx.Input("Y"); - const auto *dout = - ctx.Input(framework::GradVarName("Out")); - - auto *dx = ctx.Output(framework::GradVarName("X")); - auto *dy = ctx.Output(framework::GradVarName("Y")); - - int x_num_col_dims = ctx.Attr("x_num_col_dims"); - int y_num_col_dims = ctx.Attr("y_num_col_dims"); - - const Tensor x_matrix = x->dims().size() > 2 - ? framework::ReshapeToMatrix(*x, x_num_col_dims) - : static_cast(*x); - const Tensor y_matrix = y->dims().size() > 2 - ? framework::ReshapeToMatrix(*y, y_num_col_dims) - : static_cast(*y); - - Tensor dout_matrix = *dout; - dout_matrix.Resize({phi::flatten_to_2d(x->dims(), x_num_col_dims)[0], - phi::flatten_to_2d(y->dims(), y_num_col_dims)[1]}); - - // adding mb dim because MatMulV2 handler needs it - std::vector x_dims(3, 1); - std::vector y_dims(3, 1); - std::vector dout_dims(3, 1); - - x_dims[1] = x_matrix.dims()[0]; - x_dims[2] = x_matrix.dims()[1]; - - y_dims[1] = y_matrix.dims()[0]; - y_dims[2] = y_matrix.dims()[1]; - - dout_dims[1] = dout_matrix.dims()[0]; - dout_dims[2] = dout_matrix.dims()[1]; - - if (dx != nullptr) { - dx->set_lod(x->lod()); - this->ExecuteMatMul(ctx, - dev_ctx, - onednn_engine, - ctx.GetPlace(), - &dout_matrix, - dout_dims, - false, - &y_matrix, - y_dims, - true, - static_cast(dx)); - } - if (dy != nullptr) { - dy->set_lod(y->lod()); - this->ExecuteMatMul(ctx, - dev_ctx, - onednn_engine, - ctx.GetPlace(), - &x_matrix, - x_dims, - true, - &dout_matrix, - dout_dims, - false, - static_cast(dy)); - } - } -}; - } // namespace operators } // namespace paddle @@ -578,9 +501,3 @@ REGISTER_OP_KERNEL(mul, ops::MulMKLDNNINT8Kernel, ops::MulMKLDNNKernel, ops::MulMKLDNNKernel); - -REGISTER_OP_KERNEL(mul_grad, - MKLDNN, - ::paddle::platform::CPUPlace, - ops::MulGradMKLDNNKernel, - ops::MulGradMKLDNNKernel); diff --git a/paddle/phi/backends/onednn/onednn_reuse.h b/paddle/phi/backends/onednn/onednn_reuse.h index bd3d3f30f7a44..bc88fef443df2 100644 --- a/paddle/phi/backends/onednn/onednn_reuse.h +++ b/paddle/phi/backends/onednn/onednn_reuse.h @@ -1912,6 +1912,47 @@ class MatmulOneDNNHandler } }; +template +static void ExecuteMul(const OneDNNContext& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + const std::vector& x_dims, + const std::vector& y_dims, + bool trans_x, + bool trans_y, + DenseTensor* out) { + static const std::vector vec_placeholder; + MatmulOneDNNHandler handler(dev_ctx, + x_dims, + y_dims, + trans_x, + trans_y, + vec_placeholder, + vec_placeholder, + false); + + const auto src_memory_p = handler.AcquireSrcMemory(&x); + const auto weights_memory_p = handler.AcquireWeightsMemory(&y); + const auto dst_memory_p = handler.AcquireDstMemory(dev_ctx, out); + + auto matmul_p = handler.AcquireForwardPrimitive(); + + std::unordered_map matmul_args = { + {DNNL_ARG_SRC, *src_memory_p}, + {DNNL_ARG_WEIGHTS, *weights_memory_p}, + {DNNL_ARG_DST, *dst_memory_p}}; + + auto& astream = OneDNNContext::tls().get_stream(); + matmul_p->execute(astream, matmul_args); + astream.wait(); + + // This kernel is flattening dims so then we need to unflattened version + // that should be set in out reshape require plain layout, but + // MatmulV2MKLDNNHanlder enforces one so it should work + out->set_mem_desc( + dst_memory_p->get_desc().reshape(vectorize(out->dims()))); +} + template void ExecuteMatmul(const OneDNNContext& dev_ctx, const DenseTensor& x, diff --git a/paddle/phi/kernels/onednn/matmul_grad_kernel.cc b/paddle/phi/kernels/onednn/matmul_grad_kernel.cc index 47807f156b18f..ceb752f6d41be 100644 --- a/paddle/phi/kernels/onednn/matmul_grad_kernel.cc +++ b/paddle/phi/kernels/onednn/matmul_grad_kernel.cc @@ -153,6 +153,49 @@ void MatmulGradKernel(const Context &dev_ctx, dy->Resize(y.dims()); } +template +void MatmulWithFlattenGradKernel(const Context &dev_ctx, + const DenseTensor &x, + const DenseTensor &y, + const DenseTensor &out_grad, + int x_num_col_dims, + int y_num_col_dims, + DenseTensor *x_grad, + DenseTensor *y_grad) { + const DenseTensor reshaped_y = + paddle::framework::ReshapeToMatrix(y, y_num_col_dims); + const DenseTensor reshaped_x = + paddle::framework::ReshapeToMatrix(x, x_num_col_dims); + const DenseTensor x_matrix = x.dims().size() > 2 ? reshaped_x : x; + const DenseTensor y_matrix = y.dims().size() > 2 ? reshaped_y : y; + + DenseTensor dout_matrix = out_grad; + dout_matrix.Resize({flatten_to_2d(x.dims(), x_num_col_dims)[0], + flatten_to_2d(y.dims(), y_num_col_dims)[1]}); + + // adding mb dim because MatMulV2 handler needs it + std::vector x_dims(3, 1); + std::vector y_dims(3, 1); + std::vector dout_dims(3, 1); + x_dims[1] = x_matrix.dims()[0]; + x_dims[2] = x_matrix.dims()[1]; + y_dims[1] = y_matrix.dims()[0]; + y_dims[2] = y_matrix.dims()[1]; + dout_dims[1] = dout_matrix.dims()[0]; + dout_dims[2] = dout_matrix.dims()[1]; + + if (x_grad != nullptr) { + x_grad->set_lod(x.lod()); + funcs::ExecuteMul( + dev_ctx, dout_matrix, y_matrix, dout_dims, y_dims, false, true, x_grad); + } + if (y_grad != nullptr) { + y_grad->set_lod(y.lod()); + funcs::ExecuteMul( + dev_ctx, x_matrix, dout_matrix, x_dims, dout_dims, true, false, y_grad); + } +} + } // namespace phi PD_REGISTER_KERNEL(matmul_grad, @@ -161,3 +204,10 @@ PD_REGISTER_KERNEL(matmul_grad, phi::MatmulGradKernel, float, phi::dtype::bfloat16) {} + +PD_REGISTER_KERNEL(matmul_with_flatten_grad, + OneDNN, + ONEDNN, + phi::MatmulWithFlattenGradKernel, + float, + phi::dtype::bfloat16) {} From 41483383d4955c5f478ca2239a21681e9d1ce548 Mon Sep 17 00:00:00 2001 From: RichardWooSJTU <37864677+RichardWooSJTU@users.noreply.github.com> Date: Mon, 21 Nov 2022 17:28:35 +0800 Subject: [PATCH 128/210] delete unnecessary shape and slice op (#48112) --- .../ir/fuse_multi_transformer_layer_pass.cc | 45 +--- .../fused_multi_transformer_decoder_pass.cc | 216 +++++++++--------- .../inference/api/paddle_pass_builder.cc | 1 + 3 files changed, 113 insertions(+), 149 deletions(-) diff --git a/paddle/fluid/framework/ir/fuse_multi_transformer_layer_pass.cc b/paddle/fluid/framework/ir/fuse_multi_transformer_layer_pass.cc index 4e2bca2ae2a97..b730d46ab7c5f 100644 --- a/paddle/fluid/framework/ir/fuse_multi_transformer_layer_pass.cc +++ b/paddle/fluid/framework/ir/fuse_multi_transformer_layer_pass.cc @@ -62,34 +62,7 @@ MultiTransformerLayerPattern::operator()(bool enable_int8, fused_multi_transformer_name, "Out"); if (is_decoder) { - auto shape_repr = - PDNodeName(name_scope_, repr_, id_, "shape_" + std::to_string(i)); - node_reprs["shape_" + std::to_string(i)] = shape_repr; - auto* shape = pattern->NewNode(shape_repr)->assert_is_op("shape"); - - auto shape_out_repr = - PDNodeName(name_scope_, repr_, id_, "shape_out_" + std::to_string(i)); - node_reprs["shape_out_" + std::to_string(i)] = shape_out_repr; - auto* shape_out = - pattern->NewNode(shape_out_repr)->assert_is_op_output("shape", "Out"); - - shape->LinksFrom({src_mask}).LinksTo({shape_out}); - - auto slice_repr = - PDNodeName(name_scope_, repr_, id_, "slice_" + std::to_string(i)); - node_reprs["slice_" + std::to_string(i)] = slice_repr; - auto* slice = pattern->NewNode(slice_repr)->assert_is_op("slice"); - - auto slice_out_repr = - PDNodeName(name_scope_, repr_, id_, "slice_out_" + std::to_string(i)); - node_reprs["slice_out_" + std::to_string(i)] = slice_out_repr; - auto* slice_out = - pattern->NewNode(slice_out_repr)->assert_is_op_output("slice", "Out"); - - slice->LinksFrom({shape_out}).LinksTo({slice_out}); - - fused_multi_transformer->LinksFrom({x0, src_mask, slice_out}) - .LinksTo({out}); + fused_multi_transformer->LinksFrom({x0, src_mask}).LinksTo({out}); } else { auto cache_kv_repr = PDNodeName(name_scope_, repr_, id_, "cache_kv_" + std::to_string(i)); @@ -187,10 +160,6 @@ int FuseMultiTransformerLayerPass::BuildFusion(Graph* graph, std::vector fuse_op_nodes; std::vector out_nodes; - std::vector unused_node_prefixes = { - "shape_", "shape_out_", "slice_", "slice_out_"}; - std::vector unused_nodes; - std::vector fuse_op_descs; std::vector fuse_op_input_var_name_maps; std::vector fuse_op_output_var_name_maps; @@ -219,14 +188,6 @@ int FuseMultiTransformerLayerPass::BuildFusion(Graph* graph, fill_op_node->Op()->SetInput("Input", {x0->Name()}); IR_NODE_UNLINK(out_nodes[i - 1], fill_op_node); IR_NODE_LINK_TO(x0, fill_op_node); - } else if (is_decoder && i != 0) { - for (const auto& unused_node_prefix : unused_node_prefixes) { - PDNode* unused_pdnode = - multi_layer_pattern.PatternBase::pattern->RetrieveNode( - node_reprs[unused_node_prefix + std::to_string(i)]); - Node* unused_node = subgraph.at(unused_pdnode); - unused_nodes.push_back(unused_node); - } } } @@ -293,10 +254,6 @@ int FuseMultiTransformerLayerPass::BuildFusion(Graph* graph, std::unordered_set marked_fuse_op_nodes( fuse_op_nodes.begin() + 1, fuse_op_nodes.end()); - if (is_decoder) { - marked_fuse_op_nodes.insert(unused_nodes.begin(), unused_nodes.end()); - } - GraphSafeRemoveNodes(graph, marked_fuse_op_nodes); ++fusion_count; }; diff --git a/paddle/fluid/framework/ir/fused_multi_transformer_decoder_pass.cc b/paddle/fluid/framework/ir/fused_multi_transformer_decoder_pass.cc index 42c699195beb9..2d93758f177d2 100644 --- a/paddle/fluid/framework/ir/fused_multi_transformer_decoder_pass.cc +++ b/paddle/fluid/framework/ir/fused_multi_transformer_decoder_pass.cc @@ -1146,35 +1146,7 @@ int FusedMultiTransformerDecoderPass::BuildFusion(Graph* graph, auto cache_kv_name = "cache_kv" + std::to_string(layer_idx); fused_multi_transformer_op_desc.SetInput("CacheKV", {cache_kv_name}); - VarDesc shape_out_desc("shape_out." + std::to_string(layer_idx)); - shape_out_desc.SetDataType(proto::VarType::INT32); - shape_out_desc.SetPersistable(false); - auto* shape_out = graph->CreateVarNode(&shape_out_desc); - - OpDesc shape_op_desc(layer_norm->Op()->Block()); - shape_op_desc.SetType("shape"); - shape_op_desc.SetInput("Input", {eltadd_qk_b->Name()}); - shape_op_desc.SetOutput("Out", {shape_out->Name()}); - auto* shape_op = graph->CreateOpNode(&shape_op_desc); - - VarDesc slice_out_desc("slice_out." + std::to_string(layer_idx)); - slice_out_desc.SetDataType(proto::VarType::INT32); - slice_out_desc.SetPersistable(false); - auto* slice_out = graph->CreateVarNode(&slice_out_desc); - - OpDesc slice_op_desc(layer_norm->Op()->Block()); - slice_op_desc.SetType("slice"); - slice_op_desc.SetInput("Input", {shape_out->Name()}); - slice_op_desc.SetOutput("Out", {slice_out->Name()}); - std::vector axes = {0}; - std::vector starts = {3}; - std::vector ends = {4}; - slice_op_desc.SetAttr("axes", axes); - slice_op_desc.SetAttr("starts", starts); - slice_op_desc.SetAttr("ends", ends); - auto* slice_op = graph->CreateOpNode(&slice_op_desc); - - fused_multi_transformer_op_desc.SetInput("TimeStep", {slice_out->Name()}); + fused_multi_transformer_op_desc.SetInput("TimeStep", {"slice_out.0"}); // Out Linear input fused_multi_transformer_op_desc.SetInput("OutLinearW", @@ -1219,12 +1191,42 @@ int FusedMultiTransformerDecoderPass::BuildFusion(Graph* graph, IR_NODE_LINK_TO(eltadd0_b, fused_multi_transformer); IR_NODE_LINK_TO(eltadd_qk_b, fused_multi_transformer); - // TimeStep link - IR_NODE_LINK_TO(eltadd_qk_b, shape_op); - IR_NODE_LINK_TO(shape_op, shape_out); - IR_NODE_LINK_TO(shape_out, slice_op); - IR_NODE_LINK_TO(slice_op, slice_out); - IR_NODE_LINK_TO(slice_out, fused_multi_transformer) + if (layer_idx == 0) { + VarDesc shape_out_desc("shape_out.0"); + shape_out_desc.SetDataType(proto::VarType::INT32); + shape_out_desc.SetPersistable(false); + auto* shape_out = graph->CreateVarNode(&shape_out_desc); + + OpDesc shape_op_desc(layer_norm->Op()->Block()); + shape_op_desc.SetType("shape"); + shape_op_desc.SetInput("Input", {eltadd_qk_b->Name()}); + shape_op_desc.SetOutput("Out", {shape_out->Name()}); + auto* shape_op = graph->CreateOpNode(&shape_op_desc); + + VarDesc slice_out_desc("slice_out.0"); + slice_out_desc.SetDataType(proto::VarType::INT32); + slice_out_desc.SetPersistable(false); + auto* slice_out = graph->CreateVarNode(&slice_out_desc); + + OpDesc slice_op_desc(layer_norm->Op()->Block()); + slice_op_desc.SetType("slice"); + slice_op_desc.SetInput("Input", {shape_out->Name()}); + slice_op_desc.SetOutput("Out", {slice_out->Name()}); + std::vector axes = {0}; + std::vector starts = {3}; + std::vector ends = {4}; + slice_op_desc.SetAttr("axes", axes); + slice_op_desc.SetAttr("starts", starts); + slice_op_desc.SetAttr("ends", ends); + auto* slice_op = graph->CreateOpNode(&slice_op_desc); + + // TimeStep link + IR_NODE_LINK_TO(eltadd_qk_b, shape_op); + IR_NODE_LINK_TO(shape_op, shape_out); + IR_NODE_LINK_TO(shape_out, slice_op); + IR_NODE_LINK_TO(slice_op, slice_out); + IR_NODE_LINK_TO(slice_out, fused_multi_transformer) + } IR_NODE_LINK_TO(matmul_linear_w, fused_multi_transformer); IR_NODE_LINK_TO(eltadd_linear_b, fused_multi_transformer); @@ -1789,35 +1791,7 @@ int FusedMultiTransformerDecoderFuseQKVPass::BuildFusion( auto cache_kv_name = "cache_kv" + std::to_string(layer_idx); fused_multi_transformer_op_desc.SetInput("CacheKV", {cache_kv_name}); - VarDesc shape_out_desc("shape_out." + std::to_string(layer_idx)); - shape_out_desc.SetDataType(proto::VarType::INT32); - shape_out_desc.SetPersistable(false); - auto* shape_out = graph->CreateVarNode(&shape_out_desc); - - OpDesc shape_op_desc(layer_norm->Op()->Block()); - shape_op_desc.SetType("shape"); - shape_op_desc.SetInput("Input", {eltadd_qk_b->Name()}); - shape_op_desc.SetOutput("Out", {shape_out->Name()}); - auto* shape_op = graph->CreateOpNode(&shape_op_desc); - - VarDesc slice_out_desc("slice_out." + std::to_string(layer_idx)); - slice_out_desc.SetDataType(proto::VarType::INT32); - slice_out_desc.SetPersistable(false); - auto* slice_out = graph->CreateVarNode(&slice_out_desc); - - OpDesc slice_op_desc(layer_norm->Op()->Block()); - slice_op_desc.SetType("slice"); - slice_op_desc.SetInput("Input", {shape_out->Name()}); - slice_op_desc.SetOutput("Out", {slice_out->Name()}); - std::vector axes = {0}; - std::vector starts = {3}; - std::vector ends = {4}; - slice_op_desc.SetAttr("axes", axes); - slice_op_desc.SetAttr("starts", starts); - slice_op_desc.SetAttr("ends", ends); - auto* slice_op = graph->CreateOpNode(&slice_op_desc); - - fused_multi_transformer_op_desc.SetInput("TimeStep", {slice_out->Name()}); + fused_multi_transformer_op_desc.SetInput("TimeStep", {"slice_out.0"}); // Out Linear input fused_multi_transformer_op_desc.SetInput("OutLinearW", @@ -1862,12 +1836,42 @@ int FusedMultiTransformerDecoderFuseQKVPass::BuildFusion( IR_NODE_LINK_TO(eltadd0_b, fused_multi_transformer); IR_NODE_LINK_TO(eltadd_qk_b, fused_multi_transformer); - // TimeStep link - IR_NODE_LINK_TO(eltadd_qk_b, shape_op); - IR_NODE_LINK_TO(shape_op, shape_out); - IR_NODE_LINK_TO(shape_out, slice_op); - IR_NODE_LINK_TO(slice_op, slice_out); - IR_NODE_LINK_TO(slice_out, fused_multi_transformer) + if (layer_idx == 0) { + VarDesc shape_out_desc("shape_out.0"); + shape_out_desc.SetDataType(proto::VarType::INT32); + shape_out_desc.SetPersistable(false); + auto* shape_out = graph->CreateVarNode(&shape_out_desc); + + OpDesc shape_op_desc(layer_norm->Op()->Block()); + shape_op_desc.SetType("shape"); + shape_op_desc.SetInput("Input", {eltadd_qk_b->Name()}); + shape_op_desc.SetOutput("Out", {shape_out->Name()}); + auto* shape_op = graph->CreateOpNode(&shape_op_desc); + + VarDesc slice_out_desc("slice_out.0"); + slice_out_desc.SetDataType(proto::VarType::INT32); + slice_out_desc.SetPersistable(false); + auto* slice_out = graph->CreateVarNode(&slice_out_desc); + + OpDesc slice_op_desc(layer_norm->Op()->Block()); + slice_op_desc.SetType("slice"); + slice_op_desc.SetInput("Input", {shape_out->Name()}); + slice_op_desc.SetOutput("Out", {slice_out->Name()}); + std::vector axes = {0}; + std::vector starts = {3}; + std::vector ends = {4}; + slice_op_desc.SetAttr("axes", axes); + slice_op_desc.SetAttr("starts", starts); + slice_op_desc.SetAttr("ends", ends); + auto* slice_op = graph->CreateOpNode(&slice_op_desc); + + // TimeStep link + IR_NODE_LINK_TO(eltadd_qk_b, shape_op); + IR_NODE_LINK_TO(shape_op, shape_out); + IR_NODE_LINK_TO(shape_out, slice_op); + IR_NODE_LINK_TO(slice_op, slice_out); + IR_NODE_LINK_TO(slice_out, fused_multi_transformer) + } IR_NODE_LINK_TO(matmul_linear_w, fused_multi_transformer); IR_NODE_LINK_TO(eltadd_linear_b, fused_multi_transformer); @@ -2405,35 +2409,7 @@ int MultiDevicesFusedMultiTransformerDecoderFuseQKVPass::BuildFusion( auto cache_kv_name = "cache_kv" + std::to_string(layer_idx); fused_multi_transformer_op_desc.SetInput("CacheKV", {cache_kv_name}); - VarDesc shape_out_desc("shape_out." + std::to_string(layer_idx)); - shape_out_desc.SetDataType(proto::VarType::INT32); - shape_out_desc.SetPersistable(false); - auto* shape_out = graph->CreateVarNode(&shape_out_desc); - - OpDesc shape_op_desc(layer_norm->Op()->Block()); - shape_op_desc.SetType("shape"); - shape_op_desc.SetInput("Input", {eltadd_qk_b->Name()}); - shape_op_desc.SetOutput("Out", {shape_out->Name()}); - auto* shape_op = graph->CreateOpNode(&shape_op_desc); - - VarDesc slice_out_desc("slice_out." + std::to_string(layer_idx)); - slice_out_desc.SetDataType(proto::VarType::INT32); - slice_out_desc.SetPersistable(false); - auto* slice_out = graph->CreateVarNode(&slice_out_desc); - - OpDesc slice_op_desc(layer_norm->Op()->Block()); - slice_op_desc.SetType("slice"); - slice_op_desc.SetInput("Input", {shape_out->Name()}); - slice_op_desc.SetOutput("Out", {slice_out->Name()}); - std::vector axes = {0}; - std::vector starts = {3}; - std::vector ends = {4}; - slice_op_desc.SetAttr("axes", axes); - slice_op_desc.SetAttr("starts", starts); - slice_op_desc.SetAttr("ends", ends); - auto* slice_op = graph->CreateOpNode(&slice_op_desc); - - fused_multi_transformer_op_desc.SetInput("TimeStep", {slice_out->Name()}); + fused_multi_transformer_op_desc.SetInput("TimeStep", {"slice_out.0"}); // Out Linear input fused_multi_transformer_op_desc.SetInput("OutLinearW", @@ -2483,12 +2459,42 @@ int MultiDevicesFusedMultiTransformerDecoderFuseQKVPass::BuildFusion( IR_NODE_LINK_TO(eltadd0_b, fused_multi_transformer); IR_NODE_LINK_TO(eltadd_qk_b, fused_multi_transformer); - // TimeStep link - IR_NODE_LINK_TO(eltadd_qk_b, shape_op); - IR_NODE_LINK_TO(shape_op, shape_out); - IR_NODE_LINK_TO(shape_out, slice_op); - IR_NODE_LINK_TO(slice_op, slice_out); - IR_NODE_LINK_TO(slice_out, fused_multi_transformer) + if (layer_idx == 0) { + VarDesc shape_out_desc("shape_out.0"); + shape_out_desc.SetDataType(proto::VarType::INT32); + shape_out_desc.SetPersistable(false); + auto* shape_out = graph->CreateVarNode(&shape_out_desc); + + OpDesc shape_op_desc(layer_norm->Op()->Block()); + shape_op_desc.SetType("shape"); + shape_op_desc.SetInput("Input", {eltadd_qk_b->Name()}); + shape_op_desc.SetOutput("Out", {shape_out->Name()}); + auto* shape_op = graph->CreateOpNode(&shape_op_desc); + + VarDesc slice_out_desc("slice_out.0"); + slice_out_desc.SetDataType(proto::VarType::INT32); + slice_out_desc.SetPersistable(false); + auto* slice_out = graph->CreateVarNode(&slice_out_desc); + + OpDesc slice_op_desc(layer_norm->Op()->Block()); + slice_op_desc.SetType("slice"); + slice_op_desc.SetInput("Input", {shape_out->Name()}); + slice_op_desc.SetOutput("Out", {slice_out->Name()}); + std::vector axes = {0}; + std::vector starts = {3}; + std::vector ends = {4}; + slice_op_desc.SetAttr("axes", axes); + slice_op_desc.SetAttr("starts", starts); + slice_op_desc.SetAttr("ends", ends); + auto* slice_op = graph->CreateOpNode(&slice_op_desc); + + // TimeStep link + IR_NODE_LINK_TO(eltadd_qk_b, shape_op); + IR_NODE_LINK_TO(shape_op, shape_out); + IR_NODE_LINK_TO(shape_out, slice_op); + IR_NODE_LINK_TO(slice_op, slice_out); + IR_NODE_LINK_TO(slice_out, fused_multi_transformer) + } IR_NODE_LINK_TO(matmul_linear_w, fused_multi_transformer); IR_NODE_LINK_TO(eltadd_linear_b, fused_multi_transformer); diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc index a1980a8ba5005..19fd7279b9677 100755 --- a/paddle/fluid/inference/api/paddle_pass_builder.cc +++ b/paddle/fluid/inference/api/paddle_pass_builder.cc @@ -177,6 +177,7 @@ const std::vector kGpuLowerPrecisionPasses{ "fused_multi_transformer_decoder_fuse_qkv_pass", "multi_devices_fused_multi_transformer_encoder_fuse_qkv_pass", "multi_devices_fused_multi_transformer_decoder_fuse_qkv_pass", + "fuse_multi_transformer_layer_pass", "gpu_cpu_map_matmul_v2_to_mul_pass", "gpu_cpu_map_matmul_v2_to_matmul_pass", "fc_fuse_pass", From fed0ed34e9ad221289e24d80f03834cbcd6ec16e Mon Sep 17 00:00:00 2001 From: Sylwester Fraczek Date: Mon, 21 Nov 2022 12:08:53 +0100 Subject: [PATCH 129/210] add fc-residual quantization (#46917) * add fc-residual quantization * revert removal of check for use_mkldnn * fix bug * add disable_logs * review fix call twice AreScalesPresntForNodes instead of if-else * rewrite residual input to output * revert fc mkldnn taking residual data * format fix * fix LoDTensor->DenseTensor * LoDTensor->DenseTensor * output->input * revert changes to unsupported script revert changes to unsupported script * remove fc residualdata from output blocklist in cpu_bfloat16_pass.cc --- .../framework/ir/graph_pattern_detector.cc | 15 +-- .../framework/ir/mkldnn/cpu_bfloat16_pass.cc | 1 - .../framework/ir/mkldnn/cpu_quantize_pass.cc | 35 +++++- .../framework/ir/mkldnn/cpu_quantize_pass.h | 4 +- .../ir/mkldnn/cpu_quantize_squash_pass.cc | 7 +- .../fc_elementwise_add_mkldnn_fuse_pass.cc | 2 +- .../framework/ir/mkldnn/fc_mkldnn_pass.cc | 35 ++---- .../framework/ir/mkldnn/fc_mkldnn_pass.h | 1 - .../inference/api/paddle_pass_builder.cc | 1 + paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc | 113 ++++++++++-------- 10 files changed, 117 insertions(+), 97 deletions(-) diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index cb131f8ec16ac..7f509d64b5c23 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -1163,21 +1163,12 @@ PDNode *patterns::FCMKLDNN::operator()(bool with_residual_data) { if (with_residual_data) { auto res_fc_var = pattern->NewNode(residual_data_repr()) ->AsInput() - ->assert_is_op_input("fc") - // assert_is_op_input with two arguments doesn't work - // because ResidualData in FC is set as output with - // SetOutput so we do custom assert output - ->assert_more([&](Node *x) { - for (auto *op : x->outputs) - if (IsNthOutput(x, op, "ResidualData", 0)) - return true; - return false; - }); + ->assert_is_op_input("fc", "ResidualData"); links_from.push_back(res_fc_var); } else { fc_op->assert_more([&](Node *x) { - if (!HasOutput(x, "ResidualData") || - x->Op()->Output("ResidualData").size() == 0) + if (!HasInput(x, "ResidualData") || + x->Op()->Input("ResidualData").size() == 0) return true; return false; }); diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.cc index d64fbe16a3eb4..ba8bacd200b12 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.cc @@ -200,7 +200,6 @@ class DeQuantizer final : public Quanter { std::unordered_map> block_list{ {"layer_norm", {"Mean", "Variance"}}, // not used in inference in MKLDNN - {"fc", {"ResidualData"}}, // artifical output, already dequantized {"matmul", {"ResidualData"}}, // artifical output, already dequantized {"matmul_v2", {"ResidualData"}}}; // artifical output, already dequantized diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc index efdba4f44fdf8..ac509aa604bd6 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc @@ -515,16 +515,17 @@ void CPUQuantizePass::QuantizeConv(Graph* graph, ((with_residual_data) ? "with residual connection" : "")); } -void CPUQuantizePass::QuantizeFc(Graph* graph) const { +void CPUQuantizePass::QuantizeFc(Graph* graph, bool with_residual_data) const { GraphPatternDetector gpd; auto pattern = gpd.mutable_pattern(); patterns::FCMKLDNN fc_pattern{pattern, name_scope_}; - fc_pattern(false /* with_residual */); + fc_pattern(with_residual_data); int quantize_fc_count = 0; auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, Graph* g) { - VLOG(4) << "Quantize fc op"; + VLOG(4) << "Quantize fc op " << (with_residual_data ? "with" : "without") + << " residual data"; GET_IR_NODE_FROM_SUBGRAPH(fc, fc, fc_pattern); // skip if should not be quantized @@ -532,6 +533,7 @@ void CPUQuantizePass::QuantizeFc(Graph* graph) const { LogQuantizationDisabled(fc); return; } + if (!fc->Op()->GetAttrIfExists("use_mkldnn")) { MarkAndLogCannotQuantizeOp(fc, "use_mkldnn attribute set to false"); return; @@ -546,6 +548,26 @@ void CPUQuantizePass::QuantizeFc(Graph* graph) const { return; } + if (with_residual_data) { + GET_IR_NODE_FROM_SUBGRAPH(residual_data, residual_data, fc_pattern); + if (!AreScalesPresentForNodes({residual_data})) { + MarkAndLogCannotQuantizeOp(fc, "No scale available for the operator"); + return; + } + + bool is_residual_unsigned{false}; + auto residual_scale = + GetScaleValueForNode(residual_data, &is_residual_unsigned); + + QuantizeInput(g, + fc, + residual_data, + "ResidualData", + residual_scale, + is_residual_unsigned, + "Scale_in_eltwise"); + } + bool is_input_unsigned{false}; auto input_scale = GetScaleValueForNode(input, &is_input_unsigned); QuantizeInput( @@ -576,7 +598,9 @@ void CPUQuantizePass::QuantizeFc(Graph* graph) const { gpd(graph, handler); AddStatis(quantize_fc_count); - LogQuantizedOpsCounter("fc", quantize_fc_count); + LogQuantizedOpsCounter("fc", + quantize_fc_count, + with_residual_data ? "with residual connection" : ""); } void CPUQuantizePass::QuantizePool(Graph* graph) const { @@ -1228,7 +1252,8 @@ void CPUQuantizePass::ApplyImpl(ir::Graph* graph) const { QuantizePool(graph); QuantizeConcat(graph); QuantizePriorBox(graph); - QuantizeFc(graph); + QuantizeFc(graph, false /* with_residual_data */); + QuantizeFc(graph, true /* with_residual_data */); QuantizeMatmul(graph, false /* with_residual_data */); QuantizeMatmul(graph, true /* with_residual_data */); QuantizeImmutable(graph, "reshape2", "X"); diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h index 64f9b11ee9464..b3c5312197baf 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h @@ -49,8 +49,8 @@ class CPUQuantizePass : public FusePassBase { protected: void ApplyImpl(ir::Graph* graph) const override; - void QuantizeConv(Graph* graph, bool with_residual_data = false) const; - void QuantizeFc(Graph* graph) const; + void QuantizeConv(Graph* graph, bool with_residual_data) const; + void QuantizeFc(Graph* graph, bool with_residual_data) const; void QuantizePool(Graph* graph) const; void QuantizeConcat(Graph* graph) const; void QuantizePriorBox(Graph* graph) const; diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc index e0a64b2036bb7..b0ccbb8aa9d26 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc @@ -337,7 +337,8 @@ void CPUQuantizeSquashPass::OpDequantSquash(Graph* graph) const { if (dequant_in->outputs.size() == 1) { if (any_op->Op()->Type() == "conv2d" || - any_op->Op()->Type() == "conv2d_transpose") { + any_op->Op()->Type() == "conv2d_transpose" || + any_op->Op()->Type() == "fc") { // do not squash if fuse residual connection is true // because residual fusion does not support force output with fp32 if (any_op->Op()->GetAttrIfExists("fuse_residual_connection")) @@ -418,8 +419,8 @@ void CPUQuantizeSquashPass::MultipleQuantizeSquash(Graph* graph) const { last_op_names.begin(), last_op_names.end(), quant_out->Name()), last_op_names.end()); last_op_names.push_back(first_quant_out->Name()); - last_op->Op()->SetInput(last_op_input_name, - std::vector(last_op_names)); + last_op_op->SetInput(last_op_input_name, + std::vector(last_op_names)); IR_NODE_LINK_TO(first_quant_out, last_op); GraphSafeRemoveNodes(graph, {quant_op, quant_out}); diff --git a/paddle/fluid/framework/ir/mkldnn/fc_elementwise_add_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/fc_elementwise_add_mkldnn_fuse_pass.cc index 7b0951b9c7ddc..9ddf9e161db7d 100644 --- a/paddle/fluid/framework/ir/mkldnn/fc_elementwise_add_mkldnn_fuse_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/fc_elementwise_add_mkldnn_fuse_pass.cc @@ -119,7 +119,7 @@ GraphWithStats FCResidualConnectionMKLDNNFusePass::FuseFC( return; } - fc_op->Op()->SetOutput("ResidualData", {residual_data->Name()}); + fc_op->Op()->SetInput("ResidualData", {residual_data->Name()}); fc_op->Op()->SetOutput("Out", {elementwise_out->Name()}); fc_op->Op()->SetAttr("fuse_residual_connection", true); diff --git a/paddle/fluid/framework/ir/mkldnn/fc_mkldnn_pass.cc b/paddle/fluid/framework/ir/mkldnn/fc_mkldnn_pass.cc index a2f8c14d1a2cc..ceb73b0911267 100644 --- a/paddle/fluid/framework/ir/mkldnn/fc_mkldnn_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/fc_mkldnn_pass.cc @@ -29,18 +29,16 @@ namespace ir { class Graph; -namespace { -void LogEnabledOps(const int counter, const std::string& details) { - std::string msg_ss{"--- enabled FC MKL-DNN for "}; - msg_ss += counter + " fc ops " + details; - string::PrettyLogDetail(msg_ss.c_str()); -} -} // namespace +void FCMKLDNNPass::ApplyImpl(ir::Graph* graph) const { + PADDLE_ENFORCE_NOT_NULL(graph, + platform::errors::InvalidArgument( + "Pointer to graph argument should not be NULL.")); + Init("fc_mkldnn_pass", graph); -void FCMKLDNNPass::ApplyPass(ir::Graph* graph, bool with_residual) const { GraphPatternDetector gpd; patterns::FCMKLDNN fc_pattern(gpd.mutable_pattern(), "fc_mkldnn_pass"); - fc_pattern(with_residual); + // searching for fc+residual doesn't make sense at this stage + fc_pattern(false /*with_residual*/); int found_fc_count = 0; auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, @@ -79,19 +77,12 @@ void FCMKLDNNPass::ApplyPass(ir::Graph* graph, bool with_residual) const { AddStatis(found_fc_count); - LogEnabledOps(found_fc_count, - (with_residual ? "with residual connection" - : "without residual connection")); -} - -void FCMKLDNNPass::ApplyImpl(ir::Graph* graph) const { - PADDLE_ENFORCE_NOT_NULL(graph, - platform::errors::InvalidArgument( - "Pointer to graph argument should not be NULL.")); - Init("fc_mkldnn_pass", graph); - - ApplyPass(graph, true); - ApplyPass(graph, false); + if ((!Has("disable_logs") || !Get("disable_logs")) && + (found_fc_count > 0)) { + std::string msg_ss = "--- enabled FC MKL-DNN for " + + std::to_string(found_fc_count) + " fc ops "; + string::PrettyLogDetail(msg_ss.c_str()); + } } } // namespace ir diff --git a/paddle/fluid/framework/ir/mkldnn/fc_mkldnn_pass.h b/paddle/fluid/framework/ir/mkldnn/fc_mkldnn_pass.h index 9367e08e7c703..df02250394a19 100644 --- a/paddle/fluid/framework/ir/mkldnn/fc_mkldnn_pass.h +++ b/paddle/fluid/framework/ir/mkldnn/fc_mkldnn_pass.h @@ -34,7 +34,6 @@ class FCMKLDNNPass : public FusePassBase { protected: void ApplyImpl(ir::Graph* graph) const; - void ApplyPass(ir::Graph* graph, bool with_residual) const; }; } // namespace ir diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc index 19fd7279b9677..062264222b255 100755 --- a/paddle/fluid/inference/api/paddle_pass_builder.cc +++ b/paddle/fluid/inference/api/paddle_pass_builder.cc @@ -439,6 +439,7 @@ void CpuPassStrategy::EnableMkldnnInt8() { passes_.push_back("repeated_fc_relu_fuse_pass"); passes_.push_back("fc_mkldnn_pass"); passes_.push_back("fc_act_mkldnn_fuse_pass"); + passes_.push_back("fc_elementwise_add_mkldnn_fuse_pass"); passes_.push_back("matmul_transpose_reshape_mkldnn_fuse_pass"); passes_.push_back("batch_norm_act_fuse_pass"); passes_.push_back("softplus_activation_mkldnn_fuse_pass"); diff --git a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc index a9d1e6e9d5810..6a6704c094533 100644 --- a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc @@ -103,15 +103,16 @@ class FCMKLDNNHandler dnnl::primitive_attr attributes; dnnl::post_ops post_operations; - std::vector output_shift_scale; - float scale = 1.0f; + float sum_scale = 1.0f; + float activation_scale = 1.0f; if (phi::funcs::is_int8()) { - std::tie(output_shift_scale, scale) = ComputeOutputShiftScale(ctx); + std::vector output_shift_scale; + std::tie(output_shift_scale, sum_scale, activation_scale) = + GetOutputScales(ctx); int mask = CreateMask(1, output_shift_scale.size() > 1); attributes.set_output_scales(mask, output_shift_scale); } - float sum_scale = 1.0f; if (ctx.HasAttr("fuse_residual_connection") && ctx.Attr("fuse_residual_connection")) { post_operations.append_sum(sum_scale); @@ -120,9 +121,9 @@ class FCMKLDNNHandler // ReLU from "fc_fuse_pass" if (ctx.Attr("activation_type") == "relu") { post_operations.append_eltwise( - scale, dnnl::algorithm::eltwise_relu, 0.0f, 0.0f); + activation_scale, dnnl::algorithm::eltwise_relu, 0.0f, 0.0f); } - platform::AppendActivation(ctx, post_operations, scale); + platform::AppendActivation(ctx, post_operations, activation_scale); if (ctx.HasAttr("fused_output_scale")) { float scale_alpha = ctx.Attr("fused_output_scale"); @@ -136,18 +137,22 @@ class FCMKLDNNHandler // Compute the bias scales so that its values correspond to the // scale of data being an output of weights and input multiplication - std::vector ComputeBiasScales( - const float scale_in, const std::vector& scale_weights) { - std::vector bias_scales(scale_weights.size()); - - for (size_t i = 0; i < bias_scales.size(); ++i) { - if (scale_weights[i] == 0.0) - bias_scales[i] = 1.0f; - else - bias_scales[i] = scale_in * scale_weights[i]; + std::vector GetBiasScales(const framework::ExecutionContext& ctx) { + if (ctx.HasAttr("Bias_scales")) { + return ctx.Attr>("Bias_scales"); + } else { + const float scale_in = ctx.Attr("Scale_in"); + const auto& scale_weights = ctx.Attr>("Scale_weights"); + std::vector bias_scales(scale_weights.size()); + + for (size_t i = 0; i < bias_scales.size(); ++i) { + if (scale_weights[i] == 0.0) + bias_scales[i] = 1.0f; + else + bias_scales[i] = scale_in * scale_weights[i]; + } + return bias_scales; } - - return bias_scales; } // Correct output scale, to take into account scaling of input and weights @@ -155,32 +160,44 @@ class FCMKLDNNHandler // scaled with its own scales, this data needs to be divided by // those scales to normalise them back to what their floating-point range // was. Then we multiply them by desired output scale we want on the output. - std::tuple, float> ComputeOutputShiftScale( + std::tuple, float, float> GetOutputScales( const ExecutionContext& ctx) { - auto scale_in_data = ctx.Attr("Scale_in"); - auto scale_weights_data = ctx.Attr>("Scale_weights"); - bool has_activation = !ctx.Attr("activation_type").empty(); - bool force_fp32_output = ctx.Attr("force_fp32_output"); - - // If the output will be in floats, we don't multiply by scale_out. - - float scale = (!force_fp32_output && has_activation) - ? ctx.Attr("Scale_out") - : 1.0f; - float inner_scale = (force_fp32_output || has_activation) - ? 1.0f - : ctx.Attr("Scale_out"); - const size_t weight_scales_num = scale_weights_data.size(); - - for (size_t i = 0; i < weight_scales_num; ++i) { - if (scale_weights_data[i] == 0.0) - scale_weights_data[i] = inner_scale; - else - scale_weights_data[i] = - inner_scale / (scale_in_data * scale_weights_data[i]); + if (ctx.HasAttr("Sum_scale")) { + return std::make_tuple(ctx.Attr>("Output_shift_scale"), + ctx.Attr("Sum_scale"), + ctx.Attr("Activation_scale")); + } else { + auto scale_in_data = ctx.Attr("Scale_in"); + auto scale_weights_data = ctx.Attr>("Scale_weights"); + bool has_activation = !ctx.Attr("activation_type").empty(); + bool force_fp32_output = ctx.Attr("force_fp32_output"); + bool fuse_residual_conn = ctx.HasAttr("fuse_residual_connection") && + ctx.Attr("fuse_residual_connection"); + auto scale_in_eltwise_data = ctx.HasAttr("Scale_in_eltwise") + ? ctx.Attr("Scale_in_eltwise") + : 1.0f; + + // If the output will be in floats, we don't multiply by scale_out. + + float activation_scale = (!force_fp32_output && has_activation) + ? ctx.Attr("Scale_out") + : 1.0f; + float scale_out_data = (force_fp32_output || has_activation) + ? 1.0f + : ctx.Attr("Scale_out"); + float sum_scale = + fuse_residual_conn ? scale_out_data / scale_in_eltwise_data : 1.0f; + const size_t weight_scales_num = scale_weights_data.size(); + + for (size_t i = 0; i < weight_scales_num; ++i) { + if (scale_weights_data[i] == 0.0) + scale_weights_data[i] = scale_out_data; + else + scale_weights_data[i] = + scale_out_data / (scale_in_data * scale_weights_data[i]); + } + return std::make_tuple(scale_weights_data, sum_scale, activation_scale); } - - return make_tuple(scale_weights_data, scale); } // Computing MKL-DNN's scaling mask which determines along which dimension @@ -240,9 +257,7 @@ class FCMKLDNNHandler } std::shared_ptr AcquireBiasMemoryWithReorder( - const phi::DenseTensor* bias, - const float scale_in, - const std::vector& scale_weights) { + const framework::ExecutionContext& ctx, const phi::DenseTensor* bias) { const float* bias_data = bias->data(); if (phi::funcs::is_int8() == false) { @@ -255,7 +270,7 @@ class FCMKLDNNHandler this->dev_ctx_.GetBlob(bias_key)); if (!memory_p) { - const auto& scale_data = ComputeBiasScales(scale_in, scale_weights); + const auto& scale_data = GetBiasScales(ctx); dnnl::primitive_attr attrs; int mask = CreateMask(0, scale_data.size() > 1); @@ -316,7 +331,7 @@ class FCMKLDNNHandler const ExecutionContext& ctx, phi::DenseTensor* out) { if (ctx.HasAttr("fuse_residual_connection") && ctx.Attr("fuse_residual_connection")) { - auto* residual_param = ctx.Output("ResidualData"); + auto* residual_param = ctx.Input("ResidualData"); PADDLE_ENFORCE_EQ( out->dims(), @@ -393,7 +408,6 @@ class FCMKLDNNKernel : public framework::OpKernel { const auto* bias = ctx.Input("Bias"); auto out = ctx.Output("Out"); - const float scale_in = ctx.Attr("Scale_in"); const auto& scale_weights = ctx.Attr>("Scale_weights"); std::shared_ptr fc_p; @@ -430,7 +444,7 @@ class FCMKLDNNKernel : public framework::OpKernel { std::make_shared(inner_product_cache->dst_mem); if (ctx.HasAttr("fuse_residual_connection") && ctx.Attr("fuse_residual_connection")) { - auto* residual_param = ctx.Output("ResidualData"); + auto* residual_param = ctx.Input("ResidualData"); out->ShareDataWith(*residual_param); } auto out_ptr = out->mutable_data( @@ -460,8 +474,7 @@ class FCMKLDNNKernel : public framework::OpKernel { dst_memory_p = handler.AcquireCustomDstMemory(ctx, out); if (bias) { - bias_memory_p = - handler.AcquireBiasMemoryWithReorder(bias, scale_in, scale_weights); + bias_memory_p = handler.AcquireBiasMemoryWithReorder(ctx, bias); } fc_p = handler.AcquireForwardPrimitive(); From 2d0fb05963460e5db280d11a0a1a7af00af07b77 Mon Sep 17 00:00:00 2001 From: Roc <30228238+sljlp@users.noreply.github.com> Date: Mon, 21 Nov 2022 19:12:02 +0800 Subject: [PATCH 130/210] Fix Ctx Dev pointer for KUNLUN (#48184) --- paddle/fluid/distributed/collective/ProcessGroupBKCL.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/distributed/collective/ProcessGroupBKCL.cc b/paddle/fluid/distributed/collective/ProcessGroupBKCL.cc index 898166faae187..75953dc0b4289 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupBKCL.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupBKCL.cc @@ -292,14 +292,14 @@ phi::DeviceContext* ProcessGroupBKCL::GetDeviceContext( const std::string& key = GetKeyFromPlace(place); if (use_calc_stream) { const auto& iter = place_to_calc_ctx_.find(key); - return *iter->second; + return iter->second; } else { const auto& iter = place_to_comm_ctx_.find(key); PADDLE_ENFORCE_NE(iter, place_to_comm_ctx_.end(), platform::errors::InvalidArgument( "Cannot find device context in process group.")); - return *iter->second; + return iter->second.get(); } } From 809516f61cf7437fe56c0a9ab699e5467c44b9f6 Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Mon, 21 Nov 2022 19:14:59 +0800 Subject: [PATCH 131/210] fix doc of NPUPlace (#48148) * fix doc of NPUPlace * fix doc of NPUPlace, test=document_fix --- paddle/fluid/pybind/place.cc | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/pybind/place.cc b/paddle/fluid/pybind/place.cc index 72e672792d5e6..b39427f8b3962 100644 --- a/paddle/fluid/pybind/place.cc +++ b/paddle/fluid/pybind/place.cc @@ -553,8 +553,11 @@ void BindPlace(pybind11::module &m) { // NOLINT Examples: .. code-block:: python + + # required: npu + import paddle - npu_place = paddle.NPUPlace(0) + place = paddle.NPUPlace(0) )DOC"); g_npuplace_pytype = reinterpret_cast(npuplace.ptr()); From b0eec3171853ed8490eb5fa6a090a124bf92bd28 Mon Sep 17 00:00:00 2001 From: Nyakku Shigure Date: Tue, 22 Nov 2022 10:39:49 +0800 Subject: [PATCH 132/210] [CodeStyle][py36-][E722] remove import handling for collections.abc in different python versions (#48165) --- python/paddle/distribution/multinomial.py | 5 +---- python/paddle/distribution/normal.py | 5 +---- python/paddle/fluid/backward.py | 5 +---- python/paddle/fluid/dataloader/collate.py | 5 +---- python/paddle/fluid/dataloader/flat.py | 6 ++---- .../fluid/dygraph/dygraph_to_static/origin_info.py | 5 +---- python/paddle/fluid/layers/nn.py | 6 ++---- python/paddle/fluid/layers/rnn.py | 5 +---- python/paddle/fluid/layers/utils.py | 5 +---- python/paddle/fluid/tests/unittests/gradient_checker.py | 5 +---- python/paddle/framework/io.py | 5 +---- python/paddle/nn/layer/rnn.py | 5 +---- python/paddle/vision/transforms/functional_cv2.py | 9 +-------- python/paddle/vision/transforms/functional_pil.py | 9 +-------- python/paddle/vision/transforms/transforms.py | 9 +-------- 15 files changed, 17 insertions(+), 72 deletions(-) diff --git a/python/paddle/distribution/multinomial.py b/python/paddle/distribution/multinomial.py index 0408aa9a6065a..5630fd1469189 100644 --- a/python/paddle/distribution/multinomial.py +++ b/python/paddle/distribution/multinomial.py @@ -15,10 +15,7 @@ import paddle from paddle.distribution import categorical, distribution -try: - from collections.abc import Iterable -except: - from collections import Iterable +from collections.abc import Iterable class Multinomial(distribution.Distribution): diff --git a/python/paddle/distribution/normal.py b/python/paddle/distribution/normal.py index 061d68ab5afa0..0dd7db2df9e4d 100644 --- a/python/paddle/distribution/normal.py +++ b/python/paddle/distribution/normal.py @@ -26,10 +26,7 @@ tensor, ) -try: - from collections.abc import Iterable -except: - from collections import Iterable +from collections.abc import Iterable class Normal(distribution.Distribution): diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py index 279ac480453dd..8f8b4bfa73115 100755 --- a/python/paddle/fluid/backward.py +++ b/python/paddle/fluid/backward.py @@ -26,10 +26,7 @@ from .data_feeder import check_type import warnings -try: - from collections.abc import Sequence -except: - from collections import Sequence +from collections.abc import Sequence __all__ = [ 'append_backward', diff --git a/python/paddle/fluid/dataloader/collate.py b/python/paddle/fluid/dataloader/collate.py index 661a0de13cd51..dd70a3421409d 100644 --- a/python/paddle/fluid/dataloader/collate.py +++ b/python/paddle/fluid/dataloader/collate.py @@ -18,10 +18,7 @@ from ..framework import _non_static_mode from .. import core, layers -try: - from collections.abc import Sequence, Mapping -except: - from collections import Sequence, Mapping +from collections.abc import Sequence, Mapping def default_collate_fn(batch): diff --git a/python/paddle/fluid/dataloader/flat.py b/python/paddle/fluid/dataloader/flat.py index 6f3c6edf0b0cc..1e1ed1eebd806 100644 --- a/python/paddle/fluid/dataloader/flat.py +++ b/python/paddle/fluid/dataloader/flat.py @@ -16,10 +16,8 @@ import numbers import numpy as np -try: - from collections.abc import Sequence, Mapping -except: - from collections import Sequence, Mapping +from collections.abc import Sequence, Mapping + FIELD_PREFIX = "_paddle_field_" diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/origin_info.py b/python/paddle/fluid/dygraph/dygraph_to_static/origin_info.py index d6ff463a70d7c..7eb9da1206439 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/origin_info.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/origin_info.py @@ -21,10 +21,7 @@ from paddle.fluid.dygraph.dygraph_to_static.utils import ORIGI_INFO from paddle.fluid.framework import Program -try: - from collections.abc import Sequence -except: - from collections import Sequence +from collections.abc import Sequence class Location: diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index b32124e6bc471..076fdf8fdb105 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -59,6 +59,8 @@ ) from paddle.utils import deprecated from paddle import _C_ops, _legacy_C_ops +from collections.abc import Iterable + __all__ = [ 'fc', @@ -6798,10 +6800,6 @@ def lod_append(x, level): x = fluid.layers.data(name='x', shape=[6, 10], lod_level=1) out = fluid.layers.lod_append(x, [1,1,1,1,1,1]) """ - try: - from collections.abc import Iterable - except: - from collections import Iterable if x is None: raise ValueError("Input(x) can't be None.") if (not isinstance(level, Iterable)) and (not isinstance(level, Variable)): diff --git a/python/paddle/fluid/layers/rnn.py b/python/paddle/fluid/layers/rnn.py index 60b0eb5da67d0..82da847bbc7b6 100644 --- a/python/paddle/fluid/layers/rnn.py +++ b/python/paddle/fluid/layers/rnn.py @@ -32,10 +32,7 @@ from ..param_attr import ParamAttr from ..data_feeder import check_variable_and_dtype, check_type, check_dtype -try: - from collections.abc import Sequence -except: - from collections import Sequence +from collections.abc import Sequence __all__ = [ 'RNNCell', diff --git a/python/paddle/fluid/layers/utils.py b/python/paddle/fluid/layers/utils.py index 66fc253bb097b..7e3e69fda7c07 100644 --- a/python/paddle/fluid/layers/utils.py +++ b/python/paddle/fluid/layers/utils.py @@ -25,10 +25,7 @@ from ..layer_helper import LayerHelper from sys import version_info -try: - from collections.abc import Sequence -except: - from collections import Sequence +from collections.abc import Sequence def convert_to_list(value, n, name, dtype=int): diff --git a/python/paddle/fluid/tests/unittests/gradient_checker.py b/python/paddle/fluid/tests/unittests/gradient_checker.py index 9b08f17dadd7b..c51b02bd43b55 100644 --- a/python/paddle/fluid/tests/unittests/gradient_checker.py +++ b/python/paddle/fluid/tests/unittests/gradient_checker.py @@ -22,10 +22,7 @@ from paddle.fluid.backward import _append_grad_suffix_, _as_list from paddle.fluid.framework import _test_eager_guard -try: - from collections.abc import Sequence -except: - from collections import Sequence +from collections.abc import Sequence def _product(t): diff --git a/python/paddle/framework/io.py b/python/paddle/framework/io.py index 758f734dd9ac7..57e1edfc478c0 100644 --- a/python/paddle/framework/io.py +++ b/python/paddle/framework/io.py @@ -49,10 +49,7 @@ ) from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX -try: - from collections.abc import Iterable -except: - from collections import Iterable +from collections.abc import Iterable __all__ = [] diff --git a/python/paddle/nn/layer/rnn.py b/python/paddle/nn/layer/rnn.py index dba69b9848a28..aeac50d0680e1 100644 --- a/python/paddle/nn/layer/rnn.py +++ b/python/paddle/nn/layer/rnn.py @@ -30,10 +30,7 @@ from paddle.static import default_startup_program from paddle.static import program_guard -try: - from collections.abc import Sequence -except: - from collections import Sequence +from collections.abc import Sequence __all__ = [] diff --git a/python/paddle/vision/transforms/functional_cv2.py b/python/paddle/vision/transforms/functional_cv2.py index 52609b786fe1f..f22b63d83f871 100644 --- a/python/paddle/vision/transforms/functional_cv2.py +++ b/python/paddle/vision/transforms/functional_cv2.py @@ -12,22 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. -import sys import math import numbers -import collections import numpy as np import paddle from paddle.utils import try_import -if sys.version_info < (3, 3): - Sequence = collections.Sequence - Iterable = collections.Iterable -else: - Sequence = collections.abc.Sequence - Iterable = collections.abc.Iterable +from collections.abc import Sequence, Iterable __all__ = [] diff --git a/python/paddle/vision/transforms/functional_pil.py b/python/paddle/vision/transforms/functional_pil.py index 595d92a84b11b..432cd7da2bd8a 100644 --- a/python/paddle/vision/transforms/functional_pil.py +++ b/python/paddle/vision/transforms/functional_pil.py @@ -12,20 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. -import sys import numbers -import collections from PIL import Image, ImageOps, ImageEnhance import numpy as np import paddle -if sys.version_info < (3, 3): - Sequence = collections.Sequence - Iterable = collections.Iterable -else: - Sequence = collections.abc.Sequence - Iterable = collections.abc.Iterable +from collections.abc import Sequence, Iterable try: # PIL version >= "9.1.0" diff --git a/python/paddle/vision/transforms/transforms.py b/python/paddle/vision/transforms/transforms.py index f5cbd90ffc7b0..13be056df2daf 100644 --- a/python/paddle/vision/transforms/transforms.py +++ b/python/paddle/vision/transforms/transforms.py @@ -13,23 +13,16 @@ # limitations under the License. import math -import sys import random import numpy as np import numbers -import collections import traceback import paddle from . import functional as F -if sys.version_info < (3, 3): - Sequence = collections.Sequence - Iterable = collections.Iterable -else: - Sequence = collections.abc.Sequence - Iterable = collections.abc.Iterable +from collections.abc import Sequence, Iterable __all__ = [] From 3c0bd3afbb4349424688f3adc62681ab4cca3bd2 Mon Sep 17 00:00:00 2001 From: wanghuancoder Date: Tue, 22 Nov 2022 10:52:17 +0800 Subject: [PATCH 133/210] use full directly if device is CPU and in dygraph, for optimizer (#48189) --- python/paddle/optimizer/optimizer.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py index 59663bb819088..6a9d504cc29bc 100644 --- a/python/paddle/optimizer/optimizer.py +++ b/python/paddle/optimizer/optimizer.py @@ -730,10 +730,22 @@ def _add_accumulator( ) if device is None: device = self._get_device_for_param(param.name) - with device_guard(device): - self.helper.set_variable_initializer( - var, initializer=Constant(value=float(fill_value)) + + if in_dygraph_mode() and ( + device == 'cpu' or isinstance(device, core.CPUPlace) + ): + _C_ops.full_( + var, + var.shape, + str(float(fill_value)), + var.dtype, + core.CPUPlace(), ) + else: + with device_guard(device): + self.helper.set_variable_initializer( + var, initializer=Constant(value=float(fill_value)) + ) if framework._non_static_mode(): if len(self._accumulators_holder) > 0: From 6992170e668228259b0b9c3e789f24cc3c5820ed Mon Sep 17 00:00:00 2001 From: zhaoyingli <86812880+zhaoyinglia@users.noreply.github.com> Date: Tue, 22 Nov 2022 11:02:22 +0800 Subject: [PATCH 134/210] fix_var_recursive (#48206) --- .../auto_parallel/operators/common.py | 6 +- .../dist_check_finite_and_unscale.py | 8 +-- .../auto_parallel/operators/dist_default.py | 5 +- .../auto_parallel/operators/dist_eltwise.py | 1 - .../auto_parallel/operators/dist_embedding.py | 14 ++-- .../auto_parallel/operators/dist_matmul.py | 68 +++++++------------ .../auto_parallel/operators/dist_pnorm.py | 8 +-- .../operators/dist_reduce_sum_p.py | 4 +- .../auto_parallel/operators/dist_reshape.py | 24 +++---- .../auto_parallel/operators/dist_softmax.py | 1 - .../auto_parallel/operators/dist_transpose.py | 1 - .../operators/dist_update_loss_scaling.py | 2 +- 12 files changed, 57 insertions(+), 85 deletions(-) diff --git a/python/paddle/distributed/auto_parallel/operators/common.py b/python/paddle/distributed/auto_parallel/operators/common.py index af9c53a88ea86..72ed66f3e41a0 100644 --- a/python/paddle/distributed/auto_parallel/operators/common.py +++ b/python/paddle/distributed/auto_parallel/operators/common.py @@ -266,13 +266,13 @@ def is_parameter_related(varname, block): varname = varname[: varname.index(".cast_fp")] if ".quantized" in varname: varname = varname[: varname.index(".quantized")] - assert block.has_var(varname) - var = block.var(varname) + assert block._find_var_recursive(varname) + var = block._var_recursive(varname) return var.is_parameter def infer_shape(block, src_var, src_var_dist_attr, op_input_dist_attr): - var_shape = block.var(src_var.name).shape + var_shape = block._var_recursive(src_var.name).shape var_topoloy = src_var_dist_attr.process_mesh.topology var_dims_mapping = src_var_dist_attr.dims_mapping diff --git a/python/paddle/distributed/auto_parallel/operators/dist_check_finite_and_unscale.py b/python/paddle/distributed/auto_parallel/operators/dist_check_finite_and_unscale.py index 4fa689f5fa1ae..c1834bde1136c 100644 --- a/python/paddle/distributed/auto_parallel/operators/dist_check_finite_and_unscale.py +++ b/python/paddle/distributed/auto_parallel/operators/dist_check_finite_and_unscale.py @@ -117,7 +117,7 @@ def backward(ctx, *args, **kwargs): if ( rank_id in ctx.get_tensor_dist_attr_for_program( - main_block.var(varname) + main_block._var_recursive(varname) ).process_mesh.processes ): filter_vars.append(varname) @@ -132,7 +132,7 @@ def backward(ctx, *args, **kwargs): # sync result group = new_process_group(world_process_group.ranks) - inf_var = main_block.var(kwargs['FoundInfinite'][0]) + inf_var = main_block._var_recursive(kwargs['FoundInfinite'][0]) inf_var_int32 = main_block.create_var( name=inf_var.name + "@cast_int32", shape=inf_var.shape, @@ -179,7 +179,7 @@ def backward(ctx, *args, **kwargs): new_op_dist_attr = OperatorDistributedAttribute() for varname in op.input_arg_names: var_dist_attr = ctx.get_tensor_dist_attr_for_program( - main_block.var(varname) + main_block._var_recursive(varname) ) assert var_dist_attr is not None new_op_dist_attr.set_input_dims_mapping( @@ -187,7 +187,7 @@ def backward(ctx, *args, **kwargs): ) for varname in op.output_arg_names: var_dist_attr = ctx.get_tensor_dist_attr_for_program( - main_block.var(varname) + main_block._var_recursive(varname) ) new_op_dist_attr.set_output_dims_mapping( varname, var_dist_attr.dims_mapping diff --git a/python/paddle/distributed/auto_parallel/operators/dist_default.py b/python/paddle/distributed/auto_parallel/operators/dist_default.py index b1c0014045b95..85ffb77d97b52 100644 --- a/python/paddle/distributed/auto_parallel/operators/dist_default.py +++ b/python/paddle/distributed/auto_parallel/operators/dist_default.py @@ -69,7 +69,7 @@ def prim_operator_data_parallel_functor(ctx, src_op): }, ) - grad_var = main_block.var(var_name) + grad_var = main_block._var_recursive(var_name) dims_mapping = ctx.get_tensor_dist_attr_for_program( grad_var ).dims_mapping @@ -140,7 +140,6 @@ def calc_bwd_cost(self, dist_op, ctx, cluster): res.append(cost_mapping) main_block = backward_op.block - vars = main_block.vars need_gradient_allreduce = False for input_name in backward_op.desc.input_names(): for varname in backward_op.desc.input(input_name): @@ -588,7 +587,7 @@ def backward(ctx, *args, **kwargs): for varname in backward_op.desc.output(output_name): if varname in kwargs["grad_var_to_var"]: fwd_name = kwargs["grad_var_to_var"][varname] - if fwd_name not in main_block.vars: + if not main_block._find_var_recursive(fwd_name): continue if is_parameter_related(fwd_name, main_block): out_grad_names.append(varname) diff --git a/python/paddle/distributed/auto_parallel/operators/dist_eltwise.py b/python/paddle/distributed/auto_parallel/operators/dist_eltwise.py index 50d4f138dcaa6..75dcc98faa130 100644 --- a/python/paddle/distributed/auto_parallel/operators/dist_eltwise.py +++ b/python/paddle/distributed/auto_parallel/operators/dist_eltwise.py @@ -84,7 +84,6 @@ def calc_bwd_cost(self, dist_op, ctx, cluster): res.append(cost_mapping) main_block = backward_op.block - vars = main_block.vars need_gradient_allreduce = False for input_name in backward_op.desc.input_names(): for varname in backward_op.desc.input(input_name): diff --git a/python/paddle/distributed/auto_parallel/operators/dist_embedding.py b/python/paddle/distributed/auto_parallel/operators/dist_embedding.py index af3514c85f1a5..683236cadd14f 100644 --- a/python/paddle/distributed/auto_parallel/operators/dist_embedding.py +++ b/python/paddle/distributed/auto_parallel/operators/dist_embedding.py @@ -370,9 +370,9 @@ def forward(ctx, *args, **kwargs): kwargs['Out'] ) - Ids_var = main_block.var(kwargs['Ids'][0]) + Ids_var = main_block._var_recursive(kwargs['Ids'][0]) Weight_var = main_block._var_recursive(kwargs['W'][0]) - Out_var = main_block.var(kwargs['Out'][0]) + Out_var = main_block._var_recursive(kwargs['Out'][0]) # support lookup_table_v1 if src_op.type == 'lookup_table': @@ -507,7 +507,7 @@ def forward(ctx, *args, **kwargs): allreduce_op_dist_attr.impl_type = op_dist_attr.impl_type allreduce_op_dist_attr.impl_idx = op_dist_attr.impl_idx for input_varname in c_allreduce_sum_op.desc.input_arg_names(): - input_var = main_block.var(input_varname) + input_var = main_block._var_recursive(input_varname) tensor_dist_attr = ctx.get_tensor_dist_attr_for_program(input_var) assert tensor_dist_attr is not None allreduce_op_dist_attr.set_input_dist_attr( @@ -607,10 +607,10 @@ def backward(ctx, *args, **kwargs): kwargs['W@GRAD'] ) - Ids_var = main_block.var(kwargs['Ids'][0]) - Weight_var = main_block.var(kwargs['W'][0]) - Out_grad = main_block.var(kwargs['Out@GRAD'][0]) - Weight_grad = main_block.var(kwargs['W@GRAD'][0]) + Ids_var = main_block._var_recursive(kwargs['Ids'][0]) + Weight_var = main_block._var_recursive(kwargs['W'][0]) + Out_grad = main_block._var_recursive(kwargs['Out@GRAD'][0]) + Weight_grad = main_block._var_recursive(kwargs['W@GRAD'][0]) embedding_row_dim_mapping = dist_attr.get_input_dims_mapping( Weight_var.name diff --git a/python/paddle/distributed/auto_parallel/operators/dist_matmul.py b/python/paddle/distributed/auto_parallel/operators/dist_matmul.py index ace72f0a2162a..fa6557f497bb2 100644 --- a/python/paddle/distributed/auto_parallel/operators/dist_matmul.py +++ b/python/paddle/distributed/auto_parallel/operators/dist_matmul.py @@ -316,10 +316,10 @@ def _right_operand_parameter_matmul_backward(ctx, *args, **kwargs): kwargs['Y@GRAD'] ) - X_var = main_block.var(kwargs['X'][0]) + X_var = main_block._var_recursive(kwargs['X'][0]) Y_var = main_block._var_recursive(kwargs['Y'][0]) - Out_grad = main_block.var(kwargs['Out@GRAD'][0]) - Y_grad = main_block.var(kwargs['Y@GRAD'][0]) + Out_grad = main_block._var_recursive(kwargs['Out@GRAD'][0]) + Y_grad = main_block._var_recursive(kwargs['Y@GRAD'][0]) assert not is_parameter_related( X_var.name, main_block @@ -433,7 +433,7 @@ def _right_operand_parameter_matmul_backward(ctx, *args, **kwargs): has_x_grad = len(kwargs['X@GRAD']) > 0 if has_x_grad: assert len(kwargs['X@GRAD']) == 1 - X_grad = main_block.var(kwargs['X@GRAD'][0]) + X_grad = main_block._var_recursive(kwargs['X@GRAD'][0]) intermediate_var_0 = main_block.create_var( name=unique_name.generate_with_ignorable_key( ".".join(["c_identity", 'tmp']) @@ -572,7 +572,6 @@ def calc_bwd_cost(self, dist_op, ctx, cluster): backward_op = dist_op.serial_op dist_attr = dist_op.dist_attr main_block = backward_op.block - vars = main_block.vars Y_var_dim_mapping = dist_attr.get_input_dims_mapping( backward_op.input("Y")[0] ) @@ -647,7 +646,6 @@ def calc_fwd_cost(self, dist_op, ctx, cluster): # calc comm op cost serial_op = dist_op.serial_op - vars = serial_op.block.vars parallel_axis = dist_op.dist_attr.get_input_dims_mapping( serial_op.input("Y")[0] )[-1] @@ -762,9 +760,9 @@ def forward(ctx, *args, **kwargs): output_name ) - X_var = main_block.var(kwargs['X'][0]) - Weight_var = main_block.var(kwargs['Y'][0]) - Out_var = main_block.var(kwargs['Out'][0]) + X_var = main_block._var_recursive(kwargs['X'][0]) + Weight_var = main_block._var_recursive(kwargs['Y'][0]) + Out_var = main_block._var_recursive(kwargs['Out'][0]) trans_x = src_op.attr("transpose_X") trans_y = src_op.attr("transpose_Y") @@ -906,7 +904,7 @@ def forward(ctx, *args, **kwargs): input_varname, input_dist_attr ) else: - input_var = main_block.var(input_varname) + input_var = main_block._var_recursive(input_varname) tensor_dist_attr = ctx.get_tensor_dist_attr_for_program( input_var ) @@ -958,7 +956,6 @@ def calc_bwd_cost(self, dist_op, ctx, cluster): backward_op = dist_op.serial_op dist_attr = dist_op.dist_attr main_block = backward_op.block - vars = main_block.vars Y_var_dim_mapping = dist_attr.get_input_dims_mapping( backward_op.input("Y")[0] ) @@ -1023,8 +1020,6 @@ def calc_fwd_cost(self, dist_op, ctx, cluster): # calc comm op cost serial_op = dist_op.serial_op - vars = serial_op.block.vars - parallel_axis = dist_op.dist_attr.get_input_dims_mapping( serial_op.input("Y")[0] )[-2] @@ -1147,9 +1142,9 @@ def forward(ctx, *args, **kwargs): output_name ) - X_var = main_block.var(kwargs['X'][0]) - Weight_var = main_block.var(kwargs['Y'][0]) - Out_var = main_block.var(kwargs['Out'][0]) + X_var = main_block._var_recursive(kwargs['X'][0]) + Weight_var = main_block._var_recursive(kwargs['Y'][0]) + Out_var = main_block._var_recursive(kwargs['Out'][0]) trans_x = src_op.attr('transpose_X') trans_y = src_op.attr('transpose_Y') @@ -1268,7 +1263,7 @@ def forward(ctx, *args, **kwargs): allreduce_op_dist_attr.impl_type = op_dist_attr.impl_type allreduce_op_dist_attr.impl_idx = op_dist_attr.impl_idx for input_varname in c_allreduce_sum_op.desc.input_arg_names(): - input_var = main_block.var(input_varname) + input_var = main_block._var_recursive(input_varname) tensor_dist_attr = ctx.get_tensor_dist_attr_for_program(input_var) assert tensor_dist_attr is not None allreduce_op_dist_attr.set_input_dist_attr( @@ -1316,7 +1311,6 @@ def calc_bwd_cost(self, dist_op, ctx, cluster): backward_op = dist_op.serial_op dist_attr = dist_op.dist_attr main_block = backward_op.block - vars = main_block.vars # calc comp op cost desc_mapping = build_comp_desc_from_dist_op( @@ -1469,7 +1463,6 @@ def calc_bwd_cost(self, dist_op, ctx, cluster): backward_op = dist_op.serial_op dist_attr = dist_op.dist_attr main_block = backward_op.block - vars = main_block.vars Y_var_dim_mapping = dist_attr.get_input_dims_mapping( backward_op.input("Y")[0] ) @@ -1549,8 +1542,6 @@ def calc_fwd_cost(self, dist_op, ctx, cluster): # calc comm op cost serial_op = dist_op.serial_op - vars = serial_op.block.vars - parallel_axis = dist_op.dist_attr.get_input_dims_mapping( serial_op.input("Y")[0] )[-1] @@ -1665,9 +1656,9 @@ def forward(ctx, *args, **kwargs): output_name ) - X_var = main_block.var(kwargs['X'][0]) + X_var = main_block._var_recursive(kwargs['X'][0]) Weight_var = main_block._var_recursive(kwargs['Y'][0]) - Out_var = main_block.var(kwargs['Out'][0]) + Out_var = main_block._var_recursive(kwargs['Out'][0]) trans_x = src_op.attr('trans_x') trans_y = src_op.attr('trans_y') @@ -1808,7 +1799,7 @@ def forward(ctx, *args, **kwargs): input_varname, input_dist_attr ) else: - input_var = main_block.var(input_varname) + input_var = main_block._var_recursive(input_varname) tensor_dist_attr = ctx.get_tensor_dist_attr_for_program( input_var ) @@ -1858,7 +1849,7 @@ def calc_bwd_cost(self, dist_op, ctx, cluster): backward_op = dist_op.serial_op dist_attr = dist_op.dist_attr main_block = backward_op.block - vars = main_block.vars + Y_var_dim_mapping = dist_attr.get_input_dims_mapping( backward_op.input("Y")[0] ) @@ -1924,8 +1915,6 @@ def calc_fwd_cost(self, dist_op, ctx, cluster): # calc comm op cost serial_op = dist_op.serial_op - vars = serial_op.block.vars - parallel_axis = dist_op.dist_attr.get_input_dims_mapping( serial_op.input("Y")[0] )[-2] @@ -2047,9 +2036,9 @@ def forward(ctx, *args, **kwargs): output_name ) - X_var = main_block.var(kwargs['X'][0]) + X_var = main_block._var_recursive(kwargs['X'][0]) Weight_var = main_block._var_recursive(kwargs['Y'][0]) - Out_var = main_block.var(kwargs['Out'][0]) + Out_var = main_block._var_recursive(kwargs['Out'][0]) trans_x = src_op.attr('trans_x') trans_y = src_op.attr('trans_y') @@ -2167,7 +2156,7 @@ def forward(ctx, *args, **kwargs): allreduce_op_dist_attr.impl_type = op_dist_attr.impl_type allreduce_op_dist_attr.impl_idx = op_dist_attr.impl_idx for input_varname in c_allreduce_sum_op.desc.input_arg_names(): - input_var = main_block.var(input_varname) + input_var = main_block._var_recursive(input_varname) tensor_dist_attr = ctx.get_tensor_dist_attr_for_program(input_var) assert tensor_dist_attr is not None allreduce_op_dist_attr.set_input_dist_attr( @@ -2215,7 +2204,6 @@ def calc_bwd_cost(self, dist_op, ctx, cluster): backward_op = dist_op.serial_op dist_attr = dist_op.dist_attr main_block = backward_op.block - vars = main_block.vars process_mesh = dist_attr.process_mesh # calc comp op cost @@ -2370,7 +2358,6 @@ def calc_bwd_cost(self, dist_op, ctx, cluster): backward_op = dist_op.serial_op dist_attr = dist_op.dist_attr main_block = backward_op.block - vars = main_block.vars Y_var_dim_mapping = dist_attr.get_input_dims_mapping( backward_op.input("Y")[0] ) @@ -2445,7 +2432,6 @@ def calc_fwd_cost(self, dist_op, ctx, cluster): # calc comm op cost serial_op = dist_op.serial_op - vars = serial_op.block.vars parallel_axis = dist_op.dist_attr.get_input_dims_mapping( serial_op.input("Y")[0] )[-1] @@ -2555,9 +2541,9 @@ def forward(ctx, *args, **kwargs): output_name ) - X_var = main_block.var(kwargs['X'][0]) + X_var = main_block._var_recursive(kwargs['X'][0]) Weight_var = main_block._var_recursive(kwargs['Y'][0]) - Out_var = main_block.var(kwargs['Out'][0]) + Out_var = main_block._var_recursive(kwargs['Out'][0]) # TODO infer logic comm presentation matmul_col_dim_mapping = op_dist_attr.get_input_dims_mapping( @@ -2712,7 +2698,7 @@ def forward(ctx, *args, **kwargs): input_varname, input_dist_attr ) else: - input_var = main_block.var(input_varname) + input_var = main_block._var_recursive(input_varname) tensor_dist_attr = ctx.get_tensor_dist_attr_for_program( input_var ) @@ -2763,7 +2749,6 @@ def calc_bwd_cost(self, dist_op, ctx, cluster): dist_attr = dist_op.dist_attr process_mesh = dist_attr.process_mesh main_block = backward_op.block - vars = main_block.vars Y_var_dim_mapping = dist_attr.get_input_dims_mapping( backward_op.input("Y")[0] ) @@ -2827,8 +2812,6 @@ def calc_fwd_cost(self, dist_op, ctx, cluster): # calc comm op cost serial_op = dist_op.serial_op - vars = serial_op.block.vars - parallel_axis = dist_op.dist_attr.get_input_dims_mapping( serial_op.input("Y")[0] )[-2] @@ -2947,9 +2930,9 @@ def forward(ctx, *args, **kwargs): output_name ) - X_var = main_block.var(kwargs['X'][0]) + X_var = main_block._var_recursive(kwargs['X'][0]) Weight_var = main_block._var_recursive(kwargs['Y'][0]) - Out_var = main_block.var(kwargs['Out'][0]) + Out_var = main_block._var_recursive(kwargs['Out'][0]) # TODO infer logic comm presentation matmul_row_dim_mapping = op_dist_attr.get_input_dims_mapping( @@ -3082,7 +3065,7 @@ def forward(ctx, *args, **kwargs): allreduce_op_dist_attr.impl_type = op_dist_attr.impl_type allreduce_op_dist_attr.impl_idx = op_dist_attr.impl_idx for input_varname in c_allreduce_sum_op.desc.input_arg_names(): - input_var = main_block.var(input_varname) + input_var = main_block._var_recursive(input_varname) tensor_dist_attr = ctx.get_tensor_dist_attr_for_program(input_var) assert tensor_dist_attr is not None allreduce_op_dist_attr.set_input_dist_attr( @@ -3130,7 +3113,6 @@ def calc_bwd_cost(self, dist_op, ctx, cluster): backward_op = dist_op.serial_op dist_attr = dist_op.dist_attr main_block = backward_op.block - vars = main_block.vars # calc comp op cost desc_mapping = build_comp_desc_from_dist_op( diff --git a/python/paddle/distributed/auto_parallel/operators/dist_pnorm.py b/python/paddle/distributed/auto_parallel/operators/dist_pnorm.py index 662b4d666fdc4..99cc63a7b93dd 100644 --- a/python/paddle/distributed/auto_parallel/operators/dist_pnorm.py +++ b/python/paddle/distributed/auto_parallel/operators/dist_pnorm.py @@ -155,7 +155,7 @@ def forward(ctx, *args, **kwargs): ctx, op_dist_attr.process_mesh, rank_id ) - X_var = main_block.var(kwargs['X'][0]) + X_var = main_block._var_recursive(kwargs['X'][0]) in_dims_mapping = op_dist_attr.get_input_dims_mapping(X_var.name) for axis in range(len(in_dims_mapping)): if in_dims_mapping[axis] != -1: @@ -260,13 +260,13 @@ def backward(ctx, *args, **kwargs): output_name ) - X_var = main_block.var(kwargs['X'][0]) - X_grad_var = main_block.var(kwargs['X@GRAD'][0]) + X_var = main_block._var_recursive(kwargs['X'][0]) + X_grad_var = main_block._var_recursive(kwargs['X@GRAD'][0]) # 1. copy p_norm_grad op and reset input name and output name new_kwargs = copy.deepcopy(kwargs) new_kwargs['X'] = [".".join(["c_allgather", X_var.name])] - new_X_var = main_block.var(new_kwargs['X'][0]) + new_X_var = main_block._var_recursive(new_kwargs['X'][0]) new_X_grad = main_block.create_var( name=".".join(["c_allgather", X_grad_var.name]), dtype=X_grad_var.dtype, diff --git a/python/paddle/distributed/auto_parallel/operators/dist_reduce_sum_p.py b/python/paddle/distributed/auto_parallel/operators/dist_reduce_sum_p.py index 01b326d3a562c..75dbb7f9c0dcb 100644 --- a/python/paddle/distributed/auto_parallel/operators/dist_reduce_sum_p.py +++ b/python/paddle/distributed/auto_parallel/operators/dist_reduce_sum_p.py @@ -54,7 +54,7 @@ def is_output_compatible(self, dist_op): return False output_name = outputs[0] - output_var = dist_op.serial_op.block.var(output_name) + output_var = dist_op.serial_op.block._var_recursive(output_name) if output_var.shape != (1,): return False @@ -124,7 +124,7 @@ def forward(ctx, *args, **kwargs): ) # dist attr - var = main_block.var(var_name) + var = main_block._var_recursive(var_name) tensor_dist_attr = ctx.get_tensor_dist_attr_for_program(var) op_dist_attr = ctx.get_op_dist_attr_for_program(src_op) new_op_attr = OperatorDistributedAttribute() diff --git a/python/paddle/distributed/auto_parallel/operators/dist_reshape.py b/python/paddle/distributed/auto_parallel/operators/dist_reshape.py index b305d88d7df1b..7d4aa3f517be8 100644 --- a/python/paddle/distributed/auto_parallel/operators/dist_reshape.py +++ b/python/paddle/distributed/auto_parallel/operators/dist_reshape.py @@ -53,7 +53,6 @@ def calc_cost(self, op_role, dist_op, ctx, cluster): def calc_fwd_cost(self, dist_op, ctx, cluster): res = [] op = dist_op.serial_op - vars = op.block.vars dist_attr = dist_op.dist_attr shape_list = op.desc.attr("shape") @@ -103,7 +102,6 @@ def calc_bwd_cost(self, dist_op, ctx, cluster): backward_op = dist_op.serial_op main_block = backward_op.block need_gradient_allreduce = False - vars = main_block.vars for input_name in backward_op.desc.input_names(): for varname in backward_op.desc.input(input_name): if "@GRAD" not in varname and is_parameter_related( @@ -246,9 +244,9 @@ def forward(ctx, *args, **kwargs): output_name ) - X_var = main_block.var(kwargs['X'][0]) - Out_var = main_block.var(kwargs['Out'][0]) - XShape_var = main_block.var(kwargs['XShape'][0]) + X_var = main_block._var_recursive(kwargs['X'][0]) + Out_var = main_block._var_recursive(kwargs['Out'][0]) + XShape_var = main_block._var_recursive(kwargs['XShape'][0]) shape_list = src_op.desc.attr("shape") ShapeTensor_var_list = [] for name in kwargs['ShapeTensor']: @@ -303,7 +301,6 @@ def calc_cost(self, op_role, dist_op, ctx, cluster): def calc_fwd_cost(self, dist_op, ctx, cluster): res = [] op = dist_op.serial_op - vars = op.block.vars dist_attr = dist_op.dist_attr shape_list = op.desc.attr("shape") @@ -353,7 +350,6 @@ def calc_bwd_cost(self, dist_op, ctx, cluster): backward_op = dist_op.serial_op main_block = backward_op.block need_gradient_allreduce = False - vars = main_block.vars for input_name in backward_op.desc.input_names(): for varname in backward_op.desc.input(input_name): if "@GRAD" not in varname and not is_parameter_related( @@ -499,9 +495,9 @@ def forward(ctx, *args, **kwargs): output_name ) - X_var = main_block.var(kwargs['X'][0]) - Out_var = main_block.var(kwargs['Out'][0]) - XShape_var = main_block.var(kwargs['XShape'][0]) + X_var = main_block._var_recursive(kwargs['X'][0]) + Out_var = main_block._var_recursive(kwargs['Out'][0]) + XShape_var = main_block._var_recursive(kwargs['XShape'][0]) shape_list = src_op.desc.attr("shape") ShapeTensor_var_list = [] for name in kwargs['ShapeTensor']: @@ -556,7 +552,6 @@ def calc_cost(self, op_role, dist_op, ctx, cluster): def calc_fwd_cost(self, dist_op, ctx, cluster): res = [] op = dist_op.serial_op - vars = op.block.vars dist_attr = dist_op.dist_attr shape_list = op.desc.attr("shape") @@ -606,7 +601,6 @@ def calc_bwd_cost(self, dist_op, ctx, cluster): backward_op = dist_op.serial_op main_block = backward_op.block need_gradient_allreduce = False - vars = main_block.vars for input_name in backward_op.desc.input_names(): for varname in backward_op.desc.input(input_name): if "@GRAD" not in varname and not is_parameter_related( @@ -745,9 +739,9 @@ def forward(ctx, *args, **kwargs): output_name ) - X_var = main_block.var(kwargs['X'][0]) - Out_var = main_block.var(kwargs['Out'][0]) - XShape_var = main_block.var(kwargs['XShape'][0]) + X_var = main_block._var_recursive(kwargs['X'][0]) + Out_var = main_block._var_recursive(kwargs['Out'][0]) + XShape_var = main_block._var_recursive(kwargs['XShape'][0]) shape_list = src_op.desc.attr("shape") ShapeTensor_var_list = [] for name in kwargs['ShapeTensor']: diff --git a/python/paddle/distributed/auto_parallel/operators/dist_softmax.py b/python/paddle/distributed/auto_parallel/operators/dist_softmax.py index c480a634b097c..0059d0e1bb459 100644 --- a/python/paddle/distributed/auto_parallel/operators/dist_softmax.py +++ b/python/paddle/distributed/auto_parallel/operators/dist_softmax.py @@ -79,7 +79,6 @@ def calc_bwd_cost(self, dist_op, ctx, cluster): backward_op = dist_op.serial_op main_block = backward_op.block need_gradient_allreduce = False - vars = main_block.vars for input_name in backward_op.desc.input_names(): for varname in backward_op.desc.input(input_name): if "@GRAD" not in varname and is_parameter_related( diff --git a/python/paddle/distributed/auto_parallel/operators/dist_transpose.py b/python/paddle/distributed/auto_parallel/operators/dist_transpose.py index 6153b4a7406e3..c5ce7628dc7d4 100644 --- a/python/paddle/distributed/auto_parallel/operators/dist_transpose.py +++ b/python/paddle/distributed/auto_parallel/operators/dist_transpose.py @@ -160,7 +160,6 @@ def calc_bwd_cost(self, dist_op, ctx, cluster): backward_op = dist_op.serial_op main_block = backward_op.block need_gradient_allreduce = False - vars = main_block.vars for input_name in backward_op.desc.input_names(): for varname in backward_op.desc.input(input_name): if "@GRAD" not in varname and is_parameter_related( diff --git a/python/paddle/distributed/auto_parallel/operators/dist_update_loss_scaling.py b/python/paddle/distributed/auto_parallel/operators/dist_update_loss_scaling.py index 26c530250e15a..048d06791bbfe 100644 --- a/python/paddle/distributed/auto_parallel/operators/dist_update_loss_scaling.py +++ b/python/paddle/distributed/auto_parallel/operators/dist_update_loss_scaling.py @@ -151,7 +151,7 @@ def backward(ctx, *args, **kwargs): if ( rank_id in ctx.get_tensor_dist_attr_for_program( - main_block.var(varname) + main_block._var_recursive(varname) ).process_mesh.processes ): filter_vars.append(varname) From d6b94d2699dd969a56352d006746e1dabee2182b Mon Sep 17 00:00:00 2001 From: Nyakku Shigure Date: Tue, 22 Nov 2022 11:02:33 +0800 Subject: [PATCH 135/210] [CodeStyle][py2][py311] replace deprecated `inspect.getargspec` with `inspect.getfullargspec` (#48218) * [CodeStyle][py2] use inspect.getfullargspect instead of deprecated inspect.getargspec * refactor to f-string --- .../distributed/auto_parallel/dist_tensor.py | 2 +- .../fluid/dygraph/dygraph_to_static/utils.py | 41 ++----------------- python/paddle/fluid/layers/nn.py | 2 +- python/paddle/hapi/model.py | 5 +-- 4 files changed, 7 insertions(+), 43 deletions(-) diff --git a/python/paddle/distributed/auto_parallel/dist_tensor.py b/python/paddle/distributed/auto_parallel/dist_tensor.py index 9a6f9c41154e1..8ba0e8570e1d5 100644 --- a/python/paddle/distributed/auto_parallel/dist_tensor.py +++ b/python/paddle/distributed/auto_parallel/dist_tensor.py @@ -306,7 +306,7 @@ def new_local_tensor(self, block=None, rank=None, name=None): def _copy_kwargs(serial_tensor): kwargs = {} no_need_copy_args = ["self", "block", "shape", "name"] - arg_spec = inspect.getargspec(Variable.__init__) + arg_spec = inspect.getfullargspec(Variable.__init__) for key in arg_spec.args: # TODO: Check the copied attribute from serial tensor whether valid diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/utils.py b/python/paddle/fluid/dygraph/dygraph_to_static/utils.py index 9687efb942efa..f5d425ca0ac06 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/utils.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/utils.py @@ -96,21 +96,6 @@ def visit(self, node): RE_PYNAME = '[a-zA-Z0-9_]+' RE_PYMODULE = r'[a-zA-Z0-9_]+\.' -# FullArgSpec is valid from Python3. Defined a Namedtuple to -# to make it available in Python2. -FullArgSpec = collections.namedtuple( - 'FullArgSpec', - [ - 'args', - 'varargs', - 'varkw', - 'defaults', - 'kwonlyargs', - 'kwonlydefaults', - 'annotations', - ], -) - def data_layer_not_check(name, shape, dtype='float32', lod_level=0): """ @@ -199,27 +184,11 @@ def saw(x): return x -def getfullargspec(target): - if hasattr(inspect, "getfullargspec"): - return inspect.getfullargspec(target) - else: - argspec = inspect.getargspec(target) - return FullArgSpec( - args=argspec.args, - varargs=argspec.varargs, - varkw=argspec.keywords, - defaults=argspec.defaults, - kwonlyargs=[], - kwonlydefaults=None, - annotations={}, - ) - - def parse_arg_and_kwargs(function): """ Returns full argument names as list. e.g ['x', 'y', 'z'] """ - fullargspec = getfullargspec(function) + fullargspec = inspect.getfullargspec(function) arg_names = fullargspec.args if arg_names and 'self' == arg_names[0]: arg_names = fullargspec.args[1:] @@ -239,7 +208,7 @@ def parse_varargs_name(function): """ Returns varargs name string of function. e.g: 'input' from `foo(x, *input)` """ - fullargspec = getfullargspec(function) + fullargspec = inspect.getfullargspec(function) varargs = fullargspec.varargs return varargs @@ -354,7 +323,7 @@ def _delete_keywords_from(node): func_src = astor.to_source(gast.gast_to_ast(node.func)) import paddle.fluid as fluid - full_args = eval("inspect.getargspec({})".format(func_src)) + full_args = eval(f"inspect.getfullargspec({func_src})") full_args_name = full_args[0] node.keywords = [k for k in node.keywords if k.arg in full_args_name] @@ -438,9 +407,7 @@ def update_args_of_func(node, dygraph_node, method_name): if method_name == "__init__" or eval( "issubclass({}, fluid.dygraph.Layer)".format(class_src) ): - full_args = eval( - "inspect.getargspec({}.{})".format(class_src, method_name) - ) + full_args = eval(f"inspect.getfullargspec({class_src}.{method_name})") full_args_name = [ arg_name for arg_name in full_args[0] if arg_name != "self" ] diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 076fdf8fdb105..d6b2a4bbbc583 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -12700,7 +12700,7 @@ def __init__(self, func): self._func = func # find named args using reflection - args = inspect.getargspec(self._func) + args = inspect.getfullargspec(self._func) if len(args[0]) == 0 and args[1] is None and args[2] is None: # Function with no inputs self._named_args = None diff --git a/python/paddle/hapi/model.py b/python/paddle/hapi/model.py index e64aa47e2d1f7..931e3c3e398b3 100644 --- a/python/paddle/hapi/model.py +++ b/python/paddle/hapi/model.py @@ -94,10 +94,7 @@ def restore_flatten_list(l, splits): def extract_args(func): - if hasattr(inspect, 'getfullargspec'): - return inspect.getfullargspec(func)[0] - else: - return inspect.getargspec(func)[0] + return inspect.getfullargspec(func).args def _all_gather(x, nranks, ring_id=0, use_calc_stream=True): From 2995f742e8ab8f17499a857e13861740ccc815a1 Mon Sep 17 00:00:00 2001 From: Guanghua Yu <742925032@qq.com> Date: Tue, 22 Nov 2022 11:08:58 +0800 Subject: [PATCH 136/210] fix error of QuantizationTransformPassV2 when has condition block (#48190) * fix error of QuantizationTransformPassV2 when has condition block * fix error --- .../fluid/contrib/slim/quantization/quantization_pass.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py index f0caabd6f4ea1..8902b40aa68e5 100644 --- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py +++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py @@ -2481,11 +2481,6 @@ def __init__( self.create_var_map = {} self.create_op_map = {} - # marked the variable which has been dequantized. - self.dequantized_vars = collections.OrderedDict() - self.persistable_vars = [] - self.processed_vars = [] - def _quant_preprocess(self, op_node): user_skipped = False if isinstance(self._skip_pattern, list): @@ -2627,6 +2622,10 @@ def apply(self, graph): ), 'graph must be the instance of IrGraph.' if self._is_test is None: self._is_test = graph.is_test() + # marked the variable which has been dequantized. + self.dequantized_vars = collections.OrderedDict() + self.persistable_vars = [] + self.processed_vars = [] self.persistable_vars = [ p.name() for p in graph.all_persistable_nodes() From 4da1a0fe66f63d1a059c6471660ae7aef24ca44b Mon Sep 17 00:00:00 2001 From: huangjiyi <43315610+huangjiyi@users.noreply.github.com> Date: Tue, 22 Nov 2022 11:23:31 +0800 Subject: [PATCH 137/210] [PHI decoupling] remove "gpu_device_function.h" in fluid. (#48117) * move "paddle/phi/backends/gpu/gpu_device_function.h" to phi * update copyright years * rm "fluid/platform/device/gpu/gpu_device_function.h" in phi * rm dependence to "gpu_device_function.h" in fluid * rm gpu_device_function.h etc in fluid * fix rocm-complie bugs * fix cuda_helper_test.cu bugs --- paddle/fluid/operators/activation_op.kps | 2 +- .../elementwise/elementwise_op_function.h | 13 +- .../operators/fused/fused_attention_op.cu | 2 +- ...sed_bias_dropout_residual_layer_norm_op.cu | 2 +- .../operators/fused/fused_dropout_common.h | 2 +- .../fused_fc_elementwise_layernorm_op.cu | 2 +- .../fused/fused_gate_attention_op.cu | 2 +- .../fused/fused_multi_transformer_op.cu.h | 2 +- paddle/fluid/operators/group_norm_op.cu | 2 +- paddle/fluid/operators/layer_norm_kernel.cu.h | 4 +- paddle/fluid/operators/math/beam_search.cu | 2 +- paddle/fluid/operators/row_conv_op.cu | 6 +- paddle/fluid/operators/top_k_function_cuda.h | 17 +- .../device/gpu/cuda/cuda_device_function.h | 193 ------------------ .../platform/device/gpu/cuda_helper_test.cu | 4 +- .../platform/device/gpu/gpu_device_function.h | 24 --- .../device/gpu/rocm/rocm_device_function.h | 168 --------------- 17 files changed, 34 insertions(+), 413 deletions(-) delete mode 100644 paddle/fluid/platform/device/gpu/cuda/cuda_device_function.h delete mode 100644 paddle/fluid/platform/device/gpu/gpu_device_function.h delete mode 100644 paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h diff --git a/paddle/fluid/operators/activation_op.kps b/paddle/fluid/operators/activation_op.kps index 0ce55b7cf7331..9a522359628eb 100644 --- a/paddle/fluid/operators/activation_op.kps +++ b/paddle/fluid/operators/activation_op.kps @@ -13,7 +13,7 @@ limitations under the License. */ #include "paddle/fluid/operators/amp/fp16_type_traits.h" #include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h" #include "paddle/fluid/platform/bfloat16.h" -#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" +#include "paddle/phi/backends/gpu/gpu_device_function.h" #include "paddle/phi/kernels/funcs/activation_functor.h" namespace paddle { diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.h b/paddle/fluid/operators/elementwise/elementwise_op_function.h index 7bcd336732960..5f2097f333050 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op_function.h +++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h @@ -42,7 +42,7 @@ limitations under the License. */ #include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h" #include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h" -#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" +#include "paddle/phi/backends/gpu/gpu_device_function.h" #include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/kernels/gpu/elementwise_grad.h" @@ -982,7 +982,7 @@ static __global__ void FusedElemwiseAndActGradBroadcast1CUDAKernel( #pragma unroll for (int i = BLOCK_X >> 1; i > 0; i >>= 1) { // reduce sum with wrap - val += platform::CudaShuffleXorSync(0xFFFFFFFF, val, i); + val += phi::backends::gpu::CudaShuffleXorSync(0xFFFFFFFF, val, i); } size_t idx_j = j + threadIdx.y; @@ -1004,7 +1004,8 @@ static __global__ void FusedElemwiseAndActGradBroadcast1CUDAKernel( #pragma unroll for (int i = BLOCK_X >> 1; i > 0; i >>= 1) { // reduce sum with wrap - inter_val += platform::CudaShuffleXorSync(0xFFFFFFFF, inter_val, i); + inter_val += + phi::backends::gpu::CudaShuffleXorSync(0xFFFFFFFF, inter_val, i); } if (threadIdx.x == 0 && (idx_j < w)) d_intermediate[idx_j] = inter_val; } @@ -1160,14 +1161,14 @@ static __global__ void FusedElemwiseAndActGradBroadcast2CUDAKernel( h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h; if (BcastY) { if (dy) { - val = paddle::platform::reduceSum(val, tid, h); + val = phi::backends::gpu::reduceSum(val, tid, h); if (threadIdx.x == 0) { dy[j] = val; } } } else { if (dx) { - val = paddle::platform::reduceSum(val, tid, h); + val = phi::backends::gpu::reduceSum(val, tid, h); if (threadIdx.x == 0) { dx[j] = val; } @@ -1175,7 +1176,7 @@ static __global__ void FusedElemwiseAndActGradBroadcast2CUDAKernel( } if (!SameShapeOfIntermediateOutAndOut) { if (d_intermediate) { - inter_val = paddle::platform::reduceSum(inter_val, tid, h); + inter_val = phi::backends::gpu::reduceSum(inter_val, tid, h); if (threadIdx.x == 0) { d_intermediate[j] = inter_val; } diff --git a/paddle/fluid/operators/fused/fused_attention_op.cu b/paddle/fluid/operators/fused/fused_attention_op.cu index a13bfcf12ea8d..ef5087f0534e1 100644 --- a/paddle/fluid/operators/fused/fused_attention_op.cu +++ b/paddle/fluid/operators/fused/fused_attention_op.cu @@ -22,9 +22,9 @@ limitations under the License. */ #include "paddle/fluid/operators/fused/attn_gemm.h" #include "paddle/fluid/operators/fused/fmha_ref.h" #include "paddle/fluid/operators/fused/fused_dropout_helper.h" -#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" #include "paddle/phi/api/include/tensor.h" +#include "paddle/phi/backends/gpu/gpu_device_function.h" #include "paddle/phi/kernels/funcs/broadcast_function.h" #include "paddle/phi/kernels/funcs/elementwise_functor.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cu b/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cu index 6da533aa77f3c..664e20b686d7e 100644 --- a/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cu +++ b/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cu @@ -19,8 +19,8 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/operators/fused/fused_dropout_helper.h" -#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" +#include "paddle/phi/backends/gpu/gpu_device_function.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/fused/fused_dropout_common.h b/paddle/fluid/operators/fused/fused_dropout_common.h index 1b8dc4bb324ca..0fbc14436e914 100644 --- a/paddle/fluid/operators/fused/fused_dropout_common.h +++ b/paddle/fluid/operators/fused/fused_dropout_common.h @@ -22,10 +22,10 @@ limitations under the License. */ #include "paddle/fluid/operators/amp/fp16_type_traits.h" #include "paddle/fluid/operators/fused/quant_dequant_kernel.h" #include "paddle/fluid/operators/layer_norm_kernel.cu.h" -#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/float16.h" +#include "paddle/phi/backends/gpu/gpu_device_function.h" #include "paddle/phi/kernels/funcs/aligned_vector.h" #include "paddle/phi/kernels/funcs/functors.h" diff --git a/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu b/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu index 74ba0b54afd45..c6cfc86983511 100644 --- a/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu +++ b/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu @@ -25,8 +25,8 @@ namespace cub = hipcub; #endif #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" +#include "paddle/phi/backends/gpu/gpu_device_function.h" #include "paddle/phi/kernels/funcs/blas/blas.h" namespace paddle { diff --git a/paddle/fluid/operators/fused/fused_gate_attention_op.cu b/paddle/fluid/operators/fused/fused_gate_attention_op.cu index 8f13424ce49b5..9cb3f19ab1740 100644 --- a/paddle/fluid/operators/fused/fused_gate_attention_op.cu +++ b/paddle/fluid/operators/fused/fused_gate_attention_op.cu @@ -16,7 +16,7 @@ limitations under the License. */ #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/operators/fused/attn_gemm.h" #include "paddle/fluid/operators/fused/fused_gate_attention.h" -#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" +#include "paddle/phi/backends/gpu/gpu_device_function.h" #include "paddle/phi/kernels/funcs/math_function.h" namespace paddle { diff --git a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu.h b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu.h index 777ee83c38dc6..79fc561698989 100644 --- a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu.h +++ b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu.h @@ -26,9 +26,9 @@ limitations under the License. */ #include "paddle/fluid/operators/fused/attn_gemm.h" #include "paddle/fluid/operators/fused/fmha_ref.h" #include "paddle/fluid/operators/fused/fused_dropout_helper.h" -#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" #include "paddle/phi/api/include/tensor.h" +#include "paddle/phi/backends/gpu/gpu_device_function.h" #include "paddle/phi/kernels/funcs/math_function.h" #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) diff --git a/paddle/fluid/operators/group_norm_op.cu b/paddle/fluid/operators/group_norm_op.cu index d0a2935197a8c..6b2ba1670a3b7 100644 --- a/paddle/fluid/operators/group_norm_op.cu +++ b/paddle/fluid/operators/group_norm_op.cu @@ -21,7 +21,7 @@ namespace cub = hipcub; #endif #include "paddle/fluid/operators/group_norm_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" +#include "paddle/phi/backends/gpu/gpu_device_function.h" #include "paddle/phi/backends/gpu/gpu_primitives.h" namespace paddle { diff --git a/paddle/fluid/operators/layer_norm_kernel.cu.h b/paddle/fluid/operators/layer_norm_kernel.cu.h index 86d01f6dece4c..3d1bd7490795d 100644 --- a/paddle/fluid/operators/layer_norm_kernel.cu.h +++ b/paddle/fluid/operators/layer_norm_kernel.cu.h @@ -25,8 +25,8 @@ namespace cub = hipcub; #include #include "paddle/fluid/operators/fused/quant_dequant_kernel.h" -#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" +#include "paddle/phi/backends/gpu/gpu_device_function.h" #include "paddle/phi/core/ddim.h" #include "paddle/phi/kernels/funcs/aligned_vector.h" @@ -55,7 +55,7 @@ static __forceinline__ __device__ U WarpReduceSum(U val) { unsigned mask = 0u; CREATE_SHFL_MASK(mask, true); for (int offset = warpSize / 2; offset > 0; offset /= 2) { - val += paddle::platform::CudaShuffleDownSync(mask, val, offset); + val += phi::backends::gpu::CudaShuffleDownSync(mask, val, offset); } return val; } diff --git a/paddle/fluid/operators/math/beam_search.cu b/paddle/fluid/operators/math/beam_search.cu index 696ddb5a059ed..400f10558e155 100644 --- a/paddle/fluid/operators/math/beam_search.cu +++ b/paddle/fluid/operators/math/beam_search.cu @@ -13,8 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/math/beam_search.h" -#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" +#include "paddle/phi/backends/gpu/gpu_device_function.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/row_conv_op.cu b/paddle/fluid/operators/row_conv_op.cu index 3c4253ef800aa..34595180c9d72 100644 --- a/paddle/fluid/operators/row_conv_op.cu +++ b/paddle/fluid/operators/row_conv_op.cu @@ -12,7 +12,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/row_conv_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" +#include "paddle/phi/backends/gpu/gpu_device_function.h" #include "paddle/phi/kernels/funcs/math_function.h" namespace paddle { @@ -242,7 +242,7 @@ __global__ void RowConvGradFilterImproved(const T *in, for (int offset = 16; offset > 0; offset = offset / 2) { // blockDim.x is 32. - val += platform::CudaShuffleDownSync(mask, val, offset); + val += phi::backends::gpu::CudaShuffleDownSync(mask, val, offset); } __syncthreads(); @@ -307,7 +307,7 @@ __global__ void RowConvGradFilter(const T *in, for (int offset = 16; offset > 0; offset = offset / 2) { // blockDim.x is 32. - val += platform::CudaShuffleDownSync(mask, val, offset); + val += phi::backends::gpu::CudaShuffleDownSync(mask, val, offset); } __syncthreads(); diff --git a/paddle/fluid/operators/top_k_function_cuda.h b/paddle/fluid/operators/top_k_function_cuda.h index e95bca3c2791e..f210f46ea4376 100644 --- a/paddle/fluid/operators/top_k_function_cuda.h +++ b/paddle/fluid/operators/top_k_function_cuda.h @@ -26,9 +26,9 @@ limitations under the License. */ #include "paddle/fluid/operators/eigen/eigen_function.h" #include "paddle/fluid/operators/kernel_primitives/functor_primitives.h" #include "paddle/fluid/operators/top_k_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" #include "paddle/fluid/platform/float16.h" +#include "paddle/phi/backends/gpu/gpu_device_function.h" #include "paddle/phi/backends/gpu/gpu_primitives.h" #define FINAL_MASK 0xffffffff @@ -283,8 +283,10 @@ __forceinline__ __device__ Pair WarpReduce(Pair input, if (largest) { #pragma unroll for (int offset = 16; offset > 0; offset >>= 1) { - T tmp_val = platform::CudaShuffleDownSync(FINAL_MASK, input.v, offset); - int tmp_id = platform::CudaShuffleDownSync(FINAL_MASK, input.id, offset); + T tmp_val = + phi::backends::gpu::CudaShuffleDownSync(FINAL_MASK, input.v, offset); + int tmp_id = + phi::backends::gpu::CudaShuffleDownSync(FINAL_MASK, input.id, offset); if (input.v < tmp_val || (input.v == tmp_val && input.id > tmp_id)) { input.v = tmp_val; input.id = tmp_id; @@ -293,8 +295,10 @@ __forceinline__ __device__ Pair WarpReduce(Pair input, } else { #pragma unroll for (int offset = 16; offset > 0; offset >>= 1) { - T tmp_val = platform::CudaShuffleDownSync(FINAL_MASK, input.v, offset); - int tmp_id = platform::CudaShuffleDownSync(FINAL_MASK, input.id, offset); + T tmp_val = + phi::backends::gpu::CudaShuffleDownSync(FINAL_MASK, input.v, offset); + int tmp_id = + phi::backends::gpu::CudaShuffleDownSync(FINAL_MASK, input.id, offset); if (input.v > tmp_val || (input.v == tmp_val && input.id > tmp_id)) { input.v = tmp_val; input.id = tmp_id; @@ -357,7 +361,8 @@ __device__ __forceinline__ void BlockReduce(Pair shared_max[], unsigned mask = 0u; CREATE_SHFL_MASK(mask, true); if (tid_max / 32 == wid) { - if (platform::CudaShuffleSync(mask, *beam, tid_max % 32, 32) == MaxLength) + if (phi::backends::gpu::CudaShuffleSync(mask, *beam, tid_max % 32, 32) == + MaxLength) break; } } diff --git a/paddle/fluid/platform/device/gpu/cuda/cuda_device_function.h b/paddle/fluid/platform/device/gpu/cuda/cuda_device_function.h deleted file mode 100644 index c1db9c6770c24..0000000000000 --- a/paddle/fluid/platform/device/gpu/cuda/cuda_device_function.h +++ /dev/null @@ -1,193 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -// NOTE(): support float16 to half in header file. -#define PADDLE_CUDA_FP16 -#include "paddle/fluid/platform/bfloat16.h" -#include "paddle/fluid/platform/complex.h" -#include "paddle/fluid/platform/float16.h" -#include "paddle/phi/core/enforce.h" - -namespace paddle { -namespace platform { - -#define FULL_WARP_MASK 0xFFFFFFFF -#define CREATE_SHFL_MASK(mask, predicate) \ - mask = __ballot_sync(FULL_WARP_MASK, (predicate)) - -#define CUDA_LAUNCH_KERNEL_BASE(dim, ...) \ - case (dim): { \ - constexpr auto kPowerOfTwoDim = (dim); \ - __VA_ARGS__; \ - } break - -#define CUDA_LAUNCH_KERNEL_HELPER(...) \ - CUDA_LAUNCH_KERNEL_BASE(1024, ##__VA_ARGS__); \ - CUDA_LAUNCH_KERNEL_BASE(512, ##__VA_ARGS__); \ - CUDA_LAUNCH_KERNEL_BASE(256, ##__VA_ARGS__); \ - CUDA_LAUNCH_KERNEL_BASE(128, ##__VA_ARGS__); \ - CUDA_LAUNCH_KERNEL_BASE(64, ##__VA_ARGS__); \ - CUDA_LAUNCH_KERNEL_BASE(32, ##__VA_ARGS__); - -template -__forceinline__ __device__ T -CudaShuffleDownSync(unsigned mask, T val, int delta, int width = warpSize) { - return __shfl_down_sync(mask, val, static_cast(delta), width); -} - -template -__forceinline__ __device__ T CudaShuffleXorSync(unsigned mask, - T val, - int width = warpSize) { - return __shfl_xor_sync(mask, val, width); -} - -template <> -__forceinline__ __device__ float16 -CudaShuffleDownSync(unsigned mask, float16 val, int delta, int width) { - return float16(__shfl_down_sync( - mask, val.to_half(), static_cast(delta), width)); -} - -template <> -__forceinline__ __device__ bfloat16 -CudaShuffleDownSync(unsigned mask, bfloat16 val, int delta, int width) { -#if defined(PADDLE_CUDA_BF16) - return bfloat16(__shfl_down_sync(mask, - static_cast(val), - static_cast(delta), - width)); -#else - PADDLE_ENFORCE( - false, "__shfl_down_sync with bfloat16 is not supported on cuda <= 11."); -#endif -} - -template <> -__forceinline__ __device__ paddle::platform::complex CudaShuffleDownSync( - unsigned mask, paddle::platform::complex val, int delta, int width) { - float real = static_cast(__shfl_down_sync( - mask, static_cast(val.real), static_cast(delta), width)); - float imag = static_cast(__shfl_down_sync( - mask, static_cast(val.imag), static_cast(delta), width)); - return paddle::platform::complex(real, imag); -} - -template <> -__forceinline__ __device__ paddle::platform::complex -CudaShuffleDownSync(unsigned mask, - paddle::platform::complex val, - int delta, - int width) { - double real = - static_cast(__shfl_down_sync(mask, - static_cast(val.real), - static_cast(delta), - width)); - double imag = - static_cast(__shfl_down_sync(mask, - static_cast(val.imag), - static_cast(delta), - width)); - return paddle::platform::complex(real, imag); -} - -template <> -__forceinline__ __device__ float16 CudaShuffleXorSync(unsigned mask, - float16 val, - int width) { - return float16(__shfl_xor_sync(mask, val.to_half(), width)); -} - -template <> -__forceinline__ __device__ bfloat16 CudaShuffleXorSync(unsigned mask, - bfloat16 val, - int width) { -#if defined(PADDLE_CUDA_BF16) - return bfloat16(__shfl_xor_sync(mask, static_cast(val), width)); -#else - PADDLE_ENFORCE( - false, "__shfl_xor_sync with bfloat16 is not supported on cuda <= 11."); -#endif -} - -template <> -__forceinline__ __device__ paddle::platform::complex CudaShuffleXorSync( - unsigned mask, paddle::platform::complex val, int width) { - float real = static_cast( - __shfl_xor_sync(mask, static_cast(val.real), width)); - float imag = static_cast( - __shfl_xor_sync(mask, static_cast(val.imag), width)); - return paddle::platform::complex(real, imag); -} - -template <> -__forceinline__ __device__ paddle::platform::complex CudaShuffleXorSync( - unsigned mask, paddle::platform::complex val, int width) { - double real = static_cast( - __shfl_xor_sync(mask, static_cast(val.real), width)); - double imag = static_cast( - __shfl_xor_sync(mask, static_cast(val.imag), width)); - return paddle::platform::complex(real, imag); -} - -template -__forceinline__ __device__ T -CudaShuffleSync(unsigned mask, T val, int src_line, int width = 32) { - return __shfl_sync(mask, val, src_line, width); -} - -template -HOSTDEVICE T Infinity() { - return INFINITY; -} - -template -__device__ T reduceSum(T val, int tid, int len) { - // NOTE(zcd): The warp size should be taken from the - // parameters of the GPU but not specified as 32 simply. - // To make the reduceSum more efficiently, - // I use Warp-Level Parallelism and assume the Warp size - // is 32 which may be different for different GPU, - // but most card's warp size is 32. - const int warpSize = 32; - __shared__ T shm[warpSize]; - unsigned mask = 0u; - CREATE_SHFL_MASK(mask, tid < len); - - for (int offset = warpSize / 2; offset > 0; offset /= 2) - val += platform::CudaShuffleDownSync(mask, val, offset); - - if (tid < warpSize) shm[tid] = 0; - __syncthreads(); - - if (tid % warpSize == 0) { - shm[tid / warpSize] = val; - } - __syncthreads(); - - CREATE_SHFL_MASK(mask, tid < warpSize); - - if (tid < warpSize) { - val = shm[tid]; - for (int offset = warpSize / 2; offset > 0; offset /= 2) - val += platform::CudaShuffleDownSync(mask, val, offset); - } - return val; -} - -} // namespace platform -} // namespace paddle diff --git a/paddle/fluid/platform/device/gpu/cuda_helper_test.cu b/paddle/fluid/platform/device/gpu/cuda_helper_test.cu index a3fff0dbed8e2..f20c89f97a4f5 100644 --- a/paddle/fluid/platform/device/gpu/cuda_helper_test.cu +++ b/paddle/fluid/platform/device/gpu/cuda_helper_test.cu @@ -22,9 +22,9 @@ #include #define PADDLE_CUDA_FP16 -#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/fluid/platform/device/gpu/gpu_helper.h" #include "paddle/fluid/platform/float16.h" +#include "paddle/phi/backends/gpu/gpu_device_function.h" #include "paddle/phi/backends/gpu/gpu_primitives.h" using paddle::platform::float16; @@ -214,7 +214,7 @@ static __forceinline__ __device__ T WarpReduceSum(T val) { unsigned mask = 0u; CREATE_SHFL_MASK(mask, true); for (int offset = warpSize / 2; offset > 0; offset /= 2) { - val += paddle::platform::CudaShuffleDownSync(mask, val, offset); + val += phi::backends::gpu::CudaShuffleDownSync(mask, val, offset); } return val; } diff --git a/paddle/fluid/platform/device/gpu/gpu_device_function.h b/paddle/fluid/platform/device/gpu/gpu_device_function.h deleted file mode 100644 index a8daa5e87fdc3..0000000000000 --- a/paddle/fluid/platform/device/gpu/gpu_device_function.h +++ /dev/null @@ -1,24 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - -#ifdef PADDLE_WITH_HIP -#include "paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h" -#else -#include "paddle/fluid/platform/device/gpu/cuda/cuda_device_function.h" -#endif - -#endif diff --git a/paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h b/paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h deleted file mode 100644 index a8ce5f1a1827b..0000000000000 --- a/paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h +++ /dev/null @@ -1,168 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -// NOTE(): support float16 to half in header file. -#define PADDLE_CUDA_FP16 -#include "paddle/fluid/platform/bfloat16.h" -#include "paddle/fluid/platform/complex.h" -#include "paddle/fluid/platform/float16.h" - -namespace paddle { -namespace platform { - -#define CREATE_SHFL_MASK(mask, predicate) mask = __ballot((predicate)) - -#define CUDA_LAUNCH_KERNEL_BASE(dim, ...) \ - case (dim): { \ - constexpr auto kPowerOfTwoDim = (dim); \ - __VA_ARGS__; \ - } break - -#define CUDA_LAUNCH_KERNEL_HELPER(...) \ - CUDA_LAUNCH_KERNEL_BASE(1024, ##__VA_ARGS__); \ - CUDA_LAUNCH_KERNEL_BASE(512, ##__VA_ARGS__); \ - CUDA_LAUNCH_KERNEL_BASE(256, ##__VA_ARGS__); \ - CUDA_LAUNCH_KERNEL_BASE(128, ##__VA_ARGS__); \ - CUDA_LAUNCH_KERNEL_BASE(64, ##__VA_ARGS__); \ - CUDA_LAUNCH_KERNEL_BASE(32, ##__VA_ARGS__); - -template -__forceinline__ __device__ T -CudaShuffleDownSync(unsigned mask, T val, int delta, int width = warpSize) { - return __shfl_down(val, delta, width); -} - -template -__forceinline__ __device__ T CudaShuffleXorSync(unsigned mask, - T val, - int width = warpSize) { - return __shfl_xor(val, width); -} - -template <> -__forceinline__ __device__ float16 -CudaShuffleDownSync(unsigned mask, float16 val, int delta, int width) { - return float16(__shfl_down( - static_cast(val), static_cast(delta), width)); -} - -template <> -__forceinline__ __device__ bfloat16 -CudaShuffleDownSync(unsigned mask, bfloat16 val, int delta, int width) { - return bfloat16(__shfl_down( - static_cast(val), static_cast(delta), width)); -} - -template <> -__forceinline__ __device__ paddle::platform::complex CudaShuffleDownSync( - unsigned mask, paddle::platform::complex val, int delta, int width) { - float real = __shfl_down(val.real, delta, width); - float imag = __shfl_down(val.imag, delta, width); - return paddle::platform::complex(real, imag); -} - -template <> -__forceinline__ __device__ paddle::platform::complex -CudaShuffleDownSync(unsigned mask, - paddle::platform::complex val, - int delta, - int width) { - double real = __shfl_down(val.real, delta, width); - double imag = __shfl_down(val.imag, delta, width); - return paddle::platform::complex(real, imag); -} - -template <> -__forceinline__ __device__ float16 CudaShuffleXorSync(unsigned mask, - float16 val, - int width) { - return float16(__shfl_xor(static_cast(val), width)); -} - -template <> -__forceinline__ __device__ bfloat16 CudaShuffleXorSync(unsigned mask, - bfloat16 val, - int width) { - return bfloat16(__shfl_xor(static_cast(val), width)); -} - -template <> -__forceinline__ __device__ paddle::platform::complex CudaShuffleXorSync( - unsigned mask, paddle::platform::complex val, int width) { - float real = __shfl_xor(val.real, width); - float imag = __shfl_xor(val.imag, width); - return paddle::platform::complex(real, imag); -} - -template <> -__forceinline__ __device__ paddle::platform::complex CudaShuffleXorSync( - unsigned mask, paddle::platform::complex val, int width) { - double real = __shfl_xor(val.real, width); - double imag = __shfl_xor(val.imag, width); - return paddle::platform::complex(real, imag); -} - -template -__forceinline__ __device__ T -CudaShuffleSync(unsigned mask, T val, int src_line, int width = 32) { - return __shfl(val, src_line, width); -} - -template -HOSTDEVICE T Infinity() { - return INFINITY; -} - -template -__device__ T reduceSum(T val, int tid, int len) { - // NOTE(zcd): The warp size should be taken from the - // parameters of the GPU but not specified as 32 simply. - // To make the reduceSum more efficiently, - // I use Warp-Level Parallelism and assume the Warp size - // is 32 which may be different for different GPU, - // but most card's warp size is 32. -#ifdef PADDLE_WITH_HIP - const int warpSize = 64; -#else - const int warpSize = 32; -#endif - __shared__ T shm[warpSize]; - unsigned mask = 0u; - CREATE_SHFL_MASK(mask, tid < len); - - for (int offset = warpSize / 2; offset > 0; offset /= 2) - val += platform::CudaShuffleDownSync(mask, val, offset); - - if (tid < warpSize) shm[tid] = 0; - __syncthreads(); - - if (tid % warpSize == 0) { - shm[tid / warpSize] = val; - } - __syncthreads(); - - CREATE_SHFL_MASK(mask, tid < warpSize); - - if (tid < warpSize) { - val = shm[tid]; - for (int offset = warpSize / 2; offset > 0; offset /= 2) - val += platform::CudaShuffleDownSync(mask, val, offset); - } - return val; -} - -} // namespace platform -} // namespace paddle From e0dd4ee9093cf8d14687690213100cb0786e5188 Mon Sep 17 00:00:00 2001 From: Yuang Liu Date: Tue, 22 Nov 2022 12:25:24 +0800 Subject: [PATCH 138/210] bf16 for interpolate, nhwc for bf16 (#48192) --- paddle/phi/kernels/gpu/interpolate_grad_kernel.cu | 7 ++++--- paddle/phi/kernels/gpu/interpolate_kernel.cu | 5 +++-- paddle/phi/kernels/gpudnn/conv_grad_kernel.cu | 6 ++++++ paddle/phi/kernels/gpudnn/conv_kernel.cu | 10 +++++++++- 4 files changed, 22 insertions(+), 6 deletions(-) diff --git a/paddle/phi/kernels/gpu/interpolate_grad_kernel.cu b/paddle/phi/kernels/gpu/interpolate_grad_kernel.cu index 51a5f50560eac..b38cae829680b 100644 --- a/paddle/phi/kernels/gpu/interpolate_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/interpolate_grad_kernel.cu @@ -487,13 +487,13 @@ __global__ void KeBicubicInterpBw(T* in, T in_img_idy = align_corners ? static_cast(ratio_h * out_img_idy) : static_cast(ratio_h * (out_img_idy + 0.5) - 0.5); - int input_y = floorf(in_img_idy); + int input_y = floorf(static_cast(in_img_idy)); using MT = typename phi::dtype::MPTypeTrait::Type; const T y_t = static_cast(static_cast(in_img_idy) - input_y); T in_img_idx = align_corners ? static_cast(ratio_w * out_img_idx) : static_cast(ratio_w * (out_img_idx + 0.5) - 0.5); - int input_x = floorf(in_img_idx); + int input_x = floorf(static_cast(in_img_idx)); const T x_t = static_cast(static_cast(in_img_idx) - input_x); T x_coeffs[4]; @@ -1577,7 +1577,8 @@ PD_REGISTER_KERNEL(nearest_interp_grad, phi::NearestInterpGradKernel, float, double, - phi::dtype::float16) { + phi::dtype::float16, + phi::dtype::bfloat16) { kernel->InputAt(2).SetBackend(phi::Backend::ALL_BACKEND); kernel->InputAt(3).SetBackend(phi::Backend::ALL_BACKEND); } diff --git a/paddle/phi/kernels/gpu/interpolate_kernel.cu b/paddle/phi/kernels/gpu/interpolate_kernel.cu index 8135e73142fec..07e113ef7aa80 100644 --- a/paddle/phi/kernels/gpu/interpolate_kernel.cu +++ b/paddle/phi/kernels/gpu/interpolate_kernel.cu @@ -355,14 +355,14 @@ __global__ void KeBicubicInterpFw(const T* in, T in_img_idy = align_corners ? static_cast(ratio_h * out_img_idy) : static_cast(ratio_h * (out_img_idy + 0.5) - 0.5); - int input_y = floorf(in_img_idy); + int input_y = floorf(static_cast(in_img_idy)); using MT = typename phi::dtype::MPTypeTrait::Type; const T y_t = static_cast(static_cast(in_img_idy) - input_y); T in_img_idx = align_corners ? static_cast(ratio_w * out_img_idx) : static_cast(ratio_w * (out_img_idx + 0.5) - 0.5); - int input_x = floorf(in_img_idx); + int input_x = floorf(static_cast(in_img_idx)); const T x_t = static_cast(static_cast(in_img_idx) - input_x); T coefficients[4]; @@ -1468,6 +1468,7 @@ PD_REGISTER_KERNEL(nearest_interp, float, double, phi::dtype::float16, + phi::dtype::bfloat16, int, int64_t) { kernel->InputAt(2).SetBackend(phi::Backend::ALL_BACKEND); diff --git a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu index 0d5f266d3d172..5d1a92a3119bc 100644 --- a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu +++ b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu @@ -454,8 +454,14 @@ void ConvCudnnGradKernel(const Context& ctx, #ifdef PADDLE_WITH_HIP // HIP MIOPEN ONLY SUPPORT NCHW format auto compute_format = paddle::platform::DataLayout::kNCHW; +#else +#if CUDNN_VERSION_MIN(8, 1, 0) + const bool compute_in_nhwc = + (dtype == CUDNN_DATA_HALF || dtype == CUDNN_DATA_BFLOAT16) && + IsVoltaOrLater(ctx); #else const bool compute_in_nhwc = dtype == CUDNN_DATA_HALF && IsVoltaOrLater(ctx); +#endif auto compute_format = compute_in_nhwc && channel_last ? paddle::platform::DataLayout::kNHWC : paddle::platform::DataLayout::kNCHW; diff --git a/paddle/phi/kernels/gpudnn/conv_kernel.cu b/paddle/phi/kernels/gpudnn/conv_kernel.cu index 3e3b1fb198da9..4044056653162 100644 --- a/paddle/phi/kernels/gpudnn/conv_kernel.cu +++ b/paddle/phi/kernels/gpudnn/conv_kernel.cu @@ -373,10 +373,18 @@ void ConvCudnnKernel(const Context& ctx, #ifdef PADDLE_WITH_HIP // HIP MIOPEN ONLY SUPPORT NCHW format auto compute_format = paddle::platform::DataLayout::kNCHW; +#else +#if CUDNN_VERSION_MIN(8, 1, 0) + // Tensor Core introduced from Volta GPUs supports more faster conv op + // with FP16 or BF16 in NHWC data format. + const bool compute_in_nhwc = + (dtype == CUDNN_DATA_HALF || dtype == CUDNN_DATA_BFLOAT16) && + IsVoltaOrLater(ctx); #else // Tensor Core introduced from Volta GPUs supports more faster conv op - // with FP16 in NHWC data format. + // with FP16 in NHWC data format. (BF16 require cudnn >= 8.1.0) const bool compute_in_nhwc = dtype == CUDNN_DATA_HALF && IsVoltaOrLater(ctx); +#endif // We will only do data format conversion from NHWC to NCHW. // cudnn will convert NCHW to NHWC automatically on Tensor Core. auto compute_format = compute_in_nhwc && channel_last From 1e8346fedd12fd8b65c5591ffba230c2d5feee05 Mon Sep 17 00:00:00 2001 From: 201716010711 <87008376+201716010711@users.noreply.github.com> Date: Tue, 22 Nov 2022 13:34:55 +0800 Subject: [PATCH 139/210] delete logical_not api (#48078) --- .../dygraph_to_static/convert_operators.py | 3 +- python/paddle/fluid/layers/control_flow.py | 9 ++--- python/paddle/fluid/layers/nn.py | 36 ------------------- python/paddle/fluid/layers/rnn.py | 16 ++++----- .../tests/book/test_machine_translation.py | 2 +- .../fluid/tests/unittests/dist_transformer.py | 3 +- .../unittests/ipu/test_logical_not_op_ipu.py | 2 +- 7 files changed, 18 insertions(+), 53 deletions(-) diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py b/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py index fc91a3a797424..6de6d79267307 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py @@ -37,7 +37,6 @@ cast, control_flow, logical_and, - logical_not, logical_or, nn, ) @@ -318,7 +317,7 @@ def convert_logical_not(x): def _run_paddle_logical_not(x): x = cast_bool_if_necessary(x) - return logical_not(x) + return paddle.logical_not(x) def _run_py_logical_not(x): diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py index 658941ad4446a..ee53f23684ca9 100755 --- a/python/paddle/fluid/layers/control_flow.py +++ b/python/paddle/fluid/layers/control_flow.py @@ -27,7 +27,7 @@ in_dygraph_mode, ) from ..layer_helper import LayerHelper, unique_name -from .nn import logical_and, logical_not, logical_or +from .nn import logical_and, logical_or from .utils import ( assert_same_structure, map_structure, @@ -49,6 +49,7 @@ check_dtype, ) from ..backward import _infer_var_data_type_shape_ +import paddle from paddle import _C_ops, _legacy_C_ops __all__ = [ @@ -2807,7 +2808,7 @@ def false_func(): ) ) false_cond_block = ConditionalBlock( - [logical_not(pred)], is_scalar_condition=True + [paddle.logical_not(pred)], is_scalar_condition=True ) with false_cond_block.block(): origin_false_output = false_fn() @@ -3260,13 +3261,13 @@ def case(self, condition): if len(self.pre_not_conditions) == 0: cond_block = ConditionalBlock([condition], is_scalar_condition=True) - not_cond = logical_not(x=condition) + not_cond = paddle.logical_not(x=condition) self.pre_not_conditions.append(not_cond) else: pre_cond_num = len(self.pre_not_conditions) pre_not_cond = self.pre_not_conditions[pre_cond_num - 1] new_not_cond = logical_and( - x=pre_not_cond, y=logical_not(x=condition) + x=pre_not_cond, y=paddle.logical_not(x=condition) ) self.pre_not_conditions.append(new_not_cond) cond_block = ConditionalBlock( diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index d6b2a4bbbc583..b699de304eb77 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -151,7 +151,6 @@ 'size', 'logical_and', 'logical_or', - 'logical_not', 'clip', 'clip_by_norm', 'mean', @@ -11549,41 +11548,6 @@ def logical_or(x, y, out=None, name=None): ) -@templatedoc() -def logical_not(x, out=None, name=None): - """ - - ``logical_not`` operator computes element-wise logical NOT on ``x``, and returns ``out``. ``out`` is N-dim boolean ``Variable``. - Each element of ``out`` is calculated by - - .. math:: - - out = !x - - Args: - x(Tensor): Operand of logical_not operator. Must be a Tensor of type bool, int8, int16, in32, in64, float32, or float64. - out(Tensor): The ``Tensor`` that specifies the output of the operator, which can be any ``Tensor`` that has been created in the program. The default value is None, and a new ``Tensor` will be created to save the output. - name(str|None): The default value is None. Normally there is no need for users to set this property. For more information, please refer to :ref:`api_guide_Name`. - - Returns: - Tensor: ${out_comment} - - Examples: - .. code-block:: python - - import paddle - - x = paddle.to_tensor([True, False, True, False]) - res = paddle.logical_not(x) - print(res) # [False True False True] - """ - if in_dygraph_mode(): - return _C_ops.logical_not(x) - return _logical_op( - op_name="logical_not", x=x, y=None, name=name, out=out, binary_op=False - ) - - @templatedoc() def clip(x, min, max, name=None): """ diff --git a/python/paddle/fluid/layers/rnn.py b/python/paddle/fluid/layers/rnn.py index 82da847bbc7b6..3401fe468748e 100644 --- a/python/paddle/fluid/layers/rnn.py +++ b/python/paddle/fluid/layers/rnn.py @@ -1332,7 +1332,7 @@ def _beam_search_step(self, time, logits, next_cell_states, beam_state): beam_state.lengths, beam_indices, self.batch_size ) next_lengths = next_lengths + tensor.cast( - nn.logical_not(next_finished), beam_state.lengths.dtype + paddle.logical_not(next_finished), beam_state.lengths.dtype ) next_finished = control_flow.logical_or( next_finished, @@ -1481,7 +1481,7 @@ def _maybe_copy(state, new_state, step_mask): initial_states, initial_finished, ) - cond = control_flow.logical_not((nn.reduce_all(initial_finished))) + cond = paddle.logical_not((nn.reduce_all(initial_finished))) sequence_lengths = tensor.cast(tensor.zeros_like(initial_finished), "int64") outputs = None @@ -1505,7 +1505,7 @@ def _maybe_copy(state, new_state, step_mask): next_sequence_lengths = nn.elementwise_add( sequence_lengths, tensor.cast( - control_flow.logical_not(finished), sequence_lengths.dtype + paddle.logical_not(finished), sequence_lengths.dtype ), ) if impute_finished: # rectify the states for the finished. @@ -1539,7 +1539,7 @@ def _maybe_copy(state, new_state, step_mask): control_flow.increment(x=step_idx_tensor, value=1.0, in_place=True) step_idx += 1 - cond = control_flow.logical_not(nn.reduce_all(finished)) + cond = paddle.logical_not(nn.reduce_all(finished)) if max_step_num is not None and step_idx > max_step_num: break @@ -1587,7 +1587,7 @@ def _dynamic_decode_declarative( global_finished.stop_gradient = True step_idx = tensor.fill_constant(shape=[1], dtype="int64", value=0) - cond = control_flow.logical_not((nn.reduce_all(initial_finished))) + cond = paddle.logical_not((nn.reduce_all(initial_finished))) if max_step_num is not None: max_step_num = tensor.fill_constant( shape=[1], dtype="int64", value=max_step_num @@ -1665,7 +1665,7 @@ def _create_array_out_of_while(dtype): next_sequence_lengths = nn.elementwise_add( sequence_lengths, tensor.cast( - control_flow.logical_not(global_finished), + paddle.logical_not(global_finished), sequence_lengths.dtype, ), ) @@ -1720,12 +1720,12 @@ def _create_array_out_of_while(dtype): ) if max_step_num is not None: control_flow.logical_and( - control_flow.logical_not(nn.reduce_all(global_finished)), + paddle.logical_not(nn.reduce_all(global_finished)), control_flow.less_equal(step_idx, max_step_num), cond, ) else: - control_flow.logical_not(nn.reduce_all(global_finished), cond) + paddle.logical_not(nn.reduce_all(global_finished), cond) final_outputs = map_structure( lambda array: tensor.tensor_array_to_tensor( diff --git a/python/paddle/fluid/tests/book/test_machine_translation.py b/python/paddle/fluid/tests/book/test_machine_translation.py index 4ae6462f02f10..27da08ea00814 100644 --- a/python/paddle/fluid/tests/book/test_machine_translation.py +++ b/python/paddle/fluid/tests/book/test_machine_translation.py @@ -162,7 +162,7 @@ def decoder_decode(context, is_sparse): # update the break condition: up to the max length or all candidates of # source sentences have ended. length_cond = pd.less_than(x=counter, y=array_len) - finish_cond = pd.logical_not(pd.is_empty(x=selected_ids)) + finish_cond = paddle.logical_not(pd.is_empty(x=selected_ids)) pd.logical_and(x=length_cond, y=finish_cond, out=cond) translation_ids, translation_scores = pd.beam_search_decode( diff --git a/python/paddle/fluid/tests/unittests/dist_transformer.py b/python/paddle/fluid/tests/unittests/dist_transformer.py index 514fcf4b869be..88ec3188c9e89 100644 --- a/python/paddle/fluid/tests/unittests/dist_transformer.py +++ b/python/paddle/fluid/tests/unittests/dist_transformer.py @@ -26,6 +26,7 @@ import paddle.fluid as fluid import paddle.fluid.layers as layers from test_dist_base import TestDistRunnerBase, runtime_main, RUN_STEP +import paddle const_para_attr = fluid.ParamAttr(initializer=fluid.initializer.Constant(0.001)) const_bias_attr = const_para_attr @@ -1860,7 +1861,7 @@ def beam_search(): layers.assign(pre_caches[i]["k"], caches[i]["k"]) layers.assign(pre_caches[i]["v"], caches[i]["v"]) length_cond = layers.less_than(x=step_idx, y=max_len) - finish_cond = layers.logical_not(layers.is_empty(x=selected_ids)) + finish_cond = paddle.logical_not(layers.is_empty(x=selected_ids)) layers.logical_and(x=length_cond, y=finish_cond, out=cond) finished_ids, finished_scores = layers.beam_search_decode( diff --git a/python/paddle/fluid/tests/unittests/ipu/test_logical_not_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_logical_not_op_ipu.py index c75f6faa65b65..26c63cc58392f 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_logical_not_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_logical_not_op_ipu.py @@ -42,7 +42,7 @@ def build_model(self): x = paddle.static.data( name=self.feed_list[0], shape=self.feed_shape[0], dtype="bool" ) - out = paddle.fluid.layers.logical_not(x) + out = paddle.logical_not(x) self.fetch_list = [out.name] def run_model(self, exec_mode): From 4244fa6ee72d5de05788dac89aa26d7813357e89 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=82=85=E5=89=91=E5=AF=92?= Date: Tue, 22 Nov 2022 13:39:06 +0800 Subject: [PATCH 140/210] =?UTF-8?q?(fluid=E6=B8=85=E7=90=86)remove=20resha?= =?UTF-8?q?pe=20in=20nn.py=20under=20fluid=20(#47967)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * remove reshape in nn.py under fluid * remove reshape reference * fix test case * fix test case in distribution.uniform * remove fluid reshape reference --- python/paddle/distribution/normal.py | 4 +- python/paddle/distribution/uniform.py | 7 +- python/paddle/fluid/contrib/layers/nn.py | 9 +- .../paddle/fluid/contrib/layers/rnn_impl.py | 19 +- .../contrib/tests/test_model_cast_to_bf16.py | 9 +- python/paddle/fluid/dygraph/parallel.py | 2 +- python/paddle/fluid/layer_helper_base.py | 5 +- python/paddle/fluid/layers/detection.py | 20 +- python/paddle/fluid/layers/distributions.py | 8 +- python/paddle/fluid/layers/nn.py | 235 ------------------ python/paddle/fluid/layers/rnn.py | 10 +- python/paddle/fluid/layers/tensor.py | 3 +- python/paddle/fluid/nets.py | 15 +- .../tests/book/test_machine_translation.py | 4 +- .../fleet/hybrid_parallel_pp_embedding.py | 4 +- .../fleet/hybrid_parallel_shared_weight.py | 4 +- .../fleet/parallel_dygraph_se_resnext.py | 4 +- .../fleet/parallel_dygraph_transformer.py | 19 +- .../tests/unittests/dist_fleet_simnet_bow.py | 6 +- .../fluid/tests/unittests/dist_transformer.py | 14 +- .../dygraph_to_static/bert_dygraph_model.py | 4 +- .../dygraph_to_static/ifelse_simple_func.py | 4 +- .../seq2seq_dygraph_model.py | 12 +- .../dygraph_to_static/simnet_dygraph_model.py | 4 +- .../unittests/dygraph_to_static/test_bmn.py | 12 +- .../unittests/dygraph_to_static/test_error.py | 12 +- .../unittests/dygraph_to_static/test_lac.py | 8 +- .../unittests/dygraph_to_static/test_mnist.py | 2 +- .../dygraph_to_static/test_mobile_net.py | 4 +- .../dygraph_to_static/test_partial_program.py | 2 +- .../dygraph_to_static/test_ptb_lm.py | 14 +- .../test_reinforcement_learning.py | 2 +- .../dygraph_to_static/test_resnet.py | 2 +- .../dygraph_to_static/test_se_resnet.py | 4 +- .../dygraph_to_static/test_sentiment.py | 38 ++- .../dygraph_to_static/test_simnet.py | 6 +- .../dygraph_to_static/test_tensor_shape.py | 12 +- .../unittests/dygraph_to_static/test_tsm.py | 6 +- .../transformer_dygraph_model.py | 20 +- .../ipu/test_reshape_inplace_op_ipu.py | 2 +- .../unittests/ipu/test_reshape_op_ipu.py | 2 +- .../unittests/ipu/test_varname_inplace_ipu.py | 2 +- .../test_mkldnn_cpu_bfloat16_pass.py | 3 +- .../test_mkldnn_matmul_op_output_fuse_pass.py | 12 +- ...n_reshape_transpose_matmul_v2_fuse_pass.py | 2 +- .../test_trt_conv_quant_dequant_pass.py | 30 +-- .../test_trt_fc_fuse_quant_dequant_pass.py | 8 +- .../test_trt_matmul_quant_dequant.py | 2 +- .../inference/test_trt_multiclass_nms3_op.py | 3 +- .../inference/test_trt_multiclass_nms_op.py | 3 +- .../ir/inference/test_trt_reshape_op.py | 5 +- .../test_trt_shuffle_channel_detect_pass.py | 5 +- .../ir/inference/test_trt_subgraph_pass.py | 3 +- ..._trt_transpose_flatten_concat_fuse_pass.py | 3 +- .../tests/unittests/parallel_dygraph_mnist.py | 2 +- .../parallel_dygraph_sparse_embedding.py | 4 +- .../fluid/tests/unittests/seresnext_net.py | 8 +- .../tests/unittests/test_beam_search_op.py | 3 +- .../tests/unittests/test_dist_fleet_ps.py | 6 +- .../tests/unittests/test_dist_fleet_ps11.py | 6 +- .../tests/unittests/test_dist_fleet_ps12.py | 6 +- .../tests/unittests/test_dist_fleet_ps13.py | 6 +- .../tests/unittests/test_dist_fleet_ps2.py | 6 +- .../tests/unittests/test_dist_fleet_ps3.py | 6 +- .../tests/unittests/test_dist_fleet_ps4.py | 6 +- .../tests/unittests/test_dist_fleet_ps5.py | 6 +- .../tests/unittests/test_dist_fleet_ps6.py | 6 +- .../tests/unittests/test_dist_transpiler.py | 12 +- .../unittests/test_dygraph_mnist_fp16.py | 2 +- .../unittests/test_dygraph_multi_forward.py | 2 +- .../test_eager_deletion_padding_rnn.py | 42 ++-- .../unittests/test_eager_deletion_while_op.py | 8 +- .../test_embedding_id_stop_gradient.py | 2 +- .../test_fuse_relu_depthwise_conv_pass.py | 2 +- .../tests/unittests/test_imperative_basic.py | 14 +- .../tests/unittests/test_imperative_gnn.py | 6 +- ..._imperative_lod_tensor_to_selected_rows.py | 6 +- .../tests/unittests/test_imperative_mnist.py | 2 +- .../test_imperative_ocr_attention_model.py | 41 ++- .../unittests/test_imperative_optimizer.py | 4 +- .../unittests/test_imperative_optimizer_v2.py | 4 +- .../unittests/test_imperative_ptb_rnn.py | 30 +-- .../test_imperative_reinforcement.py | 2 +- .../tests/unittests/test_imperative_resnet.py | 2 +- .../unittests/test_imperative_save_load.py | 30 +-- .../unittests/test_imperative_save_load_v2.py | 30 +-- .../unittests/test_imperative_se_resnext.py | 4 +- ..._imperative_selected_rows_to_lod_tensor.py | 6 +- ...perative_star_gan_with_gradient_penalty.py | 10 +- ..._imperative_transformer_sorted_gradient.py | 19 +- .../unittests/test_ir_memory_optimize_pass.py | 4 +- .../fluid/tests/unittests/test_nn_grad.py | 2 +- .../fluid/tests/unittests/test_reshape_op.py | 35 +-- .../tests/unittests/test_static_save_load.py | 30 +-- .../fluid/tests/unittests/test_var_base.py | 4 +- .../fluid/tests/unittests/test_variable.py | 2 +- .../tests/unittests/test_while_loop_op.py | 2 +- .../tests/unittests/transformer_model.py | 10 +- 98 files changed, 382 insertions(+), 712 deletions(-) diff --git a/python/paddle/distribution/normal.py b/python/paddle/distribution/normal.py index 0dd7db2df9e4d..c2b20297d5e2a 100644 --- a/python/paddle/distribution/normal.py +++ b/python/paddle/distribution/normal.py @@ -184,7 +184,7 @@ def sample(self, shape=(), seed=0): zero_tmp = tensor.fill_constant_batch_size_like( self.loc + self.scale, batch_shape + shape, self.dtype, 0.0 ) - zero_tmp_reshape = nn.reshape(zero_tmp, output_shape) + zero_tmp_reshape = paddle.reshape(zero_tmp, output_shape) zero_tmp_shape = nn.shape(zero_tmp_reshape) normal_random_tmp = nn.gaussian_random( zero_tmp_shape, mean=0.0, std=1.0, seed=seed, dtype=self.dtype @@ -199,7 +199,7 @@ def sample(self, shape=(), seed=0): ) * (tensor.zeros(output_shape, dtype=self.dtype) + self.scale) output = elementwise_add(output, self.loc, name=name) if self.all_arg_is_float: - return nn.reshape(output, shape, name=name) + return paddle.reshape(output, shape, name=name) else: return output diff --git a/python/paddle/distribution/uniform.py b/python/paddle/distribution/uniform.py index 0b4ec6211d4ae..c8f8c40a758a4 100644 --- a/python/paddle/distribution/uniform.py +++ b/python/paddle/distribution/uniform.py @@ -28,6 +28,7 @@ nn, tensor, ) +import paddle class Uniform(distribution.Distribution): @@ -174,8 +175,8 @@ def sample(self, shape, seed=0): max=1.0, seed=seed, ) - zero_tmp_reshape = nn.reshape(zero_tmp, output_shape) - uniform_random_tmp_reshape = nn.reshape( + zero_tmp_reshape = paddle.reshape(zero_tmp, output_shape) + uniform_random_tmp_reshape = paddle.reshape( uniform_random_tmp, output_shape ) output = uniform_random_tmp_reshape * ( @@ -193,7 +194,7 @@ def sample(self, shape, seed=0): ) output = elementwise_add(output, self.low, name=name) if self.all_arg_is_float: - return nn.reshape(output, shape, name=name) + return paddle.reshape(output, shape, name=name) else: return output diff --git a/python/paddle/fluid/contrib/layers/nn.py b/python/paddle/fluid/contrib/layers/nn.py index 3695f8cad20d7..4bfbe7538617c 100644 --- a/python/paddle/fluid/contrib/layers/nn.py +++ b/python/paddle/fluid/contrib/layers/nn.py @@ -36,7 +36,8 @@ from paddle.fluid.param_attr import ParamAttr from paddle.fluid.framework import Variable, convert_np_dtype_to_dtype_ -from paddle.fluid.layers import slice, reshape +from paddle.fluid.layers import slice +import paddle import warnings from paddle import _C_ops, _legacy_C_ops @@ -1549,17 +1550,17 @@ def tdm_sampler( mask, axes=[1], starts=[start_offset], ends=[end_offset] ) - layer_samples = reshape( + layer_samples = paddle.reshape( layer_samples, [-1, layer_sample_num + positive_flag, 1] ) layer_samples.stop_gradient = True - layer_labels = reshape( + layer_labels = paddle.reshape( layer_labels, [-1, layer_sample_num + positive_flag, 1] ) layer_labels.stop_gradient = True - layer_mask = reshape( + layer_mask = paddle.reshape( layer_mask, [-1, layer_sample_num + positive_flag, 1] ) layer_mask.stop_gradient = True diff --git a/python/paddle/fluid/contrib/layers/rnn_impl.py b/python/paddle/fluid/contrib/layers/rnn_impl.py index 69b48fbe23430..78813126e26ac 100644 --- a/python/paddle/fluid/contrib/layers/rnn_impl.py +++ b/python/paddle/fluid/contrib/layers/rnn_impl.py @@ -19,6 +19,7 @@ from paddle.fluid.dygraph import Layer from paddle.fluid.dygraph.layer_object_helper import LayerObjectHelper from paddle.fluid.layers.control_flow import StaticRNN +import paddle __all__ = ['BasicGRUUnit', 'basic_gru', 'BasicLSTMUnit', 'basic_lstm'] @@ -339,7 +340,7 @@ def basic_gru( if bidirectional: direc_num = 2 if init_hidden: - init_hidden = layers.reshape( + init_hidden = paddle.reshape( init_hidden, shape=[num_layers, direc_num, -1, hidden_size] ) @@ -394,7 +395,7 @@ def get_single_direction_output( last_hidden_array.append(last_hidden) last_hidden_output = layers.concat(last_hidden_array, axis=0) - last_hidden_output = layers.reshape( + last_hidden_output = paddle.reshape( last_hidden_output, shape=[num_layers, -1, hidden_size] ) @@ -419,7 +420,7 @@ def get_single_direction_output( rnn_out = layers.concat([fw_rnn_out, bw_rnn_out], axis=2) last_hidden = layers.concat([fw_last_hidden, bw_last_hidden], axis=1) - last_hidden = layers.reshape( + last_hidden = paddle.reshape( last_hidden, shape=[num_layers * direc_num, -1, hidden_size] ) @@ -625,10 +626,10 @@ def basic_lstm( direc_num = 2 # convert to [num_layers, 2, batch_size, hidden_size] if init_hidden: - init_hidden = layers.reshape( + init_hidden = paddle.reshape( init_hidden, shape=[num_layers, direc_num, -1, hidden_size] ) - init_cell = layers.reshape( + init_cell = paddle.reshape( init_cell, shape=[num_layers, direc_num, -1, hidden_size] ) @@ -701,11 +702,11 @@ def get_single_direction_output( last_cell_array.append(last_cell) last_hidden_output = layers.concat(last_hidden_array, axis=0) - last_hidden_output = layers.reshape( + last_hidden_output = paddle.reshape( last_hidden_output, shape=[num_layers, -1, hidden_size] ) last_cell_output = layers.concat(last_cell_array, axis=0) - last_cell_output = layers.reshape( + last_cell_output = paddle.reshape( last_cell_output, shape=[num_layers, -1, hidden_size] ) @@ -729,12 +730,12 @@ def get_single_direction_output( rnn_out = layers.concat([fw_rnn_out, bw_rnn_out], axis=2) last_hidden = layers.concat([fw_last_hidden, bw_last_hidden], axis=1) - last_hidden = layers.reshape( + last_hidden = paddle.reshape( last_hidden, shape=[num_layers * direc_num, -1, hidden_size] ) last_cell = layers.concat([fw_last_cell, bw_last_cell], axis=1) - last_cell = layers.reshape( + last_cell = paddle.reshape( last_cell, shape=[num_layers * direc_num, -1, hidden_size] ) diff --git a/python/paddle/fluid/contrib/tests/test_model_cast_to_bf16.py b/python/paddle/fluid/contrib/tests/test_model_cast_to_bf16.py index c000d55fccdfa..e04bf0b2b7058 100644 --- a/python/paddle/fluid/contrib/tests/test_model_cast_to_bf16.py +++ b/python/paddle/fluid/contrib/tests/test_model_cast_to_bf16.py @@ -107,17 +107,17 @@ def _graph_common(self, _amp_fun, startup_prog=None): ret = layers.elementwise_add(t, tt) ret = layers.elementwise_mul(ret, t) - ret = layers.reshape(ret, [0, 0]) + ret = paddle.reshape(ret, [0, 0]) with amp.bf16.bf16_guard(): ret_bf16 = layers.elementwise_add(t_bf16, tt_bf16) ret_bf16 = layers.elementwise_mul(ret_bf16, t_bf16) - ret_bf16 = layers.reshape(ret_bf16, [0, 0]) + ret_bf16 = paddle.reshape(ret_bf16, [0, 0]) with amp.bf16.bf16_guard(): ret_fp32bf16 = layers.elementwise_add(t, tt) ret_fp32bf16 = layers.elementwise_mul(ret_fp32bf16, t) - ret_fp32bf16 = layers.reshape(ret_fp32bf16, [0, 0]) + ret_fp32bf16 = paddle.reshape(ret_fp32bf16, [0, 0]) ( static_ret_bf16, @@ -148,7 +148,8 @@ def _graph_common(self, _amp_fun, startup_prog=None): with amp.bf16.bf16_guard(): ret = layers.elementwise_add(t, tt) - ret = layers.reshape(ret, [0, 0], act='elu') + ret = paddle.reshape(ret, [0, 0]) + ret = paddle.nn.functional.elu(ret) ret = layers.elementwise_mul(ret, t) ret = layers.elementwise_add(ret, tt) diff --git a/python/paddle/fluid/dygraph/parallel.py b/python/paddle/fluid/dygraph/parallel.py index 85c95c6b2b3e6..cb030f71a45bc 100644 --- a/python/paddle/fluid/dygraph/parallel.py +++ b/python/paddle/fluid/dygraph/parallel.py @@ -320,7 +320,7 @@ def _coalesce_tensors(var_groups): for g_var in grad_vars: g_var_shapes.append(g_var.shape) flattened_vars.append( - nn.reshape(x=g_var, shape=[np.prod(g_var.shape)]) + paddle.reshape(x=g_var, shape=[np.prod(g_var.shape)]) ) coalesced_grad = nn.concat(flattened_vars) coalesced_grads_and_grad_vars.append( diff --git a/python/paddle/fluid/layer_helper_base.py b/python/paddle/fluid/layer_helper_base.py index 91ec751cc282c..39eb4a0947400 100644 --- a/python/paddle/fluid/layer_helper_base.py +++ b/python/paddle/fluid/layer_helper_base.py @@ -14,6 +14,7 @@ import copy import numpy as np +import paddle from .framework import ( Variable, @@ -114,7 +115,7 @@ def to_variable(self, value, name=None): ) def _create_weight_normalize(self, attr, shape, dtype): - from .layers import elementwise_mul, elementwise_div, reshape + from .layers import elementwise_mul, elementwise_div # Remove these ops when LayerHelper and layers support indicating # program and block. @@ -275,7 +276,7 @@ def __weight_normalize(g, v, dim): x=v, y=scale if dim is None - else reshape(x=scale, shape=[v.shape[dim]]), + else paddle.reshape(x=scale, shape=[v.shape[dim]]), axis=-1 if dim is None else dim, ) # To serialize the original parameter for inference, maybe a diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py index b7a3b2aba9c88..bfa063c105270 100644 --- a/python/paddle/fluid/layers/detection.py +++ b/python/paddle/fluid/layers/detection.py @@ -328,8 +328,8 @@ def retinanet_target_assign( bbox_inside_weight.stop_gradient = True fg_num.stop_gradient = True - cls_logits = nn.reshape(x=cls_logits, shape=(-1, num_classes)) - bbox_pred = nn.reshape(x=bbox_pred, shape=(-1, 4)) + cls_logits = paddle.reshape(x=cls_logits, shape=(-1, num_classes)) + bbox_pred = paddle.reshape(x=bbox_pred, shape=(-1, 4)) predicted_cls_logits = paddle.gather(cls_logits, score_index) predicted_bbox_pred = paddle.gather(bbox_pred, loc_index) @@ -510,8 +510,8 @@ def rpn_target_assign( target_bbox.stop_gradient = True bbox_inside_weight.stop_gradient = True - cls_logits = nn.reshape(x=cls_logits, shape=(-1, 1)) - bbox_pred = nn.reshape(x=bbox_pred, shape=(-1, 4)) + cls_logits = paddle.reshape(x=cls_logits, shape=(-1, 1)) + bbox_pred = paddle.reshape(x=bbox_pred, shape=(-1, 4)) predicted_cls_logits = paddle.gather(cls_logits, score_index) predicted_bbox_pred = paddle.gather(bbox_pred, loc_index) @@ -1750,7 +1750,7 @@ def __reshape_to_2d(var): # 2. Compute confidence for mining hard examples # 2.1. Get the target label based on matched indices - gt_label = nn.reshape( + gt_label = paddle.reshape( x=gt_label, shape=(len(gt_label.shape) - 1) * (0,) + (-1, 1) ) gt_label.stop_gradient = True @@ -1769,9 +1769,7 @@ def __reshape_to_2d(var): actual_shape.stop_gradient = True # shape=(-1, 0) is set for compile-time, the correct shape is set by # actual_shape in runtime. - conf_loss = nn.reshape( - x=conf_loss, shape=(-1, 0), actual_shape=actual_shape - ) + conf_loss = paddle.reshape(x=conf_loss, shape=actual_shape) conf_loss.stop_gradient = True neg_indices = helper.create_variable_for_type_inference(dtype='int32') dtype = matched_indices.dtype @@ -1848,7 +1846,7 @@ def __reshape_to_2d(var): # reshape to [N, Np], N is the batch size and Np is the prior box number. # shape=(-1, 0) is set for compile-time, the correct shape is set by # actual_shape in runtime. - loss = nn.reshape(x=loss, shape=(-1, 0), actual_shape=actual_shape) + loss = paddle.reshape(x=loss, shape=actual_shape) loss = nn.reduce_sum(loss, dim=1, keep_dim=True) if normalize: normalizer = nn.reduce_sum(target_loc_weight) @@ -2477,9 +2475,9 @@ def _is_list_or_tuple_and_equal(data, length, err_info): box = tensor.concat(reshaped_boxes) var = tensor.concat(reshaped_vars) mbox_locs_concat = tensor.concat(mbox_locs, axis=1) - mbox_locs_concat = nn.reshape(mbox_locs_concat, shape=[0, -1, 4]) + mbox_locs_concat = paddle.reshape(mbox_locs_concat, shape=[0, -1, 4]) mbox_confs_concat = tensor.concat(mbox_confs, axis=1) - mbox_confs_concat = nn.reshape( + mbox_confs_concat = paddle.reshape( mbox_confs_concat, shape=[0, -1, num_classes] ) diff --git a/python/paddle/fluid/layers/distributions.py b/python/paddle/fluid/layers/distributions.py index e7c846c1fe08a..196d89db33e5f 100644 --- a/python/paddle/fluid/layers/distributions.py +++ b/python/paddle/fluid/layers/distributions.py @@ -228,7 +228,7 @@ def sample(self, shape, seed=0): uniform_random_tmp * (zero_tmp + self.high - self.low) + self.low ) - return nn.reshape(output, output_shape) + return paddle.reshape(output, output_shape) else: output_shape = shape + batch_shape output = ( @@ -240,7 +240,7 @@ def sample(self, shape, seed=0): + self.low ) if self.all_arg_is_float: - return nn.reshape(output, shape) + return paddle.reshape(output, shape) else: return output @@ -382,7 +382,7 @@ def sample(self, shape, seed=0): zero_tmp_shape, mean=0.0, std=1.0, seed=seed ) output = normal_random_tmp * (zero_tmp + self.scale) + self.loc - return nn.reshape(output, output_shape) + return paddle.reshape(output, output_shape) else: output_shape = shape + batch_shape output = ( @@ -394,7 +394,7 @@ def sample(self, shape, seed=0): + self.loc ) if self.all_arg_is_float: - return nn.reshape(output, shape) + return paddle.reshape(output, shape) else: return output diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index b699de304eb77..45dac4372a23d 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -101,7 +101,6 @@ 'smooth_l1', 'one_hot', 'autoincreased_step_counter', - 'reshape', 'squeeze', 'unsqueeze', 'lod_reset', @@ -6234,240 +6233,6 @@ def autoincreased_step_counter(counter_name=None, begin=1, step=1): return counter -def reshape(x, shape, actual_shape=None, act=None, inplace=False, name=None): - r""" - :alias_main: paddle.reshape - :alias: paddle.reshape,paddle.tensor.reshape,paddle.tensor.manipulation.reshape - - This operator changes the shape of ``x`` without changing its data. - - The target shape can be given by ``shape`` or ``actual_shape``. - When ``shape`` and ``actual_shape`` are set at the same time, - ``actual_shape`` has a higher priority than ``shape`` - but at this time ``shape`` can only be an integer list or tuple, and ``shape`` still should be set correctly to - guarantee shape inference in compile-time. - - Some tricks exist when specifying the target shape. - - 1. -1 means the value of this dimension is inferred from the total element - number of x and remaining dimensions. Thus one and only one dimension can - be set -1. - - 2. 0 means the actual dimension value is going to be copied from the - corresponding dimension of x. The index of 0s in shape can not exceed - the dimension of x. - - Here are some examples to explain it. - - 1. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape - is [6, 8], the reshape operator will transform x into a 2-D tensor with - shape [6, 8] and leaving x's data unchanged. - - 2. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape - specified is [2, 3, -1, 2], the reshape operator will transform x into a - 4-D tensor with shape [2, 3, 4, 2] and leaving x's data unchanged. In this - case, one dimension of the target shape is set to -1, the value of this - dimension is inferred from the total element number of x and remaining - dimensions. - - 3. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape - is [-1, 0, 3, 2], the reshape operator will transform x into a 4-D tensor - with shape [2, 4, 3, 2] and leaving x's data unchanged. In this case, - besides -1, 0 means the actual dimension value is going to be copied from - the corresponding dimension of x. - - **Note**: - The parameter ``actual_shape`` will be deprecated in the future and only use ``shape`` instead to represent the target shape. - - Args: - x(Tensor): An N-D Tensor. The data type is ``float32``, ``float64``, ``int32`` or ``int64``. - shape(list|tuple|Tensor): Define the target shape. At most one dimension of the target shape can be -1. - The data type is ``int32`` . If ``shape`` is a list or tuple, the elements of it should be integers or Tensors with shape [1]. - If ``shape`` is an Tensor, it should be an 1-D Tensor . - actual_shape(variable, optional): An 1-D ``Tensor`` or ``LoDTensor`` . The data type is ``int32`` . If provided, reshape - according to this given shape rather than ``shape`` specifying shape. - That is to say ``actual_shape`` has a higher priority - than ``shape(list|tuple)`` but not ``shape(Tensor)``. \ - This argument ``actual_shape`` will be removed in a future version. \ - Instructions for updating: ``actual_shape`` will be removed in future versions and replaced by ``shape``. - act (str, optional): The non-linear activation to be applied to the reshaped input. Default None. - inplace(bool, optional): If ``inplace`` is True, the input and output of ``layers.reshape`` - are the same variable. Otherwise, the input and output of - ``layers.reshape`` are different variable. Default False. Note that if ``x`` - is more than one OPs' input, ``inplace`` must be False. - name(str, optional): The default value is None. Normally there is no need for user to set this property. - For more information, please refer to :ref:`api_guide_Name` . - - Returns: - Tensor: A reshaped Tensor with the same data type as ``x``. It is a new tensor variable if ``inplace`` is ``False``, otherwise it is ``x``. If ``act`` is None, return the reshaped tensor variable, otherwise return the activated tensor variable. - - - Examples: - .. code-block:: python - - import paddle - import paddle.fluid as fluid - paddle.enable_static() - - # example 1: - # attr shape is a list which doesn't contain Tensors. - data_1 = fluid.data( - name='data_1', shape=[2, 4, 6], dtype='float32') - reshaped_1 = fluid.layers.reshape( - x=data_1, shape=[-1, 0, 3, 2]) - # the shape of reshaped_1 is [2,4,3,2]. - - # example 2: - # attr shape is a list which contains Tensors. - data_2 = fluid.layers.fill_constant([2,25], "int32", 3) - dim = fluid.layers.fill_constant([1], "int32", 5) - reshaped_2 = fluid.layers.reshape(data_2, shape=[dim, 10]) - # the shape of reshaped_2 is [5,10]. - - # example 3: - data_3 = fluid.data( - name="data_3", shape=[2,4,6], dtype='float32') - reshaped_3 = fluid.layers.reshape(x=data_3, shape=[6,8]) - # the shape of reshaped_3 is [6,8]. - """ - if in_dygraph_mode(): - tmp_tensor_type = core.eager.Tensor - # TODO(zhiqiu): enable inplace in dygraph mode. - if inplace: - warnings.warn( - "Inplace on reshape is not allowed and will be discarded in dygraph mode currently." - ) - if isinstance(shape, (list, tuple)): - shape = [ - item.numpy().item(0) if isinstance(item, Variable) else item - for item in shape - ] - out = _C_ops.reshape(x, shape) - elif isinstance(shape, tmp_tensor_type): - # TODO: Tensor shape in reshape has not been tested - shape.stop_gradient = True - out = _C_ops.reshape(x, shape) - else: - raise ValueError( - "shape must be an instance of `list`, `tuple` or `Variable`," - " got '{}.'".format(type(shape)) - ) - - return dygraph_utils._append_activation_in_dygraph(out, act) - else: - if _in_legacy_dygraph(): - tmp_tensor_type = Variable - if inplace: - warnings.warn( - "Inplace on reshape is not allowed and will be discarded in dygraph mode currently." - ) - if isinstance(shape, (list, tuple)): - shape = [ - item.numpy().item(0) if isinstance(item, Variable) else item - for item in shape - ] - out, _ = _legacy_C_ops.reshape2(x, None, 'shape', shape) - elif isinstance(shape, tmp_tensor_type): - shape.stop_gradient = True - out, _ = _legacy_C_ops.reshape2(x, shape) - else: - raise ValueError( - "shape must be an instance of `list`, `tuple` or `Variable`," - " got '{}.'".format(type(shape)) - ) - - return dygraph_utils._append_activation_in_dygraph(out, act) - - check_variable_and_dtype( - x, - 'x', - [ - 'float16', - 'float32', - 'float64', - 'int16', - 'int32', - 'int64', - 'bool', - 'uint16', - ], - 'reshape', - ) - check_type(shape, 'shape', (list, tuple, Variable), 'reshape') - check_type(actual_shape, 'actual_shape', (Variable, type(None)), 'reshape') - - helper = LayerHelper("reshape2", **locals()) - - def get_attr_shape(list_shape): - unk_dim_idx = -1 - attrs_shape = [] - for dim_idx, dim_size in enumerate(list_shape): - if isinstance(dim_size, Variable): - attrs_shape.append(-1) - else: - attrs_shape.append(dim_size) - if dim_size == -1: - assert unk_dim_idx == -1, ( - "Only one dimension value of 'shape' in reshape can " - "be -1. But received shape[%d] is also -1.\n" - "\n\t# N = x.shape()[2]\t\t# N is an int. " - "(NOT recommend under @to_static)\n\tN = paddle.shape(x)[2]\t\t" - "# N is a Tensor. (Recommend)\n\tz = paddle.reshape([N, -1, 4])" - "\t# z.shape is [-1, -1, 4]\n\n" - " If your target shape in Reshape represents dynamic shape, " - "please turn it into a Tensor under @to_static. See above example for details." - % dim_idx - ) - unk_dim_idx = dim_idx - elif dim_size == 0: - assert dim_idx < len(x.shape), ( - "The index of 0 in `shape` must be less than " - "the input tensor X's dimensions. " - "But received shape[%d] = 0, X's dimensions = %d." - % (dim_idx, len(x.shape)) - ) - else: - assert dim_size > 0, ( - "Each dimension value of 'shape' in reshape must not " - "be negative except one unknown dimension. " - "But received shape[%d] = %s." - % (dim_idx, str(dim_size)) - ) - return attrs_shape - - inputs = {"X": x} - attrs = {} - if isinstance(shape, Variable): - shape.stop_gradient = True - inputs["Shape"] = shape - elif isinstance(shape, (list, tuple)): - assert len(shape) > 0, ( - "The size of 'shape' in reshape can't be zero, " - "but received %s." % len(shape) - ) - attrs["shape"] = get_attr_shape(shape) - if utils._contain_var(shape): - inputs['ShapeTensor'] = utils._convert_to_tensor_list(shape) - elif isinstance(actual_shape, Variable): - actual_shape.stop_gradient = True - inputs["Shape"] = actual_shape - - out = ( - x - if inplace - else helper.create_variable_for_type_inference(dtype=x.dtype) - ) - x_shape = helper.create_variable_for_type_inference(dtype=x.dtype) - helper.append_op( - type="reshape2", - inputs=inputs, - attrs=attrs, - outputs={"Out": out, "XShape": x_shape}, - ) - - return helper.append_activation(out) - - def squeeze(input, axes, name=None): """ This OP will squeeze single-dimensional entries of input tensor's shape. If axes is provided, will diff --git a/python/paddle/fluid/layers/rnn.py b/python/paddle/fluid/layers/rnn.py index 3401fe468748e..9b384203fa885 100644 --- a/python/paddle/fluid/layers/rnn.py +++ b/python/paddle/fluid/layers/rnn.py @@ -1036,7 +1036,7 @@ def tile_beam_merge_with_batch(x, beam_size): x, list(range(2, len(x.shape))) + [0, 1] ) # [..., batch_size, beam_size] # use 0 to copy to avoid wrong shape - x = nn.reshape( + x = paddle.reshape( x, shape=[0] * (len(x.shape) - 2) + [-1] ) # [..., batch_size * beam_size] x = nn.transpose( @@ -1059,7 +1059,7 @@ def _split_batch_beams(self, x): """ check_type(x, 'x', (Variable), 'BeamSearchDecoder._split_batch_beams') # TODO: avoid fake shape in compile-time like tile_beam_merge_with_batch - return nn.reshape(x, shape=[-1, self.beam_size] + list(x.shape[1:])) + return paddle.reshape(x, shape=[-1, self.beam_size] + list(x.shape[1:])) def _merge_batch_beams(self, x): r""" @@ -1076,7 +1076,7 @@ def _merge_batch_beams(self, x): """ check_type(x, 'x', (Variable), 'BeamSearchDecoder._merge_batch_beams') # TODO: avoid fake shape in compile-time like tile_beam_merge_with_batch - return nn.reshape(x, shape=[-1] + list(x.shape[2:])) + return paddle.reshape(x, shape=[-1] + list(x.shape[2:])) def _expand_to_beam_size(self, x): r""" @@ -1311,13 +1311,13 @@ def _beam_search_step(self, time, logits, next_cell_states, beam_state): ) # TODO: length penalty scores = log_probs - scores = nn.reshape(scores, [-1, self.beam_size * self.vocab_size]) + scores = paddle.reshape(scores, [-1, self.beam_size * self.vocab_size]) # TODO: add grad for topk then this beam search can be used to train topk_scores, topk_indices = paddle.topk(x=scores, k=self.beam_size) beam_indices = paddle.floor_divide(topk_indices, self.vocab_size_tensor) token_indices = paddle.remainder(topk_indices, self.vocab_size_tensor) next_log_probs = self._gather( - nn.reshape(log_probs, [-1, self.beam_size * self.vocab_size]), + paddle.reshape(log_probs, [-1, self.beam_size * self.vocab_size]), topk_indices, self.batch_size, ) diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py index 3982bcac6fa13..d032b8cd20a7c 100644 --- a/python/paddle/fluid/layers/tensor.py +++ b/python/paddle/fluid/layers/tensor.py @@ -1948,7 +1948,8 @@ def _check_attr(attr, message): if batch_val <= 0: raise TypeError("batch_shape should be a positive int list") - from .nn import reshape, expand + from .nn import expand + from paddle import reshape out = reshape(x=out, shape=re_shape) out = expand(x=out, expand_times=expand_times) diff --git a/python/paddle/fluid/nets.py b/python/paddle/fluid/nets.py index bccb7e039824b..eab247452fc92 100644 --- a/python/paddle/fluid/nets.py +++ b/python/paddle/fluid/nets.py @@ -16,6 +16,7 @@ from . import layers from .data_feeder import check_variable_and_dtype, convert_dtype from ..utils import deprecated +import paddle __all__ = [ "simple_img_conv_pool", @@ -569,7 +570,7 @@ def __split_heads(x, num_heads): # reshape the 3-D input: [batch_size, max_sequence_length, hidden_dim] # into a 4-D output: # [batch_size, max_sequence_length, num_heads, hidden_size_per_head]. - reshaped = layers.reshape( + reshaped = paddle.reshape( x=x, shape=list(x.shape[:-1]) + [num_heads, hidden_size // num_heads], ) @@ -598,7 +599,7 @@ def __combine_heads(x): raise ValueError("Input(x) should be a 4-D Tensor.") trans_x = layers.transpose(x, perm=[0, 2, 1, 3]) - return layers.reshape( + return paddle.reshape( x=trans_x, shape=list( map( @@ -622,12 +623,10 @@ def __combine_heads(x): scaled_q = layers.scale(x=q, scale=key_dim_per_head**-0.5) product = layers.matmul(x=scaled_q, y=k, transpose_y=True) - weights = layers.reshape( - x=layers.reshape( - x=product, shape=[-1, product.shape[-1]], act="softmax" - ), - shape=product.shape, - ) + x = paddle.reshape(x=product, shape=[-1, product.shape[-1]]) + x = paddle.nn.functional.softmax(x) + weights = paddle.reshape(x=x, shape=product.shape) + if dropout_rate: weights = layers.dropout( weights, dropout_prob=dropout_rate, is_test=False diff --git a/python/paddle/fluid/tests/book/test_machine_translation.py b/python/paddle/fluid/tests/book/test_machine_translation.py index 27da08ea00814..20b36550fa1de 100644 --- a/python/paddle/fluid/tests/book/test_machine_translation.py +++ b/python/paddle/fluid/tests/book/test_machine_translation.py @@ -140,7 +140,9 @@ def decoder_decode(context, is_sparse): topk_scores, topk_indices = pd.topk(current_score, k=beam_size) # calculate accumulated scores after topk to reduce computation cost accu_scores = pd.elementwise_add( - x=pd.log(topk_scores), y=pd.reshape(pre_score, shape=[-1]), axis=0 + x=pd.log(topk_scores), + y=paddle.reshape(pre_score, shape=[-1]), + axis=0, ) selected_ids, selected_scores = pd.beam_search( pre_ids, diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_pp_embedding.py b/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_pp_embedding.py index 88b4a66d4fe50..33833454c0152 100644 --- a/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_pp_embedding.py +++ b/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_pp_embedding.py @@ -55,7 +55,7 @@ def forward(self, x1, x2, y1): x_emb = self.word_embeddings(x1) fc = fluid.layers.matmul(x_emb, self.softmax_weight) fc = fluid.layers.elementwise_add(fc, self.softmax_bias) - projection = fluid.layers.reshape(fc, shape=[-1, vocab_size]) + projection = paddle.reshape(fc, shape=[-1, vocab_size]) loss = fluid.layers.softmax_with_cross_entropy( logits=projection, label=y1, soft_label=False ) @@ -95,7 +95,7 @@ def __init__(self): def forward(self, args): fc, x2 = args fc = fluid.layers.elementwise_add(fc, self.softmax_bias) - projection = fluid.layers.reshape(fc, shape=[-1, vocab_size]) + projection = paddle.reshape(fc, shape=[-1, vocab_size]) return projection, x2 diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_shared_weight.py b/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_shared_weight.py index 7f3fff2da92b6..aa1489b48bc51 100644 --- a/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_shared_weight.py +++ b/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_shared_weight.py @@ -59,7 +59,7 @@ def forward(self, x1, x2, y1): x_emb = self.word_embeddings(x1) fc = fluid.layers.matmul(x_emb, self.softmax_weight) fc = fluid.layers.elementwise_add(fc, self.softmax_bias) - projection = fluid.layers.reshape(fc, shape=[-1, vocab_size]) + projection = paddle.reshape(fc, shape=[-1, vocab_size]) projection = paddle.matmul(projection, self.word_embeddings.weight) @@ -106,7 +106,7 @@ def __init__(self): def forward(self, args): fc, x2 = args fc = fluid.layers.elementwise_add(fc, self.softmax_bias) - projection = fluid.layers.reshape(fc, shape=[-1, vocab_size]) + projection = paddle.reshape(fc, shape=[-1, vocab_size]) return projection, x2 diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_se_resnext.py b/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_se_resnext.py index 9883efbb48783..d497c4a369c8f 100644 --- a/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_se_resnext.py +++ b/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_se_resnext.py @@ -135,7 +135,7 @@ def __init__(self, num_channels, reduction_ratio): def forward(self, input): y = self._pool(input) - y = fluid.layers.reshape(y, shape=[-1, self._num_channels]) + y = paddle.reshape(y, shape=[-1, self._num_channels]) y = self._squeeze(y) y = self._excitation(y) y = fluid.layers.elementwise_mul(x=input, y=y, axis=0) @@ -326,7 +326,7 @@ def forward(self, inputs): for bottleneck_block in self.bottleneck_block_list: y = bottleneck_block(y) y = self.pool2d_avg(y) - y = fluid.layers.reshape(y, shape=[-1, self.pool2d_avg_output]) + y = paddle.reshape(y, shape=[-1, self.pool2d_avg_output]) y = self.out(y) return y diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_transformer.py b/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_transformer.py index f5b5903831b24..21d357fcdd180 100644 --- a/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_transformer.py +++ b/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_transformer.py @@ -325,16 +325,16 @@ def forward(self, queries, keys, values, attn_bias): v = self._v_fc(values) # split head - reshaped_q = fluid.layers.reshape( - x=q, shape=[0, 0, self._n_head, self._d_key], inplace=False + reshaped_q = paddle.reshape( + x=q, shape=[0, 0, self._n_head, self._d_key] ) transpose_q = fluid.layers.transpose(x=reshaped_q, perm=[0, 2, 1, 3]) - reshaped_k = fluid.layers.reshape( - x=k, shape=[0, 0, self._n_head, self._d_key], inplace=False + reshaped_k = paddle.reshape( + x=k, shape=[0, 0, self._n_head, self._d_key] ) transpose_k = fluid.layers.transpose(x=reshaped_k, perm=[0, 2, 1, 3]) - reshaped_v = fluid.layers.reshape( - x=v, shape=[0, 0, self._n_head, self._d_value], inplace=False + reshaped_v = paddle.reshape( + x=v, shape=[0, 0, self._n_head, self._d_value] ) transpose_v = fluid.layers.transpose(x=reshaped_v, perm=[0, 2, 1, 3]) @@ -363,10 +363,9 @@ def forward(self, queries, keys, values, attn_bias): if len(out.shape) != 4: raise ValueError("Input(x) should be a 4-D Tensor.") trans_x = fluid.layers.transpose(out, perm=[0, 2, 1, 3]) - final_out = fluid.layers.reshape( + final_out = paddle.reshape( x=trans_x, shape=[0, 0, trans_x.shape[2] * trans_x.shape[3]], - inplace=False, ) # fc to output @@ -839,8 +838,8 @@ def forward(self, dec_inputs=None, enc_output=None): dec_input, enc_output, trg_slf_attn_bias, trg_src_attn_bias ) - dec_output_reshape = fluid.layers.reshape( - dec_output, shape=[-1, dec_output.shape[-1]], inplace=False + dec_output_reshape = paddle.reshape( + dec_output, shape=[-1, dec_output.shape[-1]] ) if self._weight_sharing: diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_simnet_bow.py b/python/paddle/fluid/tests/unittests/dist_fleet_simnet_bow.py index c9a2a16da2db8..5ede941c22dee 100644 --- a/python/paddle/fluid/tests/unittests/dist_fleet_simnet_bow.py +++ b/python/paddle/fluid/tests/unittests/dist_fleet_simnet_bow.py @@ -127,7 +127,7 @@ def train_network( ), is_sparse=is_sparse, ) - q_emb = fluid.layers.reshape(q_emb, [-1, emb_dim]) + q_emb = paddle.reshape(q_emb, [-1, emb_dim]) # vsum q_sum = fluid.layers.sequence_pool(input=q_emb, pool_type='sum') q_ss = paddle.nn.functional.softsign(q_sum) @@ -154,7 +154,7 @@ def train_network( ), is_sparse=is_sparse, ) - pt_emb = fluid.layers.reshape(pt_emb, [-1, emb_dim]) + pt_emb = paddle.reshape(pt_emb, [-1, emb_dim]) # vsum pt_sum = fluid.layers.sequence_pool(input=pt_emb, pool_type='sum') pt_ss = paddle.nn.functional.softsign(pt_sum) @@ -178,7 +178,7 @@ def train_network( ), is_sparse=is_sparse, ) - nt_emb = fluid.layers.reshape(nt_emb, [-1, emb_dim]) + nt_emb = paddle.reshape(nt_emb, [-1, emb_dim]) # vsum nt_sum = fluid.layers.sequence_pool(input=nt_emb, pool_type='sum') nt_ss = paddle.nn.functional.softsign(nt_sum) diff --git a/python/paddle/fluid/tests/unittests/dist_transformer.py b/python/paddle/fluid/tests/unittests/dist_transformer.py index 88ec3188c9e89..fbe292e1f368c 100644 --- a/python/paddle/fluid/tests/unittests/dist_transformer.py +++ b/python/paddle/fluid/tests/unittests/dist_transformer.py @@ -1142,7 +1142,7 @@ def __split_heads(x, n_head): hidden_size = x.shape[-1] # The value 0 in shape attr means copying the corresponding dimension # size of the input as the output dimension size. - reshaped = layers.reshape( + reshaped = paddle.reshape( x=x, shape=[0, 0, n_head, hidden_size // n_head] ) @@ -1163,7 +1163,7 @@ def __combine_heads(x): trans_x = layers.transpose(x, perm=[0, 2, 1, 3]) # The value 0 in shape attr means copying the corresponding dimension # size of the input as the output dimension size. - return layers.reshape( + return paddle.reshape( x=trans_x, shape=list(map(int, [0, 0, trans_x.shape[2] * trans_x.shape[3]])), ) @@ -1585,7 +1585,7 @@ def transformer( ) cost = layers.softmax_with_cross_entropy( - logits=layers.reshape(predict, shape=[-1, trg_vocab_size]), + logits=paddle.reshape(predict, shape=[-1, trg_vocab_size]), label=label, soft_label=True if label_smooth_eps else False, ) @@ -1765,7 +1765,7 @@ def beam_search(): while_op = layers.While(cond) # array states will be stored for each step. ids = layers.array_write( - layers.reshape(start_tokens, (-1, 1)), step_idx + paddle.reshape(start_tokens, (-1, 1)), step_idx ) scores = layers.array_write(init_scores, step_idx) # cell states will be overwrited at each step. @@ -1790,7 +1790,7 @@ def beam_search(): ] with while_op.block(): pre_ids = layers.array_read(array=ids, i=step_idx) - pre_ids = layers.reshape(pre_ids, (-1, 1, 1)) + pre_ids = paddle.reshape(pre_ids, (-1, 1, 1)) pre_scores = layers.array_read(array=scores, i=step_idx) # sequence_expand can gather sequences according to lod thus can be # used in beam search to sift states corresponding to selected ids. @@ -1830,14 +1830,14 @@ def beam_search(): enc_output=pre_enc_output, caches=pre_caches, ) - logits = layers.reshape(logits, (-1, trg_vocab_size)) + logits = paddle.reshape(logits, (-1, trg_vocab_size)) topk_scores, topk_indices = layers.topk( input=layers.softmax(logits), k=beam_size ) accu_scores = layers.elementwise_add( x=layers.log(topk_scores), - y=layers.reshape(pre_scores, shape=[-1]), + y=paddle.reshape(pre_scores, shape=[-1]), axis=0, ) # beam_search op uses lod to distinguish branches. diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/bert_dygraph_model.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/bert_dygraph_model.py index 721bc9122165b..0bccfea79d178 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/bert_dygraph_model.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/bert_dygraph_model.py @@ -295,7 +295,7 @@ def forward(self, src_ids, position_ids, sentence_ids, input_mask): input=enc_output, axes=[1], starts=[0], ends=[1] ) next_sent_feat = self.pooled_fc(next_sent_feat) - next_sent_feat = fluid.layers.reshape( + next_sent_feat = paddle.reshape( next_sent_feat, shape=[-1, self._emb_size] ) @@ -391,7 +391,7 @@ def forward( enc_output, next_sent_feat = self.bert_layer( src_ids, position_ids, sentence_ids, input_mask ) - reshaped_emb_out = fluid.layers.reshape( + reshaped_emb_out = paddle.reshape( x=enc_output, shape=[-1, self._emb_size] ) diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/ifelse_simple_func.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/ifelse_simple_func.py index d7d1fd4cf9622..8459d0d60e7d3 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/ifelse_simple_func.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/ifelse_simple_func.py @@ -173,7 +173,7 @@ def nested_if_else(x_v): def nested_if_else_2(x): - y = fluid.layers.reshape(x, [-1, 1]) + y = paddle.reshape(x, [-1, 1]) b = 2 if b < 1: # var `z` is not visible for outer scope @@ -196,7 +196,7 @@ def nested_if_else_2(x): def nested_if_else_3(x): - y = fluid.layers.reshape(x, [-1, 1]) + y = paddle.reshape(x, [-1, 1]) b = 2 # var `z` is visible for func.body if b < 1: diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_dygraph_model.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_dygraph_model.py index 9f8eba7f59af3..539400ad927ad 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_dygraph_model.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_dygraph_model.py @@ -179,10 +179,10 @@ def _transpose_batch_time(self, x): return fluid.layers.transpose(x, [1, 0] + list(range(2, len(x.shape)))) def _merge_batch_beams(self, x): - return fluid.layers.reshape(x, shape=(-1, x.shape[2])) + return paddle.reshape(x, shape=(-1, x.shape[2])) def _split_batch_beams(self, x): - return fluid.layers.reshape(x, shape=(-1, self.beam_size, x.shape[1])) + return paddle.reshape(x, shape=(-1, self.beam_size, x.shape[1])) def _expand_to_beam_size(self, x): x = fluid.layers.unsqueeze(x, [1]) @@ -454,7 +454,7 @@ def beam_search(self, inputs): log_probs = fluid.layers.elementwise_add( x=step_log_probs, y=beam_state_log_probs, axis=0 ) - scores = fluid.layers.reshape( + scores = paddle.reshape( log_probs, [-1, self.beam_size * self.tar_vocab_size] ) topk_scores, topk_indices = fluid.layers.topk( @@ -646,7 +646,7 @@ def _transpose_batch_time(self, x): return fluid.layers.transpose(x, [1, 0] + list(range(2, len(x.shape)))) def _merge_batch_beams(self, x): - return fluid.layers.reshape(x, shape=(-1, x.shape[2])) + return paddle.reshape(x, shape=(-1, x.shape[2])) def tile_beam_merge_with_batch(self, x): x = fluid.layers.unsqueeze(x, [1]) # [batch_size, 1, ...] @@ -657,7 +657,7 @@ def tile_beam_merge_with_batch(self, x): x, list(range(2, len(x.shape))) + [0, 1] ) # [..., batch_size, beam_size] # use 0 to copy to avoid wrong shape - x = fluid.layers.reshape( + x = paddle.reshape( x, shape=[0] * (len(x.shape) - 2) + [-1] ) # [..., batch_size * beam_size] x = fluid.layers.transpose( @@ -666,7 +666,7 @@ def tile_beam_merge_with_batch(self, x): return x def _split_batch_beams(self, x): - return fluid.layers.reshape(x, shape=(-1, self.beam_size, x.shape[1])) + return paddle.reshape(x, shape=(-1, self.beam_size, x.shape[1])) def _expand_to_beam_size(self, x): x = fluid.layers.unsqueeze(x, [1]) diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model.py index 2d58a64ca2dc2..326b2aa0220c9 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model.py @@ -503,10 +503,10 @@ def forward(self, left, right): # embedding layer left_emb = self.emb_layer(left) right_emb = self.emb_layer(right) - left_emb = fluid.layers.reshape( + left_emb = paddle.reshape( left_emb, shape=[-1, self.seq_len, self.bow_dim] ) - right_emb = fluid.layers.reshape( + right_emb = paddle.reshape( right_emb, shape=[-1, self.seq_len, self.bow_dim] ) diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bmn.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bmn.py index 9ce37b565b906..e4cb3dd381b71 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bmn.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bmn.py @@ -284,9 +284,7 @@ def forward(self, x): xp = paddle.nn.functional.relu(self.p_conv1(x)) # BM layer xp = fluid.layers.matmul(xp, self.sample_mask) - xp = fluid.layers.reshape( - xp, shape=[0, 0, -1, self.dscale, self.tscale] - ) + xp = paddle.reshape(xp, shape=[0, 0, -1, self.dscale, self.tscale]) xp = self.p_conv3d1(xp) xp = fluid.layers.squeeze(xp, axes=[2]) @@ -319,12 +317,8 @@ def _get_mask(cfg): def tem_loss_func(pred_start, pred_end, gt_start, gt_end): def bi_loss(pred_score, gt_label): - pred_score = fluid.layers.reshape( - x=pred_score, shape=[-1], inplace=False - ) - gt_label = fluid.layers.reshape( - x=gt_label, shape=[-1], inplace=False - ) + pred_score = paddle.reshape(x=pred_score, shape=[-1]) + gt_label = paddle.reshape(x=gt_label, shape=[-1]) gt_label.stop_gradient = True pmask = fluid.layers.cast(x=(gt_label > 0.5), dtype=DATATYPE) num_entries = fluid.layers.cast( diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_error.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_error.py index 76b04b8f1b7e0..d7a21f3be6b08 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_error.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_error.py @@ -41,7 +41,7 @@ def func_error_in_compile_time(x): @paddle.jit.to_static def func_error_in_compile_time_2(x): x = fluid.dygraph.to_variable(x) - x = fluid.layers.reshape(x, shape=[1, 2]) + x = paddle.reshape(x, shape=[1, 2]) return x @@ -49,7 +49,7 @@ def func_error_in_compile_time_2(x): def func_error_in_runtime(x): x = fluid.dygraph.to_variable(x) two = fluid.layers.fill_constant(shape=[1], value=2, dtype="int32") - x = fluid.layers.reshape(x, shape=[1, two]) + x = paddle.reshape(x, shape=[1, two]) return x @@ -101,7 +101,7 @@ def func_error_in_runtime_with_empty_line(x): x = fluid.dygraph.to_variable(x) two = fluid.layers.fill_constant(shape=[1], value=2, dtype="int32") - x = fluid.layers.reshape(x, shape=[1, two]) + x = paddle.reshape(x, shape=[1, two]) return x @@ -290,7 +290,7 @@ def set_message(self): ), 'def func_error_in_compile_time_2(x):', 'x = fluid.dygraph.to_variable(x)', - 'x = fluid.layers.reshape(x, shape=[1, 2])', + 'x = paddle.reshape(x, shape=[1, 2])', '<--- HERE', 'return x', ] @@ -340,7 +340,7 @@ def set_message(self): ), 'x = fluid.dygraph.to_variable(x)', 'two = fluid.layers.fill_constant(shape=[1], value=2, dtype="int32")', - 'x = fluid.layers.reshape(x, shape=[1, two])', + 'x = paddle.reshape(x, shape=[1, two])', '<--- HERE', 'return x', ] @@ -356,7 +356,7 @@ def set_message(self): self.filepath ), 'two = fluid.layers.fill_constant(shape=[1], value=2, dtype="int32")', - 'x = fluid.layers.reshape(x, shape=[1, two])', + 'x = paddle.reshape(x, shape=[1, two])', '<--- HERE', 'return x', ] diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py index 9d00db1caa660..c195081f5a0fd 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py @@ -87,13 +87,9 @@ def forward(self, inputs): input_ = fluid.layers.slice( inputs, axes=[1], starts=[j], ends=[j + 1] ) - input_ = fluid.layers.reshape( - input_, [-1, input_.shape[2]], inplace=False - ) + input_ = paddle.reshape(input_, [-1, input_.shape[2]]) hidden, reset, gate = self.gru_unit(input_, hidden) - hidden_ = fluid.layers.reshape( - hidden, [-1, 1, hidden.shape[1]], inplace=False - ) + hidden_ = paddle.reshape(hidden, [-1, 1, hidden.shape[1]]) res.append(hidden_) if self.is_reverse: diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py index a35aaf57ee6e1..cfe085426932c 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py @@ -125,7 +125,7 @@ def forward(self, inputs, label=None): def inference(self, inputs): x = self._simple_img_conv_pool_1(inputs) x = self._simple_img_conv_pool_2(x) - x = fluid.layers.reshape(x, shape=[-1, self.pool_2_shape]) + x = paddle.reshape(x, shape=[-1, self.pool_2_shape]) x = self._fc(x) return x diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py index af50300be3a1c..a3b55386b438b 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py @@ -271,7 +271,7 @@ def forward(self, inputs): for dws in self.dwsl: y = dws(y) y = self.pool2d_avg(y) - y = fluid.layers.reshape(y, shape=[-1, 1024]) + y = paddle.reshape(y, shape=[-1, 1024]) y = self.out(y) return y @@ -438,7 +438,7 @@ def forward(self, inputs): y = inv(y) y = self._conv9(y, if_act=True) y = self._pool2d_avg(y) - y = fluid.layers.reshape(y, shape=[-1, self._out_c]) + y = paddle.reshape(y, shape=[-1, self._out_c]) y = self._fc(y) return y diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_partial_program.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_partial_program.py index 6a2e77bdaa84c..692bae0218f03 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_partial_program.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_partial_program.py @@ -186,7 +186,7 @@ def __init__(self): @declarative def forward(self, x): - x = fluid.layers.reshape(x, shape=[-1, 6]) + x = paddle.reshape(x, shape=[-1, 6]) x1, x2, x3 = fluid.layers.split(input=x, dim=1, num_or_sections=3) return x1 diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ptb_lm.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ptb_lm.py index 6b4da8aa1b536..e87c727f7d716 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ptb_lm.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ptb_lm.py @@ -115,16 +115,16 @@ def forward(self, input_embedding, init_hidden=None, init_cell=None): ) res.append(step_input) real_res = fluid.layers.concat(res, 1) - real_res = fluid.layers.reshape( + real_res = paddle.reshape( real_res, [-1, self._num_steps, self._hidden_size] ) last_hidden = fluid.layers.concat(hidden_array, 1) - last_hidden = fluid.layers.reshape( + last_hidden = paddle.reshape( last_hidden, shape=[-1, self._num_layers, self._hidden_size] ) last_hidden = fluid.layers.transpose(x=last_hidden, perm=[1, 0, 2]) last_cell = fluid.layers.concat(cell_array, 1) - last_cell = fluid.layers.reshape( + last_cell = paddle.reshape( last_cell, shape=[-1, self._num_layers, self._hidden_size] ) last_cell = fluid.layers.transpose(x=last_cell, perm=[1, 0, 2]) @@ -189,17 +189,17 @@ def build_once(self, input, label, init_hidden, init_cell): @declarative def forward(self, input, label, init_hidden, init_cell): - init_h = fluid.layers.reshape( + init_h = paddle.reshape( init_hidden, shape=[self.num_layers, -1, self.hidden_size] ) - init_c = fluid.layers.reshape( + init_c = paddle.reshape( init_cell, shape=[self.num_layers, -1, self.hidden_size] ) x_emb = self.embedding(input) - x_emb = fluid.layers.reshape( + x_emb = paddle.reshape( x_emb, shape=[-1, self.num_steps, self.hidden_size] ) if self.dropout is not None and self.dropout > 0.0: @@ -218,7 +218,7 @@ def forward(self, input, label, init_hidden, init_cell): loss = fluid.layers.softmax_with_cross_entropy( logits=projection, label=label, soft_label=False ) - loss = fluid.layers.reshape(loss, shape=[-1, self.num_steps]) + loss = paddle.reshape(loss, shape=[-1, self.num_steps]) loss = fluid.layers.reduce_mean(loss, dim=[0]) loss = fluid.layers.reduce_sum(loss) diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_reinforcement_learning.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_reinforcement_learning.py index c111e5c4820d0..76aae1fd0e97a 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_reinforcement_learning.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_reinforcement_learning.py @@ -41,7 +41,7 @@ def __init__(self): @declarative def forward(self, x): - x = fluid.layers.reshape(x, shape=[1, 4]) + x = paddle.reshape(x, shape=[1, 4]) x = self.affine1(x) x = fluid.layers.dropout(x, self.dropout_ratio) x = fluid.layers.relu(x) diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py index 594a3fa71f894..dfb371f414e2c 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py @@ -211,7 +211,7 @@ def forward(self, inputs): for bottleneck_block in self.bottleneck_block_list: y = bottleneck_block(y) y = self.pool2d_avg(y) - y = fluid.layers.reshape(y, shape=[-1, self.pool2d_avg_output]) + y = paddle.reshape(y, shape=[-1, self.pool2d_avg_output]) pred = self.out(y) return pred diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_se_resnet.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_se_resnet.py index ea87ba5ba68fd..c5a25fca8d520 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_se_resnet.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_se_resnet.py @@ -148,7 +148,7 @@ def __init__(self, num_channels, reduction_ratio): def forward(self, input): y = self._pool(input) - y = fluid.layers.reshape(y, shape=[-1, self._num_channels]) + y = paddle.reshape(y, shape=[-1, self._num_channels]) y = self._fc(y) y = self._excitation(y) y = fluid.layers.elementwise_mul(x=input, y=y, axis=0) @@ -344,7 +344,7 @@ def forward(self, inputs, label): y = self.pool2d_avg(y) y = fluid.layers.dropout(y, dropout_prob=0.5, seed=100) - y = fluid.layers.reshape(y, shape=[-1, self.pool2d_avg_output]) + y = paddle.reshape(y, shape=[-1, self.pool2d_avg_output]) out = self.out(y) softmax_out = fluid.layers.softmax(out) diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_sentiment.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_sentiment.py index 042ba310619af..3e4d1ddf8d1ef 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_sentiment.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_sentiment.py @@ -53,7 +53,7 @@ def __init__( def forward(self, inputs): x = paddle.tanh(self._conv2d(inputs)) x = fluid.layers.reduce_max(x, dim=-1) - x = fluid.layers.reshape(x, shape=[self.batch_size, -1]) + x = paddle.reshape(x, shape=[self.batch_size, -1]) return x @@ -92,12 +92,12 @@ def __init__(self, dict_dim, batch_size, seq_len): @declarative def forward(self, inputs, label=None): emb = self.embedding(inputs) - o_np_mask = ( - fluid.layers.reshape(inputs, [-1, 1]) != self.dict_dim - ).astype(dtype='float32') + o_np_mask = (paddle.reshape(inputs, [-1, 1]) != self.dict_dim).astype( + dtype='float32' + ) mask_emb = fluid.layers.expand(o_np_mask, [1, self.hid_dim]) emb = emb * mask_emb - emb = fluid.layers.reshape( + emb = paddle.reshape( emb, shape=[-1, self.channels, self.seq_len, self.hid_dim] ) conv_3 = self._simple_conv_pool_1(emb) @@ -138,12 +138,12 @@ def __init__(self, dict_dim, batch_size, seq_len): @declarative def forward(self, inputs, label=None): emb = self.embedding(inputs) - o_np_mask = ( - fluid.layers.reshape(inputs, [-1, 1]) != self.dict_dim - ).astype(dtype='float32') + o_np_mask = (paddle.reshape(inputs, [-1, 1]) != self.dict_dim).astype( + dtype='float32' + ) mask_emb = fluid.layers.expand(o_np_mask, [1, self.hid_dim]) emb = emb * mask_emb - emb = fluid.layers.reshape(emb, shape=[-1, self.seq_len, self.hid_dim]) + emb = paddle.reshape(emb, shape=[-1, self.seq_len, self.hid_dim]) bow_1 = fluid.layers.reduce_sum(emb, dim=1) bow_1 = paddle.tanh(bow_1) fc_1 = self._fc1(bow_1) @@ -186,14 +186,12 @@ def __init__(self, dict_dim, batch_size, seq_len): @declarative def forward(self, inputs, label=None): emb = self.embedding(inputs) - o_np_mask = ( - fluid.layers.reshape(inputs, [-1, 1]) != self.dict_dim - ).astype('float32') + o_np_mask = (paddle.reshape(inputs, [-1, 1]) != self.dict_dim).astype( + 'float32' + ) mask_emb = fluid.layers.expand(o_np_mask, [1, self.hid_dim]) emb = emb * mask_emb - emb = fluid.layers.reshape( - emb, shape=[self.batch_size, -1, self.hid_dim] - ) + emb = paddle.reshape(emb, shape=[self.batch_size, -1, self.hid_dim]) fc_1 = self._fc1(emb) gru_hidden = self._gru(fc_1) gru_hidden = fluid.layers.reduce_max(gru_hidden, dim=1) @@ -242,14 +240,12 @@ def __init__(self, dict_dim, batch_size, seq_len): @declarative def forward(self, inputs, label=None): emb = self.embedding(inputs) - o_np_mask = ( - fluid.layers.reshape(inputs, [-1, 1]) != self.dict_dim - ).astype('float32') + o_np_mask = (paddle.reshape(inputs, [-1, 1]) != self.dict_dim).astype( + 'float32' + ) mask_emb = fluid.layers.expand(o_np_mask, [1, self.hid_dim]) emb = emb * mask_emb - emb = fluid.layers.reshape( - emb, shape=[self.batch_size, -1, self.hid_dim] - ) + emb = paddle.reshape(emb, shape=[self.batch_size, -1, self.hid_dim]) fc_1 = self._fc1(emb) gru_forward = self._gru_forward(fc_1) gru_backward = self._gru_backward(fc_1) diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_simnet.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_simnet.py index 3e70147d30296..9762242385e75 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_simnet.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_simnet.py @@ -145,9 +145,9 @@ def train(conf_dict, to_static): ) for left, pos_right, neg_right in train_loader(): - left = fluid.layers.reshape(left, shape=[-1, 1]) - pos_right = fluid.layers.reshape(pos_right, shape=[-1, 1]) - neg_right = fluid.layers.reshape(neg_right, shape=[-1, 1]) + left = paddle.reshape(left, shape=[-1, 1]) + pos_right = paddle.reshape(pos_right, shape=[-1, 1]) + neg_right = paddle.reshape(neg_right, shape=[-1, 1]) net.train() global_step += 1 left_feat, pos_score = net(left, pos_right) diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tensor_shape.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tensor_shape.py index 5d06d1c694ec6..2087e615a576b 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tensor_shape.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tensor_shape.py @@ -22,7 +22,7 @@ def dyfunc_tensor_shape_1(x): x = fluid.dygraph.to_variable(x) - res = fluid.layers.reshape(x, shape=x.shape) + res = paddle.reshape(x, shape=x.shape) return res @@ -38,13 +38,13 @@ def dyfunc_tensor_shape_3(x): # Transform y.shape but run y.shape actually because y is not Tensor x = fluid.dygraph.to_variable(x) y = np.ones(5) - res = fluid.layers.reshape(x, shape=y.shape) + res = paddle.reshape(x, shape=y.shape) return res def dyfunc_tensor_shape_4(x): x = fluid.dygraph.to_variable(x) - res = fluid.layers.reshape(x, shape=(-1, x.shape[0], len(x.shape))) + res = paddle.reshape(x, shape=(-1, x.shape[0], len(x.shape))) return res @@ -54,7 +54,7 @@ def dyfunc_tensor_shape_5(x): # paddle.jit.dy2static.convert_var_shape(x)[0]))` x = fluid.dygraph.to_variable(x) s = x.shape[0] - res = fluid.layers.reshape(x, shape=(-1, s)) + res = paddle.reshape(x, shape=(-1, s)) return res @@ -64,7 +64,7 @@ def dyfunc_tensor_shape_6(x): # paddle.jit.dy2static.convert_var_shape(x)[0:]))` x = fluid.dygraph.to_variable(x) s = x.shape[0:] - res = fluid.layers.reshape(x, shape=s) + res = paddle.reshape(x, shape=s) return res @@ -103,7 +103,7 @@ def dyfunc_paddle_shape_api(x): def dyfunc_with_if_1(x): x = fluid.dygraph.to_variable(x) - res = fluid.layers.reshape(x, [-1, 1]) + res = paddle.reshape(x, [-1, 1]) x_shape_0 = x.shape[0] if x_shape_0 < 1: # `res.shape[0]` is transformed into diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tsm.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tsm.py index cc307e5a7bb16..dde28dadfd4e7 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tsm.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tsm.py @@ -200,16 +200,16 @@ def __init__(self, name_scope, config, mode): @declarative def forward(self, inputs): - y = fluid.layers.reshape(inputs, [-1] + self.reshape_list) + y = paddle.reshape(inputs, [-1] + self.reshape_list) y = self.conv(y) y = self.pool2d_max(y) for bottleneck_block in self.bottleneck_block_list: y = bottleneck_block(y) y = self.pool2d_avg(y) y = fluid.layers.dropout(y, dropout_prob=0.5) - y = fluid.layers.reshape(y, [-1, self.seg_num, y.shape[1]]) + y = paddle.reshape(y, [-1, self.seg_num, y.shape[1]]) y = fluid.layers.reduce_mean(y, dim=1) - y = fluid.layers.reshape(y, shape=[-1, 2048]) + y = paddle.reshape(y, shape=[-1, 2048]) y = self.out(y) return y diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_dygraph_model.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_dygraph_model.py index a2c6b4c225dcd..e26699bacfb52 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_dygraph_model.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_dygraph_model.py @@ -138,11 +138,11 @@ def forward(self, queries, keys, values, attn_bias, cache=None): k = self.k_fc(keys) v = self.v_fc(values) # split head - q = layers.reshape(x=q, shape=[0, 0, self.n_head, self.d_key]) + q = paddle.reshape(x=q, shape=[0, 0, self.n_head, self.d_key]) q = layers.transpose(x=q, perm=[0, 2, 1, 3]) - k = layers.reshape(x=k, shape=[0, 0, self.n_head, self.d_key]) + k = paddle.reshape(x=k, shape=[0, 0, self.n_head, self.d_key]) k = layers.transpose(x=k, perm=[0, 2, 1, 3]) - v = layers.reshape(x=v, shape=[0, 0, self.n_head, self.d_value]) + v = paddle.reshape(x=v, shape=[0, 0, self.n_head, self.d_value]) v = layers.transpose(x=v, perm=[0, 2, 1, 3]) if cache is not None: @@ -161,7 +161,7 @@ def forward(self, queries, keys, values, attn_bias, cache=None): weights = layers.dropout(weights, dropout_prob=self.dropout_rate) out = layers.matmul(weights, v) out = layers.transpose(out, perm=[0, 2, 1, 3]) - out = layers.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]]) + out = paddle.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]]) out = self.proj_fc(out) return out @@ -557,7 +557,7 @@ def forward( dec_output = self.decoder( dec_input, enc_output, trg_slf_attn_bias, trg_src_attn_bias, caches ) - dec_output = layers.reshape( + dec_output = paddle.reshape( dec_output, shape=[-1, dec_output.shape[-1]], ) @@ -694,7 +694,7 @@ def beam_search( max_len=256, ): def expand_to_beam_size(tensor, beam_size): - tensor = layers.reshape( + tensor = paddle.reshape( tensor, [tensor.shape[0], 1] + list(tensor.shape[1:]) ) tile_dims = [1] * len(tensor.shape) @@ -709,7 +709,7 @@ def merge_batch_beams(tensor): + list(range(0, var_dim_in_state)), ) - tensor = layers.reshape( + tensor = paddle.reshape( tensor, [0] * (len(tensor.shape) - var_dim_in_state) + [batch_size * beam_size], @@ -733,7 +733,7 @@ def split_batch_beams(tensor): list(range(var_dim_in_state, len(tensor.shape))) + list(range(0, var_dim_in_state)), ) - tensor = layers.reshape( + tensor = paddle.reshape( tensor, [0] * (len(tensor.shape) - var_dim_in_state) + [batch_size, beam_size], @@ -849,7 +849,7 @@ def gather(input, indices, batch_pos): log_probs = layers.elementwise_add( x=step_log_probs, y=log_probs, axis=0 ) - log_probs = layers.reshape( + log_probs = paddle.reshape( log_probs, [-1, beam_size * self.trg_vocab_size] ) scores = log_probs @@ -868,7 +868,7 @@ def gather(input, indices, batch_pos): finished = layers.logical_or( finished, layers.equal(token_indices, end_token_tensor) ) - trg_word = layers.reshape(token_indices, [-1, 1]) + trg_word = paddle.reshape(token_indices, [-1, 1]) predict_ids.append(token_indices) parent_ids.append(beam_indices) diff --git a/python/paddle/fluid/tests/unittests/ipu/test_reshape_inplace_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_reshape_inplace_op_ipu.py index c6016d2ec13db..47ed9bb7d4cb2 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_reshape_inplace_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_reshape_inplace_op_ipu.py @@ -50,7 +50,7 @@ def build_model(self): name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32' ) add = paddle.fluid.layers.elementwise_add(x, x) - out = paddle.fluid.layers.reshape(add, **self.attrs) + out = paddle.reshape(add, **self.attrs) self.fetch_list = [out.name] def run_model(self, exec_mode): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_reshape_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_reshape_op_ipu.py index 74bae31111ba1..1363faac99c3b 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_reshape_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_reshape_op_ipu.py @@ -47,7 +47,7 @@ def build_model(self): x = paddle.static.data( name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32' ) - out = paddle.fluid.layers.reshape(x=x, **self.attrs) + out = paddle.reshape(x=x, **self.attrs) self.fetch_list = [out.name] def run_model(self, exec_mode): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_varname_inplace_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_varname_inplace_ipu.py index e42e5b4d54e0b..0445dd808eac1 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_varname_inplace_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_varname_inplace_ipu.py @@ -59,7 +59,7 @@ def _test_base(self, run_ipu=True): dtype=self.feed_dtype[0], ) add1 = paddle.fluid.layers.elementwise_add(x, x) - reshape = paddle.fluid.layers.reshape(add1, **self.attrs) + reshape = paddle.reshape(add1, **self.attrs) add2 = paddle.fluid.layers.elementwise_add(reshape, reshape) scale1 = paddle.fluid.layers.scale(add2) scale2 = paddle.fluid.layers.scale(scale1, scale=1.3, bias=0.5) diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_cpu_bfloat16_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_cpu_bfloat16_pass.py index 2cd9cbcb05700..5792db6af95c0 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_cpu_bfloat16_pass.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_cpu_bfloat16_pass.py @@ -17,6 +17,7 @@ from inference_pass_test import InferencePassTest import paddle.fluid as fluid from paddle.fluid.core import PassVersionChecker +import paddle class TestMKLDNNCpuBfloat16Pass(InferencePassTest): @@ -27,7 +28,7 @@ def setUp(self): name='x', shape=[-1] + self.shape_x, dtype=self.d_type ) out = fluid.layers.transpose(x, perm=[0, 1, 2, 3]) - out = fluid.layers.reshape(out, [0, 0, 0, 0]) + out = paddle.reshape(out, [0, 0, 0, 0]) out = fluid.layers.fc(out, size=1) self.feeds = { diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_op_output_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_op_output_fuse_pass.py index 1991f3592fc6e..a320dfbe4dcff 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_op_output_fuse_pass.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_op_output_fuse_pass.py @@ -14,7 +14,7 @@ import unittest import numpy as np - +import paddle import paddle.fluid as fluid from inference_pass_test import InferencePassTest @@ -37,9 +37,7 @@ def make_network(self): ) out = fluid.layers.matmul(x, y) out = fluid.layers.transpose(out, perm=[0, 2, 1, 3]) - out = fluid.layers.reshape( - out, [0, 0, self.shape_y[0] * self.shape_y[2]] - ) + out = paddle.reshape(out, [0, 0, self.shape_y[0] * self.shape_y[2]]) out = fluid.layers.relu(out) return out @@ -80,7 +78,7 @@ def make_network(self): ) out = fluid.layers.matmul(x, y) out = fluid.layers.transpose(out, perm=[0, 1, 2, 3]) - out = fluid.layers.reshape(out, [0, 0, 0, 0]) + out = paddle.reshape(out, [0, 0, 0, 0]) out = fluid.layers.fc(out, size=1) return out @@ -106,9 +104,7 @@ def make_network(self): out = fluid.layers.transpose( out, perm=[0, 1, 2, 3] ) # breaks pattern - out = fluid.layers.reshape( - out, [0, 0, self.shape_y[0] * self.shape_y[2]] - ) + out = paddle.reshape(out, [0, 0, self.shape_y[0] * self.shape_y[2]]) out = fluid.layers.relu(out) return out diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_reshape_transpose_matmul_v2_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_reshape_transpose_matmul_v2_fuse_pass.py index b62b3eaf51795..188d111c45528 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_reshape_transpose_matmul_v2_fuse_pass.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_reshape_transpose_matmul_v2_fuse_pass.py @@ -33,7 +33,7 @@ def setUp(self): weight = fluid.layers.create_parameter( shape=self.weight_shape, dtype="float32" ) - reshape = fluid.layers.reshape(data, shape=self.reshape_shape) + reshape = paddle.reshape(data, shape=self.reshape_shape) transpose = fluid.layers.transpose(reshape, self.tranpose_perm) matmul = paddle.matmul( transpose, diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_conv_quant_dequant_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_conv_quant_dequant_pass.py index f19de2a3bb372..8ca6bbad042dd 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_conv_quant_dequant_pass.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_conv_quant_dequant_pass.py @@ -30,9 +30,9 @@ def network(): self.data = fluid.data( name='data', shape=[1, 28, 28], dtype='float32' ) - data_reshape = fluid.layers.reshape(self.data, shape=[1, 4, 14, 14]) + data_reshape = paddle.reshape(self.data, shape=[1, 4, 14, 14]) self.label = fluid.data(name='label', shape=[1, 1], dtype='int64') - label_shape = fluid.layers.reshape(self.label, shape=[1, 1, 1]) + label_shape = paddle.reshape(self.label, shape=[1, 1, 1]) conv_out = fluid.layers.conv2d( input=data_reshape, num_filters=self.conv_num_filters, @@ -44,13 +44,13 @@ def network(): act=None, ) if self.conv_padding == [1, 1]: - cout = fluid.layers.reshape(conv_out, shape=[1, 1, 10816]) + cout = paddle.reshape(conv_out, shape=[1, 1, 10816]) elif self.conv_padding == 'VALID': - cout = fluid.layers.reshape(conv_out, shape=[1, 1, 7744]) + cout = paddle.reshape(conv_out, shape=[1, 1, 7744]) elif self.conv_padding == 'SAME': - cout = fluid.layers.reshape(conv_out, shape=[1, 1, 12544]) + cout = paddle.reshape(conv_out, shape=[1, 1, 12544]) elif self.conv_groups == 4: - cout = fluid.layers.reshape(conv_out, shape=[1, 1, 10816]) + cout = paddle.reshape(conv_out, shape=[1, 1, 10816]) result = fluid.layers.relu(cout) loss = fluid.layers.cross_entropy(input=result, label=label_shape) avg_loss = paddle.mean(loss) @@ -140,9 +140,9 @@ def network(): self.data = fluid.data( name='data', shape=[1, 28, 28], dtype='float32' ) - data_reshape = fluid.layers.reshape(self.data, shape=[1, 4, 14, 14]) + data_reshape = paddle.reshape(self.data, shape=[1, 4, 14, 14]) self.label = fluid.data(name='label', shape=[1, 1], dtype='int64') - label_shape = fluid.layers.reshape(self.label, shape=[1, 1, 1]) + label_shape = paddle.reshape(self.label, shape=[1, 1, 1]) conv_out = fluid.layers.conv2d( input=data_reshape, num_filters=self.conv_num_filters, @@ -153,7 +153,7 @@ def network(): use_cudnn=self.use_cudnn, act=None, ) - cout = fluid.layers.reshape(conv_out, shape=[1, 1, 10816]) + cout = paddle.reshape(conv_out, shape=[1, 1, 10816]) result = fluid.layers.relu(cout) loss = fluid.layers.cross_entropy(input=result, label=label_shape) avg_loss = paddle.mean(loss) @@ -234,9 +234,9 @@ def network(): self.data = fluid.data( name='data', shape=[1, 28, 28], dtype='float32' ) - data_reshape = fluid.layers.reshape(self.data, shape=[1, 4, 14, 14]) + data_reshape = paddle.reshape(self.data, shape=[1, 4, 14, 14]) self.label = fluid.data(name='label', shape=[1, 1], dtype='int64') - label_shape = fluid.layers.reshape(self.label, shape=[1, 1, 1]) + label_shape = paddle.reshape(self.label, shape=[1, 1, 1]) conv_out = fluid.layers.conv2d_transpose( input=data_reshape, num_filters=self.conv_num_filters, @@ -248,13 +248,13 @@ def network(): act=None, ) if self.conv_padding == [1, 1]: - cout = fluid.layers.reshape(conv_out, shape=[1, 1, 14400]) + cout = paddle.reshape(conv_out, shape=[1, 1, 14400]) elif self.conv_padding == 'VALID': - cout = fluid.layers.reshape(conv_out, shape=[1, 1, 18496]) + cout = paddle.reshape(conv_out, shape=[1, 1, 18496]) elif self.conv_padding == 'SAME': - cout = fluid.layers.reshape(conv_out, shape=[1, 1, 12544]) + cout = paddle.reshape(conv_out, shape=[1, 1, 12544]) elif self.conv_groups == 4: - cout = fluid.layers.reshape(conv_out, shape=[1, 1, 10816]) + cout = paddle.reshape(conv_out, shape=[1, 1, 10816]) result = fluid.layers.relu(cout) loss = fluid.layers.cross_entropy(input=result, label=label_shape) avg_loss = paddle.mean(loss) diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_fc_fuse_quant_dequant_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_fc_fuse_quant_dequant_pass.py index 179dcd140d8a2..908b7c2ad318c 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_fc_fuse_quant_dequant_pass.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_fc_fuse_quant_dequant_pass.py @@ -102,7 +102,7 @@ def network(): bias_attr=False, act=None, ) - c_out = fluid.layers.reshape(fc_out, shape=[0, 784]) + c_out = paddle.reshape(fc_out, shape=[0, 784]) result = fluid.layers.relu(c_out) loss = fluid.layers.cross_entropy(input=result, label=self.label) avg_loss = paddle.mean(loss) @@ -162,8 +162,8 @@ def network(): name='data', shape=[1, 28, 28], dtype='float32' ) self.label = fluid.data(name='label', shape=[1, 1], dtype='int64') - label_shape = fluid.layers.reshape(self.label, shape=[1, 1, 1]) - reshape_out = fluid.layers.reshape(self.data, shape=[1, 14, 14, 4]) + label_shape = paddle.reshape(self.label, shape=[1, 1, 1]) + reshape_out = paddle.reshape(self.data, shape=[1, 14, 14, 4]) fc_out = fluid.layers.fc( input=reshape_out, size=14, @@ -171,7 +171,7 @@ def network(): bias_attr=False, act=None, ) - c_out = fluid.layers.reshape(fc_out, shape=[1, 1, 2744]) + c_out = paddle.reshape(fc_out, shape=[1, 1, 2744]) result = fluid.layers.relu(c_out) loss = fluid.layers.cross_entropy(input=result, label=label_shape) avg_loss = paddle.mean(loss) diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_matmul_quant_dequant.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_matmul_quant_dequant.py index 75db2ebb221f5..f530a2bb12e9b 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_matmul_quant_dequant.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_matmul_quant_dequant.py @@ -126,7 +126,7 @@ def network(): name='data', shape=[1, 28, 28], dtype='float32' ) self.label = fluid.data(name='label', shape=[1, 1], dtype='int64') - reshape_out = fluid.layers.reshape(self.data, shape=[1, 4, 14, 14]) + reshape_out = paddle.reshape(self.data, shape=[1, 4, 14, 14]) matmul_out = fluid.layers.matmul( x=reshape_out, y=reshape_out, diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_multiclass_nms3_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_multiclass_nms3_op.py index 05df3b65082b7..5d7f12e554e8a 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_multiclass_nms3_op.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_multiclass_nms3_op.py @@ -22,6 +22,7 @@ from paddle.fluid.layer_helper import LayerHelper from paddle.fluid.core import PassVersionChecker from paddle.fluid.core import AnalysisConfig +import paddle def multiclass_nms( @@ -235,7 +236,7 @@ def build(self): nms_eta=self.nms_eta, ) mutliclass_nms_out = multiclass_nms_out + 1.0 - multiclass_nms_out = fluid.layers.reshape( + multiclass_nms_out = paddle.reshape( multiclass_nms_out, [self.bs, 1, self.keep_top_k, 6], name='reshape', diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_multiclass_nms_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_multiclass_nms_op.py index ead11ba7ae170..bc432a69a212c 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_multiclass_nms_op.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_multiclass_nms_op.py @@ -20,6 +20,7 @@ import paddle.fluid.core as core from paddle.fluid.core import PassVersionChecker from paddle.fluid.core import AnalysisConfig +import paddle class TensorRTMultiClassNMSTest(InferencePassTest): @@ -62,7 +63,7 @@ def build(self): normalized=self.normalized, ) mutliclass_nms_out = multiclass_nms_out + 1.0 - multiclass_nms_out = fluid.layers.reshape( + multiclass_nms_out = paddle.reshape( multiclass_nms_out, [self.bs, 1, self.keep_top_k, 6], name='reshape', diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_reshape_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_reshape_op.py index 075919b7bf6e3..8972067760260 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_reshape_op.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_reshape_op.py @@ -19,6 +19,7 @@ import paddle.fluid.core as core from paddle.fluid.core import PassVersionChecker from paddle.fluid.core import AnalysisConfig +import paddle class TRTReshapeTest(InferencePassTest): @@ -48,7 +49,7 @@ def setUp(self): self.fetch_list = [out] def append_reshape(self, data, reshape): - return fluid.layers.reshape(data, reshape) + return paddle.reshape(data, reshape) def test_check_output(self): if core.is_compiled_with_cuda(): @@ -101,7 +102,7 @@ def setUp(self): data = fluid.data( name='data', shape=self.data_shape, dtype='float32' ) - reshape_out = fluid.layers.reshape(x=data, shape=self.reshape) + reshape_out = paddle.reshape(x=data, shape=self.reshape) out = fluid.layers.batch_norm(reshape_out, is_test=True) self.feeds = { 'data': np.random.random(self.data_shape).astype('float32') diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_shuffle_channel_detect_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_shuffle_channel_detect_pass.py index d5bbbcde1e163..754149f7b3489 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_shuffle_channel_detect_pass.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_shuffle_channel_detect_pass.py @@ -18,6 +18,7 @@ import paddle.fluid as fluid from paddle.fluid.core import PassVersionChecker from paddle.fluid.core import AnalysisConfig +import paddle class ShuffleChannelFuseTRTPassTest(InferencePassTest): @@ -26,9 +27,9 @@ def setUp(self): data = fluid.data( name="data", shape=[-1, 6, 64, 64], dtype="float32" ) - reshape1 = fluid.layers.reshape(x=data, shape=[-1, 2, 3, 64, 64]) + reshape1 = paddle.reshape(x=data, shape=[-1, 2, 3, 64, 64]) trans = fluid.layers.transpose(x=reshape1, perm=[0, 2, 1, 3, 4]) - reshape2 = fluid.layers.reshape(x=trans, shape=[-1, 6, 64, 64]) + reshape2 = paddle.reshape(x=trans, shape=[-1, 6, 64, 64]) out = fluid.layers.batch_norm(reshape2, is_test=True) self.feeds = { diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_subgraph_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_subgraph_pass.py index 16621212e88f7..b91b068adb828 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_subgraph_pass.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_subgraph_pass.py @@ -21,6 +21,7 @@ import paddle.fluid.core as core from paddle.fluid.core import PassVersionChecker from paddle.fluid.core import AnalysisConfig +import paddle class TensorRTSubgraphPassFcTest(InferencePassTest): @@ -30,7 +31,7 @@ def setUp(self): name="data", shape=[-1, 6, 64, 64], dtype="float32" ) fc_out = fluid.layers.fc(input=[data], act=None, size=1000) - reshape_out = fluid.layers.reshape(x=fc_out, shape=[1, 1000]) + reshape_out = paddle.reshape(x=fc_out, shape=[1, 1000]) self.feeds = { "data": np.random.random([1, 6, 64, 64]).astype("float32"), } diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_transpose_flatten_concat_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_transpose_flatten_concat_fuse_pass.py index 409d36600d28b..8fc8b464dda1e 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_transpose_flatten_concat_fuse_pass.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_transpose_flatten_concat_fuse_pass.py @@ -18,6 +18,7 @@ import paddle.fluid as fluid import paddle.fluid.core as core from paddle.fluid.core import AnalysisConfig +import paddle class TransposeFlattenConcatFusePassTRTTest(InferencePassTest): @@ -36,7 +37,7 @@ def setUp(self): concat_out = fluid.layers.concat([flatt1, flatt2], axis=1) # There is no parameters for above structure. # Hence, append a batch_norm to avoid failure caused by load_combined. - reshape_out = fluid.layers.reshape(concat_out, [-1, 0, 1, 1]) + reshape_out = paddle.reshape(concat_out, [-1, 0, 1, 1]) out = fluid.layers.batch_norm(reshape_out, is_test=True) self.feeds = { diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_mnist.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_mnist.py index e63adefa3d162..f7ccdc3bea0b1 100644 --- a/python/paddle/fluid/tests/unittests/parallel_dygraph_mnist.py +++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_mnist.py @@ -100,7 +100,7 @@ def __init__(self): def forward(self, inputs, label): x = self._simple_img_conv_pool_1(inputs) x = self._simple_img_conv_pool_2(x) - x = fluid.layers.reshape(x, shape=[-1, self.pool_2_shape]) + x = paddle.reshape(x, shape=[-1, self.pool_2_shape]) cost = self._fc(x) loss = fluid.layers.cross_entropy(cost, label) avg_loss = paddle.mean(loss) diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_sparse_embedding.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_sparse_embedding.py index af6471611e969..1163a99552aa5 100644 --- a/python/paddle/fluid/tests/unittests/parallel_dygraph_sparse_embedding.py +++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_sparse_embedding.py @@ -68,11 +68,11 @@ def forward(self, input, label): x_emb = self.embedding(input) fc = fluid.layers.matmul(x_emb, self.softmax_weight) fc = fluid.layers.elementwise_add(fc, self.softmax_bias) - projection = fluid.layers.reshape(fc, shape=[-1, self.vocab_size]) + projection = paddle.reshape(fc, shape=[-1, self.vocab_size]) loss = fluid.layers.softmax_with_cross_entropy( logits=projection, label=label, soft_label=False ) - loss = fluid.layers.reshape(loss, shape=[-1, self.num_steps]) + loss = paddle.reshape(loss, shape=[-1, self.num_steps]) loss = fluid.layers.reduce_mean(loss, dim=[0]) loss = fluid.layers.reduce_sum(loss) diff --git a/python/paddle/fluid/tests/unittests/seresnext_net.py b/python/paddle/fluid/tests/unittests/seresnext_net.py index b2bc25e35ae3e..32dbeb71b9756 100644 --- a/python/paddle/fluid/tests/unittests/seresnext_net.py +++ b/python/paddle/fluid/tests/unittests/seresnext_net.py @@ -48,9 +48,7 @@ def squeeze_excitation(input, num_channels, reduction_ratio): # input=input, pool_size=0, pool_type='avg', global_pooling=True) conv = input shape = conv.shape - reshape = fluid.layers.reshape( - x=conv, shape=[-1, shape[1], shape[2] * shape[3]] - ) + reshape = paddle.reshape(x=conv, shape=[-1, shape[1], shape[2] * shape[3]]) pool = fluid.layers.reduce_mean(input=reshape, dim=2) squeeze = fluid.layers.fc( @@ -161,9 +159,7 @@ def SE_ResNeXt50Small(use_feed): ) shape = conv.shape - reshape = fluid.layers.reshape( - x=conv, shape=[-1, shape[1], shape[2] * shape[3]] - ) + reshape = paddle.reshape(x=conv, shape=[-1, shape[1], shape[2] * shape[3]]) pool = fluid.layers.reduce_mean(input=reshape, dim=2) dropout = ( pool diff --git a/python/paddle/fluid/tests/unittests/test_beam_search_op.py b/python/paddle/fluid/tests/unittests/test_beam_search_op.py index 7d99fad642292..60989a0b60880 100644 --- a/python/paddle/fluid/tests/unittests/test_beam_search_op.py +++ b/python/paddle/fluid/tests/unittests/test_beam_search_op.py @@ -18,6 +18,7 @@ import numpy as np import paddle.fluid as fluid from paddle.fluid.framework import Program, program_guard +import paddle def create_tensor(scope, name, np_data): @@ -312,7 +313,7 @@ def test_errors(self): topk_scores, topk_indices = fluid.layers.topk(probs, k=4) accu_scores = fluid.layers.elementwise_add( x=fluid.layers.log(x=topk_scores), - y=fluid.layers.reshape(pre_scores, shape=[-1]), + y=paddle.reshape(pre_scores, shape=[-1]), axis=0, ) diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps.py index 87bf029956442..df912af7feddc 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps.py +++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps.py @@ -84,7 +84,7 @@ def get_loss(cos_q_pt, cos_q_nt): ), is_sparse=is_sparse, ) - q_emb = fluid.layers.reshape(q_emb, [-1, emb_dim]) + q_emb = paddle.reshape(q_emb, [-1, emb_dim]) # vsum q_sum = fluid.layers.sequence_pool(input=q_emb, pool_type='sum') q_ss = paddle.nn.functional.softsign(q_sum) @@ -116,7 +116,7 @@ def get_loss(cos_q_pt, cos_q_nt): ), is_sparse=is_sparse, ) - pt_emb = fluid.layers.reshape(pt_emb, [-1, emb_dim]) + pt_emb = paddle.reshape(pt_emb, [-1, emb_dim]) # vsum pt_sum = fluid.layers.sequence_pool(input=pt_emb, pool_type='sum') pt_ss = paddle.nn.functional.softsign(pt_sum) @@ -147,7 +147,7 @@ def get_loss(cos_q_pt, cos_q_nt): ), is_sparse=is_sparse, ) - nt_emb = fluid.layers.reshape(nt_emb, [-1, emb_dim]) + nt_emb = paddle.reshape(nt_emb, [-1, emb_dim]) # vsum nt_sum = fluid.layers.sequence_pool(input=nt_emb, pool_type='sum') nt_ss = paddle.nn.functional.softsign(nt_sum) diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps11.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps11.py index d70ed0b9031ba..6be2ad229a359 100755 --- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps11.py +++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps11.py @@ -80,7 +80,7 @@ def get_loss(cos_q_pt, cos_q_nt): learning_rate=emb_lr, ), ) - q_emb = fluid.layers.reshape(q_emb, [-1, emb_dim]) + q_emb = paddle.reshape(q_emb, [-1, emb_dim]) # vsum q_sum = fluid.layers.sequence_pool(input=q_emb, pool_type='sum') q_ss = paddle.nn.functional.softsign(q_sum) @@ -108,7 +108,7 @@ def get_loss(cos_q_pt, cos_q_nt): learning_rate=emb_lr, ), ) - pt_emb = fluid.layers.reshape(pt_emb, [-1, emb_dim]) + pt_emb = paddle.reshape(pt_emb, [-1, emb_dim]) # vsum pt_sum = fluid.layers.sequence_pool(input=pt_emb, pool_type='sum') pt_ss = paddle.nn.functional.softsign(pt_sum) @@ -135,7 +135,7 @@ def get_loss(cos_q_pt, cos_q_nt): learning_rate=emb_lr, ), ) - nt_emb = fluid.layers.reshape(nt_emb, [-1, emb_dim]) + nt_emb = paddle.reshape(nt_emb, [-1, emb_dim]) # vsum nt_sum = fluid.layers.sequence_pool(input=nt_emb, pool_type='sum') nt_ss = paddle.nn.functional.softsign(nt_sum) diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps12.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps12.py index d8506f64a4af2..8718931752e25 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps12.py +++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps12.py @@ -83,7 +83,7 @@ def get_loss(cos_q_pt, cos_q_nt): learning_rate=emb_lr, ), ) - q_emb = fluid.layers.reshape(q_emb, [-1, emb_dim]) + q_emb = paddle.reshape(q_emb, [-1, emb_dim]) # vsum q_sum = fluid.layers.sequence_pool(input=q_emb, pool_type='sum') q_ss = paddle.nn.functional.softsign(q_sum) @@ -111,7 +111,7 @@ def get_loss(cos_q_pt, cos_q_nt): learning_rate=emb_lr, ), ) - pt_emb = fluid.layers.reshape(pt_emb, [-1, emb_dim]) + pt_emb = paddle.reshape(pt_emb, [-1, emb_dim]) # vsum pt_sum = fluid.layers.sequence_pool(input=pt_emb, pool_type='sum') pt_ss = paddle.nn.functional.softsign(pt_sum) @@ -138,7 +138,7 @@ def get_loss(cos_q_pt, cos_q_nt): learning_rate=emb_lr, ), ) - nt_emb = fluid.layers.reshape(nt_emb, [-1, emb_dim]) + nt_emb = paddle.reshape(nt_emb, [-1, emb_dim]) # vsum nt_sum = fluid.layers.sequence_pool(input=nt_emb, pool_type='sum') nt_ss = paddle.nn.functional.softsign(nt_sum) diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps13.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps13.py index 0d531054a709b..0b6e1c9c48316 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps13.py +++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps13.py @@ -86,7 +86,7 @@ def get_loss(cos_q_pt, cos_q_nt): learning_rate=emb_lr, ), ) - q_emb = fluid.layers.reshape(q_emb, [-1, emb_dim]) + q_emb = paddle.reshape(q_emb, [-1, emb_dim]) # vsum q_sum = fluid.layers.sequence_pool(input=q_emb, pool_type='sum') q_ss = paddle.nn.functional.softsign(q_sum) @@ -116,7 +116,7 @@ def get_loss(cos_q_pt, cos_q_nt): learning_rate=emb_lr, ), ) - pt_emb = fluid.layers.reshape(pt_emb, [-1, emb_dim]) + pt_emb = paddle.reshape(pt_emb, [-1, emb_dim]) # vsum pt_sum = fluid.layers.sequence_pool(input=pt_emb, pool_type='sum') pt_ss = paddle.nn.functional.softsign(pt_sum) @@ -145,7 +145,7 @@ def get_loss(cos_q_pt, cos_q_nt): learning_rate=emb_lr, ), ) - nt_emb = fluid.layers.reshape(nt_emb, [-1, emb_dim]) + nt_emb = paddle.reshape(nt_emb, [-1, emb_dim]) # vsum nt_sum = fluid.layers.sequence_pool(input=nt_emb, pool_type='sum') nt_ss = paddle.nn.functional.softsign(nt_sum) diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps2.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps2.py index 8d0fdd6f9c0cb..e95a42a44a12d 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps2.py +++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps2.py @@ -85,7 +85,7 @@ def get_loss(cos_q_pt, cos_q_nt): learning_rate=emb_lr, ), ) - q_emb = fluid.layers.reshape(q_emb, [-1, emb_dim]) + q_emb = paddle.reshape(q_emb, [-1, emb_dim]) # vsum q_sum = fluid.layers.sequence_pool(input=q_emb, pool_type='sum') q_ss = paddle.nn.functional.softsign(q_sum) @@ -116,7 +116,7 @@ def get_loss(cos_q_pt, cos_q_nt): learning_rate=emb_lr, ), ) - pt_emb = fluid.layers.reshape(pt_emb, [-1, emb_dim]) + pt_emb = paddle.reshape(pt_emb, [-1, emb_dim]) # vsum pt_sum = fluid.layers.sequence_pool(input=pt_emb, pool_type='sum') pt_ss = paddle.nn.functional.softsign(pt_sum) @@ -145,7 +145,7 @@ def get_loss(cos_q_pt, cos_q_nt): learning_rate=emb_lr, ), ) - nt_emb = fluid.layers.reshape(nt_emb, [-1, emb_dim]) + nt_emb = paddle.reshape(nt_emb, [-1, emb_dim]) # vsum nt_sum = fluid.layers.sequence_pool(input=nt_emb, pool_type='sum') nt_ss = paddle.nn.functional.softsign(nt_sum) diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps3.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps3.py index 80830b96936dc..33af7401ae0ac 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps3.py +++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps3.py @@ -84,7 +84,7 @@ def get_loss(cos_q_pt, cos_q_nt): ), is_sparse=is_sparse, ) - q_emb = fluid.layers.reshape(q_emb, [-1, emb_dim]) + q_emb = paddle.reshape(q_emb, [-1, emb_dim]) # vsum q_sum = fluid.layers.sequence_pool(input=q_emb, pool_type='sum') q_ss = paddle.nn.functional.softsign(q_sum) @@ -116,7 +116,7 @@ def get_loss(cos_q_pt, cos_q_nt): ), is_sparse=is_sparse, ) - pt_emb = fluid.layers.reshape(pt_emb, [-1, emb_dim]) + pt_emb = paddle.reshape(pt_emb, [-1, emb_dim]) # vsum pt_sum = fluid.layers.sequence_pool(input=pt_emb, pool_type='sum') pt_ss = paddle.nn.functional.softsign(pt_sum) @@ -147,7 +147,7 @@ def get_loss(cos_q_pt, cos_q_nt): ), is_sparse=is_sparse, ) - nt_emb = fluid.layers.reshape(nt_emb, [-1, emb_dim]) + nt_emb = paddle.reshape(nt_emb, [-1, emb_dim]) # vsum nt_sum = fluid.layers.sequence_pool(input=nt_emb, pool_type='sum') nt_ss = paddle.nn.functional.softsign(nt_sum) diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps4.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps4.py index 61561621d3839..ce828ff213543 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps4.py +++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps4.py @@ -82,7 +82,7 @@ def get_loss(cos_q_pt, cos_q_nt): learning_rate=emb_lr, ), ) - q_emb = fluid.layers.reshape(q_emb, [-1, emb_dim]) + q_emb = paddle.reshape(q_emb, [-1, emb_dim]) # vsum q_sum = fluid.layers.sequence_pool(input=q_emb, pool_type='sum') q_ss = paddle.nn.functional.softsign(q_sum) @@ -112,7 +112,7 @@ def get_loss(cos_q_pt, cos_q_nt): learning_rate=emb_lr, ), ) - pt_emb = fluid.layers.reshape(pt_emb, [-1, emb_dim]) + pt_emb = paddle.reshape(pt_emb, [-1, emb_dim]) # vsum pt_sum = fluid.layers.sequence_pool(input=pt_emb, pool_type='sum') pt_ss = paddle.nn.functional.softsign(pt_sum) @@ -141,7 +141,7 @@ def get_loss(cos_q_pt, cos_q_nt): learning_rate=emb_lr, ), ) - nt_emb = fluid.layers.reshape(nt_emb, [-1, emb_dim]) + nt_emb = paddle.reshape(nt_emb, [-1, emb_dim]) # vsum nt_sum = fluid.layers.sequence_pool(input=nt_emb, pool_type='sum') nt_ss = paddle.nn.functional.softsign(nt_sum) diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps5.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps5.py index 8729c4d63971c..692c84ac518ff 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps5.py +++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps5.py @@ -84,7 +84,7 @@ def get_loss(cos_q_pt, cos_q_nt): ), is_sparse=is_sparse, ) - q_emb = fluid.layers.reshape(q_emb, [-1, emb_dim]) + q_emb = paddle.reshape(q_emb, [-1, emb_dim]) # vsum q_sum = fluid.layers.sequence_pool(input=q_emb, pool_type='sum') q_ss = paddle.nn.functional.softsign(q_sum) @@ -116,7 +116,7 @@ def get_loss(cos_q_pt, cos_q_nt): ), is_sparse=is_sparse, ) - pt_emb = fluid.layers.reshape(pt_emb, [-1, emb_dim]) + pt_emb = paddle.reshape(pt_emb, [-1, emb_dim]) # vsum pt_sum = fluid.layers.sequence_pool(input=pt_emb, pool_type='sum') pt_ss = paddle.nn.functional.softsign(pt_sum) @@ -147,7 +147,7 @@ def get_loss(cos_q_pt, cos_q_nt): ), is_sparse=False, ) - nt_emb = fluid.layers.reshape(nt_emb, [-1, emb_dim]) + nt_emb = paddle.reshape(nt_emb, [-1, emb_dim]) # vsum nt_sum = fluid.layers.sequence_pool(input=nt_emb, pool_type='sum') nt_ss = paddle.nn.functional.softsign(nt_sum) diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps6.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps6.py index 83c710c4eaef7..3dbef1bd2adeb 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps6.py +++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps6.py @@ -82,7 +82,7 @@ def get_loss(cos_q_pt, cos_q_nt): learning_rate=emb_lr, ), ) - q_emb = fluid.layers.reshape(q_emb, [-1, emb_dim]) + q_emb = paddle.reshape(q_emb, [-1, emb_dim]) # vsum q_sum = fluid.layers.sequence_pool(input=q_emb, pool_type='sum') q_ss = paddle.nn.functional.softsign(q_sum) @@ -112,7 +112,7 @@ def get_loss(cos_q_pt, cos_q_nt): learning_rate=emb_lr, ), ) - pt_emb = fluid.layers.reshape(pt_emb, [-1, emb_dim]) + pt_emb = paddle.reshape(pt_emb, [-1, emb_dim]) # vsum pt_sum = fluid.layers.sequence_pool(input=pt_emb, pool_type='sum') pt_ss = paddle.nn.functional.softsign(pt_sum) @@ -141,7 +141,7 @@ def get_loss(cos_q_pt, cos_q_nt): learning_rate=emb_lr, ), ) - nt_emb = fluid.layers.reshape(nt_emb, [-1, emb_dim]) + nt_emb = paddle.reshape(nt_emb, [-1, emb_dim]) # vsum nt_sum = fluid.layers.sequence_pool(input=nt_emb, pool_type='sum') nt_ss = paddle.nn.functional.softsign(nt_sum) diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py index 7b5fe2c11434b..ce02bc4af7950 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py +++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py @@ -381,7 +381,7 @@ def net_conf(self): ), ) - neg_word_reshape = fluid.layers.reshape(inputs[2], shape=[-1, 1]) + neg_word_reshape = paddle.reshape(inputs[2], shape=[-1, 1]) neg_word_reshape.stop_gradient = True neg_emb_w = fluid.layers.embedding( @@ -391,7 +391,7 @@ def net_conf(self): param_attr=fluid.ParamAttr(name='emb_w', learning_rate=1.0), ) - neg_emb_w_re = fluid.layers.reshape( + neg_emb_w_re = paddle.reshape( neg_emb_w, shape=[-1, neg_num, embedding_size] ) @@ -402,7 +402,7 @@ def net_conf(self): param_attr=fluid.ParamAttr(name='emb_b', learning_rate=1.0), ) - neg_emb_b_vec = fluid.layers.reshape(neg_emb_b, shape=[-1, neg_num]) + neg_emb_b_vec = paddle.reshape(neg_emb_b, shape=[-1, neg_num]) true_logits = fluid.layers.elementwise_add( fluid.layers.reduce_sum( @@ -413,14 +413,12 @@ def net_conf(self): true_emb_b, ) - input_emb_re = fluid.layers.reshape( - input_emb, shape=[-1, 1, embedding_size] - ) + input_emb_re = paddle.reshape(input_emb, shape=[-1, 1, embedding_size]) neg_matmul = fluid.layers.matmul( input_emb_re, neg_emb_w_re, transpose_y=True ) - neg_matmul_re = fluid.layers.reshape(neg_matmul, shape=[-1, neg_num]) + neg_matmul_re = paddle.reshape(neg_matmul, shape=[-1, neg_num]) neg_logits = fluid.layers.elementwise_add(neg_matmul_re, neg_emb_b_vec) # nce loss label_ones = fluid.layers.fill_constant_batch_size_like( diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_mnist_fp16.py b/python/paddle/fluid/tests/unittests/test_dygraph_mnist_fp16.py index 7e7fab954bfd6..f96595588c3db 100644 --- a/python/paddle/fluid/tests/unittests/test_dygraph_mnist_fp16.py +++ b/python/paddle/fluid/tests/unittests/test_dygraph_mnist_fp16.py @@ -115,7 +115,7 @@ def __init__(self, dtype="float32"): def forward(self, inputs, label): x = paddle.nn.functional.relu(self._simple_img_conv_pool_1(inputs)) x = paddle.nn.functional.relu(self._simple_img_conv_pool_2(x)) - x = fluid.layers.reshape(x, shape=[-1, self.pool_2_shape]) + x = paddle.reshape(x, shape=[-1, self.pool_2_shape]) cost = self._linear(x) loss = fluid.layers.cross_entropy(cost, label) avg_loss = paddle.mean(loss) diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_multi_forward.py b/python/paddle/fluid/tests/unittests/test_dygraph_multi_forward.py index 02ce69de22018..08bb8fceb4fa4 100644 --- a/python/paddle/fluid/tests/unittests/test_dygraph_multi_forward.py +++ b/python/paddle/fluid/tests/unittests/test_dygraph_multi_forward.py @@ -104,7 +104,7 @@ def __init__(self): def forward(self, inputs): x = self._simple_img_conv_pool_1(inputs) x = self._simple_img_conv_pool_2(x) - x = fluid.layers.reshape(x, shape=[-1, self.pool_2_shape]) + x = paddle.reshape(x, shape=[-1, self.pool_2_shape]) x = self._fc(x) return x diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_padding_rnn.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_padding_rnn.py index e1271f30c4f35..220a6d13b81fa 100644 --- a/python/paddle/fluid/tests/unittests/test_eager_deletion_padding_rnn.py +++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_padding_rnn.py @@ -149,8 +149,8 @@ def padding_rnn(input_embedding, len=3, init_hidden=None, init_cell=None): pre_cell = layers.slice( init_cell, axes=[0], starts=[i], ends=[i + 1] ) - pre_hidden = layers.reshape(pre_hidden, shape=[-1, hidden_size]) - pre_cell = layers.reshape(pre_cell, shape=[-1, hidden_size]) + pre_hidden = paddle.reshape(pre_hidden, shape=[-1, hidden_size]) + pre_cell = paddle.reshape(pre_cell, shape=[-1, hidden_size]) hidden_array.append(pre_hidden) cell_array.append(pre_cell) @@ -270,12 +270,8 @@ def encoder_static( pre_cell = layers.slice( init_cell, axes=[0], starts=[i], ends=[i + 1] ) - pre_hidden = layers.reshape( - pre_hidden, shape=[-1, hidden_size], inplace=True - ) - pre_cell = layers.reshape( - pre_cell, shape=[-1, hidden_size], inplace=True - ) + pre_hidden = paddle.reshape(pre_hidden, shape=[-1, hidden_size]) + pre_cell = paddle.reshape(pre_cell, shape=[-1, hidden_size]) hidden_array.append(pre_hidden) cell_array.append(pre_cell) @@ -286,7 +282,7 @@ def encoder_static( for index in range(len): input = sliced_inputs[index] - input = layers.reshape(input, shape=[-1, hidden_size], inplace=True) + input = paddle.reshape(input, shape=[-1, hidden_size]) for k in range(num_layers): pre_hidden = hidden_array[k] pre_cell = cell_array[k] @@ -318,21 +314,19 @@ def encoder_static( res.append(input) last_hidden = layers.concat(hidden_array, 1) - last_hidden = layers.reshape( - last_hidden, shape=[-1, num_layers, hidden_size], inplace=True + last_hidden = paddle.reshape( + last_hidden, shape=[-1, num_layers, hidden_size] ) last_hidden = layers.transpose(x=last_hidden, perm=[1, 0, 2]) last_cell = layers.concat(cell_array, 1) - last_cell = layers.reshape( + last_cell = paddle.reshape( last_cell, shape=[-1, num_layers, hidden_size] ) last_cell = layers.transpose(x=last_cell, perm=[1, 0, 2]) real_res = layers.concat(res, 0) - real_res = layers.reshape( - real_res, shape=[len, -1, hidden_size], inplace=True - ) + real_res = paddle.reshape(real_res, shape=[len, -1, hidden_size]) real_res = layers.transpose(x=real_res, perm=[1, 0, 2]) return real_res, last_hidden, last_cell @@ -367,10 +361,10 @@ def encoder_static( init_cell.persistable = True init_hidden.persistable = True - init_hidden_reshape = layers.reshape( + init_hidden_reshape = paddle.reshape( init_hidden, shape=[num_layers, -1, hidden_size] ) - init_cell_reshape = layers.reshape( + init_cell_reshape = paddle.reshape( init_cell, shape=[num_layers, -1, hidden_size] ) @@ -387,9 +381,7 @@ def encoder_static( ), ) - x_emb = layers.reshape( - x_emb, shape=[-1, num_steps, hidden_size], inplace=True - ) + x_emb = paddle.reshape(x_emb, shape=[-1, num_steps, hidden_size]) if dropout is not None and dropout > 0.0: x_emb = layers.dropout( x_emb, @@ -447,9 +439,7 @@ def encoder_static( print("type not support") return - rnn_out = layers.reshape( - rnn_out, shape=[-1, num_steps, hidden_size], inplace=True - ) + rnn_out = paddle.reshape(rnn_out, shape=[-1, num_steps, hidden_size]) softmax_weight = layers.create_parameter( [hidden_size, vocab_size], @@ -470,15 +460,13 @@ def encoder_static( projection = layers.matmul(rnn_out, softmax_weight) projection = layers.elementwise_add(projection, softmax_bias) - projection = layers.reshape( - projection, shape=[-1, vocab_size], inplace=True - ) + projection = paddle.reshape(projection, shape=[-1, vocab_size]) loss = layers.softmax_with_cross_entropy( logits=projection, label=y, soft_label=False ) - loss = layers.reshape(loss, shape=[-1, num_steps], inplace=True) + loss = paddle.reshape(loss, shape=[-1, num_steps]) loss = layers.reduce_mean(loss, dim=[0]) loss = layers.reduce_sum(loss) diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_while_op.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_while_op.py index 1793d69f48f18..37ee4897e77f2 100644 --- a/python/paddle/fluid/tests/unittests/test_eager_deletion_while_op.py +++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_while_op.py @@ -107,8 +107,8 @@ def run_main(self, place, with_data_parallel): with while_op.block(): d = layers.array_read(array=data_array, i=i) prev = layers.array_read(array=mem_array, i=i) - d = layers.reshape(d, shape=[10]) - prev = layers.reshape(prev, shape=[10]) + d = paddle.reshape(d, shape=[10]) + prev = paddle.reshape(prev, shape=[10]) result = layers.sums(input=[d, prev]) i = layers.increment(x=i, in_place=True) @@ -117,8 +117,8 @@ def run_main(self, place, with_data_parallel): with while_op2.block(): d2 = layers.array_read(array=data_array, i=j) prev2 = layers.array_read(array=mem_array, i=j) - d2 = layers.reshape(d2, shape=[10]) - prev2 = layers.reshape(prev2, shape=[10]) + d2 = paddle.reshape(d2, shape=[10]) + prev2 = paddle.reshape(prev2, shape=[10]) result2 = layers.sums(input=[d2, prev2]) j = layers.increment(x=j, in_place=True) diff --git a/python/paddle/fluid/tests/unittests/test_embedding_id_stop_gradient.py b/python/paddle/fluid/tests/unittests/test_embedding_id_stop_gradient.py index 68f9696ffe226..be5fdcba69557 100644 --- a/python/paddle/fluid/tests/unittests/test_embedding_id_stop_gradient.py +++ b/python/paddle/fluid/tests/unittests/test_embedding_id_stop_gradient.py @@ -52,7 +52,7 @@ def run_program(self, place, stop_gradient=False): x = fluid.layers.concat([x_1, x_2], axis=-1) for _ in range(self.reshape_times): - x = fluid.layers.reshape(x, [-1, 1]) + x = paddle.reshape(x, [-1, 1]) x.stop_gradient = stop_gradient diff --git a/python/paddle/fluid/tests/unittests/test_fuse_relu_depthwise_conv_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_relu_depthwise_conv_pass.py index 686a5c1e41088..54eacb5ec0d8f 100644 --- a/python/paddle/fluid/tests/unittests/test_fuse_relu_depthwise_conv_pass.py +++ b/python/paddle/fluid/tests/unittests/test_fuse_relu_depthwise_conv_pass.py @@ -54,7 +54,7 @@ def simple_depthwise_net(use_feed): assert use_feed img = fluid.layers.data(name='image', shape=[784], dtype='float32') label = fluid.layers.data(name='label', shape=[1], dtype='int64') - hidden = fluid.layers.reshape(img, (-1, 1, 28, 28)) + hidden = paddle.reshape(img, (-1, 1, 28, 28)) for _ in range(4): hidden = sep_conv(hidden, channel=200, stride=2, filter=5) hidden = fluid.layers.relu(hidden) diff --git a/python/paddle/fluid/tests/unittests/test_imperative_basic.py b/python/paddle/fluid/tests/unittests/test_imperative_basic.py index eaa8474c8246f..197d68db745c7 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_basic.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_basic.py @@ -140,7 +140,7 @@ def forward(self, inputs): input = fluid.layers.slice( inputs, axes=[1], starts=[i], ends=[i + 1] ) - input = fluid.layers.reshape(input, shape=[1, 3]) + input = paddle.reshape(input, shape=[1, 3]) out_softmax, pre_hidden = self._cell(input, pre_hidden) outs.append(out_softmax) @@ -739,15 +739,11 @@ def func_dygraph_vs_static(self): ) a = fluid.layers.expand( - fluid.layers.reshape( - fluid.layers.reduce_sum(inp_data1), [1, 1] - ), + paddle.reshape(fluid.layers.reduce_sum(inp_data1), [1, 1]), [4, 1], ) b = fluid.layers.expand( - fluid.layers.reshape( - fluid.layers.reduce_sum(inp_data2), [1, 1] - ), + paddle.reshape(fluid.layers.reduce_sum(inp_data2), [1, 1]), [4, 1], ) cond = fluid.layers.less_than(x=a, y=b) @@ -796,7 +792,7 @@ def func_rnn(self): np_inp = np_inp.astype(np.float32) with fluid.dygraph.guard(): var_inp = paddle.to_tensor(np_inp) - var_inp = fluid.layers.reshape(var_inp, shape=[1, 4, 3]) + var_inp = paddle.reshape(var_inp, shape=[1, 4, 3]) simple_rnn = SimpleRNN() outs, pre_hiddens = simple_rnn.forward(var_inp) dy_out = outs[3].numpy() @@ -807,7 +803,7 @@ def func_rnn(self): with fluid.dygraph.guard(): var_inp2 = paddle.to_tensor(np_inp) - var_inp2 = fluid.layers.reshape(var_inp2, shape=[1, 4, 3]) + var_inp2 = paddle.reshape(var_inp2, shape=[1, 4, 3]) simple_rnn2 = SimpleRNN() outs2, pre_hiddens2 = simple_rnn2.forward(var_inp2) dy_out2 = outs2[3].numpy() diff --git a/python/paddle/fluid/tests/unittests/test_imperative_gnn.py b/python/paddle/fluid/tests/unittests/test_imperative_gnn.py index a824774bbb132..8703bafb260a8 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_gnn.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_gnn.py @@ -92,7 +92,7 @@ def func_gnn_float32(self): model = GCN('test_gcn', 50) logits = model(features, adj) - logits = fluid.layers.reshape(logits, logits.shape[1:]) + logits = paddle.reshape(logits, logits.shape[1:]) # In other example, it's nll with log_softmax. However, paddle's # log_loss only supports binary classification now. loss = fluid.layers.softmax_with_cross_entropy(logits, labels) @@ -130,7 +130,7 @@ def func_gnn_float32(self): model = GCN('test_gcn', 50) logits = model(to_variable(features), to_variable(adj)) - logits = fluid.layers.reshape(logits, logits.shape[1:]) + logits = paddle.reshape(logits, logits.shape[1:]) # In other example, it's nll with log_softmax. However, paddle's # log_loss only supports binary classification now. loss = fluid.layers.softmax_with_cross_entropy( @@ -158,7 +158,7 @@ def func_gnn_float32(self): model2 = GCN('test_gcn', 50) logits2 = model2(to_variable(features2), to_variable(adj2)) - logits2 = fluid.layers.reshape(logits2, logits2.shape[1:]) + logits2 = paddle.reshape(logits2, logits2.shape[1:]) # In other example, it's nll with log_softmax. However, paddle's # log_loss only supports binary classification now. loss2 = fluid.layers.softmax_with_cross_entropy( diff --git a/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py b/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py index ba8e239e3af6a..73f8973ebaf1b 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py @@ -66,13 +66,11 @@ def forward(self, input, label): x_emb, fluid.layers.transpose(self.embedding.weight, perm=[1, 0]) ) projection = fluid.layers.elementwise_add(projection, self.softmax_bias) - projection = fluid.layers.reshape( - projection, shape=[-1, self.vocab_size] - ) + projection = paddle.reshape(projection, shape=[-1, self.vocab_size]) loss = fluid.layers.softmax_with_cross_entropy( logits=projection, label=label, soft_label=False ) - loss = fluid.layers.reshape(loss, shape=[-1, self.num_steps]) + loss = paddle.reshape(loss, shape=[-1, self.num_steps]) loss = fluid.layers.reduce_mean(loss, dim=[0]) loss = fluid.layers.reduce_sum(loss) diff --git a/python/paddle/fluid/tests/unittests/test_imperative_mnist.py b/python/paddle/fluid/tests/unittests/test_imperative_mnist.py index 5e662331b6611..67ad27a1ba8d4 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_mnist.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_mnist.py @@ -103,7 +103,7 @@ def __init__(self): def forward(self, inputs): x = self._simple_img_conv_pool_1(inputs) x = self._simple_img_conv_pool_2(x) - x = fluid.layers.reshape(x, shape=[-1, self.pool_2_shape]) + x = paddle.reshape(x, shape=[-1, self.pool_2_shape]) x = self._fc(x) return x diff --git a/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py b/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py index 08a32aeaa9971..46b568dec44bd 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py @@ -195,13 +195,9 @@ def forward(self, inputs): input_ = fluid.layers.slice( inputs, axes=[1], starts=[i], ends=[i + 1] ) - input_ = fluid.layers.reshape( - input_, [-1, input_.shape[2]], inplace=False - ) + input_ = paddle.reshape(input_, [-1, input_.shape[2]]) hidden, reset, gate = self.gru_unit(input_, hidden) - hidden_ = fluid.layers.reshape( - hidden, [-1, 1, hidden.shape[1]], inplace=False - ) + hidden_ = paddle.reshape(hidden, [-1, 1, hidden.shape[1]]) if self.is_reverse: res = [hidden_] + res else: @@ -271,7 +267,7 @@ def forward(self, inputs): transpose_conv_features = fluid.layers.transpose( conv_features, perm=[0, 3, 1, 2] ) - sliced_feature = fluid.layers.reshape( + sliced_feature = paddle.reshape( transpose_conv_features, [ -1, @@ -279,7 +275,6 @@ def forward(self, inputs): transpose_conv_features.shape[2] * transpose_conv_features.shape[3], ], - inplace=False, ) fc_1 = self.fc_1_layer(sliced_feature) fc_2 = self.fc_2_layer(sliced_feature) @@ -308,8 +303,8 @@ def __init__(self, decoder_size): def forward(self, encoder_vec, encoder_proj, decoder_state): decoder_state_fc = self.fc_1(decoder_state) - decoder_state_proj_reshape = fluid.layers.reshape( - decoder_state_fc, [-1, 1, decoder_state_fc.shape[1]], inplace=False + decoder_state_proj_reshape = paddle.reshape( + decoder_state_fc, [-1, 1, decoder_state_fc.shape[1]] ) decoder_state_expand = fluid.layers.expand( decoder_state_proj_reshape, [1, encoder_proj.shape[1], 1] @@ -320,10 +315,9 @@ def forward(self, encoder_vec, encoder_proj, decoder_state): concated = paddle.tanh(x=concated) attention_weight = self.fc_2(concated) - weights_reshape = fluid.layers.reshape( + weights_reshape = paddle.reshape( x=attention_weight, shape=[attention_weight.shape[0], attention_weight.shape[1]], - inplace=False, ) weights_reshape = fluid.layers.softmax(weights_reshape) @@ -364,8 +358,8 @@ def forward( current_word = fluid.layers.slice( target_embedding, axes=[1], starts=[i], ends=[i + 1] ) - current_word = fluid.layers.reshape( - current_word, [-1, current_word.shape[2]], inplace=False + current_word = paddle.reshape( + current_word, [-1, current_word.shape[2]] ) context = self.simple_attention( @@ -407,17 +401,16 @@ def forward(self, inputs, label_in): backward_first = fluid.layers.slice( gru_backward, axes=[1], starts=[0], ends=[1] ) - backward_first = fluid.layers.reshape( - backward_first, [-1, backward_first.shape[2]], inplace=False + backward_first = paddle.reshape( + backward_first, [-1, backward_first.shape[2]] ) decoder_boot = self.fc(backward_first) - label_in = fluid.layers.reshape(label_in, [-1], inplace=False) + label_in = paddle.reshape(label_in, [-1]) trg_embedding = self.embedding(label_in) - trg_embedding = fluid.layers.reshape( + trg_embedding = paddle.reshape( trg_embedding, [-1, Config.max_length, trg_embedding.shape[1]], - inplace=False, ) prediction = self.gru_decoder_with_attention( @@ -497,11 +490,9 @@ def run_dygraph(): label_out.stop_gradient = True img = to_variable(image_np) dy_prediction = ocr_attention(img, label_in) - label_out = fluid.layers.reshape( - label_out, [-1, 1], inplace=False - ) - dy_prediction = fluid.layers.reshape( - dy_prediction, [label_out.shape[0], -1], inplace=False + label_out = paddle.reshape(label_out, [-1, 1]) + dy_prediction = paddle.reshape( + dy_prediction, [label_out.shape[0], -1] ) loss = fluid.layers.cross_entropy( input=dy_prediction, label=label_out @@ -577,7 +568,7 @@ def run_dygraph(): static_prediction = ocr_attention(images, static_label_in) - static_prediction = fluid.layers.reshape( + static_prediction = paddle.reshape( static_prediction, shape=[-1, Config.num_classes + 2] ) diff --git a/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py index 070a92ec91f15..a75208d88d5ce 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py @@ -141,7 +141,7 @@ def _check_mlp(self, place=None): label = data[1] label.stop_gradient = True - img = fluid.layers.reshape(img, shape=[batch_size, -1]) + img = paddle.reshape(img, shape=[batch_size, -1]) cost = mlp(img) avg_loss = fluid.layers.reduce_mean(cost) dy_out = avg_loss.numpy() @@ -180,7 +180,7 @@ def _check_mlp(self, place=None): name='pixel', shape=[1, 28, 28], dtype='float32' ) label = fluid.layers.data(name='label', shape=[1], dtype='int64') - img = fluid.layers.reshape(img, shape=[batch_size, 784]) + img = paddle.reshape(img, shape=[batch_size, 784]) cost = mlp(img) avg_loss = fluid.layers.reduce_mean(cost) optimizer.minimize(avg_loss) diff --git a/python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py b/python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py index 1f34d02eb8399..4023d3596bac2 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py @@ -139,7 +139,7 @@ def _check_mlp(self, place=None): label.stop_gradient = True - img = fluid.layers.reshape(img, shape=[batch_size, -1]) + img = paddle.reshape(img, shape=[batch_size, -1]) cost = mlp(img) avg_loss = fluid.layers.reduce_mean(cost) dy_out = avg_loss.numpy() @@ -189,7 +189,7 @@ def _check_mlp(self, place=None): name='pixel', shape=[1, 28, 28], dtype='float32' ) label = fluid.layers.data(name='label', shape=[1], dtype='int64') - img = fluid.layers.reshape(img, shape=[batch_size, 784]) + img = paddle.reshape(img, shape=[batch_size, 784]) cost = mlp(img) avg_loss = fluid.layers.reduce_mean(cost) optimizer.minimize(avg_loss) diff --git a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py index df0a8996aca5a..a3e603b5a9618 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py @@ -85,12 +85,10 @@ def forward(self, input_embedding, init_hidden=None, init_cell=None): pre_cell = fluid.layers.slice( init_cell, axes=[0], starts=[i], ends=[i + 1] ) - pre_hidden = fluid.layers.reshape( + pre_hidden = paddle.reshape( pre_hidden, shape=[-1, self._hidden_size] ) - pre_cell = fluid.layers.reshape( - pre_cell, shape=[-1, self._hidden_size] - ) + pre_cell = paddle.reshape(pre_cell, shape=[-1, self._hidden_size]) self.hidden_array.append(pre_hidden) self.cell_array.append(pre_cell) @@ -99,7 +97,7 @@ def forward(self, input_embedding, init_hidden=None, init_cell=None): self._input = fluid.layers.slice( input_embedding, axes=[1], starts=[index], ends=[index + 1] ) - self._input = fluid.layers.reshape( + self._input = paddle.reshape( self._input, shape=[-1, self._hidden_size] ) for k in range(self._num_layers): @@ -130,19 +128,17 @@ def forward(self, input_embedding, init_hidden=None, init_cell=None): dropout_implementation='upscale_in_train', ) res.append( - fluid.layers.reshape( - self._input, shape=[1, -1, self._hidden_size] - ) + paddle.reshape(self._input, shape=[1, -1, self._hidden_size]) ) real_res = fluid.layers.concat(res, 0) real_res = fluid.layers.transpose(x=real_res, perm=[1, 0, 2]) last_hidden = fluid.layers.concat(self.hidden_array, 1) - last_hidden = fluid.layers.reshape( + last_hidden = paddle.reshape( last_hidden, shape=[-1, self._num_layers, self._hidden_size] ) last_hidden = fluid.layers.transpose(x=last_hidden, perm=[1, 0, 2]) last_cell = fluid.layers.concat(self.cell_array, 1) - last_cell = fluid.layers.reshape( + last_cell = paddle.reshape( last_cell, shape=[-1, self._num_layers, self._hidden_size] ) last_cell = fluid.layers.transpose(x=last_cell, perm=[1, 0, 2]) @@ -203,16 +199,16 @@ def __init__( ) def forward(self, input, label, init_hidden, init_cell): - init_h = fluid.layers.reshape( + init_h = paddle.reshape( init_hidden, shape=[self.num_layers, -1, self.hidden_size] ) - init_c = fluid.layers.reshape( + init_c = paddle.reshape( init_cell, shape=[self.num_layers, -1, self.hidden_size] ) x_emb = self.embedding(input) - x_emb = fluid.layers.reshape( + x_emb = paddle.reshape( x_emb, shape=[-1, self.num_steps, self.hidden_size] ) if self.dropout is not None and self.dropout > 0.0: @@ -224,18 +220,16 @@ def forward(self, input, label, init_hidden, init_cell): rnn_out, last_hidden, last_cell = self.simple_lstm_rnn( x_emb, init_h, init_c ) - rnn_out = fluid.layers.reshape( + rnn_out = paddle.reshape( rnn_out, shape=[-1, self.num_steps, self.hidden_size] ) projection = fluid.layers.matmul(rnn_out, self.softmax_weight) projection = fluid.layers.elementwise_add(projection, self.softmax_bias) - projection = fluid.layers.reshape( - projection, shape=[-1, self.vocab_size] - ) + projection = paddle.reshape(projection, shape=[-1, self.vocab_size]) loss = fluid.layers.softmax_with_cross_entropy( logits=projection, label=label, soft_label=False ) - loss = fluid.layers.reshape(loss, shape=[-1, self.num_steps]) + loss = paddle.reshape(loss, shape=[-1, self.num_steps]) loss = fluid.layers.reduce_mean(loss, dim=[0]) loss = fluid.layers.reduce_sum(loss) diff --git a/python/paddle/fluid/tests/unittests/test_imperative_reinforcement.py b/python/paddle/fluid/tests/unittests/test_imperative_reinforcement.py index 91f2105f6b4cd..01f34d36c46d5 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_reinforcement.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_reinforcement.py @@ -36,7 +36,7 @@ def __init__(self, input_size): self.rewards = [] def forward(self, inputs): - x = fluid.layers.reshape(inputs, shape=[-1, 4]) + x = paddle.reshape(inputs, shape=[-1, 4]) x = self.affine1(x) x = fluid.layers.dropout(x, self.dropout_ratio) x = fluid.layers.relu(x) diff --git a/python/paddle/fluid/tests/unittests/test_imperative_resnet.py b/python/paddle/fluid/tests/unittests/test_imperative_resnet.py index 328245ab9c935..0b354868795ca 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_resnet.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_resnet.py @@ -241,7 +241,7 @@ def forward(self, inputs): for bottleneck_block in self.bottleneck_block_list: y = bottleneck_block(y) y = self.pool2d_avg(y) - y = fluid.layers.reshape(y, shape=[-1, self.pool2d_avg_output]) + y = paddle.reshape(y, shape=[-1, self.pool2d_avg_output]) y = self.out(y) return y diff --git a/python/paddle/fluid/tests/unittests/test_imperative_save_load.py b/python/paddle/fluid/tests/unittests/test_imperative_save_load.py index a0b75d716074a..4968a2fe28adc 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_save_load.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_save_load.py @@ -80,12 +80,10 @@ def forward(self, input_embedding, init_hidden=None, init_cell=None): pre_cell = fluid.layers.slice( init_cell, axes=[0], starts=[i], ends=[i + 1] ) - pre_hidden = fluid.layers.reshape( + pre_hidden = paddle.reshape( pre_hidden, shape=[-1, self._hidden_size] ) - pre_cell = fluid.layers.reshape( - pre_cell, shape=[-1, self._hidden_size] - ) + pre_cell = paddle.reshape(pre_cell, shape=[-1, self._hidden_size]) self.hidden_array.append(pre_hidden) self.cell_array.append(pre_cell) @@ -94,7 +92,7 @@ def forward(self, input_embedding, init_hidden=None, init_cell=None): self._input = fluid.layers.slice( input_embedding, axes=[1], starts=[index], ends=[index + 1] ) - self._input = fluid.layers.reshape( + self._input = paddle.reshape( self._input, shape=[-1, self._hidden_size] ) for k in range(self._num_layers): @@ -125,19 +123,17 @@ def forward(self, input_embedding, init_hidden=None, init_cell=None): dropout_implementation='upscale_in_train', ) res.append( - fluid.layers.reshape( - self._input, shape=[1, -1, self._hidden_size] - ) + paddle.reshape(self._input, shape=[1, -1, self._hidden_size]) ) real_res = fluid.layers.concat(res, 0) real_res = fluid.layers.transpose(x=real_res, perm=[1, 0, 2]) last_hidden = fluid.layers.concat(self.hidden_array, 1) - last_hidden = fluid.layers.reshape( + last_hidden = paddle.reshape( last_hidden, shape=[-1, self._num_layers, self._hidden_size] ) last_hidden = fluid.layers.transpose(x=last_hidden, perm=[1, 0, 2]) last_cell = fluid.layers.concat(self.cell_array, 1) - last_cell = fluid.layers.reshape( + last_cell = paddle.reshape( last_cell, shape=[-1, self._num_layers, self._hidden_size] ) last_cell = fluid.layers.transpose(x=last_cell, perm=[1, 0, 2]) @@ -198,16 +194,16 @@ def __init__( ) def forward(self, input, label, init_hidden, init_cell): - init_h = fluid.layers.reshape( + init_h = paddle.reshape( init_hidden, shape=[self.num_layers, -1, self.hidden_size] ) - init_c = fluid.layers.reshape( + init_c = paddle.reshape( init_cell, shape=[self.num_layers, -1, self.hidden_size] ) x_emb = self.embedding(input) - x_emb = fluid.layers.reshape( + x_emb = paddle.reshape( x_emb, shape=[-1, self.num_steps, self.hidden_size] ) if self.dropout is not None and self.dropout > 0.0: @@ -219,19 +215,17 @@ def forward(self, input, label, init_hidden, init_cell): rnn_out, last_hidden, last_cell = self.simple_lstm_rnn( x_emb, init_h, init_c ) - rnn_out = fluid.layers.reshape( + rnn_out = paddle.reshape( rnn_out, shape=[-1, self.num_steps, self.hidden_size] ) projection = fluid.layers.matmul(rnn_out, self.softmax_weight) projection = fluid.layers.elementwise_add(projection, self.softmax_bias) - projection = fluid.layers.reshape( - projection, shape=[-1, self.vocab_size] - ) + projection = paddle.reshape(projection, shape=[-1, self.vocab_size]) loss = fluid.layers.softmax_with_cross_entropy( logits=projection, label=label, soft_label=False ) - loss = fluid.layers.reshape(loss, shape=[-1, self.num_steps]) + loss = paddle.reshape(loss, shape=[-1, self.num_steps]) loss = fluid.layers.reduce_mean(loss, dim=[0]) loss = fluid.layers.reduce_sum(loss) diff --git a/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py b/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py index 1274200f31bb3..a450d7e871f55 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py @@ -82,12 +82,10 @@ def forward(self, input_embedding, init_hidden=None, init_cell=None): pre_cell = fluid.layers.slice( init_cell, axes=[0], starts=[i], ends=[i + 1] ) - pre_hidden = fluid.layers.reshape( + pre_hidden = paddle.reshape( pre_hidden, shape=[-1, self._hidden_size] ) - pre_cell = fluid.layers.reshape( - pre_cell, shape=[-1, self._hidden_size] - ) + pre_cell = paddle.reshape(pre_cell, shape=[-1, self._hidden_size]) self.hidden_array.append(pre_hidden) self.cell_array.append(pre_cell) @@ -96,7 +94,7 @@ def forward(self, input_embedding, init_hidden=None, init_cell=None): self._input = fluid.layers.slice( input_embedding, axes=[1], starts=[index], ends=[index + 1] ) - self._input = fluid.layers.reshape( + self._input = paddle.reshape( self._input, shape=[-1, self._hidden_size] ) for k in range(self._num_layers): @@ -127,19 +125,17 @@ def forward(self, input_embedding, init_hidden=None, init_cell=None): dropout_implementation='upscale_in_train', ) res.append( - fluid.layers.reshape( - self._input, shape=[1, -1, self._hidden_size] - ) + paddle.reshape(self._input, shape=[1, -1, self._hidden_size]) ) real_res = fluid.layers.concat(res, 0) real_res = fluid.layers.transpose(x=real_res, perm=[1, 0, 2]) last_hidden = fluid.layers.concat(self.hidden_array, 1) - last_hidden = fluid.layers.reshape( + last_hidden = paddle.reshape( last_hidden, shape=[-1, self._num_layers, self._hidden_size] ) last_hidden = fluid.layers.transpose(x=last_hidden, perm=[1, 0, 2]) last_cell = fluid.layers.concat(self.cell_array, 1) - last_cell = fluid.layers.reshape( + last_cell = paddle.reshape( last_cell, shape=[-1, self._num_layers, self._hidden_size] ) last_cell = fluid.layers.transpose(x=last_cell, perm=[1, 0, 2]) @@ -200,16 +196,16 @@ def __init__( ) def forward(self, input, label, init_hidden, init_cell): - init_h = fluid.layers.reshape( + init_h = paddle.reshape( init_hidden, shape=[self.num_layers, -1, self.hidden_size] ) - init_c = fluid.layers.reshape( + init_c = paddle.reshape( init_cell, shape=[self.num_layers, -1, self.hidden_size] ) x_emb = self.embedding(input) - x_emb = fluid.layers.reshape( + x_emb = paddle.reshape( x_emb, shape=[-1, self.num_steps, self.hidden_size] ) if self.dropout is not None and self.dropout > 0.0: @@ -221,19 +217,17 @@ def forward(self, input, label, init_hidden, init_cell): rnn_out, last_hidden, last_cell = self.simple_lstm_rnn( x_emb, init_h, init_c ) - rnn_out = fluid.layers.reshape( + rnn_out = paddle.reshape( rnn_out, shape=[-1, self.num_steps, self.hidden_size] ) projection = fluid.layers.matmul(rnn_out, self.softmax_weight) projection = fluid.layers.elementwise_add(projection, self.softmax_bias) - projection = fluid.layers.reshape( - projection, shape=[-1, self.vocab_size] - ) + projection = paddle.reshape(projection, shape=[-1, self.vocab_size]) loss = fluid.layers.softmax_with_cross_entropy( logits=projection, label=label, soft_label=False ) - loss = fluid.layers.reshape(loss, shape=[-1, self.num_steps]) + loss = paddle.reshape(loss, shape=[-1, self.num_steps]) loss = fluid.layers.reduce_mean(loss, dim=[0]) loss = fluid.layers.reduce_sum(loss) diff --git a/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py b/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py index a0518f7ba7b43..d977dadeeba84 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py @@ -123,7 +123,7 @@ def __init__(self, num_channels, reduction_ratio): def forward(self, input): y = self._pool(input) - y = fluid.layers.reshape(y, shape=[-1, self._num_channels]) + y = paddle.reshape(y, shape=[-1, self._num_channels]) y = self._squeeze(y) y = self._excitation(y) y = fluid.layers.elementwise_mul(x=input, y=y, axis=0) @@ -318,7 +318,7 @@ def forward(self, inputs): for bottleneck_block in self.bottleneck_block_list: y = bottleneck_block(y) y = self.pool2d_avg(y) - y = fluid.layers.reshape(y, shape=[-1, self.pool2d_avg_output]) + y = paddle.reshape(y, shape=[-1, self.pool2d_avg_output]) y = self.out(y) return y diff --git a/python/paddle/fluid/tests/unittests/test_imperative_selected_rows_to_lod_tensor.py b/python/paddle/fluid/tests/unittests/test_imperative_selected_rows_to_lod_tensor.py index c34f0cd0e536a..f137de9dc2cb2 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_selected_rows_to_lod_tensor.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_selected_rows_to_lod_tensor.py @@ -75,13 +75,11 @@ def forward(self, input, label): projection = fluid.layers.matmul( fc, fluid.layers.transpose(self.embedding.weight, perm=[1, 0]) ) - projection = fluid.layers.reshape( - projection, shape=[-1, self.vocab_size] - ) + projection = paddle.reshape(projection, shape=[-1, self.vocab_size]) loss = fluid.layers.softmax_with_cross_entropy( logits=projection, label=label, soft_label=False ) - loss = fluid.layers.reshape(loss, shape=[-1, self.num_steps]) + loss = paddle.reshape(loss, shape=[-1, self.num_steps]) loss = fluid.layers.reduce_mean(loss, dim=[0]) loss = fluid.layers.reduce_sum(loss) diff --git a/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py b/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py index 6a6dd3f7712a3..4a99e0fb63b18 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py @@ -309,9 +309,7 @@ def __init__(self, cfg, num_channels=3): def forward(self, input, label_trg): shape = input.shape - label_trg_e = fluid.layers.reshape( - label_trg, [-1, label_trg.shape[1], 1, 1] - ) + label_trg_e = paddle.reshape(label_trg, [-1, label_trg.shape[1], 1, 1]) label_trg_e = fluid.layers.expand( x=label_trg_e, expand_times=[1, 1, shape[2], shape[3]] ) @@ -380,9 +378,7 @@ def forward(self, input): def loss_cls(cls, label, cfg): cls_shape = cls.shape - cls = fluid.layers.reshape( - cls, [-1, cls_shape[1] * cls_shape[2] * cls_shape[3]] - ) + cls = paddle.reshape(cls, [-1, cls_shape[1] * cls_shape[2] * cls_shape[3]]) return ( fluid.layers.reduce_sum( fluid.layers.sigmoid_cross_entropy_with_logits(cls, label) @@ -432,7 +428,7 @@ def _interpolate(a, b): gradient = gradient[0] grad_shape = gradient.shape - gradient = fluid.layers.reshape( + gradient = paddle.reshape( gradient, [-1, grad_shape[1] * grad_shape[2] * grad_shape[3]] ) diff --git a/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py b/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py index 1ff8ffbd8518e..5c6f224a5ee19 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py @@ -476,16 +476,16 @@ def forward(self, queries, keys, values, attn_bias): v = self._v_fc(values) # split head - reshaped_q = fluid.layers.reshape( - x=q, shape=[0, 0, self._n_head, self._d_key], inplace=False + reshaped_q = paddle.reshape( + x=q, shape=[0, 0, self._n_head, self._d_key] ) transpose_q = fluid.layers.transpose(x=reshaped_q, perm=[0, 2, 1, 3]) - reshaped_k = fluid.layers.reshape( - x=k, shape=[0, 0, self._n_head, self._d_key], inplace=False + reshaped_k = paddle.reshape( + x=k, shape=[0, 0, self._n_head, self._d_key] ) transpose_k = fluid.layers.transpose(x=reshaped_k, perm=[0, 2, 1, 3]) - reshaped_v = fluid.layers.reshape( - x=v, shape=[0, 0, self._n_head, self._d_value], inplace=False + reshaped_v = paddle.reshape( + x=v, shape=[0, 0, self._n_head, self._d_value] ) transpose_v = fluid.layers.transpose(x=reshaped_v, perm=[0, 2, 1, 3]) @@ -514,10 +514,9 @@ def forward(self, queries, keys, values, attn_bias): if len(out.shape) != 4: raise ValueError("Input(x) should be a 4-D Tensor.") trans_x = fluid.layers.transpose(out, perm=[0, 2, 1, 3]) - final_out = fluid.layers.reshape( + final_out = paddle.reshape( x=trans_x, shape=[0, 0, trans_x.shape[2] * trans_x.shape[3]], - inplace=False, ) # fc to output @@ -994,8 +993,8 @@ def forward(self, dec_inputs=None, enc_output=None): dec_input, enc_output, trg_slf_attn_bias, trg_src_attn_bias ) - dec_output_reshape = fluid.layers.reshape( - dec_output, shape=[-1, dec_output.shape[-1]], inplace=False + dec_output_reshape = paddle.reshape( + dec_output, shape=[-1, dec_output.shape[-1]] ) if self._weight_sharing: diff --git a/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_pass.py b/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_pass.py index d21156b43e8cc..a15079021ed56 100644 --- a/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_pass.py +++ b/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_pass.py @@ -43,8 +43,8 @@ def fc_with_inplace_net(use_feed): x, y = _feed_data_helper() fc = fluid.layers.fc(input=x, size=20, act='relu') fc = fluid.layers.fc(input=fc, size=10, act='relu') - reshape = fluid.layers.reshape(x=fc, shape=[-1, 2, 5]) - reshape = fluid.layers.reshape(x=reshape, shape=[-1, 5, 2]) + reshape = paddle.reshape(x=fc, shape=[-1, 2, 5]) + reshape = paddle.reshape(x=reshape, shape=[-1, 5, 2]) y_predict = fluid.layers.fc(input=reshape, size=10, act='softmax') cost = fluid.layers.cross_entropy(input=y_predict, label=y) avg_cost = paddle.mean(cost) diff --git a/python/paddle/fluid/tests/unittests/test_nn_grad.py b/python/paddle/fluid/tests/unittests/test_nn_grad.py index 4b3d92c92dcea..274aa25142e40 100644 --- a/python/paddle/fluid/tests/unittests/test_nn_grad.py +++ b/python/paddle/fluid/tests/unittests/test_nn_grad.py @@ -148,7 +148,7 @@ def func(self, place): x = layers.data('x', x_shape, False, dtype) x.persistable = True - out = layers.reshape(x, new_shape) + out = paddle.reshape(x, new_shape) x_arr = np.random.uniform(-1, 1, x_shape).astype(dtype) gradient_checker.double_grad_check( diff --git a/python/paddle/fluid/tests/unittests/test_reshape_op.py b/python/paddle/fluid/tests/unittests/test_reshape_op.py index dad6e3fa3284a..4445cf3426146 100755 --- a/python/paddle/fluid/tests/unittests/test_reshape_op.py +++ b/python/paddle/fluid/tests/unittests/test_reshape_op.py @@ -301,11 +301,6 @@ def _set_paddle_api(self): def _executed_api(self): self.reshape = paddle.reshape - def _set_fluid_api(self): - self.fill_constant = fluid.layers.fill_constant - self.data = paddle.static.data - self.reshape = fluid.layers.reshape - def _test_api(self): paddle.enable_static() input = np.random.random([2, 25]).astype("float32") @@ -317,18 +312,16 @@ def _test_api(self): actual_shape = self.data(name="shape", shape=[3], dtype="int32") - # situation 1: have shape( list, no tensor), no actual shape(Tensor) + # situation 1: have shape( list, no tensor) out_1 = self.reshape(x, shape) - # situation 2: have shape(list, no tensor), have actual shape(Tensor) - out_2 = fluid.layers.reshape( - x, shape=shape, actual_shape=actual_shape - ) + # situation 2: have shape(list, no tensor) + out_2 = paddle.reshape(x, actual_shape) - # Situation 3: have shape(list, have tensor), no actual shape(Tensor) + # Situation 3: have shape(list, have tensor) out_3 = self.reshape(x, shape=[positive_five, 10]) - # Situation 4: have shape(Tensor), no actual shape(Tensor) + # Situation 4: have shape(Tensor) out_4 = self.reshape(x, shape=actual_shape) exe = paddle.static.Executor(place=paddle.CPUPlace()) @@ -347,10 +340,6 @@ def test_paddle_api(self): self._set_paddle_api() self._test_api() - def test_fluid_api(self): - self._set_fluid_api() - self._test_api() - def test_imperative(self): self._set_paddle_api() input = np.random.random([2, 25]).astype("float32") @@ -401,10 +390,6 @@ def _set_paddle_api(self): self.data = paddle.static.data self.reshape = paddle.reshape - def _set_fluid_api(self): - self.data = fluid.data - self.reshape = fluid.layers.reshape - def _test_errors(self): with program_guard(Program(), Program()): # The x type of reshape_op must be Variable. @@ -439,12 +424,6 @@ def test_shape_type(): self.assertRaises(TypeError, test_shape_type) - # The argument actual_shape's type of reshape_op must be Variable or None. - def test_actual_shape_type(): - self.reshape(x3, shape=[25, 2], actual_shape=1) - - self.assertRaises(TypeError, test_actual_shape_type) - # The argument shape have more than one -1. def test_shape_1(): self.reshape(x3, shape=[-1, -1, 5]) @@ -467,10 +446,6 @@ def test_paddle_api_error(self): self._set_paddle_api() self._test_errors() - def test_fluid_api_error(self): - self._set_fluid_api() - self._test_errors() - class TestDygraphReshapeAPI(unittest.TestCase): def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/test_static_save_load.py b/python/paddle/fluid/tests/unittests/test_static_save_load.py index 5c1e96ebb8477..8c4c8aa60de0a 100644 --- a/python/paddle/fluid/tests/unittests/test_static_save_load.py +++ b/python/paddle/fluid/tests/unittests/test_static_save_load.py @@ -92,12 +92,10 @@ def forward(self, input_embedding, init_hidden=None, init_cell=None): pre_cell = fluid.layers.slice( init_cell, axes=[0], starts=[i], ends=[i + 1] ) - pre_hidden = fluid.layers.reshape( + pre_hidden = paddle.reshape( pre_hidden, shape=[-1, self._hidden_size] ) - pre_cell = fluid.layers.reshape( - pre_cell, shape=[-1, self._hidden_size] - ) + pre_cell = paddle.reshape(pre_cell, shape=[-1, self._hidden_size]) self.hidden_array.append(pre_hidden) self.cell_array.append(pre_cell) @@ -106,7 +104,7 @@ def forward(self, input_embedding, init_hidden=None, init_cell=None): self._input = fluid.layers.slice( input_embedding, axes=[1], starts=[index], ends=[index + 1] ) - self._input = fluid.layers.reshape( + self._input = paddle.reshape( self._input, shape=[-1, self._hidden_size] ) for k in range(self._num_layers): @@ -137,19 +135,17 @@ def forward(self, input_embedding, init_hidden=None, init_cell=None): dropout_implementation='upscale_in_train', ) res.append( - fluid.layers.reshape( - self._input, shape=[1, -1, self._hidden_size] - ) + paddle.reshape(self._input, shape=[1, -1, self._hidden_size]) ) real_res = fluid.layers.concat(res, 0) real_res = fluid.layers.transpose(x=real_res, perm=[1, 0, 2]) last_hidden = fluid.layers.concat(self.hidden_array, 1) - last_hidden = fluid.layers.reshape( + last_hidden = paddle.reshape( last_hidden, shape=[-1, self._num_layers, self._hidden_size] ) last_hidden = fluid.layers.transpose(x=last_hidden, perm=[1, 0, 2]) last_cell = fluid.layers.concat(self.cell_array, 1) - last_cell = fluid.layers.reshape( + last_cell = paddle.reshape( last_cell, shape=[-1, self._num_layers, self._hidden_size] ) last_cell = fluid.layers.transpose(x=last_cell, perm=[1, 0, 2]) @@ -210,18 +206,18 @@ def __init__( ) def forward(self, input, label, init_hidden, init_cell): - init_h = fluid.layers.reshape( + init_h = paddle.reshape( init_hidden, shape=[self.num_layers, -1, self.hidden_size] ) - init_c = fluid.layers.reshape( + init_c = paddle.reshape( init_cell, shape=[self.num_layers, -1, self.hidden_size] ) # NPU 'tok_k' kernel only support `int32` dtype, so cast `input` from `int64` to `int32`. input = fluid.layers.cast(input, "int32") x_emb = self.embedding(input) - x_emb = fluid.layers.reshape( + x_emb = paddle.reshape( x_emb, shape=[-1, self.num_steps, self.hidden_size] ) if self.dropout is not None and self.dropout > 0.0: @@ -234,18 +230,16 @@ def forward(self, input, label, init_hidden, init_cell): x_emb, init_h, init_c ) - rnn_out = fluid.layers.reshape( + rnn_out = paddle.reshape( rnn_out, shape=[-1, self.num_steps, self.hidden_size] ) projection = fluid.layers.matmul(rnn_out, self.softmax_weight) projection = fluid.layers.elementwise_add(projection, self.softmax_bias) - projection = fluid.layers.reshape( - projection, shape=[-1, self.vocab_size] - ) + projection = paddle.reshape(projection, shape=[-1, self.vocab_size]) loss = fluid.layers.softmax_with_cross_entropy( logits=projection, label=label, soft_label=False ) - loss = fluid.layers.reshape(loss, shape=[-1, self.num_steps]) + loss = paddle.reshape(loss, shape=[-1, self.num_steps]) loss = fluid.layers.reduce_mean(loss, dim=[0]) loss = fluid.layers.reduce_sum(loss) diff --git a/python/paddle/fluid/tests/unittests/test_var_base.py b/python/paddle/fluid/tests/unittests/test_var_base.py index 6adf1c7418013..c832093ed5909 100644 --- a/python/paddle/fluid/tests/unittests/test_var_base.py +++ b/python/paddle/fluid/tests/unittests/test_var_base.py @@ -727,7 +727,7 @@ def _test_slice(self): var3 = var[0:1] var4 = var[::-1] var5 = var[1, 1:, 1:] - var_reshape = fluid.layers.reshape(var, [3, -1, 3]) + var_reshape = paddle.reshape(var, [3, -1, 3]) var6 = var_reshape[:, :, -1] var7 = var[:, :, :-1] var8 = var[:1, :1, :1] @@ -820,7 +820,7 @@ def _test_slice_for_tensor_attr(self): var3 = var[0:one] var4 = var[::negative_one] var5 = var[one, one:, one:] - var_reshape = fluid.layers.reshape(var, [3, negative_one, 3]) + var_reshape = paddle.reshape(var, [3, negative_one, 3]) var6 = var_reshape[:, :, negative_one] var7 = var[:, :, :negative_one] var8 = var[:one, :one, :1] diff --git a/python/paddle/fluid/tests/unittests/test_variable.py b/python/paddle/fluid/tests/unittests/test_variable.py index bf9ee54afc767..b8e020eca945e 100644 --- a/python/paddle/fluid/tests/unittests/test_variable.py +++ b/python/paddle/fluid/tests/unittests/test_variable.py @@ -156,7 +156,7 @@ def _test_slice(self, place): var3 = var[0:1] var4 = var[::-1] var5 = var[1, 1:, 1:] - var_reshape = fluid.layers.reshape(var, [3, -1, 3]) + var_reshape = paddle.reshape(var, [3, -1, 3]) var6 = var_reshape[:, :, -1] var7 = var[:, :, :-1] var8 = var[:1, :1, :1] diff --git a/python/paddle/fluid/tests/unittests/test_while_loop_op.py b/python/paddle/fluid/tests/unittests/test_while_loop_op.py index 3c91b8c1e235c..deaebf4a45d7f 100644 --- a/python/paddle/fluid/tests/unittests/test_while_loop_op.py +++ b/python/paddle/fluid/tests/unittests/test_while_loop_op.py @@ -92,7 +92,7 @@ def body(i, ten, test_dict, test_list, test_list_dict): test_dict["test_key"] = i test_dict["test_key"] += 1 - test_list[0] = fluid.layers.reshape(test_list[0], [2, -1]) + 1 + test_list[0] = paddle.reshape(test_list[0], [2, -1]) + 1 test_list_dict[0]["test_key"] += 1 test_list_dict[0]["test_key"] = fluid.layers.relu( diff --git a/python/paddle/fluid/tests/unittests/transformer_model.py b/python/paddle/fluid/tests/unittests/transformer_model.py index c45e17ab77694..842d9320dafc7 100644 --- a/python/paddle/fluid/tests/unittests/transformer_model.py +++ b/python/paddle/fluid/tests/unittests/transformer_model.py @@ -115,7 +115,7 @@ def __split_heads(x, n_head): hidden_size = x.shape[-1] # FIXME(guosheng): Decouple the program desc with batch_size. - reshaped = layers.reshape( + reshaped = paddle.reshape( x=x, shape=[batch_size, -1, n_head, hidden_size // n_head] ) @@ -135,7 +135,7 @@ def __combine_heads(x): trans_x = layers.transpose(x, perm=[0, 2, 1, 3]) # FIXME(guosheng): Decouple the program desc with batch_size. - return layers.reshape( + return paddle.reshape( x=trans_x, shape=list( map(int, [batch_size, -1, trans_x.shape[2] * trans_x.shape[3]]) @@ -281,7 +281,7 @@ def prepare_encoder( enc_input = src_word_emb + src_pos_enc # FIXME(guosheng): Decouple the program desc with batch_size. - enc_input = layers.reshape(x=enc_input, shape=[batch_size, -1, src_emb_dim]) + enc_input = paddle.reshape(x=enc_input, shape=[batch_size, -1, src_emb_dim]) return ( layers.dropout(enc_input, dropout_prob=dropout, is_test=False) if dropout @@ -581,7 +581,7 @@ def transformer( # TODO(guosheng): Share the weight matrix between the embedding layers and # the pre-softmax linear transformation. - predict = layers.reshape( + predict = paddle.reshape( x=layers.fc( input=dec_output, size=trg_vocab_size, @@ -590,8 +590,8 @@ def transformer( num_flatten_dims=2, ), shape=[-1, trg_vocab_size], - act="softmax", ) + predict = paddle.nn.functional.softmax(predict) cost = layers.cross_entropy(input=predict, label=gold) weighted_cost = cost * weights From 7d6a4a54febcc37469bd95b2c028c3764c119751 Mon Sep 17 00:00:00 2001 From: Hulek Date: Tue, 22 Nov 2022 06:41:04 +0100 Subject: [PATCH 141/210] Delete caching from requantize_mkldnn_op and changed to Acquire API (#48113) * Delete caching from requantize_mkldnn_op and changed to Acquire API * Fixed codestyle and implementation --- .../operators/mkldnn/requantize_mkldnn_op.cc | 118 ++++++------------ 1 file changed, 37 insertions(+), 81 deletions(-) diff --git a/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc index 4ac14d5ff95e5..c9b80ba1e7a56 100644 --- a/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc @@ -17,7 +17,8 @@ limitations under the License. */ #include "paddle/fluid/framework/data_layout_transform.h" #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/operators/requantize_op.h" -#include "paddle/fluid/platform/mkldnn_helper.h" +#include "paddle/phi/backends/onednn/onednn_helper.h" +#include "paddle/phi/backends/onednn/onednn_reuse.h" namespace paddle { namespace operators { @@ -56,101 +57,56 @@ class ReQuantOpKernel : public framework::OpKernel { platform::errors::InvalidArgument("Scale of output cannot be 0.0")); if (shift_in != 0.0f) { PADDLE_ENFORCE_EQ( - framework::TransToProtoVarType(input->dtype()), - framework::proto::VarType::UINT8, + input->dtype(), + DataType::UINT8, platform::errors::Unimplemented("Requantize does not support nonzero " "shift for signed input.")); } auto& dev_ctx = ctx.template device_context(); - const auto& engine = dev_ctx.GetEngine(); auto src_tz = phi::vectorize(input->dims()); - float reorder_scale = scale_out / scale_in; + auto src_paddle_dt = input->dtype(); + auto dst_paddle_dt = with_shift ? DataType::UINT8 : src_paddle_dt; + + auto xstrides = input->mem_desc().data.format_desc.blocking.strides; + std::vector vstrides(xstrides, + xstrides + input->mem_desc().data.ndims); - std::string key = platform::CreateKey( - dev_ctx, src_tz, scale_in, scale_out, ctx.OutputName("Output")); - key = platform::ExtendKeyWithThreadInfoIfNeeded(dev_ctx, key); - const std::string key_prim = key + "@r"; - const std::string key_src_mem = key + "@s"; - const std::string key_dst_mem = key + "@d"; - - std::shared_ptr src_memory; - std::shared_ptr dst_memory; - std::shared_ptr reorder_p; - reorder_p = std::static_pointer_cast(dev_ctx.GetBlob(key_prim)); - - const T* input_data = input->data(); - - if (reorder_p == nullptr) { - auto src_dt = framework::ToMKLDNNDataType( - framework::TransToProtoVarType(input->dtype())); - auto dst_dt = with_shift ? framework::OneDNNDataType::u8 : src_dt; - - src_memory = std::make_shared( - input->mem_desc(), engine, phi::funcs::to_void_cast(input_data)); - - auto xstrides = input->mem_desc().data.format_desc.blocking.strides; - - std::vector vstrides(xstrides, - xstrides + input->mem_desc().data.ndims); - - auto dst_md = dnnl::memory::desc({src_tz}, dst_dt, vstrides); - - dnnl::primitive_attr attri; - int mask = 0; - attri.set_output_scales(mask, {reorder_scale}); - if (with_shift) { - dnnl::post_ops post_operations; - post_operations.append_sum(); - attri.set_post_ops(post_operations); - uint8_t* output_data = output->mutable_data(ctx.GetPlace()); - uint8_t reorder_shift = - clip_to_uint8(shift_out - reorder_scale * shift_in); - std::memset(output_data, reorder_shift, output->numel()); - dst_memory = std::make_shared( - dst_md, engine, phi::funcs::to_void_cast(output_data)); - } else { - T* output_data = output->mutable_data(ctx.GetPlace()); - dst_memory = std::make_shared( - dst_md, engine, phi::funcs::to_void_cast(output_data)); - } - - auto reorder_pd = - reorder::primitive_desc(*src_memory, *dst_memory, attri); - reorder_p = std::make_shared(reorder_pd); - - dev_ctx.SetBlob(key_prim, reorder_p); - dev_ctx.SetBlob(key_src_mem, src_memory); - dev_ctx.SetBlob(key_dst_mem, dst_memory); - } else { - src_memory = - std::static_pointer_cast(dev_ctx.GetBlob(key_src_mem)); - src_memory->set_data_handle(phi::funcs::to_void_cast(input_data)); - - dst_memory = - std::static_pointer_cast(dev_ctx.GetBlob(key_dst_mem)); - if (with_shift) { - uint8_t* output_data = output->mutable_data(ctx.GetPlace()); - uint8_t reorder_shift = - clip_to_uint8(shift_out - reorder_scale * shift_in); - std::memset(output_data, reorder_shift, output->numel()); - dst_memory->set_data_handle(output_data); - - } else { - T* output_data = output->mutable_data(ctx.GetPlace()); - dst_memory->set_data_handle(output_data); - } + dnnl::primitive_attr attrs; + int mask = 0; + float reorder_scale = scale_out / scale_in; + attrs.set_output_scales(mask, {reorder_scale}); + if (with_shift) { + uint8_t reorder_shift = + clip_to_uint8(shift_out - reorder_scale * shift_in); + attrs.set_zero_points( + DNNL_ARG_DST, mask, {static_cast(reorder_shift)}); } - auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); + phi::funcs::ReorderOneDNNHandler reorder_handler( + src_tz, + src_paddle_dt, + phi::funcs::ToOneDNNDataType(src_paddle_dt), + dst_paddle_dt, + phi::funcs::ToOneDNNDataType(dst_paddle_dt), + dev_ctx.GetEngine()); - reorder_p->execute(astream, *src_memory, *dst_memory); + auto src_memory_p = reorder_handler.AcquireSrcMemory( + input->mem_desc(), phi::funcs::to_void_cast(input->data())); + auto dst_memory_p = reorder_handler.AcquireDstMemory( + output, src_tz, vstrides, dev_ctx.GetPlace()); + + auto reorder_p = + reorder_handler.AcquireReorder(dst_memory_p, src_memory_p, attrs); + + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); + reorder_p->execute(astream, *src_memory_p, *dst_memory_p); astream.wait(); - output->set_mem_desc(dst_memory->get_desc()); + output->set_mem_desc(dst_memory_p->get_desc()); } }; From 47875ba7de461231d4e67f54ef22359e17b8828a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=82=85=E5=89=91=E5=AF=92?= Date: Tue, 22 Nov 2022 14:04:15 +0800 Subject: [PATCH 142/210] remove hard_sigmoid in nn.py under fluid (#47890) * remove hard_sigmoid in nn.py under fluid * fix hardsigmoid test case * fix hardsigmoid test case --- python/paddle/fluid/layers/nn.py | 45 ------------------- .../ir/inference/test_trt_activation_pass.py | 2 +- .../unittests/mlu/test_hard_sigmoid_op_mlu.py | 4 +- .../unittests/npu/test_hard_sigmoid_op_npu.py | 4 +- .../tests/unittests/test_activation_op.py | 4 +- .../tests/unittests/test_imperative_basic.py | 2 +- 6 files changed, 8 insertions(+), 53 deletions(-) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 45dac4372a23d..3f66af243d97d 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -124,7 +124,6 @@ 'log', 'crop_tensor', 'pow', - 'hard_sigmoid', 'prelu', 'brelu', 'leaky_relu', @@ -9056,50 +9055,6 @@ def pow(x, factor=1.0, name=None): return out -@templatedoc() -def hard_sigmoid(x, slope=0.2, offset=0.5, name=None): - """ - ${comment} - Parameters: - x (${x_type}): ${x_comment} - slope (float, optional): ${slope_comment} - offset (float, optional): ${offset_comment} - name (str, optional): The default value is None. Normally there is no - need for user to set this property. For more information, please - refer to :ref:`api_guide_Name` - - Returns: - ${out_type}: ${out_comment} - - Examples: - - .. code-block:: python - - import paddle.fluid as fluid - import paddle - paddle.enable_static() - - data = fluid.layers.fill_constant(shape=[3, 2], value=0.5, dtype='float32') # [[0.5, 0.5], [0.5, 0.5], [0.5, 0.5]] - result = fluid.layers.hard_sigmoid(data) # [[0.6, 0.6], [0.6, 0.6], [0.6, 0.6]] - """ - if _non_static_mode(): - return _legacy_C_ops.hard_sigmoid(x, 'slope', slope, 'offset', offset) - - check_variable_and_dtype( - x, 'x', ['float16', 'float32', 'float64'], 'hard_sigmoid' - ) - - helper = LayerHelper('hard_sigmoid', **locals()) - out = helper.create_variable_for_type_inference(dtype=x.dtype) - helper.append_op( - type='hard_sigmoid', - inputs={'X': x}, - outputs={'Out': out}, - attrs={'slope': slope, 'offset': offset}, - ) - return out - - @deprecated(since="2.0.0", update_to="paddle.static.nn.prelu") def prelu(x, mode, param_attr=None, data_format="NCHW", name=None): r""" diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_activation_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_activation_pass.py index 4c86911c2eae1..7f4276bff5e7d 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_activation_pass.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_activation_pass.py @@ -92,7 +92,7 @@ def append_act(self, x): class TensorRTSubgraphPassHardSigmoidTest(TensorRTSubgraphPassActivationTest): def append_act(self, x): - return fluid.layers.hard_sigmoid(x) + return paddle.nn.functional.hardsigmoid(x) class TensorRTSubgraphPassHardSwishPluginTest( diff --git a/python/paddle/fluid/tests/unittests/mlu/test_hard_sigmoid_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_hard_sigmoid_op_mlu.py index 695d37ec54d6a..6575b0decd4af 100644 --- a/python/paddle/fluid/tests/unittests/mlu/test_hard_sigmoid_op_mlu.py +++ b/python/paddle/fluid/tests/unittests/mlu/test_hard_sigmoid_op_mlu.py @@ -162,7 +162,7 @@ def test_fluid_api(self): paddle.enable_static() with fluid.program_guard(fluid.Program()): x = fluid.data('X', self.x_np.shape, self.x_np.dtype) - out = fluid.layers.hard_sigmoid(x) + out = paddle.nn.functional.hardsigmoid(x) exe = fluid.Executor(self.place) res = exe.run(feed={'X': self.x_np}, fetch_list=[out]) out_ref = ref_hardsigmoid(self.x_np, 0.2, 0.5) @@ -170,7 +170,7 @@ def test_fluid_api(self): paddle.disable_static(self.place) x = paddle.to_tensor(self.x_np) - out = paddle.fluid.layers.hard_sigmoid(x) + out = paddle.nn.functional.hardsigmoid(x) np.testing.assert_allclose(out_ref, out.numpy()) paddle.enable_static() diff --git a/python/paddle/fluid/tests/unittests/npu/test_hard_sigmoid_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_hard_sigmoid_op_npu.py index 44155e4388062..55dc1e0a1102b 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_hard_sigmoid_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_hard_sigmoid_op_npu.py @@ -123,7 +123,7 @@ def test_dygraph_api(self): def test_fluid_api(self): with fluid.program_guard(fluid.Program()): x = fluid.data('X', self.x_np.shape, self.x_np.dtype) - out = fluid.layers.hard_sigmoid(x) + out = paddle.nn.functional.hardsigmoid(x) exe = fluid.Executor(self.place) res = exe.run(feed={'X': self.x_np}, fetch_list=[out]) out_ref = ref_hardsigmoid(self.x_np, 0.2, 0.5) @@ -131,7 +131,7 @@ def test_fluid_api(self): paddle.disable_static(self.place) x = paddle.to_tensor(self.x_np) - out = paddle.fluid.layers.hard_sigmoid(x) + out = paddle.nn.functional.hardsigmoid(x) np.testing.assert_allclose(out_ref, out.numpy()) paddle.enable_static() diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py index 3b0057e226a3c..4411fdc3d1006 100755 --- a/python/paddle/fluid/tests/unittests/test_activation_op.py +++ b/python/paddle/fluid/tests/unittests/test_activation_op.py @@ -3338,7 +3338,7 @@ def test_dygraph_api(self): def test_fluid_api(self): with fluid.program_guard(fluid.Program()): x = fluid.data('X', self.x_np.shape, self.x_np.dtype) - out = fluid.layers.hard_sigmoid(x) + out = paddle.nn.functional.hardsigmoid(x, slope=0.2) exe = fluid.Executor(self.place) res = exe.run(feed={'X': self.x_np}, fetch_list=[out]) out_ref = ref_hardsigmoid(self.x_np, 0.2, 0.5) @@ -3346,7 +3346,7 @@ def test_fluid_api(self): paddle.disable_static(self.place) x = paddle.to_tensor(self.x_np) - out = paddle.fluid.layers.hard_sigmoid(x) + out = paddle.nn.functional.hardsigmoid(x, slope=0.2) np.testing.assert_allclose(out_ref, out.numpy(), rtol=1e-05) paddle.enable_static() diff --git a/python/paddle/fluid/tests/unittests/test_imperative_basic.py b/python/paddle/fluid/tests/unittests/test_imperative_basic.py index 197d68db745c7..076c38773d583 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_basic.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_basic.py @@ -896,7 +896,7 @@ def func_append_activation_in_dygraph1(self): with fluid.dygraph.guard(): a = paddle.to_tensor(a_np) res1 = func(a, act="hard_sigmoid") - res2 = fluid.layers.hard_sigmoid(a) + res2 = paddle.nn.functional.hardsigmoid(a, slope=0.2) np.testing.assert_array_equal(res1.numpy(), res2.numpy()) def test_append_activation_in_dygraph1(self): From 7bbdbe5be7959034ab7d1d518b14ac629c02a513 Mon Sep 17 00:00:00 2001 From: Sylwester Fraczek Date: Tue, 22 Nov 2022 07:29:21 +0100 Subject: [PATCH 143/210] Some residualdata fixes (#48118) Removed ResidualData and Bias from ExtraAttrProperties because it's not an attribute. Removed bug with checking for ResidualData attribute in matmul_elementwise_add_fuse_pass Removed residualData from list of matmul outputs in cpu_bfloat16_pass.cc because it's input Co-authored-by: Tomasz Socha --- paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.cc | 5 +---- .../ir/mkldnn/matmul_elementwise_add_mkldnn_fuse_pass.cc | 4 ---- paddle/fluid/operators/ops_extra_info.h | 2 -- 3 files changed, 1 insertion(+), 10 deletions(-) diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.cc index ba8bacd200b12..12a673b89d681 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.cc @@ -199,10 +199,7 @@ class DeQuantizer final : public Quanter { bool IsNotPermittedName(const std::string& output_name) const override { std::unordered_map> block_list{ {"layer_norm", - {"Mean", "Variance"}}, // not used in inference in MKLDNN - {"matmul", {"ResidualData"}}, // artifical output, already dequantized - {"matmul_v2", - {"ResidualData"}}}; // artifical output, already dequantized + {"Mean", "Variance"}}}; // not used in inference in oneDNN std::vector blocked_outputs{"XShape"}; // blocklist for any op auto op_name = op->Name(); diff --git a/paddle/fluid/framework/ir/mkldnn/matmul_elementwise_add_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/matmul_elementwise_add_mkldnn_fuse_pass.cc index 795ecce2ee815..85e49c68ff824 100644 --- a/paddle/fluid/framework/ir/mkldnn/matmul_elementwise_add_mkldnn_fuse_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/matmul_elementwise_add_mkldnn_fuse_pass.cc @@ -64,10 +64,6 @@ void MatmulElementwiseAddMKLDNNFusePass::FuseMatmulElementwiseAdd( << "op compat for matmul_elementwise_add_mkldnn_fuse_pass failed."; return; } - if (matmul->Op()->HasAttr("ResidualData")) { - LOG(WARNING) << "matmul_elementwise_add can be fused once"; - return; - } matmul->Op()->SetInput("ResidualData", {elementwise_addend->Name()}); matmul->Op()->SetOutput("Out", {elementwise_add_out->Name()}); diff --git a/paddle/fluid/operators/ops_extra_info.h b/paddle/fluid/operators/ops_extra_info.h index 6aa6bd21fba1f..33f8c8ddb9c8e 100644 --- a/paddle/fluid/operators/ops_extra_info.h +++ b/paddle/fluid/operators/ops_extra_info.h @@ -88,7 +88,6 @@ const std::unordered_map {"use_cudnn", ExtraAttrProperty::SCHEDULE}, {"use_mkldnn", ExtraAttrProperty::SCHEDULE}, // ONEDNN dedicated attributes - {"Bias", ExtraAttrProperty::ONEDNN}, {"data_format", ExtraAttrProperty::ONEDNN}, {"force_fp32_output", ExtraAttrProperty::ONEDNN}, {"fuse_activation", ExtraAttrProperty::ONEDNN}, @@ -108,7 +107,6 @@ const std::unordered_map {"fused_transpose_X", ExtraAttrProperty::ONEDNN}, {"fused_transpose_Y", ExtraAttrProperty::ONEDNN}, {"mkldnn_data_type", ExtraAttrProperty::ONEDNN}, - {"ResidualData", ExtraAttrProperty::ONEDNN}, {"scale_x", ExtraAttrProperty::ONEDNN}, {"scale_y", ExtraAttrProperty::ONEDNN}, {"scale_out", ExtraAttrProperty::ONEDNN}, From d389ddb5287c090ee4286a687dab9848c24a2d93 Mon Sep 17 00:00:00 2001 From: Nyakku Shigure Date: Tue, 22 Nov 2022 14:41:26 +0800 Subject: [PATCH 144/210] [CodeStyle][py2][U008] remove unnecessary args in `super()` for some example code (#47643) * [CodeStyle][py2][U008] remove unnecessary args in `super()` for some example code * empty commit, test=document_fix --- python/paddle/distributed/spawn.py | 2 +- python/paddle/fluid/dygraph/jit.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/python/paddle/distributed/spawn.py b/python/paddle/distributed/spawn.py index a371f5d559b59..4820d6dcd2f03 100644 --- a/python/paddle/distributed/spawn.py +++ b/python/paddle/distributed/spawn.py @@ -531,7 +531,7 @@ def spawn(func, args=(), nprocs=-1, join=True, daemon=False, **options): class LinearNet(nn.Layer): def __init__(self): - super(LinearNet, self).__init__() + super().__init__() self._linear1 = nn.Linear(10, 10) self._linear2 = nn.Linear(10, 1) diff --git a/python/paddle/fluid/dygraph/jit.py b/python/paddle/fluid/dygraph/jit.py index c359c6d152f9e..042ddb25fc65e 100644 --- a/python/paddle/fluid/dygraph/jit.py +++ b/python/paddle/fluid/dygraph/jit.py @@ -1573,7 +1573,7 @@ def trace(layer, inputs): class ExampleLayer(paddle.nn.Layer): def __init__(self): - super(ExampleLayer, self).__init__() + super().__init__() self._fc = paddle.nn.Linear(3, 10) def forward(self, input): @@ -1591,7 +1591,7 @@ def forward(self, input): print(out_static_graph[0].shape) # (2, 10) # save the static graph model for inference - static_layer.save_inference_model(dirname='./saved_infer_model') + static_layer.save_inference_model('./saved_infer_model') """ assert isinstance( @@ -1623,7 +1623,7 @@ def set_strategy(self, build_strategy=None, exec_strategy=None): class ExampleLayer(paddle.nn.Layer): def __init__(self): - super(ExampleLayer, self).__init__() + super().__init__() self._fc = paddle.nn.Linear(3, 10) def forward(self, input): @@ -1728,7 +1728,7 @@ def save_inference_model(self, path, feed=None, fetch=None, **kwargs): class ExampleLayer(paddle.nn.Layer): def __init__(self): - super(ExampleLayer, self).__init__() + super().__init__() self._fc = paddle.nn.Linear(3, 10) def forward(self, input): From ae2564549c5b8bbc9de53ca464524f1519a66aad Mon Sep 17 00:00:00 2001 From: ccrrong <101700995+ccrrong@users.noreply.github.com> Date: Tue, 22 Nov 2022 14:55:31 +0800 Subject: [PATCH 145/210] remove isfinite and has_nan (#48046) --- python/paddle/fluid/layers/tensor.py | 64 ------------------- .../fluid/tests/unittests/test_isfinite_op.py | 22 ------- 2 files changed, 86 deletions(-) diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py index d032b8cd20a7c..ee7b764ad7b86 100644 --- a/python/paddle/fluid/layers/tensor.py +++ b/python/paddle/fluid/layers/tensor.py @@ -64,8 +64,6 @@ 'zeros', 'reverse', 'has_inf', - 'has_nan', - 'isfinite', 'linspace', 'zeros_like', 'ones_like', @@ -1572,68 +1570,6 @@ def has_inf(x): return out -def has_nan(x): - """ - Test if any of x contains a NAN - - Args: - x (Tensor): The Tensor to be checked. - - Returns: - Tensor: The tensor variable storing the output, only a bool value, indicating that whether there is NAN in x or not. - - Examples: - .. code-block:: python - - import paddle - data = paddle.randn(shape=[2,3], dtype="float32") - res = paddle.fluid.layers.has_nan(data) - # [False] - - """ - if _non_static_mode(): - return _legacy_C_ops.isnan(x) - - check_type(x, 'x', (Variable), 'has_nan') - helper = LayerHelper("isnan", **locals()) - out = helper.create_variable_for_type_inference(dtype=x.dtype) - helper.append_op(type="isnan", inputs={"X": x}, outputs={"Out": out}) - return out - - -def isfinite(x): - """ - - Test if any of x contains an infinity/NAN number. If all the elements are finite, - returns true, else false. - - Args: - x(Tensor): The Tensor to be checked. - - Returns: - Tensor: The tensor storing the output, contains a bool value. - - Examples: - - .. code-block:: python - - import paddle - - x = paddle.rand(shape=[4, 6], dtype='float32') - y = paddle.fluid.layers.isfinite(x) - print(y) - - """ - check_variable_and_dtype( - x, "x", ["float32", "float64", "int32", "int64"], "isfinite" - ) - helper = LayerHelper("isfinite", **locals()) - - out = helper.create_variable_for_type_inference(dtype='bool') - helper.append_op(type="isfinite", inputs={"X": x}, outputs={"Out": out}) - return out - - def linspace(start, stop, num, dtype=None, name=None): r""" This OP return fixed number of evenly spaced values within a given interval. diff --git a/python/paddle/fluid/tests/unittests/test_isfinite_op.py b/python/paddle/fluid/tests/unittests/test_isfinite_op.py index e2fa9f67b5331..cbe12d1cb3f4b 100644 --- a/python/paddle/fluid/tests/unittests/test_isfinite_op.py +++ b/python/paddle/fluid/tests/unittests/test_isfinite_op.py @@ -40,20 +40,6 @@ def test_output(self): self.check_output() -class TestRaiseError(unittest.TestCase): - def test_errors(self): - def test_type(): - fluid.layers.isfinite([10]) - - self.assertRaises(TypeError, test_type) - - def test_dtype(): - data = fluid.data(shape=[10], dtype="float16", name="input") - fluid.layers.isfinite(data) - - self.assertRaises(TypeError, test_dtype) - - @unittest.skipIf( not core.is_compiled_with_cuda(), "core is not compiled with CUDA" ) @@ -129,19 +115,11 @@ def test_has_inf_bad_x(): self.assertRaises(TypeError, test_has_inf_bad_x) - def test_has_nan_bad_x(): - data = [1, 2, 3] - result = fluid.layers.has_nan(data) - - self.assertRaises(TypeError, test_has_nan_bad_x) - with fluid.dygraph.guard(): data = paddle.zeros([2, 3]) result = paddle.fluid.layers.has_inf(data) expect_value = np.array([False]) self.assertEqual((result.numpy() == expect_value).all(), True) - result = paddle.fluid.layers.has_nan(data) - self.assertEqual((result.numpy() == expect_value).all(), True) if __name__ == '__main__': From 161998f715039c442732d26e232b2a9e23ecd630 Mon Sep 17 00:00:00 2001 From: JZ-LIANG Date: Tue, 22 Nov 2022 15:12:11 +0800 Subject: [PATCH 146/210] [Auto Parallel] Recompute Support New Graph Executor (#47846) * add depend * fp16 pass distinguish None & False * engine log --- .../paddle/distributed/auto_parallel/utils.py | 91 +++++++++++++++++++ .../passes/auto_parallel_recompute.py | 12 ++- 2 files changed, 102 insertions(+), 1 deletion(-) diff --git a/python/paddle/distributed/auto_parallel/utils.py b/python/paddle/distributed/auto_parallel/utils.py index be4c68d97d840..c31642a9e2af3 100644 --- a/python/paddle/distributed/auto_parallel/utils.py +++ b/python/paddle/distributed/auto_parallel/utils.py @@ -1407,6 +1407,27 @@ def naive_set_dist_op_attr_for_program_by_mesh_and_mapping( ctx.set_op_dist_attr_for_program(new_op, new_op_dist_attr) +def naive_set_dist_op_attr_for_program_by_mesh( + new_op, process_mesh, ctx, is_recompute=False +): + assert process_mesh is not None + + new_op_dist_attr = OperatorDistributedAttribute() + + for input_varname in new_op.desc.input_arg_names(): + var = ctx.serial_main_program.global_block().var(input_varname) + mapping = ctx.get_tensor_dist_attr_for_program(var).dims_mapping + new_op_dist_attr.set_input_dims_mapping(input_varname, mapping) + for output_varname in new_op.desc.output_arg_names(): + var = ctx.serial_main_program.global_block().var(output_varname) + mapping = ctx.get_tensor_dist_attr_for_program(var).dims_mapping + new_op_dist_attr.set_output_dims_mapping(output_varname, mapping) + + new_op_dist_attr.process_mesh = process_mesh + new_op_dist_attr.is_recompute = is_recompute + ctx.set_op_dist_attr_for_program(new_op, new_op_dist_attr) + + def update_op_dims_mapping_by_default_dist_impl(dist_op): changed = False op_dist_attr = dist_op.dist_attr @@ -2102,3 +2123,73 @@ def _copy_dist_attr_from_cpp_for_graph(dist_context): py_dist_attr = dist_context.get_op_dist_attr_for_graph(node) cpp_dist_attr = node.op().dist_attr _copy_op_dist_attr_from_cpp(cpp_dist_attr, py_dist_attr) + + +def insert_dependencies_for_two_ops( + block, + idx, + prior_op, + posterior, + dist_context, + is_recompute=False, + sync=False, +): + """ + dependency: prior_op should be run before posterior + """ + + assert ( + len(prior_op.output_arg_names) >= 1 + ), "first op of dependency should at least have one output. [{}]".format( + str(prior_op) + ) + assert ( + len(posterior.input_arg_names) >= 1 + ), "second op of dependency should at least have one input. [{}]".format( + str(posterior) + ) + prior_op_mesh = dist_context.get_op_dist_attr_for_program( + prior_op + ).process_mesh + posterior_mesh = dist_context.get_op_dist_attr_for_program( + posterior + ).process_mesh + assert ( + prior_op_mesh == posterior_mesh + ), "two ops of dependency should have same mesh but got [{}] and [{}]".format( + str(prior_op_mesh), str(posterior_mesh) + ) + + def _select_best_depend_var(vars): + + vars_with_numels = [(var, get_var_numel(var)) for var in vars] + vars_with_numels.sort(key=lambda x: x[1]) + + return vars_with_numels[-1][0] + + first_var = _select_best_depend_var( + [block.var(name) for name in prior_op.output_arg_names] + ) + second_var = _select_best_depend_var( + [block.var(name) for name in posterior.input_arg_names] + ) + + depend_op = block._insert_op_without_sync( + idx, + type='nop', + inputs={ + "X": first_var, + }, + outputs={"Out": second_var}, + ) + # depend_op.desc.set_type("depend") + depend_op._set_attr(OP_ROLE_KEY, OpRole.Backward) + # depend_op.desc.set_input("Dep", [first_var.name]) + # self.desc.set_output(out_proto.name, out_arg_names) + + naive_set_dist_op_attr_for_program_by_mesh( + depend_op, prior_op_mesh, dist_context, is_recompute + ) + + if sync: + block._sync_with_cpp() diff --git a/python/paddle/distributed/passes/auto_parallel_recompute.py b/python/paddle/distributed/passes/auto_parallel_recompute.py index 5bdbe9d2dd5d9..44e02fb3ffad8 100644 --- a/python/paddle/distributed/passes/auto_parallel_recompute.py +++ b/python/paddle/distributed/passes/auto_parallel_recompute.py @@ -29,6 +29,7 @@ ) from paddle.distributed.auto_parallel.utils import ( naive_set_dist_op_attr_for_program_by_mesh_and_mapping, + insert_dependencies_for_two_ops, ) @@ -449,6 +450,7 @@ def _apply_single_impl(self, main_program, startup_program, context): while idx - 1 >= 0 and ops[idx - 1].type == "sum": idx -= 1 segment_descs = ckpt_ops_dict[fwd_op_id][1] + rc_op = None for _, op_desc in reversed(list(enumerate(segment_descs))): rc_op = main_block._insert_op_without_sync( idx, type='nop' @@ -466,7 +468,15 @@ def _apply_single_impl(self, main_program, startup_program, context): ) ckpt_ops_dict[fwd_op_id][0] = False - + if rc_op: + insert_dependencies_for_two_ops( + main_block, + idx, + main_block.ops[rc_op.idx - 1], + rc_op, + self._dist_context, + sync=False, + ) main_program._sync_with_cpp() def reset_op_dist_attr(self, op, var_name_dict): From 2ab60c305cebe3fef4ca6cb4e4045822db121935 Mon Sep 17 00:00:00 2001 From: 201716010711 <87008376+201716010711@users.noreply.github.com> Date: Tue, 22 Nov 2022 15:25:19 +0800 Subject: [PATCH 147/210] delete rank api (#48217) --- python/paddle/fluid/layers/nn.py | 29 ----------------------------- 1 file changed, 29 deletions(-) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 3f66af243d97d..96ca8a459bd50 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -145,7 +145,6 @@ 'slice', 'strided_slice', 'shape', - 'rank', 'size', 'logical_and', 'logical_or', @@ -10467,34 +10466,6 @@ def shape(input): return out -def rank(input): - """ - - The OP returns the number of dimensions for a tensor, which is a 0-D int32 Tensor. - - Args: - input (Tensor): The input N-D tensor with shape of :math:`[N_1, N_2, ..., N_k]`, the data type is arbitrary. - - Returns: - Tensor, the output data type is int32.: The 0-D tensor with the dimensions of the input Tensor. - - Examples: - .. code-block:: python - - import paddle - - input = paddle.rand((3, 100, 100)) - rank = paddle.rank(input) - print(rank) - # 3 - """ - check_type(input, 'input', (Variable), 'input') - ndims = len(input.shape) - out = assign(np.array(ndims, 'int32')) - - return out - - @deprecated(since="2.0.0", update_to="paddle.numel") def size(input): """ From df4dfda0db7ae2541c71cc796ab35113cdb72b1c Mon Sep 17 00:00:00 2001 From: Tian Zheng Date: Tue, 22 Nov 2022 15:28:36 +0800 Subject: [PATCH 148/210] CudnnNormConvolution is no longer supported on NVIDIA Hopper GPUs (#48203) * Skip tests that use fused_ops on H100 * Add error message to FusedOps on H100 --- paddle/fluid/operators/fused/cudnn_norm_conv.cu.h | 8 ++++++++ paddle/fluid/operators/fused/cudnn_norm_conv_test.cc | 8 ++++---- .../fluid/tests/unittests/ir/test_fuse_resnet_unit.py | 5 +++-- 3 files changed, 15 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h b/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h index 01e5e24e0a016..a5e210dc7fe3c 100644 --- a/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h +++ b/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h @@ -45,6 +45,14 @@ struct NormConvolutionArgs { int stride, int dilation, int group) { + PADDLE_ENFORCE_LT( + ctx.GetComputeCapability(), + 90, + phi::errors::PreconditionNotMet( + "Expect compute compatiblity to be less than 90, but got %d. " + "CUDNN FusedOps is no longer available on H100 and later " + "devices.", + ctx.GetComputeCapability())); PADDLE_ENFORCE_EQ( input_shape.size(), 4U, diff --git a/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc b/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc index 81e298ff9c265..3369a8ca4a9c5 100644 --- a/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc +++ b/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc @@ -439,7 +439,7 @@ TEST(CudnnNormConvFp16, K1S1) { phi::GPUContext *ctx = static_cast( platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0))); - if (ctx->GetComputeCapability() < 70) { + if (ctx->GetComputeCapability() < 70 || ctx->GetComputeCapability() >= 90) { ASSERT_THROW(test.CheckForward(1e-3, true), paddle::platform::EnforceNotMet); ASSERT_THROW(test.CheckBackward(1e-3, true), @@ -469,7 +469,7 @@ TEST(CudnnNormConvFp16, K3S1) { phi::GPUContext *ctx = static_cast( platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0))); - if (ctx->GetComputeCapability() < 70) { + if (ctx->GetComputeCapability() < 70 || ctx->GetComputeCapability() >= 90) { ASSERT_THROW(test.CheckForward(1e-3, true), paddle::platform::EnforceNotMet); ASSERT_THROW(test.CheckBackward(1e-3, true), @@ -499,7 +499,7 @@ TEST(CudnnNormConvFp16, K1S1O4) { phi::GPUContext *ctx = static_cast( platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0))); - if (ctx->GetComputeCapability() < 70) { + if (ctx->GetComputeCapability() < 70 || ctx->GetComputeCapability() >= 90) { ASSERT_THROW(test.CheckForward(1e-3, true), paddle::platform::EnforceNotMet); ASSERT_THROW(test.CheckBackward(1e-3, true), @@ -529,7 +529,7 @@ TEST(CudnnNormConvFp16, K1S2O4) { phi::GPUContext *ctx = static_cast( platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0))); - if (ctx->GetComputeCapability() <= 70) { + if (ctx->GetComputeCapability() <= 70 || ctx->GetComputeCapability() >= 90) { ASSERT_THROW(test.CheckForward(1e-3, true), paddle::platform::EnforceNotMet); ASSERT_THROW(test.CheckBackward(1e-3), paddle::platform::EnforceNotMet); diff --git a/python/paddle/fluid/tests/unittests/ir/test_fuse_resnet_unit.py b/python/paddle/fluid/tests/unittests/ir/test_fuse_resnet_unit.py index 4dabcdbcf35d6..28d49fbac7bbe 100644 --- a/python/paddle/fluid/tests/unittests/ir/test_fuse_resnet_unit.py +++ b/python/paddle/fluid/tests/unittests/ir/test_fuse_resnet_unit.py @@ -25,9 +25,10 @@ @unittest.skipIf( not paddle.is_compiled_with_cuda() or paddle.get_cudnn_version() < 8000 - or paddle.device.cuda.get_device_capability()[0] < 7, + or paddle.device.cuda.get_device_capability()[0] < 7 + or paddle.device.cuda.get_device_capability()[0] >= 9, "only support with cuda and cudnn version is at least 8.0 " - "and device's compute capability is at least 7.0", + "and device's compute capability is at least 7.0 and less than 9.0", ) class TestFuseResNetUnit(unittest.TestCase): def test_fuse_resenet_unit(self): From 48d5c36b0bd53f92bff16186c9435848e07d2391 Mon Sep 17 00:00:00 2001 From: caozhou <48191911+Caozhou1995@users.noreply.github.com> Date: Tue, 22 Nov 2022 15:59:12 +0800 Subject: [PATCH 149/210] add group operators (#48208) --- .../auto_parallel/tuner/rule_based_tuner.py | 262 ++++++++++++++++++ .../unittests/auto_parallel/CMakeLists.txt | 1 + .../auto_parallel/test_group_operators.py | 133 +++++++++ 3 files changed, 396 insertions(+) create mode 100644 python/paddle/distributed/auto_parallel/tuner/rule_based_tuner.py create mode 100644 python/paddle/fluid/tests/unittests/auto_parallel/test_group_operators.py diff --git a/python/paddle/distributed/auto_parallel/tuner/rule_based_tuner.py b/python/paddle/distributed/auto_parallel/tuner/rule_based_tuner.py new file mode 100644 index 0000000000000..83b4586af7c65 --- /dev/null +++ b/python/paddle/distributed/auto_parallel/tuner/rule_based_tuner.py @@ -0,0 +1,262 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +class OperatorGroupUtil: + common_starts = ["layer_norm", "matmul_v2", "matmul"] + + @staticmethod + def get_ranks(seq): + """Get rank array of the given seq by doubled algorithm.""" + ordered_seq = sorted(list(set(seq))) + item_to_rank = {item: idx for idx, item in enumerate(ordered_seq)} + inter_ranks = [item_to_rank[item] for item in seq] + + length = len(inter_ranks) + power = 0 + interval = 2**power + while interval < length: + for idx, item in enumerate(inter_ranks): + if idx + interval >= length: + inter_ranks[idx] = [item, -1] + else: + inter_ranks[idx] = [item, inter_ranks[idx + interval]] + + tmp = [] + for item in inter_ranks: + if item not in tmp: + tmp.append(item) + tmp.sort(key=lambda x: (x[0], x[1])) + item_to_rank = {} + for idx, val in enumerate(tmp): + key = ",".join(str(item) for item in val) + item_to_rank[key] = idx + + inter_ranks = [ + item_to_rank[",".join(str(val) for val in item)] + for item in inter_ranks + ] + power += 1 + interval = 2**power + + return inter_ranks + + @staticmethod + def get_suffixes(ranks): + """Get suffix array by the given rank array.""" + suffixes = [0 for idx in range(len(ranks))] + for idx, item in enumerate(ranks): + suffixes[item] = idx + return suffixes + + @staticmethod + def get_heights(suffixes, seq): + """Get height array by the suffix array and seq""" + heights = [-1 for i in range(len(suffixes))] + for i in range(1, len(seq)): + x = seq[suffixes[i - 1] :] + y = seq[suffixes[i] :] + max_len = len(x) if len(x) > len(y) else len(y) + same_count = 0 + for j in range(max_len): + if j >= len(x) or j >= len(y): + break + else: + if x[j] == y[j]: + same_count += 1 + else: + break + heights[i] = same_count + + return heights + + @staticmethod + def get_longest_repeated_sub_seq(suffixes, heights, seq): + """Get longest repeated sub sequence by suffix array algorithm.""" + length = len(seq) + if length <= 1: + return None + k = length // 2 + height_groups = [] + longest_sub_seq = None + longest_sub_seqs = [] + + while k >= 2: + height_group = [] + for i in range(1, len(heights)): + if heights[i] >= k: + if i == 1: + height_group.append(0) + height_group.append(i) + else: + if i == 1: + height_groups.append([0]) + height_group = [i] + else: + height_groups.append(height_group) + height_group = [i] + + if height_group: + height_groups.append(height_group) + + for height_group in height_groups: + suffix_group = [] + index_group = [] + for idx in height_group: + suffix_group.append(idx) + index_group.append(suffixes[idx]) + + max_index = max(index_group) + min_index = min(index_group) + if max_index - min_index >= k: + longest_sub_seq = seq[min_index : min_index + k] + if longest_sub_seq[0] in OperatorGroupUtil.common_starts: + return longest_sub_seq + if longest_sub_seq is not None: + return longest_sub_seq + + k -= 1 + height_groups = [] + + return longest_sub_seq + + @staticmethod + def get_decomposed_sub_seq(seq): + """Get decomposed sub seq s by seq S such as s * R = S.""" + if not seq: + return seq + + decomposed_sub_seq = seq + seq_len = len(seq) + if seq_len == 1: + return decomposed_sub_seq + else: + for interval in range(2, seq_len + 1): + if seq_len % interval == 0: + repeated_times = seq_len // interval + decomposed_sub_seq = seq[0:interval] + decomposed = True + for j in range(1, repeated_times + 1): + sub_seq = seq[interval * (j - 1) : interval * j] + if sub_seq != decomposed_sub_seq: + decomposed = False + break + if decomposed: + return decomposed_sub_seq + + return decomposed_sub_seq + + @staticmethod + def replace_by_decomposed_seq(sub_seq, seq): + """Replace seq by sub seq.""" + if not sub_seq: + return seq + + result = [] + sub_seq_len = len(sub_seq) + i = 0 + while i < len(seq): + if seq[i : i + sub_seq_len] == sub_seq: + result.append(seq[i : i + sub_seq_len]) + i += sub_seq_len + else: + result.append(seq[i]) + i += 1 + + return result + + @staticmethod + def stop_replace(seq): + for item in seq: + if not isinstance(item, list): + return False + return True + + +class RuleBasedTuner: + def __init__(self, dist_context, mode="train"): + self._dist_context = dist_context + self._mode = mode + + def group_operators(self, ops): + """ + Group operators to layers. + + Args: + ops (list): A operator list. + + Returns: + List: The list contains the list of operators which belong to the same layer. + """ + seq = [op.type for op in ops] + + while not OperatorGroupUtil.stop_replace(seq): + to_replace_seq = [] + to_replace_idxes = [] + has_append = False + for idx, item in enumerate(seq): + if not isinstance(item, list): + has_append = True + to_replace_seq.append(item) + to_replace_idxes.append(idx) + elif isinstance(seq, list) and not has_append: + continue + elif isinstance(seq, list) and has_append: + break + + ranks = OperatorGroupUtil.get_ranks(to_replace_seq) + suffixes = OperatorGroupUtil.get_suffixes(ranks) + heights = OperatorGroupUtil.get_heights(suffixes, to_replace_seq) + longest_sub_seq = OperatorGroupUtil.get_longest_repeated_sub_seq( + suffixes, heights, to_replace_seq + ) + has_merged = False + if longest_sub_seq is None: + for i in range(to_replace_idxes[-1] + 1, len(seq)): + if isinstance(seq[i], list): + seq[i] = to_replace_seq + seq[i] + has_merged = True + break + if not has_merged: + for i in range(to_replace_idxes[0] - 1, -1, -1): + if isinstance(seq[i], list): + seq[i].extend(to_replace_seq) + has_merged = True + break + if not has_merged: + seq = [to_replace_seq] + break + + decomposed_sub_seq = OperatorGroupUtil.get_decomposed_sub_seq( + longest_sub_seq + ) + to_replace_seq = OperatorGroupUtil.replace_by_decomposed_seq( + decomposed_sub_seq, to_replace_seq + ) + result = seq[: to_replace_idxes[0]] + if not has_merged: + result.extend(to_replace_seq) + result.extend(seq[to_replace_idxes[-1] + 1 :]) + seq = result + + layers = [] + idx = 0 + for groups in seq: + layer = [] + for op in groups: + layer.append(ops[idx]) + idx += 1 + layers.append(layer) + + return layers diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt index 201241cb31e63..5ba84df8395d3 100644 --- a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt @@ -118,5 +118,6 @@ if(WITH_DISTRIBUTE AND WITH_GPU) test_conditional_block_reshard) py_test_modules(test_engine_api_error MODULES test_engine_api_error) py_test_modules(test_fp16_assign MODULES test_fp16_assign) + py_test_modules(test_group_operators MODULES test_group_operators) endif() diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_group_operators.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_group_operators.py new file mode 100644 index 0000000000000..f7cb28d559cbe --- /dev/null +++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_group_operators.py @@ -0,0 +1,133 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys +import unittest +import numpy as np + +import paddle +import paddle.static as static + +sys.path.append("..") +import auto_parallel_gpt_model as modeling +from auto_parallel_gpt_model import ( + GPTModel, + GPTForPretraining, + GPTPretrainingCriterion, +) + + +def get_gpt_model( + train_program, start_program, place, batch_size, sequence_len, vocab_size +): + with static.program_guard(train_program, start_program): + tokens = paddle.static.data( + name="tokens", shape=[batch_size, sequence_len], dtype='int64' + ) + position_ids = paddle.static.data( + name="position_ids", shape=[batch_size, sequence_len], dtype='int64' + ) + attention_mask = paddle.static.data( + name="attention_mask", + shape=[batch_size, 1, sequence_len, sequence_len], + dtype='float32', + ) + labels = paddle.static.data( + name="labels", shape=[batch_size, sequence_len], dtype='int64' + ) + loss_mask = paddle.static.data( + name="loss_mask", shape=[batch_size, sequence_len], dtype='float32' + ) + + gpt = GPTModel( + vocab_size=1000, + hidden_size=64, + num_hidden_layers=2, + num_attention_heads=8, + intermediate_size=256, + hidden_act="gelu", + hidden_dropout_prob=0.0, + attention_probs_dropout_prob=0.0, + max_position_embeddings=1024, + type_vocab_size=1, + initializer_range=0.02, + pad_token_id=0, + eos_token_id=7, + bos_token_id=0, + eol_token_id=3, + ) + + model = GPTForPretraining( + gpt, vocab_size=1000, hidden_size=64, initializer_range=0.02 + ) + preds = model(tokens, position_ids, attention_mask) + criterion = GPTPretrainingCriterion() + loss = criterion(preds, labels, loss_mask) + + def gen_data(): + np.random.seed(2021) + tokens = [] + position_ids = [] + attention_mask = [] + labels = [] + loss_mask = [] + for _ in range(batch_size): + tokens.append(np.random.randint(vocab_size, size=sequence_len)) + position_ids.append(np.arange(sequence_len)) + attention_mask.append([np.tril(np.ones(sequence_len))]) + labels.append(np.random.randint(vocab_size, size=sequence_len)) + loss_mask.append(np.ones(sequence_len)) + + return tokens, position_ids, attention_mask, labels, loss_mask + + return train_program, start_program, loss, gen_data + + +class TestGroupOperators(unittest.TestCase): + def test_gpt(self): + modeling.init_global() + train_program = static.Program() + start_program = static.Program() + place = paddle.set_device("gpu") + batch_size = 8 + sequence_len = 512 + vocab_size = 1000 + train_program, start_program, loss, gen_data = get_gpt_model( + train_program, + start_program, + place, + batch_size, + sequence_len, + vocab_size, + ) + from paddle.distributed.auto_parallel.tuner.rule_based_tuner import ( + RuleBasedTuner, + ) + from paddle.distributed.auto_parallel.dist_context import ( + DistributedContext, + ) + + dist_context = DistributedContext() + tuner = RuleBasedTuner(dist_context) + layers = tuner.group_operators(train_program.global_block().ops) + op_types = [] + for layer in layers: + tmp = [] + for op in layer: + tmp.append(op.type) + op_types.append(tmp) + + +if __name__ == "__main__": + unittest.main() From aa36c6aa27c7c4a5a193ec6012f9bf30ff9e177c Mon Sep 17 00:00:00 2001 From: huangjiyi <43315610+huangjiyi@users.noreply.github.com> Date: Tue, 22 Nov 2022 16:27:18 +0800 Subject: [PATCH 150/210] [PHI decoupling] move vol2col from fluid to phi (#48175) * move vol2col from fluid to phi * update copyright year --- paddle/fluid/operators/conv_op.h | 2 +- paddle/fluid/operators/math/CMakeLists.txt | 1 - paddle/fluid/operators/math/vol2col_test.cc | 13 ++++--- paddle/phi/kernels/funcs/CMakeLists.txt | 1 + .../math => phi/kernels/funcs}/vol2col.cc | 34 ++++++++--------- .../math => phi/kernels/funcs}/vol2col.cu | 38 +++++++++---------- .../math => phi/kernels/funcs}/vol2col.h | 17 ++++----- .../phi/kernels/impl/conv_grad_kernel_impl.h | 12 +++--- paddle/phi/kernels/impl/conv_kernel_impl.h | 4 +- .../impl/conv_transpose_grad_kernel_impl.h | 4 +- .../kernels/impl/conv_transpose_kernel_impl.h | 4 +- 11 files changed, 62 insertions(+), 68 deletions(-) rename paddle/{fluid/operators/math => phi/kernels/funcs}/vol2col.cc (93%) rename paddle/{fluid/operators/math => phi/kernels/funcs}/vol2col.cu (95%) rename paddle/{fluid/operators/math => phi/kernels/funcs}/vol2col.h (90%) diff --git a/paddle/fluid/operators/conv_op.h b/paddle/fluid/operators/conv_op.h index 7752fb6c4e3d1..79d07887fb0e0 100644 --- a/paddle/fluid/operators/conv_op.h +++ b/paddle/fluid/operators/conv_op.h @@ -23,8 +23,8 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/layout_utils.h" #include "paddle/fluid/operators/math/im2col.h" -#include "paddle/fluid/operators/math/vol2col.h" #include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/funcs/vol2col.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt index 9bc7473d967cd..1f5dd8a9b2284 100644 --- a/paddle/fluid/operators/math/CMakeLists.txt +++ b/paddle/fluid/operators/math/CMakeLists.txt @@ -44,7 +44,6 @@ endif() math_library(matrix_bit_code) math_library(unpooling) -math_library(vol2col) math_library(prelu) math_library(bert_encoder_functor) math_library(tree2col DEPS math_function) diff --git a/paddle/fluid/operators/math/vol2col_test.cc b/paddle/fluid/operators/math/vol2col_test.cc index 65db94752b987..7c44a97513583 100644 --- a/paddle/fluid/operators/math/vol2col_test.cc +++ b/paddle/fluid/operators/math/vol2col_test.cc @@ -1,4 +1,4 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -12,10 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/math/vol2col.h" +#include "paddle/phi/kernels/funcs/vol2col.h" #include +#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/place.h" @@ -84,7 +85,7 @@ void testVol2col() { output_width}, *place); - paddle::operators::math::Vol2ColFunctor vol2col; + phi::funcs::Vol2ColFunctor vol2col; vol2col(*context, input, dilations, strides, paddings, &output); float vol_2_col[] = {0, 1, 1, 2, 3, 4, 4, 5, 6, 7, 7, 8, 9, 10, 10, 11}; @@ -110,7 +111,7 @@ void testVol2col() { paddle::framework::TensorCopySync(input_tmp, *place, &input); } - paddle::operators::math::Col2VolFunctor col2vol; + phi::funcs::Col2VolFunctor col2vol; col2vol(*context, output, dilations, strides, paddings, &input); float* in_ptr; @@ -201,7 +202,7 @@ void testVol2col() { output_width}, *place); - paddle::operators::math::Vol2ColFunctor vol2col; + phi::funcs::Vol2ColFunctor vol2col; vol2col(*context, input, dilations, strides, paddings, &output); float vol_2_col[] = {0, 1, 1, 2, 3, 4, 4, 5, 6, 7, 7, 8, 9, 10, 10, 11}; @@ -227,7 +228,7 @@ void testVol2col() { paddle::framework::TensorCopySync(input_tmp, *place, &input); } - paddle::operators::math::Col2VolFunctor col2vol; + phi::funcs::Col2VolFunctor col2vol; col2vol(*context, output, dilations, strides, paddings, &input); float* in_ptr; diff --git a/paddle/phi/kernels/funcs/CMakeLists.txt b/paddle/phi/kernels/funcs/CMakeLists.txt index ac1bd1fd45c72..41c6cf677717d 100644 --- a/paddle/phi/kernels/funcs/CMakeLists.txt +++ b/paddle/phi/kernels/funcs/CMakeLists.txt @@ -17,6 +17,7 @@ math_library(segment_pooling) math_library(sequence2batch) math_library(matrix_solve DEPS dense_tensor eigen3 blas math_function) math_library(cross_entropy) +math_library(vol2col) cc_library( phi_data_layout_transform diff --git a/paddle/fluid/operators/math/vol2col.cc b/paddle/phi/kernels/funcs/vol2col.cc similarity index 93% rename from paddle/fluid/operators/math/vol2col.cc rename to paddle/phi/kernels/funcs/vol2col.cc index 041d79ee1f175..b2b58ee4eb79c 100644 --- a/paddle/fluid/operators/math/vol2col.cc +++ b/paddle/phi/kernels/funcs/vol2col.cc @@ -1,4 +1,4 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -12,13 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/math/vol2col.h" +#include "paddle/phi/kernels/funcs/vol2col.h" #include "paddle/phi/backends/cpu/cpu_context.h" -namespace paddle { -namespace operators { -namespace math { +namespace phi { +namespace funcs { /* * vol = [input_channels, input_depth, input_height, input_width] @@ -38,13 +37,13 @@ class Vol2ColFunctor { const DataLayout data_layout) const { PADDLE_ENFORCE_EQ(vol.dims().size(), 4, - platform::errors::InvalidArgument( + phi::errors::InvalidArgument( "The dimension of vol should be 4, but received %d.", vol.dims().size())); PADDLE_ENFORCE_EQ(col->dims().size(), 7, - platform::errors::InvalidArgument( + phi::errors::InvalidArgument( "The dimension of col should be 7, but received %d.", col->dims().size())); @@ -81,7 +80,7 @@ class Vol2ColFunctor { PADDLE_ENFORCE_EQ( input_depth_tmp, output_depth, - platform::errors::InvalidArgument( + phi::errors::InvalidArgument( "input_depth(%d) and output_depth(%d) are mismatching.", input_depth_tmp, output_depth)); @@ -92,7 +91,7 @@ class Vol2ColFunctor { PADDLE_ENFORCE_EQ( input_height_tmp, output_height, - platform::errors::InvalidArgument( + phi::errors::InvalidArgument( "input_height(%d) and output_height(%d) are mismatching.", input_height_tmp, output_height)); @@ -103,7 +102,7 @@ class Vol2ColFunctor { PADDLE_ENFORCE_EQ( input_width_tmp, output_width, - platform::errors::InvalidArgument( + phi::errors::InvalidArgument( "input_width(%d) and output_width(%d) are mismatching.", input_width_tmp, output_width)); @@ -164,13 +163,13 @@ class Col2VolFunctor { const DataLayout data_layout) const { PADDLE_ENFORCE_EQ(vol->dims().size(), 4, - platform::errors::InvalidArgument( + phi::errors::InvalidArgument( "The dimension of vol should be 4, but received %d.", vol->dims().size())); PADDLE_ENFORCE_EQ(col.dims().size(), 7, - platform::errors::InvalidArgument( + phi::errors::InvalidArgument( "The dimension of col should be 7, but received %d.", col.dims().size())); @@ -206,7 +205,7 @@ class Col2VolFunctor { PADDLE_ENFORCE_EQ( input_depth_tmp, output_depth, - platform::errors::InvalidArgument( + phi::errors::InvalidArgument( "input_depth(%d) and output_depth(%d) are mismatching.", input_depth_tmp, output_depth)); @@ -217,7 +216,7 @@ class Col2VolFunctor { PADDLE_ENFORCE_EQ( input_height_tmp, output_height, - platform::errors::InvalidArgument( + phi::errors::InvalidArgument( "input_height(%d) and output_height(%d) are mismatching.", input_height_tmp, output_height)); @@ -228,7 +227,7 @@ class Col2VolFunctor { PADDLE_ENFORCE_EQ( input_width_tmp, output_width, - platform::errors::InvalidArgument( + phi::errors::InvalidArgument( "input_width(%d) and output_width(%d) are mismatching.", input_width_tmp, output_width)); @@ -278,6 +277,5 @@ template class Vol2ColFunctor; template class Col2VolFunctor; template class Col2VolFunctor; -} // namespace math -} // namespace operators -} // namespace paddle +} // namespace funcs +} // namespace phi diff --git a/paddle/fluid/operators/math/vol2col.cu b/paddle/phi/kernels/funcs/vol2col.cu similarity index 95% rename from paddle/fluid/operators/math/vol2col.cu rename to paddle/phi/kernels/funcs/vol2col.cu index 999e29470ebbd..9d6fe1c4d9f3a 100644 --- a/paddle/fluid/operators/math/vol2col.cu +++ b/paddle/phi/kernels/funcs/vol2col.cu @@ -15,14 +15,13 @@ limitations under the License. */ #include #include -#include "paddle/fluid/operators/math/vol2col.h" -#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" #include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/backends/gpu/gpu_primitives.h" +#include "paddle/phi/kernels/funcs/vol2col.h" -namespace paddle { -namespace operators { -namespace math { +namespace phi { +namespace funcs { template __global__ void vol2col(int num_kernels, @@ -112,12 +111,12 @@ void Vol2ColFunctor::operator()( const DataLayout data_layout) const { PADDLE_ENFORCE_EQ(vol.dims().size(), 4, - platform::errors::InvalidArgument( + phi::errors::InvalidArgument( "The dimension of vol should be 4, but received %d.", vol.dims().size())); PADDLE_ENFORCE_EQ(col->dims().size(), 7, - platform::errors::InvalidArgument( + phi::errors::InvalidArgument( "The dimension of col should be 7, but received %d.", col->dims().size())); @@ -149,7 +148,7 @@ void Vol2ColFunctor::operator()( 1; PADDLE_ENFORCE_EQ(input_depth_tmp, output_depth, - platform::errors::InvalidArgument( + phi::errors::InvalidArgument( "input_depth(%d) and output_depth(%d) are mismatching.", input_depth_tmp, output_depth)); @@ -160,7 +159,7 @@ void Vol2ColFunctor::operator()( PADDLE_ENFORCE_EQ( input_height_tmp, output_height, - platform::errors::InvalidArgument( + phi::errors::InvalidArgument( "input_height(%d) and output_height(%d) are mismatching.", input_height_tmp, output_height)); @@ -170,7 +169,7 @@ void Vol2ColFunctor::operator()( 1; PADDLE_ENFORCE_EQ(input_width_tmp, output_width, - platform::errors::InvalidArgument( + phi::errors::InvalidArgument( "input_width(%d) and output_width(%d) are mismatching.", input_width_tmp, output_width)); @@ -180,7 +179,7 @@ void Vol2ColFunctor::operator()( int max_threads = 1024; #ifdef WITH_NV_JETSON - platform::ChangeThreadNum(context, &max_threads); + phi::backends::gpu::ChangeThreadNum(context, &max_threads); #endif const int threads = max_threads; @@ -318,12 +317,12 @@ void Col2VolFunctor::operator()( const DataLayout data_layout) const { PADDLE_ENFORCE_EQ(vol->dims().size(), 4, - platform::errors::InvalidArgument( + phi::errors::InvalidArgument( "The dimension of vol should be 4, but received %d.", vol->dims().size())); PADDLE_ENFORCE_EQ(col.dims().size(), 7, - platform::errors::InvalidArgument( + phi::errors::InvalidArgument( "The dimension of col should be 7, but received %d.", col.dims().size())); @@ -356,7 +355,7 @@ void Col2VolFunctor::operator()( 1; PADDLE_ENFORCE_EQ(input_depth_tmp, output_depth, - platform::errors::InvalidArgument( + phi::errors::InvalidArgument( "input_depth(%d) and output_depth(%d) are mismatching.", input_depth_tmp, output_depth)); @@ -367,7 +366,7 @@ void Col2VolFunctor::operator()( PADDLE_ENFORCE_EQ( input_height_tmp, output_height, - platform::errors::InvalidArgument( + phi::errors::InvalidArgument( "input_height(%d) and output_height(%d) are mismatching.", input_height_tmp, output_height)); @@ -377,7 +376,7 @@ void Col2VolFunctor::operator()( 1; PADDLE_ENFORCE_EQ(input_width_tmp, output_width, - platform::errors::InvalidArgument( + phi::errors::InvalidArgument( "input_width(%d) and output_width(%d) are mismatching.", input_width_tmp, output_width)); @@ -386,7 +385,7 @@ void Col2VolFunctor::operator()( int max_threads = 1024; #ifdef WITH_NV_JETSON - platform::ChangeThreadNum(context, &max_threads); + phi::backends::gpu::ChangeThreadNum(context, &max_threads); #endif const int threads = max_threads; @@ -423,6 +422,5 @@ template class Vol2ColFunctor; template class Col2VolFunctor; template class Col2VolFunctor; -} // namespace math -} // namespace operators -} // namespace paddle +} // namespace funcs +} // namespace phi diff --git a/paddle/fluid/operators/math/vol2col.h b/paddle/phi/kernels/funcs/vol2col.h similarity index 90% rename from paddle/fluid/operators/math/vol2col.h rename to paddle/phi/kernels/funcs/vol2col.h index d0a901ac1fc58..283ab3ea06563 100644 --- a/paddle/fluid/operators/math/vol2col.h +++ b/paddle/phi/kernels/funcs/vol2col.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -16,13 +16,11 @@ limitations under the License. */ #include -#include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/platform/device_context.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/errors.h" -namespace paddle { -namespace operators { -namespace math { +namespace phi { +namespace funcs { using DataLayout = phi::DataLayout; @@ -92,6 +90,5 @@ class Col2VolFunctor { const DataLayout data_layout = DataLayout::kNCHW) const; }; -} // namespace math -} // namespace operators -} // namespace paddle +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/impl/conv_grad_kernel_impl.h b/paddle/phi/kernels/impl/conv_grad_kernel_impl.h index 0d4cdddf6b520..e66a870c3aa25 100644 --- a/paddle/phi/kernels/impl/conv_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/conv_grad_kernel_impl.h @@ -15,11 +15,11 @@ #pragma once #include "paddle/fluid/operators/math/im2col.h" -#include "paddle/fluid/operators/math/vol2col.h" #include "paddle/phi/kernels/cpu/conv_util.h" #include "paddle/phi/kernels/funcs/batch_norm_utils.h" #include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/funcs/vol2col.h" namespace phi { @@ -147,7 +147,7 @@ void ConvGradKernel(const Context& dev_ctx, if (is_expand) { set_zero(dev_ctx, &transformed_input_grad, static_cast(0)); } - paddle::operators::math::Col2VolFunctor col2vol; + phi::funcs::Col2VolFunctor col2vol; paddle::operators::math:: Col2ImFunctor col2im; @@ -206,7 +206,7 @@ void ConvGradKernel(const Context& dev_ctx, paddle::operators::math:: Im2ColFunctor im2col; - paddle::operators::math::Vol2ColFunctor vol2col; + phi::funcs::Vol2ColFunctor vol2col; for (int i = 0; i < batch_size; i++) { DenseTensor out_grad_batch = transformed_output_grad.Slice(i, i + 1).Resize(output_matrix_shape); @@ -381,7 +381,7 @@ void ConvGradGradKernel(const Context& dev_ctx, if (is_expand) { set_zero(dev_ctx, &transformed_dX, static_cast(0)); } - paddle::operators::math::Col2VolFunctor col2vol; + phi::funcs::Col2VolFunctor col2vol; paddle::operators::math:: Col2ImFunctor col2im; @@ -431,7 +431,7 @@ void ConvGradGradKernel(const Context& dev_ctx, paddle::operators::math:: Im2ColFunctor im2col; - paddle::operators::math::Vol2ColFunctor vol2col; + phi::funcs::Vol2ColFunctor vol2col; for (int i = 0; i < batch_size; ++i) { DenseTensor dy_batch = transformed_dY.Slice(i, i + 1).Resize(output_matrix_shape); @@ -480,7 +480,7 @@ void ConvGradGradKernel(const Context& dev_ctx, paddle::operators::math:: Im2ColFunctor im2col; - paddle::operators::math::Vol2ColFunctor vol2col; + phi::funcs::Vol2ColFunctor vol2col; for (int i = 0; i < batch_size; ++i) { DenseTensor ddy_batch = transformed_ddY.Slice(i, i + 1).Resize(output_matrix_shape); diff --git a/paddle/phi/kernels/impl/conv_kernel_impl.h b/paddle/phi/kernels/impl/conv_kernel_impl.h index eb2d183981213..59bea1d0564c6 100644 --- a/paddle/phi/kernels/impl/conv_kernel_impl.h +++ b/paddle/phi/kernels/impl/conv_kernel_impl.h @@ -15,12 +15,12 @@ #pragma once #include "paddle/fluid/operators/math/im2col.h" -#include "paddle/fluid/operators/math/vol2col.h" #include "paddle/phi/kernels/conv_kernel.h" #include "paddle/phi/kernels/cpu/conv_util.h" #include "paddle/phi/kernels/funcs/batch_norm_utils.h" #include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/funcs/vol2col.h" namespace phi { @@ -133,7 +133,7 @@ void ConvKernelImpl(const Context& dev_ctx, int in_step = static_cast(transformed_input.dims()[1]) / groups; int out_step = static_cast(transformed_output.dims()[1]) / groups; - paddle::operators::math::Vol2ColFunctor vol2col; + phi::funcs::Vol2ColFunctor vol2col; paddle::operators::math:: Im2ColFunctor im2col; diff --git a/paddle/phi/kernels/impl/conv_transpose_grad_kernel_impl.h b/paddle/phi/kernels/impl/conv_transpose_grad_kernel_impl.h index b325f9ff6b31b..e25a6fd56ee2a 100644 --- a/paddle/phi/kernels/impl/conv_transpose_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/conv_transpose_grad_kernel_impl.h @@ -15,7 +15,6 @@ #pragma once #include "paddle/fluid/operators/math/im2col.h" -#include "paddle/fluid/operators/math/vol2col.h" #include "paddle/phi/common/layout.h" #include "paddle/phi/core/ddim.h" #include "paddle/phi/kernels/conv_transpose_grad_kernel.h" @@ -23,6 +22,7 @@ #include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/concat_and_split_functor.h" #include "paddle/phi/kernels/funcs/slice.h" +#include "paddle/phi/kernels/funcs/vol2col.h" namespace phi { @@ -146,7 +146,7 @@ void ConvTransposeGradRawKernel(const Context& ctx, paddle::operators::math:: Im2ColFunctor im2col; - paddle::operators::math::Vol2ColFunctor vol2col; + phi::funcs::Vol2ColFunctor vol2col; funcs::ConcatFunctor concat_functor; if (dx) { diff --git a/paddle/phi/kernels/impl/conv_transpose_kernel_impl.h b/paddle/phi/kernels/impl/conv_transpose_kernel_impl.h index c8272981e221b..a854bf3ee70de 100644 --- a/paddle/phi/kernels/impl/conv_transpose_kernel_impl.h +++ b/paddle/phi/kernels/impl/conv_transpose_kernel_impl.h @@ -15,7 +15,6 @@ #pragma once #include "paddle/fluid/operators/math/im2col.h" -#include "paddle/fluid/operators/math/vol2col.h" #include "paddle/phi/common/layout.h" #include "paddle/phi/core/ddim.h" #include "paddle/phi/kernels/conv_transpose_kernel.h" @@ -23,6 +22,7 @@ #include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/concat_and_split_functor.h" #include "paddle/phi/kernels/funcs/slice.h" +#include "paddle/phi/kernels/funcs/vol2col.h" namespace phi { @@ -139,7 +139,7 @@ void ConvTransposeRawKernel(const Context& ctx, paddle::operators::math:: Col2ImFunctor col2im; - paddle::operators::math::Col2VolFunctor col2vol; + phi::funcs::Col2VolFunctor col2vol; funcs::ConcatFunctor concat_functor; // convolution transpose: gemm + col2im or col2vol (similar to conv-backward From 51b081230aecbec3c6614713a82ba6fdf74f6c35 Mon Sep 17 00:00:00 2001 From: wangzhen38 <41941775+wangzhen38@users.noreply.github.com> Date: Tue, 22 Nov 2022 17:25:29 +0800 Subject: [PATCH 151/210] [remove fluid] under fleet meta_optimizers_wz (#47888) * [remove fluid] under fleet meta_optimizers_wz * [remove fluid] under fleet meta_optimizers_wz * update * [remove fluid] under fleet meta_optimizers_wz * [remove fluid] under fleet meta_optimizers_wz * [remove fluid] under fleet meta_optimizers_wz * [remove fluid] under fleet meta_optimizers_wz * [remove fluid] under fleet meta_optimizers_wz * [remove fluid] under fleet meta_optimizers_wz * [remove fluid] under fleet meta_optimizers_wz * [remove fluid] under fleet meta_optimizers_wz * [remove fluid] under fleet meta_optimizers_wz * [remove fluid] under fleet meta_optimizers_wz * [remove fluid] under fleet meta_optimizers_wz * [remove fluid] under fleet meta_optimizers_wz * [remove fluid] under fleet meta_optimizers_wz * [remove fluid] under fleet meta_optimizers_wz * [remove fluid] under fleet meta_optimizers_wz --- .../fleet/base/meta_optimizer_factory.py | 1 + .../fleet/meta_optimizers/__init__.py | 1 + .../fleet/meta_optimizers/asp_optimizer.py | 2 +- .../fleet/meta_optimizers/common.py | 8 +- .../fleet/meta_optimizers/dgc_optimizer.py | 423 +++++++++++++++- .../fp16_allreduce_optimizer.py | 6 +- .../graph_execution_optimizer.py | 5 +- .../fleet/meta_optimizers/lamb_optimizer.py | 2 +- .../fleet/meta_optimizers/lars_optimizer.py | 3 +- .../meta_optimizers/localsgd_optimizer.py | 69 +-- .../parameter_server_graph_optimizer.py | 4 +- .../parameter_server_optimizer.py | 7 +- .../meta_optimizers/pipeline_optimizer.py | 4 +- .../fleet/meta_optimizers/ps_optimizer.py | 9 +- python/paddle/fluid/optimizer.py | 472 ------------------ .../collective/fleet/test_dgc_optimizer.py | 4 +- .../fluid/tests/unittests/dist_mnist.py | 2 +- .../fluid/tests/unittests/dist_se_resnext.py | 16 +- .../unittests/test_imperative_optimizer.py | 3 +- .../unittests/test_imperative_optimizer_v2.py | 3 +- 20 files changed, 505 insertions(+), 539 deletions(-) diff --git a/python/paddle/distributed/fleet/base/meta_optimizer_factory.py b/python/paddle/distributed/fleet/base/meta_optimizer_factory.py index dd4611fc0a8c1..2577df9380e38 100755 --- a/python/paddle/distributed/fleet/base/meta_optimizer_factory.py +++ b/python/paddle/distributed/fleet/base/meta_optimizer_factory.py @@ -24,6 +24,7 @@ # should be removed meta_optimizer_names.remove("HybridParallelOptimizer") meta_optimizer_names.remove("HeterParallelOptimizer") +meta_optimizer_names.remove("DGCMomentumOptimizer") class MetaOptimizerFactory: diff --git a/python/paddle/distributed/fleet/meta_optimizers/__init__.py b/python/paddle/distributed/fleet/meta_optimizers/__init__.py index 1eae4be579aa7..feb7b125adc1c 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/__init__.py +++ b/python/paddle/distributed/fleet/meta_optimizers/__init__.py @@ -24,6 +24,7 @@ from .lars_optimizer import LarsOptimizer from .parameter_server_graph_optimizer import ParameterServerGraphOptimizer from .dgc_optimizer import DGCOptimizer +from .dgc_optimizer import DGCMomentumOptimizer from .lamb_optimizer import LambOptimizer from .fp16_allreduce_optimizer import FP16AllReduceOptimizer from .sharding_optimizer import ShardingOptimizer diff --git a/python/paddle/distributed/fleet/meta_optimizers/asp_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/asp_optimizer.py index 637fa31a6b722..a2f494e4a8438 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/asp_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/asp_optimizer.py @@ -12,8 +12,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and -from paddle.fluid.contrib.sparsity.asp import ASPHelper from .meta_optimizer_base import MetaOptimizerBase +from paddle.fluid.contrib.sparsity.asp import ASPHelper __all__ = [] diff --git a/python/paddle/distributed/fleet/meta_optimizers/common.py b/python/paddle/distributed/fleet/meta_optimizers/common.py index 03ed84563b628..bbcd1d82159ea 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/common.py +++ b/python/paddle/distributed/fleet/meta_optimizers/common.py @@ -13,9 +13,9 @@ # limitations under the License. import os - -import paddle.fluid as fluid -from paddle.fluid import core, unique_name +import paddle +from paddle.framework import core +from paddle.utils import unique_name from ..base.private_helper_function import wait_server_ready __all__ = [] @@ -62,7 +62,7 @@ def __init__(self, role_maker, nrings=1, wait_port=True): def update_startup_program(self, startup_program=None): self.startup_program = startup_program if startup_program is None: - self.startup_program = fluid.default_startup_program() + self.startup_program = paddle.static.default_startup_program() endpoints = self.role_maker._get_trainer_endpoints() current_endpoint = endpoints[self.role_maker._worker_index()] diff --git a/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py index a8861f12ccf39..1c728ed16eddd 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py @@ -11,12 +11,433 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and -from paddle.fluid.optimizer import Momentum, DGCMomentumOptimizer +from functools import reduce from .meta_optimizer_base import MetaOptimizerBase import logging __all__ = [] +from paddle.fluid.layers import tensor +import paddle +from paddle import framework +from paddle.framework import core +from paddle.common_ops_import import LayerHelper +from paddle.fluid.clip import GradientClipByNorm, append_gradient_clip_ops +from paddle.fluid.optimizer import Optimizer, Momentum +from paddle.fluid.dygraph import base as imperative_base + + +class DGCMomentumOptimizer(Optimizer): + _u_velocity_acc_str = "_dgc_u_" + _v_velocity_acc_str = "_dgc_v_" + + def __init__( + self, + learning_rate, + momentum, + rampup_begin_step, + rampup_step=1, + sparsity=[0.999], + parameter_list=None, + use_nesterov=False, + num_trainers=None, + regularization=None, + grad_clip=None, + name=None, + ): + if framework._non_static_mode(): + raise Exception("In dygraph, don't support DGCMomentumOptimizer.") + + assert ( + core.is_compiled_with_cuda() + ), "Paddle is not compiled with CUDA. DGC is only support GPU for now." + + assert learning_rate is not None + assert momentum is not None + super().__init__( + learning_rate=learning_rate, + parameter_list=parameter_list, + regularization=regularization, + grad_clip=grad_clip, + name=name, + ) + self.type = "dgc_momentum" + self._momentum = momentum + self._use_nesterov = bool(use_nesterov) + + assert rampup_begin_step >= 0, "rampup_begin_step must >= 0" + self._rampup_begin_step = rampup_begin_step + self._rampup_step = rampup_step + self._sparsity = sparsity + + self._rampup_begin_step_var = None + self._global_step_var = None + + self._dgc_clip_norm = None + if grad_clip is not None: + if not isinstance(grad_clip, GradientClipByNorm): + raise TypeError( + "The type of grad_clip should be 'GradientClipByNorm', because DGCMomentumOptimizer only support GradientClipByNorm" + ) + assert isinstance(num_trainers, int), ( + "The type of num_trainers should be 'int', but received %s" + % type(num_trainers) + ) + assert ( + num_trainers > 0 + ), "The value of num_trainers should be greater than 0!" + + self._num_trainers = num_trainers + self._dgc_clip_norm = grad_clip.clip_norm * (num_trainers**-0.5) + + self.regular_type, self.regular_coeff = self._get_regularization_param( + self.regularization + ) + + def _get_regularization_param(self, regularization): + regular_type = 0 + regular_coeff = 0.0 + + if regularization is not None: + regular_coeff = regularization._regularization_coeff + from paddle.fluid.regularizer import L1Decay, L2Decay + + if isinstance(regularization, L1Decay): + regular_type = 1 + elif isinstance(regularization, L2Decay): + regular_type = 2 + else: + assert False, 'regularization must be None|L1Decay|L2Deacy' + return regular_type, regular_coeff + + def _is_use_dgc(self, param_var, grad_var): + var_numel = abs(reduce(lambda x, y: x * y, param_var.shape)) + if ( + var_numel < 16384 + or param_var.type == core.VarDesc.VarType.SELECTED_ROWS + or grad_var.type == core.VarDesc.VarType.SELECTED_ROWS + or param_var.dtype != core.VarDesc.VarType.FP32 + ): + return False + return True + + def _append_optimize_op(self, block, param_and_grad): + assert isinstance(block, paddle.fluid.framework.Block) + velocity_acc = self._get_accumulator( + self._u_velocity_acc_str, param_and_grad[0] + ) + assert velocity_acc is not None + + inputs = { + "Param": param_and_grad[0], + "Grad": param_and_grad[1], + "Velocity": velocity_acc, + "LearningRate": self._create_param_lr(param_and_grad), + } + outputs = { + "ParamOut": param_and_grad[0], + "VelocityOut": velocity_acc, + } + attrs = {"mu": self._momentum, "use_nesterov": self._use_nesterov} + + if not self._is_use_dgc(param_and_grad[0], param_and_grad[1]): + type = "momentum" + else: + type = "dgc_momentum" + inputs.update( + { + "current_step": self._global_step_var, + "nranks": self._nranks_var, + } + ) + outputs.update({'Grad_out': param_and_grad[1]}) + attrs.update({"rampup_begin_step": float(self._rampup_begin_step)}) + + # create the dgc momentum optimize op + dgc_momentum_op = block.append_op( + type=type, + inputs=inputs, + outputs=outputs, + attrs=attrs, + stop_gradient=True, + ) + return dgc_momentum_op + + def _add_auto_increment_var(self, counter_name, begin, step=1): + helper = LayerHelper('global_step_counter') + counter, is_new_var = helper.create_or_get_global_variable( + name=counter_name, dtype='float32', shape=[1], persistable=True + ) + if is_new_var: + helper.set_variable_initializer( + counter, + initializer=paddle.fluid.initializer.Constant( + value=float(begin - 1), force_cpu=True + ), + ) + helper.main_program.global_block()._prepend_op( + type='increment', + inputs={'X': [counter]}, + outputs={'Out': [counter]}, + attrs={'step': float(step)}, + stop_gradient=True, + ) + counter.stop_gradient = True + + return counter + + def _add_nranks_var(self, name, value=-1): + helper = LayerHelper('global_step_counter') + counter, is_new_var = helper.create_or_get_global_variable( + name=name, dtype='float32', shape=[1], persistable=True + ) + if is_new_var: + helper.set_variable_initializer( + counter, + initializer=paddle.fluid.initializer.Constant( + value=float(value), force_cpu=True + ), + ) + counter.stop_gradient = True + + return counter + + def _append_dgc_ops(self, param_and_grads): + main_program = paddle.static.default_main_program() + main_program._enable_dgc = True + + # step counter + self._global_step_var = self._add_auto_increment_var( + counter_name=core.dgc.kDGCCounterName(), begin=0 + ) + + self._nranks_var = self._add_nranks_var( + name=core.dgc.kDGCNRanksName(), value=-1 + ) + + # rampup begin step var for all_reduce_op_handle + self._rampup_begin_step_var = tensor.create_global_var( + shape=[1], + dtype=core.VarDesc.VarType.FP32, + persistable=True, + name=core.dgc.kDGCRampUpBeginStepName(), + value=self._rampup_begin_step * 1.0, + force_cpu=True, + ) + + self.helper = LayerHelper(self.__class__.__name__) + + for param_var, grad_var in param_and_grads: + # reuse velocity in dgc_op and dgc_momentum_op + u_var = self._add_accumulator(self._u_velocity_acc_str, param_var) + + if not self._is_use_dgc(param_var, grad_var): + continue + + v_var = self._add_accumulator(self._v_velocity_acc_str, param_var) + + k_var = tensor.create_global_var( + shape=[1], + dtype=param_var.dtype, + persistable=True, + name=param_var.name + core.dgc.kDGCKName(), + value=0.0, + force_cpu=True, + ) + + encoded_var = tensor.create_global_var( + shape=[1], + dtype=param_var.dtype, + persistable=True, + name=param_var.name + core.dgc.kDGCEncodedName(), + value=0.0, + force_cpu=False, + ) + + gather_var = tensor.create_global_var( + shape=[1], + dtype=param_var.dtype, + persistable=True, + name=param_var.name + core.dgc.kDGCGatherName(), + value=0.0, + force_cpu=False, + ) + + # del back oprolevarname + op_maker = core.op_proto_and_checker_maker + backward = core.op_proto_and_checker_maker.OpRole.Backward + for op in main_program.global_block().ops: + if not self._is_the_backward_op(op): + continue + + var_attr = op.all_attrs()[op_maker.kOpRoleVarAttrName()] + if param_var.name not in var_attr: + continue + + var_attr.remove(param_var.name) + var_attr.remove(grad_var.name) + if len(var_attr) > 1: + op._set_attr(op_maker.kOpRoleVarAttrName(), var_attr) + else: + op._remove_attr(op_maker.kOpRoleVarAttrName()) + + clip_var = grad_var + if self._dgc_clip_norm is not None: + clip_var = self._append_clip_norm(grad_var, self._dgc_clip_norm) + self._dgc_op( + param_var, + clip_var, + grad_var, + u_var, + v_var, + k_var, + encoded_var, + gather_var, + ) + + def _is_the_backward_op(self, op): + op_maker = core.op_proto_and_checker_maker + backward = core.op_proto_and_checker_maker.OpRole.Backward + if op_maker.kOpRoleVarAttrName() in op.attr_names and int( + op.all_attrs()[op_maker.kOpRoleAttrName()] + ) == int(backward): + return True + return False + + def _clip_by_norm(self, x, max_norm, name=None): + args = {'x': x, 'max_norm': max_norm, 'name': name} + + helper = LayerHelper("dgc_clip_by_norm_op", **args) + + if name is None: + name = paddle.fluid.unique_name.generate_with_ignorable_key( + ".".join([helper.name, 'tmp']) + ) + + out = helper.create_variable( + type=x.type, name=name, dtype=x.dtype, persistable=False + ) + + helper.append_op( + type="dgc_clip_by_norm", + inputs={"X": x, "current_step": self._global_step_var}, + attrs={ + "max_norm": max_norm, + "rampup_begin_step": float(self._rampup_begin_step), + }, + outputs={"Out": out}, + ) + return out + + def _append_clip_norm(self, grad_var, clip_norm): + with grad_var.block.program._backward_role_guard(): + return self._clip_by_norm( + x=grad_var, max_norm=clip_norm, name=grad_var.name + ) + + def _dgc_op( + self, + param_var, + clip_var, + grad_var, + u_var, + v_var, + k_var, + encoded_var, + gather_var, + ): + block = paddle.static.default_main_program().global_block() + op_maker = core.op_proto_and_checker_maker + + regular_type = self.regular_type + regular_coeff = self.regular_coeff + # The regularizer of the Parameters have higher priority + if param_var.regularizer is not None: + regular_type, regular_coeff = self._get_regularization_param( + param_var.regularizer + ) + + dgc_op = block.append_op( + type="dgc", + inputs={ + "U": u_var, + "V": v_var, + "Grad": clip_var, + "Param": param_var, + "current_step": self._global_step_var, + "nranks": self._nranks_var, + }, + outputs={ + "U_out": u_var, + "V_out": v_var, + "EncodeGrad": encoded_var, + "k": k_var, + "Grad_out": grad_var, + "GatherBuff": gather_var, + }, + attrs={ + "m": self._momentum, + "sparsity": self._sparsity, + "use_nesterov": self._use_nesterov, + "rampup_begin_step": float(self._rampup_begin_step), + "rampup_step": float(self._rampup_step), + "regular_coeff": float(regular_coeff), + "regular_type": int(regular_type), + }, + stop_gradient=True, + ) + + backward = op_maker.OpRole.Backward + dgc_op._set_attr(op_maker.kOpRoleAttrName(), backward) + dgc_op._set_attr( + op_maker.kOpRoleVarAttrName(), [param_var.name, grad_var.name] + ) + + @imperative_base.no_grad() + def apply_gradients(self, params_grads): + # Note: since we can't use all_reduce_op now, + # dgc_op should be the last op of one grad. + # Maybe need a grad allreduce pass. + self._append_dgc_ops(params_grads) + + params_grads = sorted(params_grads, key=lambda x: x[0].name) + ( + params_grads, + table_param_and_grad, + table_optimize_op, + ) = self._process_distribute_lookuptable(params_grads) + + not_dgc_params_grads = [] + dgc_params_grads = [] + # DGC clip and regularization in optimizer.backward + for param, grad in params_grads: + if not self._is_use_dgc(param, grad): + not_dgc_params_grads.append((param, grad)) + else: + dgc_params_grads.append((param, grad)) + + # 'optimizer(grad_clip)' or 'set_gradient_clip' + if self._grad_clip is not None: + not_dgc_params_grads = self._grad_clip(not_dgc_params_grads) + else: + not_dgc_params_grads = append_gradient_clip_ops( + not_dgc_params_grads + ) + + not_dgc_params_grads = self.append_regularization_ops( + not_dgc_params_grads, self.regularization + ) + + params_grads = not_dgc_params_grads + dgc_params_grads + params_grads = sorted(params_grads, key=lambda x: x[0].name) + + optimize_ops = self._create_optimization_pass(params_grads) + if table_optimize_op is not None: + optimize_ops.append(table_optimize_op) + params_grads.append(table_param_and_grad) + + return optimize_ops + class DGCOptimizer(MetaOptimizerBase): def __init__(self, optimizer): diff --git a/python/paddle/distributed/fleet/meta_optimizers/fp16_allreduce_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/fp16_allreduce_optimizer.py index 0ab95830babfb..1a29448e0245d 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/fp16_allreduce_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/fp16_allreduce_optimizer.py @@ -11,8 +11,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and -from paddle.fluid import core, framework, unique_name +from paddle.framework import core +from paddle.utils import unique_name from .meta_optimizer_base import MetaOptimizerBase +import paddle __all__ = [] @@ -133,7 +135,7 @@ def fp16_compression(param_and_grads): with block.program._optimized_guard( [param, grad] - ), framework.name_scope('fp16_allreduce'): + ), paddle.static.name_scope('fp16_allreduce'): cast_op = block.append_op( type="cast", inputs={"X": grad}, diff --git a/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py index 1dfdce6f6c638..ccc4fecbb5486 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py @@ -13,8 +13,7 @@ import copy import paddle -from paddle.fluid.framework import core -from paddle.fluid import compiler +from paddle.framework import core from .meta_optimizer_base import MetaOptimizerBase from ..base.private_helper_function import wait_server_ready import logging @@ -247,7 +246,7 @@ def _try_to_compile(self, startup_program, main_program, loss): ) local_build_strategy.enable_backward_optimizer_op_deps = True - self._compiled_program = compiler.CompiledProgram(main_program) + self._compiled_program = paddle.static.CompiledProgram(main_program) self._compiled_program.with_data_parallel( loss_name=loss.name, diff --git a/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py index 0f1ba5d29da78..b160c5f6fa789 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py @@ -11,9 +11,9 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and -from paddle.fluid.optimizer import AdamOptimizer from paddle.fluid.optimizer import LambOptimizer as LAMB from .meta_optimizer_base import MetaOptimizerBase +from paddle.fluid.optimizer import AdamOptimizer import logging __all__ = [] diff --git a/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py index 0eb4be0ca87e6..5c716bd375ac4 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py @@ -11,7 +11,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and -from paddle.fluid.optimizer import Momentum, LarsMomentumOptimizer +from paddle.fluid.optimizer import Momentum +from paddle.fluid.optimizer import LarsMomentumOptimizer from .meta_optimizer_base import MetaOptimizerBase import logging diff --git a/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py index 62ff253fb7765..e73d3c6b4b0f9 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py @@ -13,8 +13,8 @@ # limitations under the License. import paddle -from paddle.fluid import program_guard, layers, default_main_program -from paddle.fluid import default_startup_program +from paddle.static import program_guard, default_main_program +from paddle.static import default_startup_program from .meta_optimizer_base import MetaOptimizerBase from .common import CollectiveHelper, OP_ROLE_KEY, OpRole @@ -83,7 +83,7 @@ def create_snapshot_vars(self, program): def init_snapshot_vars(self, startup_program, param2snapshot): with program_guard(startup_program): for param, snapshot in param2snapshot: - layers.assign(param, snapshot) + paddle.assign(param, snapshot) def minimize_impl( self, loss, startup_program=None, parameter_list=None, no_grad_set=None @@ -109,8 +109,8 @@ def minimize_impl( p2s = self.create_snapshot_vars(main_block.program) with program_guard(main_block.program, startup_program): - step = layers.autoincreased_step_counter(begin=1) - k_steps = layers.create_global_var( + step = paddle.fluid.layers.autoincreased_step_counter(begin=1) + k_steps = paddle.static.create_global_var( name="k_steps", shape=[1], value=k_steps_value, @@ -118,7 +118,7 @@ def minimize_impl( persistable=True, ) - begin_step = layers.create_global_var( + begin_step = paddle.static.create_global_var( name="begin_step", shape=[1], value=begin_step_value, @@ -126,7 +126,7 @@ def minimize_impl( persistable=True, ) - last_step = layers.create_global_var( + last_step = paddle.static.create_global_var( name="last_step", shape=[1], value=begin_step_value, @@ -194,12 +194,14 @@ def communicate(): outputs={'Out': [snapshot]}, attrs={OP_ROLE_KEY: OpRole.Optimize}, ) - layers.assign(step, last_step) + paddle.assign(step, last_step) def begin_localsgd(): - layers.cond(step - last_step == k_steps, communicate) + paddle.static.nn.cond(step - last_step == k_steps, communicate) - layers.cond(step > begin_step, begin_localsgd, communicate) + paddle.static.nn.cond( + step > begin_step, begin_localsgd, communicate + ) return minimized @@ -225,7 +227,7 @@ def _can_apply(self): return False return ( - isinstance(self.inner_opt, paddle.optimizer.momentum.Momentum) + isinstance(self.inner_opt, paddle.optimizer.Momentum) or isinstance(self.inner_opt, paddle.fluid.optimizer.Momentum) or isinstance(self.inner_opt, paddle.optimizer.sgd.SGD) or isinstance(self.inner_opt, paddle.fluid.optimizer.SGD) @@ -268,7 +270,7 @@ def create_snapshot_vars(self, program): def init_snapshot_vars(self, startup_program, param2snapshot): with program_guard(startup_program): for param, snapshot in param2snapshot: - layers.assign(param, snapshot) + paddle.assign(param, snapshot) def _generate_avg_loss(self, program_block, loss, avg_loss): program_block.append_op( @@ -324,9 +326,9 @@ def minimize_impl( p2s = self.create_snapshot_vars(main_block.program) with program_guard(main_block.program, startup_program): - step = layers.autoincreased_step_counter(begin=1) + step = paddle.fluid.layers.autoincreased_step_counter(begin=1) - k_steps = layers.create_global_var( + k_steps = paddle.static.create_global_var( name="k_steps", shape=[1], value=int(init_k_steps), @@ -334,7 +336,7 @@ def minimize_impl( persistable=True, ) - begin_step = layers.create_global_var( + begin_step = paddle.static.create_global_var( name="begin_step", shape=[1], value=int(begin_step_value), @@ -342,7 +344,7 @@ def minimize_impl( persistable=True, ) - last_step = layers.create_global_var( + last_step = paddle.static.create_global_var( name="last_step", shape=[1], value=int(0), @@ -350,7 +352,7 @@ def minimize_impl( persistable=True, ) - avg_loss = layers.create_global_var( + avg_loss = paddle.static.create_global_var( name="avg_loss", shape=[1], value=float(0), @@ -358,7 +360,7 @@ def minimize_impl( persistable=True, ) - lr_0 = layers.create_global_var( + lr_0 = paddle.static.create_global_var( name="lr_0", shape=[1], value=float(0), @@ -366,7 +368,7 @@ def minimize_impl( persistable=True, ) - loss_0 = layers.create_global_var( + loss_0 = paddle.static.create_global_var( name="loss_0", shape=[1], value=float(0), @@ -378,10 +380,10 @@ def minimize_impl( def initialize(): self._generate_avg_loss(main_block, loss, avg_loss) - layers.assign(avg_loss, loss_0) - layers.assign(global_lr, lr_0) + paddle.assign(avg_loss, loss_0) + paddle.assign(global_lr, lr_0) - layers.cond(step == 1, initialize) + paddle.static.nn.cond(step == 1, initialize) def communicate(): sub_block = default_main_program().current_block() @@ -443,12 +445,13 @@ def communicate(): outputs={'Out': [snapshot]}, attrs={OP_ROLE_KEY: OpRole.Optimize}, ) - layers.assign(step, last_step) + paddle.assign(step, last_step) def communicate_avg_loss(): communicate() self._generate_avg_loss(main_block, loss, avg_loss) - next_local_steps = layers.cast( + + next_local_steps = paddle.cast( paddle.ceil( paddle.sqrt( lr_0 @@ -459,11 +462,11 @@ def communicate_avg_loss(): ), dtype='int64', ) - max_local_steps = layers.fill_constant( - shape=[1], dtype='int64', value=16 + max_local_steps = paddle.full( + shape=[1], dtype='int64', fill_value=16 ) - min_local_steps = layers.fill_constant( - shape=[1], dtype='int64', value=1 + min_local_steps = paddle.full( + shape=[1], dtype='int64', fill_value=1 ) next_local_steps = paddle.minimum( next_local_steps, max_local_steps @@ -471,11 +474,15 @@ def communicate_avg_loss(): next_local_steps = paddle.maximum( next_local_steps, min_local_steps ) - layers.assign(next_local_steps, k_steps) + paddle.assign(next_local_steps, k_steps) def begin_localsgd(): - layers.cond(step - last_step == k_steps, communicate_avg_loss) + paddle.static.nn.cond( + step - last_step == k_steps, communicate_avg_loss + ) - layers.cond(step > begin_step, begin_localsgd, communicate) + paddle.static.nn.cond( + step > begin_step, begin_localsgd, communicate + ) return minimized diff --git a/python/paddle/distributed/fleet/meta_optimizers/parameter_server_graph_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/parameter_server_graph_optimizer.py index 22a1b82541d5b..74d57fe59bfad 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/parameter_server_graph_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/parameter_server_graph_optimizer.py @@ -11,8 +11,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and -from paddle.fluid import compiler from .parameter_server_optimizer import ParameterServerOptimizer +import paddle __all__ = [] @@ -56,7 +56,7 @@ def _try_to_compile(self, main_program, loss): build_strategy = dist_strategy.get_build_strategy() exec_strategy = dist_strategy.get_execute_strategy() - self._compiled_program = compiler.CompiledProgram(main_program) + self._compiled_program = paddle.static.CompiledProgram(main_program) self._compiled_program.with_data_parallel( loss_name=loss.name, diff --git a/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py index 2ea83ada81236..362dec4e62257 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py @@ -11,9 +11,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and +import paddle from paddle import fluid from .meta_optimizer_base import MetaOptimizerBase -from paddle.fluid import core +from paddle.framework import core import subprocess import re import os @@ -185,8 +186,8 @@ def _build_trainer_programs(self, compiled_config): return _main, _startup def _build_pserver_programs(self, compiled_config): - _main = fluid.Program() - _startup = fluid.Program() + _main = paddle.static.Program() + _startup = paddle.static.Program() from paddle.fluid.incubate.fleet.parameter_server.ir import ( pserver_pass as server, diff --git a/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py index 655670f305305..45dde10b1ed7d 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and -import paddle.fluid as fluid +import paddle from paddle.fluid.optimizer import PipelineOptimizer as PO from .meta_optimizer_base import MetaOptimizerBase from .common import ( @@ -210,7 +210,7 @@ def minimize_impl( orig_startup_program = ( startup_program if startup_program - else fluid.default_startup_program() + else paddle.static.default_startup_program() ) block = loss.block program = block.program diff --git a/python/paddle/distributed/fleet/meta_optimizers/ps_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/ps_optimizer.py index 66b8acb4d7e07..31fcf3450d42c 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/ps_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/ps_optimizer.py @@ -11,10 +11,9 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and -from paddle import fluid import paddle.distributed.passes from .meta_optimizer_base import MetaOptimizerBase -from paddle.fluid import core +from paddle.framework import core import subprocess import re import os @@ -111,8 +110,8 @@ def _init_ps_pass_context(self, loss, startup_program): build_var_distributed(attrs) # server - attrs['_main_server'] = fluid.Program() - attrs['_startup_server'] = fluid.Program() + attrs['_main_server'] = paddle.static.Program() + attrs['_startup_server'] = paddle.static.Program() attrs['tensor_table'] = {} self.pass_ctx._attrs = attrs @@ -203,7 +202,7 @@ def get_sys_free_mem(): % (platform.system()) ) - if not isinstance(self.inner_opt, fluid.optimizer.SGDOptimizer): + if not isinstance(self.inner_opt, paddle.fluid.optimizer.SGDOptimizer): return False free = get_sys_free_mem() diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 8e030a54d832b..c724a0f348db1 100755 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -1744,478 +1744,6 @@ def _append_optimize_op(self, block, param_and_grad): return momentum_op -class DGCMomentumOptimizer(Optimizer): - r""" - :api_attr: Static Graph - - DGC (Deep Gradient Compression) Momentum Optimizer. Original paper is https://arxiv.org/abs/1712.01887 - - DGC reduces the communication bandwidth by sending only the important gradients (sparse update):\ - only gradients larger than a threshold are transmitted. - - To avoid losing information, DGC accumulates the rest of the gradients locally. - - Eventually, these gradients become large enough to be transmitted. - - Thus, DGC sends the large gradients immediately but eventually sends all of the gradients over time. - - To ensure no loss of accuracy, DGC employs momentum correction and local gradient clipping on top of the gradient sparsification to maintain model performance. - - DGC also uses momentum factor masking and warmup training to overcome the staleness problem caused by reduced communication. - - This optimizer will do two things: - - 1. Compress the gradient by get TopK import value from tensor \ - and use it for allreduce to reduce network bandwidth. - - 2. Call momentum to optimize the cost. - - Args: - learning_rate (float|Variable): The learning rate used to update parameters. \ - It can be a float value or a Variable with one float value as a data element. - momentum (float): Momentum factor. - rampup_begin_step (int): The beginning step from which gradient compression is implemented. - rampup_step (int): Time steps used in sparsity warm-up periods. Default is 1. - For example, if the sparsity is [0.75, 0.9375, 0.984375, 0.996, 0.999], and the rampup_step is 100, \ - it will use 0.75 at 0~19 steps, and 0.9375 at 20~39 steps, and so on. \ - And when reach sparsity array ends, it will use 0.999 then and after. - sparsity (list[float]): Get top important element from gradient tensor, the ratio is (1 - current sparsity). \ - Default is [0.999]. For example, if the sparsity is [0.99, 0.999], \ - the top [1%, 0.1%] important element will be transmitted. - parameter_list (Iterable, optional): Iterable of ``Variable`` names to update to minimize ``loss``. \ - This parameter is required in dygraph mode. \ - The default value is None in static mode, at this time all parameters will be updated. - use_nesterov (bool): Enables Nesterov momentum. True means use Nesterov. Default is False. - regularization (WeightDecayRegularizer, optional): The strategy of regularization. There are two method: \ - :ref:`api_fluid_regularizer_L1Decay` , :ref:`api_fluid_regularizer_L2Decay` . If a parameter has set \ - regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \ - ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect. \ - Default None, meaning there is no regularization. - grad_clip (GradientClipByNorm, optional): Gradient cliping strategy. ``DGCMomentumOptimizer`` only support - :ref:`api_fluid_clip_GradientClipByNorm` , and if not, it will raise TypeError. Default None, - meaning there is no gradient clipping. - name (str, optional): This parameter is used by developers to print debugging information. \ - For details, please refer to :ref:`api_guide_Name`. Default is None. - - Examples: - .. code-block:: python - - import paddle.fluid as fluid - optimizer = fluid.optimizer.DGCMomentumOptimizer( - learning_rate=0.0001, - momentum=0.9, - rampup_step=1000, - rampup_begin_step=1252, - sparsity=[0.999, 0.999]) - - """ - _u_velocity_acc_str = "_dgc_u_" - _v_velocity_acc_str = "_dgc_v_" - - def __init__( - self, - learning_rate, - momentum, - rampup_begin_step, - rampup_step=1, - sparsity=[0.999], - parameter_list=None, - use_nesterov=False, - num_trainers=None, - regularization=None, - grad_clip=None, - name=None, - ): - if framework._non_static_mode(): - raise Exception("In dygraph, don't support DGCMomentumOptimizer.") - - assert ( - core.is_compiled_with_cuda() - ), "Paddle is not compiled with CUDA. DGC is only support GPU for now." - - assert learning_rate is not None - assert momentum is not None - super().__init__( - learning_rate=learning_rate, - parameter_list=parameter_list, - regularization=regularization, - grad_clip=grad_clip, - name=name, - ) - self.type = "dgc_momentum" - self._momentum = momentum - self._use_nesterov = bool(use_nesterov) - - assert rampup_begin_step >= 0, "rampup_begin_step must >= 0" - self._rampup_begin_step = rampup_begin_step - self._rampup_step = rampup_step - self._sparsity = sparsity - - self._rampup_begin_step_var = None - self._global_step_var = None - - self._dgc_clip_norm = None - if grad_clip is not None: - if not isinstance(grad_clip, GradientClipByNorm): - raise TypeError( - "The type of grad_clip should be 'GradientClipByNorm', because DGCMomentumOptimizer only support GradientClipByNorm" - ) - assert isinstance(num_trainers, int), ( - "The type of num_trainers should be 'int', but received %s" - % type(num_trainers) - ) - assert ( - num_trainers > 0 - ), "The value of num_trainers should be greater than 0!" - - self._num_trainers = num_trainers - self._dgc_clip_norm = grad_clip.clip_norm * (num_trainers**-0.5) - - self.regular_type, self.regular_coeff = self._get_regularization_param( - self.regularization - ) - - def _get_regularization_param(self, regularization): - regular_type = 0 - regular_coeff = 0.0 - - if regularization is not None: - regular_coeff = regularization._regularization_coeff - from .regularizer import L1Decay, L2Decay - - if isinstance(regularization, L1Decay): - regular_type = 1 - elif isinstance(regularization, L2Decay): - regular_type = 2 - else: - assert False, 'regularization must be None|L1Decay|L2Deacy' - return regular_type, regular_coeff - - def _is_use_dgc(self, param_var, grad_var): - var_numel = abs(reduce(lambda x, y: x * y, param_var.shape)) - if ( - var_numel < 16384 - or param_var.type == core.VarDesc.VarType.SELECTED_ROWS - or grad_var.type == core.VarDesc.VarType.SELECTED_ROWS - or param_var.dtype != core.VarDesc.VarType.FP32 - ): - return False - return True - - def _append_optimize_op(self, block, param_and_grad): - assert isinstance(block, framework.Block) - velocity_acc = self._get_accumulator( - self._u_velocity_acc_str, param_and_grad[0] - ) - assert velocity_acc is not None - - inputs = { - "Param": param_and_grad[0], - "Grad": param_and_grad[1], - "Velocity": velocity_acc, - "LearningRate": self._create_param_lr(param_and_grad), - } - outputs = { - "ParamOut": param_and_grad[0], - "VelocityOut": velocity_acc, - } - attrs = {"mu": self._momentum, "use_nesterov": self._use_nesterov} - - if not self._is_use_dgc(param_and_grad[0], param_and_grad[1]): - type = "momentum" - else: - type = "dgc_momentum" - inputs.update( - { - "current_step": self._global_step_var, - "nranks": self._nranks_var, - } - ) - outputs.update({'Grad_out': param_and_grad[1]}) - attrs.update({"rampup_begin_step": float(self._rampup_begin_step)}) - - # create the dgc momentum optimize op - dgc_momentum_op = block.append_op( - type=type, - inputs=inputs, - outputs=outputs, - attrs=attrs, - stop_gradient=True, - ) - return dgc_momentum_op - - def _add_auto_increment_var(self, counter_name, begin, step=1): - helper = LayerHelper('global_step_counter') - counter, is_new_var = helper.create_or_get_global_variable( - name=counter_name, dtype='float32', shape=[1], persistable=True - ) - if is_new_var: - helper.set_variable_initializer( - counter, - initializer=Constant(value=float(begin - 1), force_cpu=True), - ) - helper.main_program.global_block()._prepend_op( - type='increment', - inputs={'X': [counter]}, - outputs={'Out': [counter]}, - attrs={'step': float(step)}, - stop_gradient=True, - ) - counter.stop_gradient = True - - return counter - - def _add_nranks_var(self, name, value=-1): - helper = LayerHelper('global_step_counter') - counter, is_new_var = helper.create_or_get_global_variable( - name=name, dtype='float32', shape=[1], persistable=True - ) - if is_new_var: - helper.set_variable_initializer( - counter, - initializer=Constant(value=float(value), force_cpu=True), - ) - counter.stop_gradient = True - - return counter - - def _append_dgc_ops(self, param_and_grads): - main_program = default_main_program() - main_program._enable_dgc = True - - # step counter - self._global_step_var = self._add_auto_increment_var( - counter_name=core.dgc.kDGCCounterName(), begin=0 - ) - - self._nranks_var = self._add_nranks_var( - name=core.dgc.kDGCNRanksName(), value=-1 - ) - - # rampup begin step var for all_reduce_op_handle - self._rampup_begin_step_var = tensor.create_global_var( - shape=[1], - dtype=core.VarDesc.VarType.FP32, - persistable=True, - name=core.dgc.kDGCRampUpBeginStepName(), - value=self._rampup_begin_step * 1.0, - force_cpu=True, - ) - - self.helper = LayerHelper(self.__class__.__name__) - - for param_var, grad_var in param_and_grads: - # reuse velocity in dgc_op and dgc_momentum_op - u_var = self._add_accumulator(self._u_velocity_acc_str, param_var) - - if not self._is_use_dgc(param_var, grad_var): - continue - - v_var = self._add_accumulator(self._v_velocity_acc_str, param_var) - - k_var = tensor.create_global_var( - shape=[1], - dtype=param_var.dtype, - persistable=True, - name=param_var.name + core.dgc.kDGCKName(), - value=0.0, - force_cpu=True, - ) - - encoded_var = tensor.create_global_var( - shape=[1], - dtype=param_var.dtype, - persistable=True, - name=param_var.name + core.dgc.kDGCEncodedName(), - value=0.0, - force_cpu=False, - ) - - gather_var = tensor.create_global_var( - shape=[1], - dtype=param_var.dtype, - persistable=True, - name=param_var.name + core.dgc.kDGCGatherName(), - value=0.0, - force_cpu=False, - ) - - # del back oprolevarname - op_maker = core.op_proto_and_checker_maker - backward = core.op_proto_and_checker_maker.OpRole.Backward - for op in main_program.global_block().ops: - if not self._is_the_backward_op(op): - continue - - var_attr = op.all_attrs()[op_maker.kOpRoleVarAttrName()] - if param_var.name not in var_attr: - continue - - var_attr.remove(param_var.name) - var_attr.remove(grad_var.name) - if len(var_attr) > 1: - op._set_attr(op_maker.kOpRoleVarAttrName(), var_attr) - else: - op._remove_attr(op_maker.kOpRoleVarAttrName()) - - clip_var = grad_var - if self._dgc_clip_norm is not None: - clip_var = self._append_clip_norm(grad_var, self._dgc_clip_norm) - self._dgc_op( - param_var, - clip_var, - grad_var, - u_var, - v_var, - k_var, - encoded_var, - gather_var, - ) - - def _is_the_backward_op(self, op): - op_maker = core.op_proto_and_checker_maker - backward = core.op_proto_and_checker_maker.OpRole.Backward - if op_maker.kOpRoleVarAttrName() in op.attr_names and int( - op.all_attrs()[op_maker.kOpRoleAttrName()] - ) == int(backward): - return True - return False - - def _clip_by_norm(self, x, max_norm, name=None): - args = {'x': x, 'max_norm': max_norm, 'name': name} - - helper = LayerHelper("dgc_clip_by_norm_op", **args) - - if name is None: - name = unique_name.generate_with_ignorable_key( - ".".join([helper.name, 'tmp']) - ) - - out = helper.create_variable( - type=x.type, name=name, dtype=x.dtype, persistable=False - ) - - helper.append_op( - type="dgc_clip_by_norm", - inputs={"X": x, "current_step": self._global_step_var}, - attrs={ - "max_norm": max_norm, - "rampup_begin_step": float(self._rampup_begin_step), - }, - outputs={"Out": out}, - ) - return out - - def _append_clip_norm(self, grad_var, clip_norm): - with grad_var.block.program._backward_role_guard(): - return self._clip_by_norm( - x=grad_var, max_norm=clip_norm, name=grad_var.name - ) - - def _dgc_op( - self, - param_var, - clip_var, - grad_var, - u_var, - v_var, - k_var, - encoded_var, - gather_var, - ): - block = framework.default_main_program().global_block() - op_maker = core.op_proto_and_checker_maker - - regular_type = self.regular_type - regular_coeff = self.regular_coeff - # The regularizer of the Parameters have higher priority - if param_var.regularizer is not None: - regular_type, regular_coeff = self._get_regularization_param( - param_var.regularizer - ) - - dgc_op = block.append_op( - type="dgc", - inputs={ - "U": u_var, - "V": v_var, - "Grad": clip_var, - "Param": param_var, - "current_step": self._global_step_var, - "nranks": self._nranks_var, - }, - outputs={ - "U_out": u_var, - "V_out": v_var, - "EncodeGrad": encoded_var, - "k": k_var, - "Grad_out": grad_var, - "GatherBuff": gather_var, - }, - attrs={ - "m": self._momentum, - "sparsity": self._sparsity, - "use_nesterov": self._use_nesterov, - "rampup_begin_step": float(self._rampup_begin_step), - "rampup_step": float(self._rampup_step), - "regular_coeff": float(regular_coeff), - "regular_type": int(regular_type), - }, - stop_gradient=True, - ) - - backward = op_maker.OpRole.Backward - dgc_op._set_attr(op_maker.kOpRoleAttrName(), backward) - dgc_op._set_attr( - op_maker.kOpRoleVarAttrName(), [param_var.name, grad_var.name] - ) - - @imperative_base.no_grad - def apply_gradients(self, params_grads): - # Note: since we can't use all_reduce_op now, - # dgc_op should be the last op of one grad. - # Maybe need a grad allreduce pass. - self._append_dgc_ops(params_grads) - - params_grads = sorted(params_grads, key=lambda x: x[0].name) - ( - params_grads, - table_param_and_grad, - table_optimize_op, - ) = self._process_distribute_lookuptable(params_grads) - - not_dgc_params_grads = [] - dgc_params_grads = [] - # DGC clip and regularization in optimizer.backward - for param, grad in params_grads: - if not self._is_use_dgc(param, grad): - not_dgc_params_grads.append((param, grad)) - else: - dgc_params_grads.append((param, grad)) - - # 'optimizer(grad_clip)' or 'set_gradient_clip' - if self._grad_clip is not None: - not_dgc_params_grads = self._grad_clip(not_dgc_params_grads) - else: - not_dgc_params_grads = append_gradient_clip_ops( - not_dgc_params_grads - ) - - not_dgc_params_grads = self.append_regularization_ops( - not_dgc_params_grads, self.regularization - ) - - params_grads = not_dgc_params_grads + dgc_params_grads - params_grads = sorted(params_grads, key=lambda x: x[0].name) - - optimize_ops = self._create_optimization_pass(params_grads) - if table_optimize_op is not None: - optimize_ops.append(table_optimize_op) - params_grads.append(table_param_and_grad) - - return optimize_ops - - class LarsMomentumOptimizer(Optimizer): r""" Momentum optimizer with LARS support diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/test_dgc_optimizer.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_dgc_optimizer.py index 0da05a377b36d..335916e520c3c 100644 --- a/python/paddle/fluid/tests/unittests/collective/fleet/test_dgc_optimizer.py +++ b/python/paddle/fluid/tests/unittests/collective/fleet/test_dgc_optimizer.py @@ -24,7 +24,9 @@ class TestDGCMomentumOptimizer(unittest.TestCase): - class MockDGCMomentum(optimizer.DGCMomentumOptimizer): + class MockDGCMomentum( + paddle.distributed.fleet.meta_optimizers.DGCMomentumOptimizer + ): def get_accumulators(self): return self._accumulators diff --git a/python/paddle/fluid/tests/unittests/dist_mnist.py b/python/paddle/fluid/tests/unittests/dist_mnist.py index 856ac1b930bbf..2df9549918528 100644 --- a/python/paddle/fluid/tests/unittests/dist_mnist.py +++ b/python/paddle/fluid/tests/unittests/dist_mnist.py @@ -93,7 +93,7 @@ def get_model(self, batch_size=2, use_dgc=False, dist_strategy=None): if not use_dgc: opt = fluid.optimizer.Momentum(learning_rate=self.lr, momentum=0.9) else: - opt = fluid.optimizer.DGCMomentumOptimizer( + opt = paddle.distributed.fleet.meta_optimizers.DGCMomentumOptimizer( learning_rate=self.lr, momentum=0.9, rampup_begin_step=2 ) diff --git a/python/paddle/fluid/tests/unittests/dist_se_resnext.py b/python/paddle/fluid/tests/unittests/dist_se_resnext.py index 0d8ed873f0398..ae7fb207d260f 100644 --- a/python/paddle/fluid/tests/unittests/dist_se_resnext.py +++ b/python/paddle/fluid/tests/unittests/dist_se_resnext.py @@ -247,13 +247,15 @@ def get_model(self, batch_size=2, use_dgc=False): regularization=fluid.regularizer.L2Decay(1e-4), ) else: - optimizer = fluid.optimizer.DGCMomentumOptimizer( - learning_rate=fluid.layers.piecewise_decay( - boundaries=bd, values=lr - ), - momentum=0.9, - rampup_begin_step=0, - regularization=fluid.regularizer.L2Decay(1e-4), + optimizer = ( + paddle.distributed.fleet.meta_optimizers.DGCMomentumOptimizer( + learning_rate=fluid.layers.piecewise_decay( + boundaries=bd, values=lr + ), + momentum=0.9, + rampup_begin_step=0, + regularization=fluid.regularizer.L2Decay(1e-4), + ) ) optimizer.minimize(avg_cost) diff --git a/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py index a75208d88d5ce..917876c9741a1 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py @@ -35,7 +35,6 @@ ) from paddle.fluid.optimizer import ( ModelAverage, - DGCMomentumOptimizer, ExponentialMovingAverage, PipelineOptimizer, LookaheadOptimizer, @@ -45,6 +44,8 @@ from test_imperative_base import new_program_scope from paddle.fluid.framework import _test_eager_guard +from paddle.distributed.fleet.meta_optimizers import DGCMomentumOptimizer + # Note(wangzhongpu) # In dygraph, don't support ModelAverage, DGCMomentumOptimizer, ExponentialMovingAverage, PipelineOptimizer, LookaheadOptimizer, RecomputeOptimizer. diff --git a/python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py b/python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py index 4023d3596bac2..0c6853ce65368 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py @@ -32,7 +32,6 @@ ) from paddle.fluid.optimizer import ( ModelAverage, - DGCMomentumOptimizer, ExponentialMovingAverage, PipelineOptimizer, LookaheadOptimizer, @@ -42,6 +41,8 @@ from test_imperative_base import new_program_scope from paddle.fluid.framework import _test_eager_guard +from paddle.distributed.fleet.meta_optimizers import DGCMomentumOptimizer + # Note(wangzhongpu) # In dygraph, don't support ModelAverage, DGCMomentumOptimizer, ExponentialMovingAverage, PipelineOptimizer, LookaheadOptimizer, RecomputeOptimizer. From 94c6ec86019128549ed96764fcab648bc4490dd6 Mon Sep 17 00:00:00 2001 From: ustiniankw <102717963+ustiniankw@users.noreply.github.com> Date: Tue, 22 Nov 2022 17:31:35 +0800 Subject: [PATCH 152/210] [Docs]fix math api en docs issue (#47448) * fix_docx_stanh * fix einsum api en docs issue * fix model api en docs issue * for codestyle * fix_einsum.py_einsum, test=document_fix * fix_model.py_Model, test=ducument_fix * fix_creation.py_meshgrid, test=document_fix * fix_linalg.py_slogdet, test=document_fix * fix_loss.py_SoftMarginLoss_CrossEntropyLoss_NLLLoss_BCELoss, test=document_fix * norm.py_SyncBatchNorm, test=document-fix * norm.py_SyncBatchNorm, test=document_fix * norm.py_SyncBatchNorm, test=document_fix * list18-30, test=document_fix * refix_list1-15, test=document_fix * deletefiles, test=document_fix * fixedapi_pre-commit, test=document_fix * fix_list31-45, test=document_fix * list111, test=document_fix * some_fix, test=document_fix * some_fix, test=document_fix * somefix, test=document_fix * somefix, test=document_fix * refix, test=document_fix * refix, test=document_fix * refix, test=document_fix * refix, test=document_fix * rerfix, test=document_fix Co-authored-by: Ligoml --- .../fleet/base/distributed_strategy.py | 822 ++++++++++-------- .../paddle/distributed/fleet/base/topology.py | 11 +- python/paddle/distributed/parallel.py | 3 + .../geometric/message_passing/send_recv.py | 8 +- python/paddle/geometric/reindex.py | 20 +- python/paddle/geometric/sampling/neighbors.py | 16 +- python/paddle/hapi/model.py | 23 +- .../operators/graph_sample_neighbors.py | 36 +- python/paddle/nn/functional/common.py | 156 ++-- python/paddle/nn/functional/distance.py | 6 +- python/paddle/nn/functional/loss.py | 65 +- python/paddle/nn/functional/pooling.py | 49 +- python/paddle/nn/layer/activation.py | 6 +- python/paddle/nn/layer/distance.py | 13 +- python/paddle/nn/layer/loss.py | 143 ++- python/paddle/nn/layer/norm.py | 43 +- python/paddle/nn/layer/pooling.py | 2 + python/paddle/nn/quant/quant_layers.py | 4 + python/paddle/optimizer/lr.py | 15 +- python/paddle/tensor/creation.py | 1 + python/paddle/tensor/einsum.py | 166 ++-- python/paddle/tensor/linalg.py | 23 +- python/paddle/tensor/math.py | 6 +- python/paddle/vision/ops.py | 61 +- 24 files changed, 901 insertions(+), 797 deletions(-) diff --git a/python/paddle/distributed/fleet/base/distributed_strategy.py b/python/paddle/distributed/fleet/base/distributed_strategy.py index bdc40618676fd..34207f6ce6f31 100755 --- a/python/paddle/distributed/fleet/base/distributed_strategy.py +++ b/python/paddle/distributed/fleet/base/distributed_strategy.py @@ -112,6 +112,7 @@ class DistributedStrategy: def __init__(self): """ + DistributedStrategy is the main configuration entry for distributed training of Paddle. All of the distributed training configurations can be configured in DistributedStrategy, such as automatic mixed precision (AMP), Layer-wise Adaptive Rate Scaling (LARS), @@ -153,33 +154,35 @@ def __setattr__(self, key, value): def save_to_prototxt(self, output): """ + Serialize current DistributedStrategy to string and save to output file Examples: + .. code-block:: python - .. code-block:: python + import paddle.distributed.fleet as fleet + strategy = fleet.DistributedStrategy() + strategy.dgc = True + strategy.recompute = True + strategy.recompute_configs = {"checkpoints": ["x"]} + strategy.save_to_prototxt("dist_strategy.prototxt") - import paddle.distributed.fleet as fleet - strategy = fleet.DistributedStrategy() - strategy.dgc = True - strategy.recompute = True - strategy.recompute_configs = {"checkpoints": ["x"]} - strategy.save_to_prototxt("dist_strategy.prototxt") """ with open(output, "w") as fout: fout.write(str(self.strategy)) def load_from_prototxt(self, pb_file): """ + Load from prototxt file for DistributedStrategy initialization Examples: + .. code-block:: python - .. code-block:: python + import paddle.distributed.fleet as fleet + strategy = fleet.DistributedStrategy() + strategy.load_from_prototxt("dist_strategy.prototxt") - import paddle.distributed.fleet as fleet - strategy = fleet.DistributedStrategy() - strategy.load_from_prototxt("dist_strategy.prototxt") """ with open(pb_file, 'r') as f: self.strategy = google.protobuf.text_format.Merge( @@ -192,17 +195,17 @@ def execution_strategy(self): Configure ExecutionStrategy for DistributedStrategy Examples: + .. code-block:: python - .. code-block:: python + import paddle + exe_strategy = paddle.static.ExecutionStrategy() + exe_strategy.num_threads = 10 + exe_strategy.num_iteration_per_drop_scope = 10 + exe_strategy.num_iteration_per_run = 10 - import paddle - exe_strategy = paddle.static.ExecutionStrategy() - exe_strategy.num_threads = 10 - exe_strategy.num_iteration_per_drop_scope = 10 - exe_strategy.num_iteration_per_run = 10 + strategy = paddle.distributed.fleet.DistributedStrategy() + strategy.execution_strategy = exe_strategy - strategy = paddle.distributed.fleet.DistributedStrategy() - strategy.execution_strategy = exe_strategy """ execution_strategy = paddle.fluid.ExecutionStrategy() fields = self.strategy.execution_strategy.DESCRIPTOR.fields @@ -228,27 +231,28 @@ def execution_strategy(self, strategy): @property def build_strategy(self): """ + Configure BuildStrategy for DistributedStrategy Note that the properties of BuildStrategy are valid in DistributedStrategy only if the property is non-distributed strategy. Examples: + .. code-block:: python - .. code-block:: python + import paddle + build_strategy = paddle.static.BuildStrategy() + build_strategy.enable_sequential_execution = True + build_strategy.fuse_elewise_add_act_ops = True + build_strategy.fuse_bn_act_ops = True + build_strategy.enable_auto_fusion = True + build_strategy.fuse_relu_depthwise_conv = True + build_strategy.fuse_broadcast_ops = True + build_strategy.fuse_all_optimizer_ops = True + build_strategy.enable_inplace = True - import paddle - build_strategy = paddle.static.BuildStrategy() - build_strategy.enable_sequential_execution = True - build_strategy.fuse_elewise_add_act_ops = True - build_strategy.fuse_bn_act_ops = True - build_strategy.enable_auto_fusion = True - build_strategy.fuse_relu_depthwise_conv = True - build_strategy.fuse_broadcast_ops = True - build_strategy.fuse_all_optimizer_ops = True - build_strategy.enable_inplace = True + strategy = paddle.distributed.fleet.DistributedStrategy() + strategy.build_strategy = build_strategy - strategy = paddle.distributed.fleet.DistributedStrategy() - strategy.build_strategy = build_strategy """ build_strategy = paddle.fluid.BuildStrategy() @@ -278,15 +282,18 @@ def build_strategy(self, strategy): @property def gradient_scale_configs(self): """ + Set the strategy of gradient scale + Examples: + .. code-block:: python - .. code-block:: python - import paddle.distributed.fleet as fleet - strategy = fleet.DistributedStrategy() - strategy.gradient_scale_configs = {'scale_strategy': 'avg'} + import paddle.distributed.fleet as fleet + strategy = fleet.DistributedStrategy() + strategy.gradient_scale_configs = {'scale_strategy': 'avg'} Note that, strategy must be in 'avg', 'sum' or 'customized' + """ return get_msg_dict(self.strategy.gradient_scale_configs) @@ -303,24 +310,25 @@ def gradient_scale_configs(self, config): @property def a_sync(self): """ + Indicating whether we are using asynchronous stocastic gradient descent updates for training. This property is valid when we are using parameter server training, which is implied by setting approperate RoleMaker Default value: True Examples: + .. code-block:: python - .. code-block:: python + import paddle.distributed.fleet as fleet + role_maker = fleet.PaddleCloudRoleMaker() + fleet.init(role_maker) - import paddle.distributed.fleet as fleet - role_maker = fleet.PaddleCloudRoleMaker() - fleet.init(role_maker) + strategy = fleet.DistributedStrategy() + strategy.a_sync = True # by default this is True - strategy = fleet.DistributedStrategy() - strategy.a_sync = True # by default this is True + # code block for defining loss and local optimizer + # sgd = fleet.distributed_optimizer(optimizer, strategy) - # code block for defining loss and local optimizer - # sgd = fleet.distributed_optimizer(optimizer, strategy) """ return self.strategy.a_sync @@ -340,6 +348,7 @@ def a_sync(self, flag): @property def a_sync_configs(self): """ + Set a_sync update configurations. In general, asynchronous parameter server training has serveral configurable settings that can be configured through a dict. @@ -360,20 +369,19 @@ def a_sync_configs(self): runtime_split_send_recv(bool): if we are using Tensor split for send and recv during runtime Examples: + .. code-block:: python - .. code-block:: python - - import paddle.distributed.fleet as fleet - role_maker = fleet.PaddleCloudRoleMaker() - fleet.init(role_maker) + import paddle.distributed.fleet as fleet + role_maker = fleet.PaddleCloudRoleMaker() + fleet.init(role_maker) - strategy = fleet.DistributedStrategy() - strategy.a_sync = True # by default this is True - configs = {"k_steps": 1024, "send_queue_size": 32} - strategy.a_sync_configs = configs + strategy = fleet.DistributedStrategy() + strategy.a_sync = True # by default this is True + configs = {"k_steps": 1024, "send_queue_size": 32} + strategy.a_sync_configs = configs - # code block for defining loss and local optimizer - # sgd = fleet.distributed_optimizer(optimizer, strategy) + # code block for defining loss and local optimizer + # sgd = fleet.distributed_optimizer(optimizer, strategy) """ return get_msg_dict(self.strategy.a_sync_configs) @@ -389,6 +397,7 @@ def a_sync_configs(self, configs): @property def trainer_desc_configs(self): """ + Set trainer desc configurations. **Notes**: @@ -401,19 +410,18 @@ def trainer_desc_configs(self): stat_var_names(list(str)): Examples: + .. code-block:: python - .. code-block:: python + import paddle.distributed.fleet as fleet + role_maker = fleet.PaddleCloudRoleMaker() + fleet.init(role_maker) - import paddle.distributed.fleet as fleet - role_maker = fleet.PaddleCloudRoleMaker() - fleet.init(role_maker) - - strategy = fleet.DistributedStrategy() - configs = {"dump_fields_path": "./dump_data", "dump_fields": ["xxx", "yyy"]} - strategy.trainer_desc_configs = configs + strategy = fleet.DistributedStrategy() + configs = {"dump_fields_path": "./dump_data", "dump_fields": ["xxx", "yyy"]} + strategy.trainer_desc_configs = configs - # code block for defining loss and local optimizer - # sgd = fleet.distributed_optimizer(optimizer, strategy) + # code block for defining loss and local optimizer + # sgd = fleet.distributed_optimizer(optimizer, strategy) """ return get_msg_dict(self.strategy.trainer_desc_configs) @@ -421,22 +429,23 @@ def trainer_desc_configs(self): @property def adam_d2sum(self): """ + set adam_d2sum Default value: False Examples: + .. code-block:: python - .. code-block:: python + import paddle.distributed.fleet as fleet + role_maker = fleet.PaddleCloudRoleMaker() + fleet.init(role_maker) - import paddle.distributed.fleet as fleet - role_maker = fleet.PaddleCloudRoleMaker() - fleet.init(role_maker) + strategy = fleet.DistributedStrategy() + strategy.adam_d2sum = True # by default this is False - strategy = fleet.DistributedStrategy() - strategy.adam_d2sum = True # by default this is False + # code block for defining loss and local optimizer + # sgd = fleet.distributed_optimizer(optimizer, strategy) - # code block for defining loss and local optimizer - # sgd = fleet.distributed_optimizer(optimizer, strategy) """ return self.strategy.adam_d2sum @@ -463,22 +472,30 @@ def trainer_desc_configs(self, configs): @property def fs_client_param(self): """ + Set fs client configurations. - **Notes**: + + Note: uri(str): the uri of fs client + user(str): the user_name of fs client + passwd(str): the passwd of fs client + hadoop_bin(str): + Examples: - .. code-block:: python - import paddle.distributed.fleet as fleet - role_maker = fleet.PaddleCloudRoleMaker() - fleet.init(role_maker) - strategy = fleet.DistributedStrategy() - configs = {"uri": "xxx", "user": "xxx", passwd: "xxx"} - strategy.fs_client_param = configs - # code block for defining loss and local optimizer - # sgd = fleet.distributed_optimizer(optimizer, strategy) + .. code-block:: python + + import paddle.distributed.fleet as fleet + role_maker = fleet.PaddleCloudRoleMaker() + fleet.init(role_maker) + strategy = fleet.DistributedStrategy() + configs = {"uri": "xxx", "user": "xxx", passwd: "xxx"} + strategy.fs_client_param = configs + # code block for defining loss and local optimizer + # sgd = fleet.distributed_optimizer(optimizer, strategy) + """ return self.strategy.fs_client_param @@ -858,6 +875,7 @@ def amp(self, flag): @property def amp_configs(self): """ + Set automatic mixed precision training configurations. In general, amp has serveral configurable settings that can be configured through a dict. @@ -886,28 +904,27 @@ def amp_configs(self): Default True. Only takes effect when `use_pure_fp16` is turned on. Examples 1: + .. code-block:: python - .. code-block:: python - - import paddle.distributed.fleet as fleet - strategy = fleet.DistributedStrategy() - strategy.amp = True - strategy.amp_configs = { - "init_loss_scaling": 32768, - "custom_white_list": ['conv2d']} + import paddle.distributed.fleet as fleet + strategy = fleet.DistributedStrategy() + strategy.amp = True + strategy.amp_configs = { + "init_loss_scaling": 32768, + "custom_white_list": ['conv2d']} Examples 2: + .. code-block:: python + + import paddle.distributed.fleet as fleet + strategy = fleet.DistributedStrategy() + strategy.amp = True + # pure fp16 + strategy.amp_configs = { + "init_loss_scaling": 32768, + "use_pure_fp16": True + } - .. code-block:: python - - import paddle.distributed.fleet as fleet - strategy = fleet.DistributedStrategy() - strategy.amp = True - # pure fp16 - strategy.amp_configs = { - "init_loss_scaling": 32768, - "use_pure_fp16": True - } """ return get_msg_dict(self.strategy.amp_configs) @@ -920,16 +937,16 @@ def amp_configs(self, configs): @property def asp(self): """ + Indicating whether we are using automatic sparsity training Default Value: False Examples: + .. code-block:: python - .. code-block:: python - - import paddle.distributed.fleet as fleet - strategy = fleet.DistributedStrategy() - strategy.asp = True # by default this is false + import paddle.distributed.fleet as fleet + strategy = fleet.DistributedStrategy() + strategy.asp = True # by default this is false """ return self.strategy.asp @@ -949,30 +966,31 @@ def recompute(self): Default value: False Examples: + .. code-block:: python - .. code-block:: python + import paddle.distributed.fleet as fleet + strategy = fleet.DistributedStrategy() + strategy.recompute = True + # suppose x and y are names of checkpoint tensors for recomputation + strategy.recompute_configs = {"checkpoints": ["x", "y"]} - import paddle.distributed.fleet as fleet - strategy = fleet.DistributedStrategy() - strategy.recompute = True - # suppose x and y are names of checkpoint tensors for recomputation - strategy.recompute_configs = {"checkpoints": ["x", "y"]} """ return self.strategy.recompute @property def sync_nccl_allreduce(self): """ + Indicating whether we are using synchronized all reduce in each communication thread We note that system overhead is usually lower when sync_nccl_allreduce = True Examples: + .. code-block:: python - .. code-block:: python + import paddle.distributed.fleet as fleet + strategy = fleet.DistributedStrategy() + strategy.sync_nccl_allreduce = True - import paddle.distributed.fleet as fleet - strategy = fleet.DistributedStrategy() - strategy.sync_nccl_allreduce = True """ return self.strategy.sync_nccl_allreduce @@ -987,17 +1005,18 @@ def sync_nccl_allreduce(self, flag): @property def use_hierarchical_allreduce(self): """ + Indicating whether we are using hierarchical allreduce in collective communication Hierarchical allreduce often does allreduce within a certain node group and then do allreduce among the leaders of each group Examples: + .. code-block:: python - .. code-block:: python + import paddle.distributed.fleet as fleet + strategy = fleet.DistributedStrategy() + strategy.use_hierarchical_allreduce = True - import paddle.distributed.fleet as fleet - strategy = fleet.DistributedStrategy() - strategy.use_hierarchical_allreduce = True """ return self.strategy.use_hierarchical_allreduce @@ -1014,16 +1033,17 @@ def use_hierarchical_allreduce(self, flag): @property def hierarchical_allreduce_inter_nranks(self): """ + Number of ranks for low level node groups in hierarchical allreduce Default value: number of GPU cards on each single GPU machine Example: + .. code-block:: python - .. code-block:: python + import paddle.distributed.fleet as fleet + strategy = fleet.DistributedStrategy() + strategy.hierarchical_allreduce_inter_nranks = 8 - import paddle.distributed.fleet as fleet - strategy = fleet.DistributedStrategy() - strategy.hierarchical_allreduce_inter_nranks = 8 """ return self.strategy.hierarchical_allreduce_inter_nranks @@ -1040,17 +1060,18 @@ def hierarchical_allreduce_inter_nranks(self, value): @property def sync_batch_norm(self): """ + Indicating whether we are using sync_batch_norm to do synchronous batch normalization among all training nodes. Default value: False Examples: + .. code-block:: python - .. code-block:: python + import paddle.distributed.fleet as fleet + strategy = fleet.DistributedStrategy() + strategy.sync_batch_norm = True - import paddle.distributed.fleet as fleet - strategy = fleet.DistributedStrategy() - strategy.sync_batch_norm = True """ return self.strategy.sync_batch_norm @@ -1066,16 +1087,17 @@ def sync_batch_norm(self, flag): @property def fuse_all_reduce_ops(self): """ + Indicating whether we are using fuse_all_reduce_ops for gradient fusion during backward phase of training Default value: True Examples: + .. code-block:: python - .. code-block:: python + import paddle.distributed.fleet as fleet + strategy = fleet.DistributedStrategy() + strategy.fuse_all_reduce_ops = False - import paddle.distributed.fleet as fleet - strategy = fleet.DistributedStrategy() - strategy.fuse_all_reduce_ops = False """ return self.strategy.fuse_all_reduce_ops @@ -1090,17 +1112,18 @@ def fuse_all_reduce_ops(self, flag): @property def fuse_grad_size_in_MB(self): """ + Specifying the size of gradient to fuse in Mega-Bytes Default value: 32 Examples: + .. code-block:: python - .. code-block:: python + import paddle.distributed.fleet as fleet + strategy = fleet.DistributedStrategy() + strategy.fuse_grad_size_in_MB = 50 - import paddle.distributed.fleet as fleet - strategy = fleet.DistributedStrategy() - strategy.fuse_grad_size_in_MB = 50 """ return self.strategy.fuse_grad_size_in_MB @@ -1115,6 +1138,7 @@ def fuse_grad_size_in_MB(self, value): @property def last_comm_group_size_MB(self): """ + Specifying the size of gradient to fuse in Mega-Bytes when the last group of each batch communicates. Making the last group small is useful to improve performance. @@ -1122,11 +1146,12 @@ def last_comm_group_size_MB(self): Default value: 1 Examples: - .. code-block:: python + .. code-block:: python + + import paddle.distributed.fleet as fleet + strategy = fleet.DistributedStrategy() + strategy.last_comm_group_size_MB = 2 - import paddle.distributed.fleet as fleet - strategy = fleet.DistributedStrategy() - strategy.last_comm_group_size_MB = 2 """ return self.strategy.last_comm_group_size_MB @@ -1141,18 +1166,19 @@ def last_comm_group_size_MB(self, value): @property def find_unused_parameters(self): """ + Indicating whether we are using find_unused_parameters to find unused parameters in DataParallel. Default value: False Examples: + .. code-block:: python - .. code-block:: python + import paddle.distributed.fleet as fleet + strategy = fleet.DistributedStrategy() + strategy.find_unused_parameters = True - import paddle.distributed.fleet as fleet - strategy = fleet.DistributedStrategy() - strategy.find_unused_parameters = True """ return self.strategy.find_unused_parameters @@ -1184,17 +1210,18 @@ def _fuse_grad_size_in_TFLOPS(self, value): @property def nccl_comm_num(self): """ + Specifying the number of NCCL communicator Default value: 1 Examples: + .. code-block:: python - .. code-block:: python + import paddle.distributed.fleet as fleet + strategy = fleet.DistributedStrategy() + strategy.nccl_comm_num = 2 - import paddle.distributed.fleet as fleet - strategy = fleet.DistributedStrategy() - strategy.nccl_comm_num = 2 """ return self.strategy.nccl_comm_num @@ -1218,6 +1245,7 @@ def recompute(self, flag): @property def recompute_configs(self): """ + Set recompute configurations. **Note**: @@ -1234,16 +1262,15 @@ def recompute_configs(self): specific here should be determined ("-1" is not allowed). Examples: + .. code-block:: python - .. code-block:: python - - import paddle.distributed.fleet as fleet - strategy = fleet.DistributedStrategy() - strategy.recompute = True - strategy.recompute_configs = { - "checkpoints": ["x", "y"], - "enable_offload": True, - "checkpoint_shape": [100, 512, 1024] } + import paddle.distributed.fleet as fleet + strategy = fleet.DistributedStrategy() + strategy.recompute = True + strategy.recompute_configs = { + "checkpoints": ["x", "y"], + "enable_offload": True, + "checkpoint_shape": [100, 512, 1024] } """ return get_msg_dict(self.strategy.recompute_configs) @@ -1259,6 +1286,7 @@ def recompute_configs(self, configs): @property def sharding(self): """ + Indicating whether we are using sharding Optimizer for memory optimization. We implement the sharding optimizer following the ZeRO-DP idea from [ZeRO: Memory Optimizations Toward Training Trillion Parameter Models](https://arxiv.org/abs/1910.02054). @@ -1269,12 +1297,12 @@ def sharding(self): Default value: False Examples: + .. code-block:: python - .. code-block:: python + import paddle.fleet as fleet + strategy = fleet.DistributedStrategy() + strategy.sharding = True - import paddle.fleet as fleet - strategy = fleet.DistributedStrategy() - strategy.sharding = True """ return self.strategy.sharding @@ -1289,6 +1317,7 @@ def sharding(self, flag): @property def sharding_configs(self): """ + Set sharding configurations. **Note**: @@ -1326,20 +1355,20 @@ def sharding_configs(self): Examples: + .. code-block:: python + + # sharding-DP, 2 nodes with 8 gpus per node + import paddle.distributed.fleet as fleet + strategy = fleet.DistributedStrategy() + strategy.sharding = True + strategy.sharding_configs = { + "sharding_segment_strategy": "segment_broadcast_MB", + "segment_broadcast_MB": 32, + "sharding_degree": 8, + "dp_degree": 2, + "gradient_merge_acc_step": 4, + } - .. code-block:: python - - # sharding-DP, 2 nodes with 8 gpus per node - import paddle.distributed.fleet as fleet - strategy = fleet.DistributedStrategy() - strategy.sharding = True - strategy.sharding_configs = { - "sharding_segment_strategy": "segment_broadcast_MB", - "segment_broadcast_MB": 32, - "sharding_degree": 8, - "dp_degree": 2, - "gradient_merge_acc_step": 4, - } """ return get_msg_dict(self.strategy.sharding_configs) @@ -1354,15 +1383,15 @@ def sharding_configs(self, configs): @property def without_graph_optimization(self): """ + Run program using Executor other than ParallelExecutor. Examples: + .. code-block:: python - .. code-block:: python - - import paddle.distributed.fleet as fleet - strategy = fleet.DistributedStrategy() - strategy.without_graph_optimization = True + import paddle.distributed.fleet as fleet + strategy = fleet.DistributedStrategy() + strategy.without_graph_optimization = True """ return self.strategy.without_graph_optimization @@ -1380,14 +1409,18 @@ def without_graph_optimization(self, flag): @property def _calc_comm_same_stream(self): """ + This based on raw_program_optimizer program Set whether use same stream for calc and comm when fuse allreduce The default value for the calc_comm_same_stream is False + Examples: - .. code-block:: python - import paddle.distributed.fleet as fleet - strategy = fleet.DistributedStrategy() - strategy.calc_comm_same_stream = True + .. code-block:: python + + import paddle.distributed.fleet as fleet + strategy = fleet.DistributedStrategy() + strategy.calc_comm_same_stream = True + """ return self.strategy.calc_comm_same_stream @@ -1404,14 +1437,18 @@ def _calc_comm_same_stream(self, same): @property def fuse_grad_merge(self): """ + Set whether fuse the grad for gradient merge. Note: this flag will only effect the gradient merge under pipeline mode The default value for the fuse_grad_merge is False + Examples: - .. code-block:: python - import paddle.distributed.fleet as fleet - strategy = fleet.DistributedStrategy() - strategy.fuse_param_grad = True + .. code-block:: python + + import paddle.distributed.fleet as fleet + strategy = fleet.DistributedStrategy() + strategy.fuse_param_grad = True + """ return self.strategy.fuse_grad_merge @@ -1426,12 +1463,17 @@ def fuse_grad_merge(self, fuse_grad_merge): @property def fuse_grad_size_in_num(self): """ + This based on raw_program_optimizer program and allreduce the num of the fused op + Examples: - .. code-block:: python - import paddle.distributed.fleet as fleet - strategy = fleet.DistributedStrategy() - strategy.fuse_grad_size_in_num = 2 + .. code-block:: python + + import paddle.distributed.fleet as fleet + + strategy = fleet.DistributedStrategy() + strategy.fuse_grad_size_in_num = 2 + """ return self.strategy.fuse_grad_size_in_num @@ -1448,18 +1490,18 @@ def fuse_grad_size_in_num(self, num): @property def pipeline(self): """ + Indicating whether we are using pipeline parallelism for distributed training. Current implementation mainly focus on single GPU machine pipeline parallelism and data parallelism across GPU machine. The pipeline information is indicated through device_guard information in user-defined program. Examples: + .. code-block:: python - .. code-block:: python - - import paddle.distributed.fleet as fleet - strategy = fleet.DistributedStrategy() - strategy.pipeline = True + import paddle.distributed.fleet as fleet + strategy = fleet.DistributedStrategy() + strategy.pipeline = True """ return self.strategy.pipeline @@ -1499,6 +1541,7 @@ def pipeline(self, flag): @property def pipeline_configs(self): """ + Set pipeline parallelism configurations. In pipeline parallelism, different parts of neural networks are running on different GPUS. There are Tensor queue buffer between each pair of neighborhood GPUS @@ -1514,13 +1557,12 @@ def pipeline_configs(self): **micro_batch_size**: the number of small batches in each user defined batch Examples: + .. code-block:: python - .. code-block:: python - - import paddle.distributed.fleet as fleet - strategy = fleet.DistributedStrategy() - strategy.pipeline = True - strategy.pipeline_configs = {"micro_batch_size": 12} + import paddle.distributed.fleet as fleet + strategy = fleet.DistributedStrategy() + strategy.pipeline = True + strategy.pipeline_configs = {"micro_batch_size": 12} """ @@ -1537,15 +1579,15 @@ def pipeline_configs(self, configs): @property def tensor_parallel(self): """ + Indicating whether we are using tensor parallel for distributed training. Examples: + .. code-block:: python - .. code-block:: python - - import paddle.distributed.fleet as fleet - strategy = fleet.DistributedStrategy() - strategy.tensor_parallel = True + import paddle.distributed.fleet as fleet + strategy = fleet.DistributedStrategy() + strategy.tensor_parallel = True """ return self.strategy.tensor_parallel @@ -1561,23 +1603,25 @@ def tensor_parallel(self, flag): @property def tensor_parallel_configs(self): """ + Set tensor_parallel configurations. **Notes**: **Detailed arguments for tensor_parallel_configs** + **tensor_parallel_degree**: degree of tensor parallel + **tensor_init_seed**: parameter initialization random seed Examples: + .. code-block:: python - .. code-block:: python - - import paddle.distributed.fleet as fleet - strategy = fleet.DistributedStrategy() - strategy.tensor_parallel = True - strategy.tensor_parallel_configs = {"tensor_parallel_degree": 4, - "tensor_init_seed": 123} + import paddle.distributed.fleet as fleet + strategy = fleet.DistributedStrategy() + strategy.tensor_parallel = True + strategy.tensor_parallel_configs = {"tensor_parallel_degree": 4, + "tensor_init_seed": 123} """ return get_msg_dict(self.strategy.tensor_parallel_configs) @@ -1595,28 +1639,32 @@ def tensor_parallel_configs(self, configs): @property def hybrid_configs(self): """ + Dynamic graph hybrid parallel strategy configuration. Three-way hybrid parallelism needs to meet the following relationships total_number_GPUs = dp_degree * mp_degree * pp_degree **Note**: - dp_degree(int): set number of GPUs in a data parallel group. Default -1. + **dp_degree(int)**: set number of GPUs in a data parallel group. Default -1. This value should be an integer greater than 0. If it is not set, or set to -1, its value will be inferred based on the total number of cards. - mp_degree(int): set number of GPUs in a model parallel group. Default 1 - pp_degree(int): set number of GPUs in a pipeline parallel group. Default 1 + **mp_degree(int)**: set number of GPUs in a model parallel group. Default 1 + + **pp_degree(int)**: set number of GPUs in a pipeline parallel group. Default 1 Examples: - .. code-block:: python - import paddle.distributed.fleet as fleet - strategy = fleet.DistributedStrategy() - strategy.hybrid_configs = { - "dp_degree": 1, - "mp_degree": 2, - "pp_degree": 1} + .. code-block:: python + + import paddle.distributed.fleet as fleet + strategy = fleet.DistributedStrategy() + strategy.hybrid_configs = { + "dp_degree": 1, + "mp_degree": 2, + "pp_degree": 1} + """ return get_msg_dict(self.strategy.hybrid_configs) @@ -1630,18 +1678,18 @@ def hybrid_configs(self, configs): @property def localsgd(self): """ + Indicating whether we are using Local SGD training. Default Value: False For more details, please refer to `Don't Use Large Mini-Batches, Use Local SGD `_. Examples: + .. code-block:: python - .. code-block:: python - - import paddle.distributed.fleet as fleet - strategy = fleet.DistributedStrategy() - strategy.localsgd = True # by default this is false + import paddle.distributed.fleet as fleet + strategy = fleet.DistributedStrategy() + strategy.localsgd = True # by default this is false """ return self.strategy.localsgd @@ -1657,6 +1705,7 @@ def localsgd(self, flag): @property def localsgd_configs(self): """ + Set LocalSGD training configurations. LocalSGD has a configurable setting that can be configured through a dict. @@ -1665,14 +1714,14 @@ def localsgd_configs(self): begin_step(int) The step of beginning training by localsgd. Default 1. Examples: + .. code-block:: python - .. code-block:: python + import paddle.distributed.fleet as fleet + strategy = fleet.DistributedStrategy() + strategy.localsgd = True + strategy.localsgd_configs = {"k_steps": 4, + "begin_step": 30} - import paddle.distributed.fleet as fleet - strategy = fleet.DistributedStrategy() - strategy.localsgd = True - strategy.localsgd_configs = {"k_steps": 4, - "begin_step": 30} """ return get_msg_dict(self.strategy.localsgd_configs) @@ -1688,18 +1737,17 @@ def localsgd_configs(self, configs): @property def adaptive_localsgd(self): """ + Indicating whether we are using Adaptive Local SGD training. Default Value: False For more details, please refer to `Adaptive Communication Strategies to Achieve the Best Error-Runtime Trade-off in Local-Update SGD `_. - Examples: + .. code-block:: python - .. code-block:: python - - import paddle.distributed.fleet as fleet - strategy = fleet.DistributedStrategy() - strategy.adaptive_localsgd = True # by default this is false + import paddle.distributed.fleet as fleet + strategy = fleet.DistributedStrategy() + strategy.adaptive_localsgd = True # by default this is false """ return self.strategy.adaptive_localsgd @@ -1715,6 +1763,7 @@ def adaptive_localsgd(self, flag): @property def adaptive_localsgd_configs(self): """ + Set AdaptiveLocalSGD training configurations. AdaptiveLocalSGD has a configurable setting that can be configured through a dict. @@ -1722,17 +1771,18 @@ def adaptive_localsgd_configs(self): init_k_steps(int) The initial steps for training before adaptive localsgd. Then, the adaptive localsgd method will modify init_k_steps automatically. Default 1. + begin_step(int) The step of beginning training by adaptive localsgd. Default 1. Examples: + .. code-block:: python - .. code-block:: python + import paddle.distributed.fleet as fleet + strategy = fleet.DistributedStrategy() + strategy.adaptive_localsgd = True + strategy.adaptive_localsgd_configs = {"init_k_steps": 1, + "begin_step": 30} - import paddle.distributed.fleet as fleet - strategy = fleet.DistributedStrategy() - strategy.adaptive_localsgd = True - strategy.adaptive_localsgd_configs = {"init_k_steps": 1, - "begin_step": 30} """ return get_msg_dict(self.strategy.adaptive_localsgd_configs) @@ -1750,18 +1800,18 @@ def adaptive_localsgd_configs(self, configs): @property def dgc(self): """ + Indicating whether we are using Deep Gradient Compression training. For more details, please refer to [Deep Gradient Compression](https://arxiv.org/abs/1712.01887). Default Value: False Examples: + .. code-block:: python - .. code-block:: python - - import paddle.distributed.fleet as fleet - strategy = fleet.DistributedStrategy() - strategy.dgc = True # by default this is false + import paddle.distributed.fleet as fleet + strategy = fleet.DistributedStrategy() + strategy.dgc = True # by default this is false """ return self.strategy.dgc @@ -1777,6 +1827,7 @@ def dgc(self, flag): @property def dgc_configs(self): r""" + Set Deep Gradient Compression training configurations. In general, dgc has serveral configurable settings that can be configured through a dict. @@ -1793,13 +1844,13 @@ def dgc_configs(self): element will be transmitted. Examples: + .. code-block:: python - .. code-block:: python + import paddle.distributed.fleet as fleet + strategy = fleet.DistributedStrategy() + strategy.dgc = True + strategy.dgc_configs = {"rampup_begin_step": 1252} - import paddle.distributed.fleet as fleet - strategy = fleet.DistributedStrategy() - strategy.dgc = True - strategy.dgc_configs = {"rampup_begin_step": 1252} """ return get_msg_dict(self.strategy.dgc_configs) @@ -1812,16 +1863,17 @@ def dgc_configs(self, configs): @property def fp16_allreduce(self): """ + Indicating whether we are using fp16 gradient allreduce training Default Value: False Examples: + .. code-block:: python - .. code-block:: python + import paddle.distributed.fleet as fleet - import paddle.distributed.fleet as fleet - strategy = fleet.DistributedStrategy() - strategy.fp16_allreduce = True # by default this is false + strategy = fleet.DistributedStrategy() + strategy.fp16_allreduce = True # by default this is false """ return self.strategy.fp16_allreduce @@ -1836,6 +1888,7 @@ def fp16_allreduce(self, flag): @property def gradient_merge(self): """ + Gradient Merge, also called as Gradient Accumulation, is a strategy for large batch training. With this strategy, model parameter will not be updated until user-defined steps. @@ -1846,13 +1899,13 @@ def gradient_merge(self): to model parameters. Examples: + .. code-block:: python - .. code-block:: python + import paddle.distributed.fleet as fleet + strategy = fleet.DistributedStrategy() + strategy.gradient_merge = True + strategy.gradient_merge_configs = {"k_steps": 4, "avg": True} - import paddle.distributed.fleet as fleet - strategy = fleet.DistributedStrategy() - strategy.gradient_merge = True - strategy.gradient_merge_configs = {"k_steps": 4, "avg": True} """ return self.strategy.gradient_merge @@ -1867,6 +1920,7 @@ def gradient_merge(self, flag): @property def gradient_merge_configs(self): """ + the key-value configs of distribute_strategy **Note**: @@ -1875,13 +1929,13 @@ def gradient_merge_configs(self): avg(bool): whether to average the gradients of each mini-batch, the default value is `True` Examples: + .. code-block:: python - .. code-block:: python + import paddle.distributed.fleet as fleet + strategy = fleet.DistributedStrategy() + strategy.gradient_merge = True + strategy.gradient_merge_configs = {"k_steps": 4, "avg": True} - import paddle.distributed.fleet as fleet - strategy = fleet.DistributedStrategy() - strategy.gradient_merge = True - strategy.gradient_merge_configs = {"k_steps": 4, "avg": True} """ return get_msg_dict(self.strategy.gradient_merge_configs) @@ -1896,6 +1950,7 @@ def gradient_merge_configs(self, configs): @property def lars(self): """ + Set lars configurations. lars is used to deal with the convergence problems when the global batch size is larger than 8k. For more details, please refer to [Large Batch Training of Convolutional Networks](https://arxiv.org/abs/1708.03888). @@ -1903,12 +1958,12 @@ def lars(self): Default Value: False Examples: + .. code-block:: python - .. code-block:: python + import paddle.distributed.fleet as fleet + strategy = fleet.DistributedStrategy() + strategy.lars = True # by default this is false - import paddle.distributed.fleet as fleet - strategy = fleet.DistributedStrategy() - strategy.lars = True # by default this is false """ return self.strategy.lars @@ -1923,6 +1978,7 @@ def lars(self, flag): @property def lars_configs(self): """ + Set Lars training configurations. **Notes**: @@ -1934,18 +1990,18 @@ def lars_configs(self): will be exclude from weight decay in lars formula. Examples: + .. code-block:: python - .. code-block:: python + import paddle.distributed.fleet as fleet + strategy = fleet.DistributedStrategy() + strategy.lars = True + strategy.lars_configs = { + "lars_coeff": 0.01, + "lars_weight_decay": 0.0005, + "epsilon": 0, + "exclude_from_weight_decay": ['batch_norm', '.b_0'] + } - import paddle.distributed.fleet as fleet - strategy = fleet.DistributedStrategy() - strategy.lars = True - strategy.lars_configs = { - "lars_coeff": 0.01, - "lars_weight_decay": 0.0005, - "epsilon": 0, - "exclude_from_weight_decay": ['batch_norm', '.b_0'] - } """ return get_msg_dict(self.strategy.lars_configs) @@ -1958,6 +2014,7 @@ def lars_configs(self, configs): @property def lamb(self): """ + Set lamb configurations. lamb is used to deal with the convergence problems for large batch size training, specially for attention-related model like BERT. For more details, please refer to @@ -1966,12 +2023,12 @@ def lamb(self): Default Value: False Examples: + .. code-block:: python - .. code-block:: python + import paddle.distributed.fleet as fleet + strategy = fleet.DistributedStrategy() + strategy.lamb = True # by default this is false - import paddle.distributed.fleet as fleet - strategy = fleet.DistributedStrategy() - strategy.lamb = True # by default this is false """ return self.strategy.lamb @@ -1987,6 +2044,7 @@ def lamb(self, flag): @property def lamb_configs(self): """ + Set Lars training configurations. **Notes**: @@ -1995,16 +2053,16 @@ def lamb_configs(self): will be exclude from weight decay in lamb formula. Examples: + .. code-block:: python + + import paddle.distributed.fleet as fleet + strategy = fleet.DistributedStrategy() + strategy.lamb = True + strategy.lamb_configs = { + 'lamb_weight_decay': 0.01, + 'exclude_from_weight_decay': [], + } - .. code-block:: python - - import paddle.distributed.fleet as fleet - strategy = fleet.DistributedStrategy() - strategy.lamb = True - strategy.lamb_configs = { - 'lamb_weight_decay': 0.01, - 'exclude_from_weight_decay': [], - } """ return get_msg_dict(self.strategy.lamb_configs) @@ -2017,8 +2075,10 @@ def lamb_configs(self, configs): @property def elastic(self): """ + Indicating whether we want to do current distributed training on clusters with elastic resources. Currently, this is configuration is not valid. + """ return self.strategy.elastic @@ -2033,6 +2093,7 @@ def elastic(self, flag): @property def auto(self): """ + Indicating whether we are using auto-parallel configuration This feature is currently an experimental feature. Currently, auto-parallelism can be used only when a user does not set any other @@ -2041,20 +2102,20 @@ def auto(self): Default Value: False Examples: + .. code-block:: python - .. code-block:: python + import paddle + paddle.enable_static() + import paddle.distributed.fleet as fleet - import paddle - paddle.enable_static() - import paddle.distributed.fleet as fleet + strategy = fleet.DistributedStrategy() + strategy.auto = True + # if set other strategy at the same time, auto will not apply + # strategy.amp = True - strategy = fleet.DistributedStrategy() - strategy.auto = True - # if set other strategy at the same time, auto will not apply - # strategy.amp = True + optimizer = paddle.optimizer.SGD(learning_rate=0.01) + optimizer = fleet.distributed_optimizer(optimizer, strategy) - optimizer = paddle.optimizer.SGD(learning_rate=0.01) - optimizer = fleet.distributed_optimizer(optimizer, strategy) """ return self.strategy.auto @@ -2068,6 +2129,7 @@ def auto(self, flag): @property def semi_auto(self): """ + Indicating whether we are using semi-auto parallel function This feature is currently an experimental feature. Currently, auto-parallelism can be used only when a user does not set any other @@ -2076,20 +2138,20 @@ def semi_auto(self): Default Value: False Examples: + .. code-block:: python - .. code-block:: python + import paddle + paddle.enable_static() + import paddle.distributed.fleet as fleet - import paddle - paddle.enable_static() - import paddle.distributed.fleet as fleet + strategy = fleet.DistributedStrategy() + strategy.semi_auto = True + # if set other strategy at the same time, auto will not apply + # strategy.amp = True - strategy = fleet.DistributedStrategy() - strategy.semi_auto = True - # if set other strategy at the same time, auto will not apply - # strategy.amp = True + optimizer = paddle.optimizer.SGD(learning_rate=0.01) + optimizer = fleet.distributed_optimizer(optimizer, strategy) - optimizer = paddle.optimizer.SGD(learning_rate=0.01) - optimizer = fleet.distributed_optimizer(optimizer, strategy) """ return self.strategy.semi_auto @@ -2103,16 +2165,21 @@ def semi_auto(self, flag): @property def auto_search(self): """ + Indicating whether we are using auto-search parallel function For details, please reference the following code example Default Value: False + Examples: - .. code-block:: python - import paddle - paddle.enable_static() - import paddle.distributed.fleet as fleet - strategy = fleet.DistributedStrategy() - strategy.auto_search = True + .. code-block:: python + + import paddle + + paddle.enable_static() + import paddle.distributed.fleet as fleet + strategy = fleet.DistributedStrategy() + strategy.auto_search = True + """ return self.strategy.auto_search @@ -2126,15 +2193,20 @@ def auto_search(self, flag): @property def split_data(self): """ + Indicating whether we split the data. If True, we split the data. Default Value: True + Examples: - .. code-block:: python - import paddle - paddle.enable_static() - import paddle.distributed.fleet as fleet - strategy = fleet.DistributedStrategy() - strategy.split_data = True + .. code-block:: python + + import paddle + + paddle.enable_static() + import paddle.distributed.fleet as fleet + strategy = fleet.DistributedStrategy() + strategy.split_data = True + """ return self.strategy.split_data @@ -2148,8 +2220,10 @@ def split_data(self, flag): @property def qat(self): """ + Indicating whether we are using quantization training Default Value: False + """ return self.strategy.qat @@ -2163,6 +2237,7 @@ def qat(self, flag): @property def qat_configs(self): """ + Set quantization training configurations. In general, qat has serveral configurable settings that can be configured through a dict. @@ -2179,17 +2254,17 @@ def qat_configs(self): algo(str): Other quantization training algorithm. Exampless: + .. code-block:: python - .. code-block:: python + import paddle.distributed.fleet as fleet - import paddle.distributed.fleet as fleet - strategy = fleet.DistributedStrategy() - strategy.qat = True - strategy.qat_configs = { - "channel_wise_abs_max": True, - "weight_bits": 8, - "activation_bits: 8, - "not_quant_pattern": ['skip_quant']} + strategy = fleet.DistributedStrategy() + strategy.qat = True + strategy.qat_configs = { + "channel_wise_abs_max": True, + "weight_bits": 8, + "activation_bits: 8, + "not_quant_pattern": ['skip_quant']} """ return get_msg_dict(self.strategy.qat_configs) @@ -2202,24 +2277,25 @@ def qat_configs(self, configs): @property def heter_ccl_mode(self): """ + Indicating whether we are using heter_ccl_mode for model training. This feature is currently an experimental feature. Currently, heter_ccl_mode can be used only for dataparallel with dygraph mode. Default Value: False Examples: + .. code-block:: python - .. code-block:: python + import paddle + import paddle.distributed.fleet as fleet - import paddle - import paddle.distributed.fleet as fleet + strategy = fleet.DistributedStrategy() + strategy.heter_ccl_mode = True - strategy = fleet.DistributedStrategy() - strategy.heter_ccl_mode = True + # for initialize parallel env, only need to call + paddle.distributed.init_parallel_env() + # then the heterogenous context will be created. - # for initialize parallel env, only need to call - paddle.distributed.init_parallel_env() - # then the heterogenous context will be created. """ return self.strategy.heter_ccl_mode @@ -2233,6 +2309,7 @@ def heter_ccl_mode(self, flag): @property def cudnn_exhaustive_search(self): """ + Indicating whether to use exhaustive search method to choose convolution algorithms. Exhaustive search attempts all cuDNN algorithms to choose the fastest algorithm. This method is time-consuming, the choosed algorithm will be cached for the given layer specifications. @@ -2240,17 +2317,18 @@ def cudnn_exhaustive_search(self): Default Value: True Examples: + .. code-block:: python - .. code-block:: python + import paddle + paddle.enable_static() + import paddle.distributed.fleet as fleet - import paddle - paddle.enable_static() - import paddle.distributed.fleet as fleet - strategy = fleet.DistributedStrategy() - strategy.cudnn_exhaustive_search = False + strategy = fleet.DistributedStrategy() + strategy.cudnn_exhaustive_search = False + + optimizer = paddle.optimizer.SGD(learning_rate=0.01) + optimizer = fleet.distributed_optimizer(optimizer, strategy) - optimizer = paddle.optimizer.SGD(learning_rate=0.01) - optimizer = fleet.distributed_optimizer(optimizer, strategy) """ return self.strategy.cudnn_exhaustive_search @@ -2267,6 +2345,7 @@ def cudnn_exhaustive_search(self, flag): @property def conv_workspace_size_limit(self): """ + The workspace limit size in MB unit for choosing cuDNN convolution algorithms. The inner funciton of cuDNN obtain the fastest suited algorithm that fits within this memory limit. Usually, large workspace size may lead to choose faster algorithms, @@ -2274,17 +2353,17 @@ def conv_workspace_size_limit(self): Default Value: 4000 Examples: + .. code-block:: python - .. code-block:: python + import paddle + paddle.enable_static() + import paddle.distributed.fleet as fleet - import paddle - paddle.enable_static() - import paddle.distributed.fleet as fleet - strategy = fleet.DistributedStrategy() - strategy.conv_workspace_size_limit = 1024 + strategy = fleet.DistributedStrategy() + strategy.conv_workspace_size_limit = 1024 - optimizer = paddle.optimizer.SGD(learning_rate=0.01) - optimizer = fleet.distributed_optimizer(optimizer, strategy) + optimizer = paddle.optimizer.SGD(learning_rate=0.01) + optimizer = fleet.distributed_optimizer(optimizer, strategy) """ return self.strategy.conv_workspace_size_limit @@ -2302,22 +2381,23 @@ def conv_workspace_size_limit(self, value): @property def cudnn_batchnorm_spatial_persistent(self): """ + Indicates whether to use the mode CUDNN_BATCHNORM_SPATIAL_PERSISTENT function in batchnorm. This is only useful in cudnn. Default Value: True Examples: + .. code-block:: python - .. code-block:: python + import paddle + paddle.enable_static() + import paddle.distributed.fleet as fleet - import paddle - paddle.enable_static() - import paddle.distributed.fleet as fleet - strategy = fleet.DistributedStrategy() - strategy.cudnn_batchnorm_spatial_persistent = True + strategy = fleet.DistributedStrategy() + strategy.cudnn_batchnorm_spatial_persistent = True - optimizer = paddle.optimizer.SGD(learning_rate=0.01) - optimizer = fleet.distributed_optimizer(optimizer, strategy) + optimizer = paddle.optimizer.SGD(learning_rate=0.01) + optimizer = fleet.distributed_optimizer(optimizer, strategy) """ return self.strategy.cudnn_batchnorm_spatial_persistent diff --git a/python/paddle/distributed/fleet/base/topology.py b/python/paddle/distributed/fleet/base/topology.py index 335125123c4d6..c34d64e611700 100644 --- a/python/paddle/distributed/fleet/base/topology.py +++ b/python/paddle/distributed/fleet/base/topology.py @@ -25,12 +25,13 @@ class ParallelMode: """ + There are all the parallel modes currently supported: - - DATA_PARALLEL: Distribute input data to different devices. - - TENSOR_PARALLEL: Shards tensors in the network to different devices. - - PIPELINE_PARALLEL: Place different layers of the network on different devices. - - SHARDING_PARALLEL: Segment the model parameters, parameter gradients and optimizer states - corresponding to the parameters to each device. + + - DATA_PARALLEL: Distribute input data to different devices. + - TENSOR_PARALLEL: Shards tensors in the network to different devices. + - PIPELINE_PARALLEL: Place different layers of the network on different devices. + - SHARDING_PARALLEL: Segment the model parameters, parameter gradients and optimizer states corresponding to the parameters to each device. Examples: .. code-block:: python diff --git a/python/paddle/distributed/parallel.py b/python/paddle/distributed/parallel.py index ca557dc7dd372..18339cd5af37c 100644 --- a/python/paddle/distributed/parallel.py +++ b/python/paddle/distributed/parallel.py @@ -97,6 +97,7 @@ def _check_var_exists(var_name): def init_parallel_env(): """ + Initialize parallel training environment in dynamic graph mode. Note: @@ -112,6 +113,7 @@ def init_parallel_env(): Examples: .. code-block:: python + # required: gpu import paddle import paddle.nn as nn @@ -152,6 +154,7 @@ def train(): if __name__ == '__main__': dist.spawn(train) + """ # 0. get env & check world size diff --git a/python/paddle/geometric/message_passing/send_recv.py b/python/paddle/geometric/message_passing/send_recv.py index 047a2e45c4b98..5131930d1d959 100644 --- a/python/paddle/geometric/message_passing/send_recv.py +++ b/python/paddle/geometric/message_passing/send_recv.py @@ -236,13 +236,13 @@ def send_ue_recv( src_index (Tensor): An 1-D tensor, and the available data type is int32, int64. dst_index (Tensor): An 1-D tensor, and should have the same shape as `src_index`. The available data type is int32, int64. - message_op (str): Different message ops for x and e, including `add`, `sub`, `mul`, `div`. - reduce_op (str): Different reduce ops, including `sum`, `mean`, `max`, `min`. + message_op (str, optional): Different message ops for x and e, including `add`, `sub`, `mul`, `div`. + reduce_op (str, optional): Different reduce ops, including `sum`, `mean`, `max`, `min`. Default value is `sum`. - out_size (int|Tensor|None): We can set `out_size` to get necessary output shape. If not set or + out_size (int|Tensor, optional): We can set `out_size` to get necessary output shape. If not set or out_size is smaller or equal to 0, then this input will not be used. Otherwise, `out_size` should be equal with or larger than - max(dst_index) + 1. + max(dst_index) + 1. Default value is `None`. name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. diff --git a/python/paddle/geometric/reindex.py b/python/paddle/geometric/reindex.py index c14e5a551f116..8b755d191c44e 100644 --- a/python/paddle/geometric/reindex.py +++ b/python/paddle/geometric/reindex.py @@ -25,6 +25,7 @@ def reindex_graph( x, neighbors, count, value_buffer=None, index_buffer=None, name=None ): """ + Reindex Graph API. This API is mainly used in Graph Learning domain, which should be used @@ -48,12 +49,12 @@ def reindex_graph( should be the same with `x`. count (Tensor): The neighbor count of the input nodes `x`. And the data type should be int32. - value_buffer (Tensor|None): Value buffer for hashtable. The data type should be int32, - and should be filled with -1. Only useful for gpu version. - index_buffer (Tensor|None): Index buffer for hashtable. The data type should be int32, + value_buffer (Tensor, optional): Value buffer for hashtable. The data type should be int32, + and should be filled with -1. Only useful for gpu version. Default is None. + index_buffer (Tensor, optional): Index buffer for hashtable. The data type should be int32, and should be filled with -1. Only useful for gpu version. `value_buffer` and `index_buffer` should be both not None - if you want to speed up by using hashtable buffer. + if you want to speed up by using hashtable buffer. Default is None. name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. @@ -68,6 +69,7 @@ def reindex_graph( .. code-block:: python import paddle + x = [0, 1, 2] neighbors = [8, 9, 0, 4, 7, 6, 7] count = [2, 3, 2] @@ -137,6 +139,7 @@ def reindex_heter_graph( x, neighbors, count, value_buffer=None, index_buffer=None, name=None ): """ + Reindex HeterGraph API. This API is mainly used in Graph Learning domain, which should be used @@ -160,12 +163,12 @@ def reindex_heter_graph( The data type should be the same with `x`. count (list|tuple): The neighbor counts of the input nodes `x` from different graphs. And the data type should be int32. - value_buffer (Tensor|None): Value buffer for hashtable. The data type should be int32, - and should be filled with -1. Only useful for gpu version. - index_buffer (Tensor|None): Index buffer for hashtable. The data type should be int32, + value_buffer (Tensor, optional): Value buffer for hashtable. The data type should be int32, + and should be filled with -1. Only useful for gpu version. Default is None. + index_buffer (Tensor, optional): Index buffer for hashtable. The data type should be int32, and should be filled with -1. Only useful for gpu version. `value_buffer` and `index_buffer` should be both not None - if you want to speed up by using hashtable buffer. + if you want to speed up by using hashtable buffer. Default is None. name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. @@ -182,6 +185,7 @@ def reindex_heter_graph( .. code-block:: python import paddle + x = [0, 1, 2] neighbors_a = [8, 9, 0, 4, 7, 6, 7] count_a = [2, 3, 2] diff --git a/python/paddle/geometric/sampling/neighbors.py b/python/paddle/geometric/sampling/neighbors.py index 111b81accaf49..092d87f92331c 100644 --- a/python/paddle/geometric/sampling/neighbors.py +++ b/python/paddle/geometric/sampling/neighbors.py @@ -31,6 +31,7 @@ def sample_neighbors( name=None, ): """ + Graph Sample Neighbors API. This API is mainly used in Graph Learning domain, and the main purpose is to @@ -51,16 +52,16 @@ def sample_neighbors( The data type should be the same with `row`. input_nodes (Tensor): The input nodes we need to sample neighbors for, and the data type should be the same with `row`. - sample_size (int): The number of neighbors we need to sample. Default value is -1, + sample_size (int, optional): The number of neighbors we need to sample. Default value is -1, which means returning all the neighbors of the input nodes. - eids (Tensor): The eid information of the input graph. If return_eids is True, + eids (Tensor, optional): The eid information of the input graph. If return_eids is True, then `eids` should not be None. The data type should be the same with `row`. Default is None. - return_eids (bool): Whether to return eid information of sample edges. Default is False. - perm_buffer (Tensor): Permutation buffer for fisher-yates sampling. If `use_perm_buffer` + return_eids (bool, optional): Whether to return eid information of sample edges. Default is False. + perm_buffer (Tensor, optional): Permutation buffer for fisher-yates sampling. If `use_perm_buffer` is True, then `perm_buffer` should not be None. The data type should be the same with `row`. If not None, we will use fiser-yates sampling - to speed up. Only useful for gpu version. + to speed up. Only useful for gpu version. Default is None. name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. @@ -68,15 +69,16 @@ def sample_neighbors( - out_neighbors (Tensor), the sample neighbors of the input nodes. - out_count (Tensor), the number of sampling neighbors of each input node, and the shape - should be the same with `input_nodes`. + should be the same with `input_nodes`. - out_eids (Tensor), if `return_eids` is True, we will return the eid information of the - sample edges. + sample edges. Examples: .. code-block:: python import paddle + # edges: (3, 0), (7, 0), (0, 1), (9, 1), (1, 2), (4, 3), (2, 4), # (9, 5), (3, 5), (9, 6), (1, 6), (9, 8), (7, 8) row = [3, 7, 0, 9, 1, 4, 2, 9, 3, 9, 1, 9, 7] diff --git a/python/paddle/hapi/model.py b/python/paddle/hapi/model.py index 931e3c3e398b3..c38497d1a9b0a 100644 --- a/python/paddle/hapi/model.py +++ b/python/paddle/hapi/model.py @@ -258,7 +258,9 @@ def _update_input_info(inputs): class StaticGraphAdapter: """ + Model traning/inference with a static graph. + """ def __init__(self, model): @@ -1005,6 +1007,7 @@ def prepare(self): class Model: """ + An Model object is network with training and inference features. Dynamic graph and static graph are supported at the same time, switched by `paddle.enable_static()`. The usage is as follows. @@ -1145,6 +1148,7 @@ def __init__(self, network, inputs=None, labels=None): def train_batch(self, inputs, labels=None, update=True): """ + Run one training step on one batch of data. And using `update` indicates whether optimizer update gradients computing by this batch. @@ -1190,6 +1194,7 @@ def train_batch(self, inputs, labels=None, update=True): loss = model.train_batch([data], [label]) print(loss) # [array([2.192784], dtype=float32)] + """ loss = self._adapter.train_batch(inputs, labels, update) if fluid._non_static_mode() and self._input_info is None: @@ -1199,6 +1204,7 @@ def train_batch(self, inputs, labels=None, update=True): @no_grad() def eval_batch(self, inputs, labels=None): """ + Run one evaluating step on a batch of data. Args: @@ -1242,6 +1248,7 @@ def eval_batch(self, inputs, labels=None): loss, acc = model.eval_batch([data], [label]) print(loss, acc) # [array([2.8825705], dtype=float32)] [0.0] + """ loss = self._adapter.eval_batch(inputs, labels) if fluid._non_static_mode() and self._input_info is None: @@ -1251,6 +1258,7 @@ def eval_batch(self, inputs, labels=None): @no_grad() def predict_batch(self, inputs): """ + Run one predicting step on a batch of data. Args: @@ -1289,6 +1297,7 @@ def predict_batch(self, inputs): # [array([[0.08189095, 0.16740078, 0.06889386, 0.05085445, 0.10729759, # 0.02217775, 0.14518553, 0.1591538 , 0.01808308, 0.17906217]], # dtype=float32)] + """ loss = self._adapter.predict_batch(inputs) if fluid._non_static_mode() and self._input_info is None: @@ -1297,6 +1306,7 @@ def predict_batch(self, inputs): def save(self, path, training=True): """ + This function saves parameters, optimizer information or model and paramters only for inference to path. It depends on the parameter `training`. @@ -1364,6 +1374,7 @@ def forward(self, x): model.fit(data, epochs=1, batch_size=32, verbose=0) model.save('checkpoint/test') # save for training model.save('inference_model', False) # save for inference + """ if ParallelEnv().local_rank == 0: @@ -1374,6 +1385,7 @@ def forward(self, x): def load(self, path, skip_mismatch=False, reset_optimizer=False): """ + Load from files storing the model states and optimizer states. The file for optimizer states is not necessary if no need to restore the optimizer. @@ -1421,6 +1433,7 @@ def load(self, path, skip_mismatch=False, reset_optimizer=False): model.save('checkpoint/test') model.load('checkpoint/test') + """ def _load_state_from_path(path): @@ -1491,6 +1504,7 @@ def _strip_postfix(path): def parameters(self, *args, **kwargs): """ + Returns a list of parameters of the model. Returns: @@ -1513,6 +1527,7 @@ def parameters(self, *args, **kwargs): nn.Linear(200, 10)), input) params = model.parameters() + """ return self._adapter.parameters() @@ -1609,6 +1624,7 @@ def prepare( self, optimizer=None, loss=None, metrics=None, amp_configs=None ): """ + Configures the model before runing. Args: @@ -1640,6 +1656,7 @@ def prepare( Returns: None + """ self._place = _get_device() if isinstance(self._place, fluid.CUDAPlace): @@ -1699,6 +1716,7 @@ def fit( num_iters=None, ): """ + Trains the model for a fixed number of epochs. If `eval_data` is set, evaluation will be done at the end of each epoch. @@ -1753,7 +1771,7 @@ def fit( How to make a batch is done internally. .. code-block:: python - :name: code-example1 + :name: code-example3 import paddle import paddle.vision.transforms as T @@ -1793,7 +1811,7 @@ def fit( DataLoader. .. code-block:: python - :name: code-example2 + :name: code-example4 import paddle import paddle.vision.transforms as T @@ -1830,6 +1848,7 @@ def fit( val_loader, epochs=2, save_dir='mnist_checkpoint') + """ assert train_data is not None, "train_data must be given!" diff --git a/python/paddle/incubate/operators/graph_sample_neighbors.py b/python/paddle/incubate/operators/graph_sample_neighbors.py index 48e52bc691408..14af243784f71 100644 --- a/python/paddle/incubate/operators/graph_sample_neighbors.py +++ b/python/paddle/incubate/operators/graph_sample_neighbors.py @@ -37,6 +37,7 @@ def graph_sample_neighbors( name=None, ): """ + Graph Sample Neighbors API. This API is mainly used in Graph Learning domain, and the main purpose is to @@ -72,27 +73,26 @@ def graph_sample_neighbors( For more information, please refer to :ref:`api_guide_Name`. Returns: - out_neighbors (Tensor): The sample neighbors of the input nodes. - out_count (Tensor): The number of sampling neighbors of each input node, and the shape - should be the same with `input_nodes`. - out_eids (Tensor): If `return_eids` is True, we will return the eid information of the - sample edges. + - out_neighbors (Tensor): The sample neighbors of the input nodes. + - out_count (Tensor): The number of sampling neighbors of each input node, and the shape should be the same with `input_nodes`. + - out_eids (Tensor): If `return_eids` is True, we will return the eid information of the sample edges. Examples: .. code-block:: python - import paddle - # edges: (3, 0), (7, 0), (0, 1), (9, 1), (1, 2), (4, 3), (2, 4), - # (9, 5), (3, 5), (9, 6), (1, 6), (9, 8), (7, 8) - row = [3, 7, 0, 9, 1, 4, 2, 9, 3, 9, 1, 9, 7] - colptr = [0, 2, 4, 5, 6, 7, 9, 11, 11, 13, 13] - nodes = [0, 8, 1, 2] - sample_size = 2 - row = paddle.to_tensor(row, dtype="int64") - colptr = paddle.to_tensor(colptr, dtype="int64") - nodes = paddle.to_tensor(nodes, dtype="int64") - out_neighbors, out_count = \ - paddle.incubate.graph_sample_neighbors(row, colptr, nodes, - sample_size=sample_size) + + import paddle + # edges: (3, 0), (7, 0), (0, 1), (9, 1), (1, 2), (4, 3), (2, 4), + # (9, 5), (3, 5), (9, 6), (1, 6), (9, 8), (7, 8) + row = [3, 7, 0, 9, 1, 4, 2, 9, 3, 9, 1, 9, 7] + colptr = [0, 2, 4, 5, 6, 7, 9, 11, 11, 13, 13] + nodes = [0, 8, 1, 2] + sample_size = 2 + row = paddle.to_tensor(row, dtype="int64") + colptr = paddle.to_tensor(colptr, dtype="int64") + nodes = paddle.to_tensor(nodes, dtype="int64") + out_neighbors, out_count = \ + paddle.incubate.graph_sample_neighbors(row, colptr, nodes, + sample_size=sample_size) """ diff --git a/python/paddle/nn/functional/common.py b/python/paddle/nn/functional/common.py index 8e374667fa0b5..b8cef84747791 100644 --- a/python/paddle/nn/functional/common.py +++ b/python/paddle/nn/functional/common.py @@ -710,6 +710,7 @@ def upsample( name=None, ): """ + This API resizes a batch of images. The input must be a 3-D Tensor of the shape (num_batches, channels, in_w) @@ -720,11 +721,12 @@ def upsample( and the resizing only applies on the three dimensions(depth, height and width). Supporting resample methods: - 'linear' : Linear interpolation - 'bilinear' : Bilinear interpolation - 'trilinear' : Trilinear interpolation - 'nearest' : Nearest neighbor interpolation - 'bicubic' : Bicubic interpolation + - 'linear' : Linear interpolation + - 'bilinear' : Bilinear interpolation + - 'trilinear' : Trilinear interpolation + - 'nearest' : Nearest neighbor interpolation + - 'bicubic' : Bicubic interpolation + Linear interpolation is the method of using a line connecting two known quantities to determine the value of an unknown quantity between the two known quantities. @@ -757,77 +759,78 @@ def upsample( `paddle.nn.functional.adaptive_avg_pool2d` or `paddle.nn.functional.adaptive_avg_pool3d`. Example: - .. code-block:: text + .. code-block:: text - For scale_factor: - if align_corners = True && out_size > 1 : - scale_factor = (in_size-1.0)/(out_size-1.0) + For scale_factor: + if align_corners = True && out_size > 1 : + scale_factor = (in_size-1.0)/(out_size-1.0) + else: + scale_factor = float(in_size/out_size) + Linear interpolation: + if: + align_corners = False , align_mode = 0 + input : (N,C,W_in) + output: (N,C,W_out) where: + W_out = (W_{in}+0.5) * scale_{factor} - 0.5 + else: + input : (N,C,W_in) + output: (N,C,W_out) where: + W_out = W_{in} * scale_{factor} + Nearest neighbor interpolation: + if: + align_corners = False + input : (N,C,H_in,W_in) + output: (N,C,H_out,W_out) where: + H_out = floor (H_{in} * scale_{factor}) + W_out = floor (W_{in} * scale_{factor}) else: - scale_factor = float(in_size/out_size) - Linear interpolation: + align_corners = True + input : (N,C,H_in,W_in) + output: (N,C,H_out,W_out) where: + H_out = round(H_{in} * scale_{factor}) + W_out = round(W_{in} * scale_{factor}) + + Bilinear interpolation: if: align_corners = False , align_mode = 0 - input : (N,C,W_in) - output: (N,C,W_out) where: + input : (N,C,H_in,W_in) + output: (N,C,H_out,W_out) where: + H_out = (H_{in}+0.5) * scale_{factor} - 0.5 W_out = (W_{in}+0.5) * scale_{factor} - 0.5 else: - input : (N,C,W_in) - output: (N,C,W_out) where: + input : (N,C,H_in,W_in) + output: (N,C,H_out,W_out) where: + H_out = H_{in} * scale_{factor} + W_out = W_{in} * scale_{factor} + Bicubic interpolation: + if: + align_corners = False + input : (N,C,H_in,W_in) + output: (N,C,H_out,W_out) where: + H_out = (H_{in}+0.5) * scale_{factor} - 0.5 + W_out = (W_{in}+0.5) * scale_{factor} - 0.5 + else: + input : (N,C,H_in,W_in) + output: (N,C,H_out,W_out) where: + H_out = H_{in} * scale_{factor} + W_out = W_{in} * scale_{factor} + Trilinear interpolation: + if: + align_corners = False , align_mode = 0 + input : (N,C,D_in,H_in,W_in) + output: (N,C,D_out,H_out,W_out) where: + D_out = (D_{in}+0.5) * scale_{factor} - 0.5 + H_out = (H_{in}+0.5) * scale_{factor} - 0.5 + W_out = (W_{in}+0.5) * scale_{factor} - 0.5 + else: + input : (N,C,D_in,H_in,W_in) + output: (N,C,D_out,H_out,W_out) where: + D_out = D_{in} * scale_{factor} + H_out = H_{in} * scale_{factor} W_out = W_{in} * scale_{factor} - Nearest neighbor interpolation: - if: - align_corners = False - input : (N,C,H_in,W_in) - output: (N,C,H_out,W_out) where: - H_out = floor (H_{in} * scale_{factor}) - W_out = floor (W_{in} * scale_{factor}) - else: - align_corners = True - input : (N,C,H_in,W_in) - output: (N,C,H_out,W_out) where: - H_out = round(H_{in} * scale_{factor}) - W_out = round(W_{in} * scale_{factor}) - Bilinear interpolation: - if: - align_corners = False , align_mode = 0 - input : (N,C,H_in,W_in) - output: (N,C,H_out,W_out) where: - H_out = (H_{in}+0.5) * scale_{factor} - 0.5 - W_out = (W_{in}+0.5) * scale_{factor} - 0.5 - else: - input : (N,C,H_in,W_in) - output: (N,C,H_out,W_out) where: - H_out = H_{in} * scale_{factor} - W_out = W_{in} * scale_{factor} - Bicubic interpolation: - if: - align_corners = False - input : (N,C,H_in,W_in) - output: (N,C,H_out,W_out) where: - H_out = (H_{in}+0.5) * scale_{factor} - 0.5 - W_out = (W_{in}+0.5) * scale_{factor} - 0.5 - else: - input : (N,C,H_in,W_in) - output: (N,C,H_out,W_out) where: - H_out = H_{in} * scale_{factor} - W_out = W_{in} * scale_{factor} - Trilinear interpolation: - if: - align_corners = False , align_mode = 0 - input : (N,C,D_in,H_in,W_in) - output: (N,C,D_out,H_out,W_out) where: - D_out = (D_{in}+0.5) * scale_{factor} - 0.5 - H_out = (H_{in}+0.5) * scale_{factor} - 0.5 - W_out = (W_{in}+0.5) * scale_{factor} - 0.5 - else: - input : (N,C,D_in,H_in,W_in) - output: (N,C,D_out,H_out,W_out) where: - D_out = D_{in} * scale_{factor} - H_out = H_{in} * scale_{factor} - W_out = W_{in} * scale_{factor} - https://en.wikipedia.org/wiki/Linear_interpolation. For details of linear interpolation, please refer to Wikipedia: + https://en.wikipedia.org/wiki/Linear_interpolation. For details of nearest neighbor interpolation, please refer to Wikipedia: https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation. @@ -871,23 +874,24 @@ def upsample( name(str, optional): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name` + Returns: A 3-D Tensor of the shape (num_batches, channels, out_w) or (num_batches, out_w, channels), A 4-D Tensor of the shape (num_batches, channels, out_h, out_w) or (num_batches, out_h, out_w, channels), or 5-D Tensor of the shape (num_batches, channels, out_d, out_h, out_w) or (num_batches, out_d, out_h, out_w, channels). - Examples: - .. code-block:: python + Examples: + .. code-block:: python - import paddle - import paddle.nn as nn + import paddle + import paddle.nn as nn - input_data = paddle.randn(shape=(2,3,6,10)).astype(paddle.float32) - upsample_out = paddle.nn.Upsample(size=[12,12]) + input_data = paddle.randn(shape=(2,3,6,10)).astype(paddle.float32) + upsample_out = paddle.nn.Upsample(size=[12,12]) - output = upsample_out(x=input_data) - print(output.shape) - # [2L, 3L, 12L, 12L] + output = upsample_out(x=input_data) + print(output.shape) + # [2L, 3L, 12L, 12L] """ return interpolate( diff --git a/python/paddle/nn/functional/distance.py b/python/paddle/nn/functional/distance.py index c1d40a83cfb0d..4383b0cc7a2f2 100644 --- a/python/paddle/nn/functional/distance.py +++ b/python/paddle/nn/functional/distance.py @@ -23,6 +23,7 @@ def pairwise_distance(x, y, p=2.0, epsilon=1e-6, keepdim=False, name=None): r""" + It computes the pairwise distance between two vectors. The distance is calculated by p-oreder norm: @@ -48,10 +49,11 @@ def pairwise_distance(x, y, p=2.0, epsilon=1e-6, keepdim=False, name=None): Returns: Tensor, the dtype is same as input tensor. + - If :attr:`keepdim` is True, the output shape is :math:`[N, 1]` or :math:`[1]`, - depending on whether the input has data shaped as :math:`[N, D]`. + depending on whether the input has data shaped as :math:`[N, D]`. - If :attr:`keepdim` is False, the output shape is :math:`[N]` or :math:`[]`, - depending on whether the input has data shaped as :math:`[N, D]`. + depending on whether the input has data shaped as :math:`[N, D]`. Examples: .. code-block:: python diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py index efe26e5a42ebf..234224964b3f0 100755 --- a/python/paddle/nn/functional/loss.py +++ b/python/paddle/nn/functional/loss.py @@ -1305,6 +1305,7 @@ def margin_ranking_loss( def l1_loss(input, label, reduction='mean', name=None): r""" + Computes the L1 Loss of Tensor ``input`` and ``label`` as follows. If `reduction` set to ``'none'``, the loss is: @@ -1336,7 +1337,7 @@ def l1_loss(input, label, reduction='mean', name=None): Returns: Tensor, the L1 Loss of Tensor ``input`` and ``label``. - If `reduction` is ``'none'``, the shape of output loss is [N, *], the same as ``input`` . + If `reduction` is ``'none'``, the shape of output loss is :math:`[N, *]`, the same as ``input`` . If `reduction` is ``'mean'`` or ``'sum'``, the shape of output loss is [1]. Examples: @@ -1359,6 +1360,7 @@ def l1_loss(input, label, reduction='mean', name=None): l1_loss = paddle.nn.functional.l1_loss(input, label, reduction='sum') print(l1_loss.numpy()) # [1.4] + """ if reduction not in ['sum', 'mean', 'none']: raise ValueError( @@ -2281,6 +2283,7 @@ def cross_entropy( name=None, ): r""" + By default, this operator implements the cross entropy loss function with softmax. This function combines the calculation of the softmax operation and the cross entropy loss function to provide a more numerically stable computing. @@ -2394,21 +2397,13 @@ def cross_entropy( Parameters: - - - **input** (Tensor) - - Input tensor, the data type is float32, float64. Shape is - :math:`[N_1, N_2, ..., N_k, C]`, where C is number of classes , ``k >= 1`` . + input (Tensor): the data type is float32, float64. Shape is :math:`[N_1, N_2, ..., N_k, C]`, where C is number of classes, ``k >= 1`` . Note: - - 1. when use_softmax=True, it expects unscaled logits. This operator should not be used with the - output of softmax operator, which will produce incorrect results. - + 1. when use_softmax=True, it expects unscaled logits. This operator should not be used with the output of softmax operator, which will produce incorrect results. 2. when use_softmax=False, it expects the output of softmax operator. - - **label** (Tensor) - + label (Tensor): 1. If soft_label=False, the shape is :math:`[N_1, N_2, ..., N_k]` or :math:`[N_1, N_2, ..., N_k, 1]`, k >= 1. the data type is int32, int64, float32, float64, where each value is [0, C-1]. @@ -2416,48 +2411,27 @@ def cross_entropy( 2. If soft_label=True, the shape and data type should be same with ``input`` , and the sum of the labels for each sample should be 1. - - **weight** (Tensor, optional) - - a manual rescaling weight given to each class. + weight (Tensor, optional): a manual rescaling weight given to each class. If given, has to be a Tensor of size C and the data type is float32, float64. Default is ``'None'`` . - - - **ignore_index** (int64, optional) - - Specifies a target value that is ignored + ignore_index (int64, optional): Specifies a target value that is ignored and does not contribute to the loss. A negative value means that no label value needs to be ignored. Only valid when soft_label = False. Default is ``-100`` . - - - **reduction** (str, optional) - - Indicate how to average the loss by batch_size, + reduction (str, optional): Indicate how to average the loss by batch_size, the candicates are ``'none'`` | ``'mean'`` | ``'sum'``. If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned; If :attr:`size_average` is ``'sum'``, the reduced sum loss is returned. If :attr:`reduction` is ``'none'``, the unreduced loss is returned. Default is ``'mean'``. - - - **soft_label** (bool, optional) - - Indicate whether label is soft. - Default is ``False``. - - - **axis** (int, optional) - - The index of dimension to perform softmax calculations. + soft_label (bool, optional): Indicate whether label is soft. Default is ``False``. + axis (int, optional):The index of dimension to perform softmax calculations. It should be in range :math:`[-1, rank - 1]`, where :math:`rank` is the number of dimensions of input :attr:`input`. Default is ``-1`` . - - - **use_softmax** (bool, optional) - - Indicate whether compute softmax before cross_entropy. + use_softmax (bool, optional): Indicate whether compute softmax before cross_entropy. Default is ``True``. - - - **name** (str, optional) - - The name of the operator. Default is ``None`` . + name (str, optional): The name of the operator. Default is ``None`` . For more information, please refer to :ref:`api_guide_Name` . Returns: @@ -2473,9 +2447,7 @@ def cross_entropy( 2. if soft_label = True, the dimension of return value is :math:`[N_1, N_2, ..., N_k, 1]` . - Examples: - .. code-block:: python # hard labels @@ -3958,6 +3930,7 @@ def multi_margin_loss( def soft_margin_loss(input, label, reduction='mean', name=None): """ + The API measures the soft margin loss between input predictions ``input`` and target labels ``label`` . It can be described as: @@ -3966,9 +3939,9 @@ def soft_margin_loss(input, label, reduction='mean', name=None): Parameters: - input (Tensor): The input predications tensor with shape: [N, *], + input (Tensor): The input predications tensor with shape: ``[N, *]``, N is batch_size, `*` means any number of additional dimensions. The ``input`` ranges from -inf to inf. - Available dtype is float32, float64. + Available dtype is float32, float64. label (Tensor): The target labels tensor with the same shape as ``input``. The target labels which values should be numbers -1 or 1. @@ -3986,8 +3959,7 @@ def soft_margin_loss(input, label, reduction='mean', name=None): Returns: - Output (Tensor): If ``reduction`` is ``'none'``, the shape of output is - same as ``input`` , else the shape of output is [1]. + Output (Tensor): If ``reduction`` is ``'none'``, the shape of output is same as ``input`` , else the shape of output is [1]. Examples: .. code-block:: python @@ -4013,6 +3985,7 @@ def soft_margin_loss(input, label, reduction='mean', name=None): # [0.84367639, 0.74795729, 0.44629076, 0.55123353, 0.77659678], # [0.39465919, 0.76651484, 0.54485321, 0.76609844, 0.77166790], # [0.51283568, 0.84757161, 0.78913331, 1.05268764, 0.45318675]]) + """ if reduction not in ['sum', 'mean', 'none']: raise ValueError( diff --git a/python/paddle/nn/functional/pooling.py b/python/paddle/nn/functional/pooling.py index c361ac43aad13..5e8f77a9810bf 100755 --- a/python/paddle/nn/functional/pooling.py +++ b/python/paddle/nn/functional/pooling.py @@ -1735,15 +1735,17 @@ def adaptive_avg_pool1d(x, output_size, name=None): def adaptive_avg_pool2d(x, output_size, data_format='NCHW', name=None): r""" + Applies 2D adaptive avg pooling on input tensor. The h and w dimensions of the output tensor are determined by the parameter output_size. For avg adaptive pool2d: + .. math:: - hstart &= floor(i * H_{in} / H_{out}) - hend &= ceil((i + 1) * H_{in} / H_{out}) - wstart &= floor(j * W_{in} / W_{out}) - wend &= ceil((j + 1) * W_{in} / W_{out}) + hstart &= floor(i * H_{in} / H_{out}) \\ + hend &= ceil((i + 1) * H_{in} / H_{out}) \\ + wstart &= floor(j * W_{in} / W_{out}) \\ + wend &= ceil((j + 1) * W_{in} / W_{out}) \\ Output(i ,j) &= \frac{\sum Input[hstart:hend, wstart:wend]}{(hend - hstart) * (wend - wstart)} Args: @@ -1752,14 +1754,15 @@ def adaptive_avg_pool2d(x, output_size, data_format='NCHW', name=None): output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list, it must contain two element, (H, W). H and W can be either a int, or None which means the size will be the same as that of the input. - data_format (str): The data format of the input and output data. An optional string + data_format (str, optional): The data format of the input and output data. An optional string from: "NCHW", "NHWC". The default is "NCHW". When it is "NCHW", the data is stored in the order of: [batch_size, input_channels, input_height, input_width]. name(str, optional): For detailed information, please refer to :ref:`api_guide_Name`. Usually name is no need to set and None by default. + Returns: - Tensor: The output tensor of avg adaptive pool2d result. The data type is same as input tensor. + Tensor, The output tensor of avg adaptive pool2d result. The data type is same as input tensor. Examples: .. code-block:: python @@ -1787,6 +1790,7 @@ def adaptive_avg_pool2d(x, output_size, data_format='NCHW', name=None): x = x, output_size=[3, 3]) # out.shape is [2, 3, 3, 3] + """ if not in_dynamic_mode(): check_variable_and_dtype( @@ -1879,34 +1883,36 @@ def adaptive_avg_pool2d(x, output_size, data_format='NCHW', name=None): def adaptive_avg_pool3d(x, output_size, data_format='NCDHW', name=None): r""" + This operation applies 3D adaptive avg pooling on input tensor. The h and w dimensions of the output tensor are determined by the parameter output_size. For avg adaptive pool3d: + .. math:: - dstart &= floor(i * D_{in} / D_{out}) - dend &= ceil((i + 1) * D_{in} / D_{out}) - hstart &= floor(j * H_{in} / H_{out}) - hend &= ceil((j + 1) * H_{in} / H_{out}) - wstart &= floor(k * W_{in} / W_{out}) - wend &= ceil((k + 1) * W_{in} / W_{out}) + dstart &= floor(i * D_{in} / D_{out}) \\ + dend &= ceil((i + 1) * D_{in} / D_{out}) \\ + hstart &= floor(j * H_{in} / H_{out}) \\ + hend &= ceil((j + 1) * H_{in} / H_{out}) \\ + wstart &= floor(k * W_{in} / W_{out}) \\ + wend &= ceil((k + 1) * W_{in} / W_{out}) \\ Output(i ,j, k) &= \frac{\sum Input[dstart:dend, hstart:hend, wstart:wend]} {(dend - dstart) * (hend - hstart) * (wend - wstart)} Args: x (Tensor): The input tensor of adaptive avg pool3d operator, which is a 5-D tensor. - The data type can be float32, float64. - output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list, - it must contain three elements, (D, H, W). D, H and W can be either a int, or None which means - the size will be the same as that of the input. - data_format (str): The data format of the input and output data. An optional string + The data type can be float32, float64. + output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or + list, it must contain three elements, (D, H, W). D, H and W can be either a int, + or None which means the size will be the same as that of the input. + data_format (str, optional): The data format of the input and output data. An optional string from: "NCDHW", "NDHWC". The default is "NCDHW". When it is "NCDHW", the data is stored in the order of: [batch_size, input_channels, input_depth, input_height, input_width]. - name(str, optional): For detailed information, please refer - to :ref:`api_guide_Name`. Usually name is no need to set and - None by default. + name(str, optional): For detailed information, please refer to :ref:`api_guide_Name`. + Usually name is no need to set and None by default. + Returns: - Tensor: The output tensor of avg adaptive pool3d result. The data type is same as input tensor. + Tensor, The output tensor of avg adaptive pool3d result. The data type is same as input tensor. Examples: .. code-block:: python @@ -1936,6 +1942,7 @@ def adaptive_avg_pool3d(x, output_size, data_format='NCDHW', name=None): x = input_data, output_size=[3, 3, 3]) # out.shape is [2, 3, 3, 3, 3] + """ if not in_dynamic_mode(): check_variable_and_dtype( diff --git a/python/paddle/nn/layer/activation.py b/python/paddle/nn/layer/activation.py index 57434e2c9ce9c..7b60c52ea5497 100644 --- a/python/paddle/nn/layer/activation.py +++ b/python/paddle/nn/layer/activation.py @@ -1449,15 +1449,16 @@ def extra_repr(self): class Softmax2D(Layer): r""" + Softmax2D Activation. Given a Tensor with shape (B, C, H, W) or (C, H, W), it will apply Softmax to each location (C, h_i, w_j). The sum of result in each location (C, H_i, W_j) will be one. Shape: - Input: :math:`(B, C, H, W)` or :math:`(C, H, W)` - - Output: :math:`(B, C, H, W)` or :math:`(C, H, W)`(same as input) + - Output: :math:`(B, C, H, W)` or :math:`(C, H, W)` (same as input) - Return: + Returns: A Tensor of the same shape and dtype as input with value in range [0, 1]. Examples: @@ -1482,6 +1483,7 @@ class Softmax2D(Layer): # [[0.42368975 0.51082766 0.47752273 0.5258871 ] # [0.66754097 0.47182566 0.5187628 0.5402329 ] # [0.49014282 0.46369177 0.50340754 0.5289428 ]]]] + """ def __init__(self, name=None): diff --git a/python/paddle/nn/layer/distance.py b/python/paddle/nn/layer/distance.py index e6b756f04628a..72dea12b49a71 100644 --- a/python/paddle/nn/layer/distance.py +++ b/python/paddle/nn/layer/distance.py @@ -20,6 +20,7 @@ class PairwiseDistance(Layer): r""" + It computes the pairwise distance between two vectors. The distance is calculated by p-oreder norm: @@ -38,14 +39,14 @@ class PairwiseDistance(Layer): Generally, no setting is required. Default: None. Shape: - x: :math:`[N, D]` or :math:`[D]`, where :math:`N` is batch size, :math:`D` - is the dimension of the data. Available data type is float32, float64. - y: :math:`[N, D]` or :math:`[D]`, y have the same dtype as x. - output: The same dtype as input tensor. + - x: :math:`[N, D]` or :math:`[D]`, where :math:`N` is batch size, :math:`D` + is the dimension of the data. Available data type is float32, float64. + - y: :math:`[N, D]` or :math:`[D]`, y have the same dtype as x. + - output: The same dtype as input tensor. - If :attr:`keepdim` is True, the output shape is :math:`[N, 1]` or :math:`[1]`, - depending on whether the input has data shaped as :math:`[N, D]`. + depending on whether the input has data shaped as :math:`[N, D]`. - If :attr:`keepdim` is False, the output shape is :math:`[N]` or :math:`[]`, - depending on whether the input has data shaped as :math:`[N, D]`. + depending on whether the input has data shaped as :math:`[N, D]`. Examples: .. code-block:: python diff --git a/python/paddle/nn/layer/loss.py b/python/paddle/nn/layer/loss.py index b180f89d6f9dc..17471dd225448 100644 --- a/python/paddle/nn/layer/loss.py +++ b/python/paddle/nn/layer/loss.py @@ -26,7 +26,8 @@ class BCEWithLogitsLoss(Layer): r""" - This operator combines the sigmoid layer and the :ref:`api_nn_loss_BCELoss` layer. + + This operator combines the sigmoid layer and the :ref:`api_paddle_nn_BCELoss` layer. Also, we can see it as the combine of ``sigmoid_cross_entropy_with_logits`` layer and some reduce operations. @@ -49,7 +50,7 @@ class BCEWithLogitsLoss(Layer): For stability and to prevent overflow of :math:`e^{-Logit}` when Logit < 0, we reformulate the loss as follows: - .. math:: + .. math:: Out = \max(Logit, 0) - Logit * Labels + \log(1 + e^{-\|Logit\|}) Then, if ``weight`` or ``pos_weight`` is not None, this operator multiply the @@ -81,21 +82,21 @@ class BCEWithLogitsLoss(Layer): For more information, please refer to :ref:`api_guide_Name`. Shapes: - logit (Tensor): The input predications tensor. 2-D tensor with shape: [N, *], - N is batch_size, `*` means number of additional dimensions. The ``logit`` - is usually the output of Linear layer. Available dtype is float32, float64. - label (Tensor): The target labels tensor. 2-D tensor with the same shape as - ``logit``. The target labels which values should be numbers between 0 and 1. - Available dtype is float32, float64. - output (Tensor): If ``reduction`` is ``'none'``, the shape of output is - same as ``logit`` , else the shape of output is scalar. + - logit (Tensor): The input predications tensor. 2-D tensor with shape: [N, `*`], + N is batch_size, `*` means number of additional dimensions. The ``logit`` + is usually the output of Linear layer. Available dtype is float32, float64. + - label (Tensor): The target labels tensor. 2-D tensor with the same shape as + ``logit``. The target labels which values should be numbers between 0 and 1. + Available dtype is float32, float64. + - output (Tensor): If ``reduction`` is ``'none'``, the shape of output is + same as ``logit`` , else the shape of output is scalar. Returns: A callable object of BCEWithLogitsLoss. Examples: - .. code-block:: python + import paddle logit = paddle.to_tensor([5.0, 1.0, 3.0], dtype="float32") label = paddle.to_tensor([1.0, 0.0, 1.0], dtype="float32") @@ -134,6 +135,7 @@ def forward(self, logit, label): class CrossEntropyLoss(Layer): r""" + By default, this operator implements the cross entropy loss function with softmax. This function combines the calculation of the softmax operation and the cross entropy loss function to provide a more numerically stable computing. @@ -246,60 +248,35 @@ class CrossEntropyLoss(Layer): Parameters: - - - **weight** (Tensor, optional) - - a manual rescaling weight given to each class. + weight (Tensor, optional): a manual rescaling weight given to each class. If given, has to be a Tensor of size C and the data type is float32, float64. Default is ``'None'`` . - - - **ignore_index** (int64, optional) - - Specifies a target value that is ignored + ignore_index (int64, optional): Specifies a target value that is ignored and does not contribute to the loss. A negative value means that no label value needs to be ignored. Only valid when soft_label = False. Default is ``-100`` . - - - **reduction** (str, optional) - - Indicate how to average the loss by batch_size, + reduction (str, optional): Indicate how to average the loss by batch_size, the candicates are ``'none'`` | ``'mean'`` | ``'sum'``. If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned; If :attr:`size_average` is ``'sum'``, the reduced sum loss is returned. If :attr:`reduction` is ``'none'``, the unreduced loss is returned. Default is ``'mean'``. - - - **soft_label** (bool, optional) - - Indicate whether label is soft. + soft_label (bool, optional): Indicate whether label is soft. If soft_label=False, the label is hard. If soft_label=True, the label is soft. Default is ``False``. - - - **axis** (int, optional) - - The index of dimension to perform softmax calculations. + axis (int, optional): The index of dimension to perform softmax calculations. It should be in range :math:`[-1, rank - 1]`, where :math:`rank` is the number of dimensions of input :attr:`input`. Default is ``-1`` . - - - **use_softmax** (bool, optional) - - Indicate whether compute softmax before cross_entropy. + use_softmax (bool, optional): Indicate whether compute softmax before cross_entropy. Default is ``True``. - - - **name** (str, optional) - - The name of the operator. Default is ``None`` . + name (str, optional): The name of the operator. Default is ``None`` . For more information, please refer to :ref:`api_guide_Name` . Shape: - - - **input** (Tensor) - - Input tensor, the data type is float32, float64. Shape is - :math:`[N_1, N_2, ..., N_k, C]`, where C is number of classes , ``k >= 1`` . - + - **input** (Tensor), the data type is float32, float64. Shape is + :math:`[N_1, N_2, ..., N_k, C]`, where C is number of classes , ``k >= 1`` . Note: 1. when use_softmax=True, it expects unscaled logits. This operator should not be used with the @@ -307,7 +284,6 @@ class CrossEntropyLoss(Layer): 2. when use_softmax=False, it expects the output of softmax operator. - - **label** (Tensor) 1. If soft_label=False, the shape is @@ -317,15 +293,10 @@ class CrossEntropyLoss(Layer): 2. If soft_label=True, the shape and data type should be same with ``input`` , and the sum of the labels for each sample should be 1. - - **output** (Tensor) - - Return the softmax cross_entropy loss of ``input`` and ``label``. - - The data type is the same as input. - - If :attr:`reduction` is ``'mean'`` or ``'sum'`` , the dimension of return value is ``1``. - - If :attr:`reduction` is ``'none'``: + - **output** (Tensor), Return the softmax cross_entropy loss of ``input`` and ``label``. + The data type is the same as input. + If :attr:`reduction` is ``'mean'`` or ``'sum'`` , the dimension of return value is ``1``. + If :attr:`reduction` is ``'none'``: 1. If soft_label = False, the dimension of return value is the same with ``label`` . @@ -629,6 +600,7 @@ def forward(self, input, label): class L1Loss(Layer): r""" + Construct a callable object of the ``L1Loss`` class. The L1Loss layer calculates the L1 Loss of ``input`` and ``label`` as follows. @@ -658,11 +630,11 @@ class L1Loss(Layer): name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. Shape: - input (Tensor): The input tensor. The shapes is [N, *], where N is batch size and `*` means any number of additional dimensions. It's data type should be float32, float64, int32, int64. - label (Tensor): label. The shapes is [N, *], same shape as ``input`` . It's data type should be float32, float64, int32, int64. - output (Tensor): The L1 Loss of ``input`` and ``label``. - If `reduction` is ``'none'``, the shape of output loss is [N, *], the same as ``input`` . - If `reduction` is ``'mean'`` or ``'sum'``, the shape of output loss is [1]. + - input (Tensor): The input tensor. The shapes is ``[N, *]``, where N is batch size and `*` means any number of additional dimensions. It's data type should be float32, float64, int32, int64. + - label (Tensor): label. The shapes is ``[N, *]``, same shape as ``input`` . It's data type should be float32, float64, int32, int64. + - output (Tensor): The L1 Loss of ``input`` and ``label``. + If `reduction` is ``'none'``, the shape of output loss is ``[N, *]``, the same as ``input`` . + If `reduction` is ``'mean'`` or ``'sum'``, the shape of output loss is [1]. Examples: .. code-block:: python @@ -687,6 +659,7 @@ class L1Loss(Layer): print(output) # [[0.20000005 0.19999999] # [0.2 0.79999995]] + """ def __init__(self, reduction='mean', name=None): @@ -707,6 +680,7 @@ def forward(self, input, label): class BCELoss(Layer): """ + This interface is used to construct a callable object of the ``BCELoss`` class. The BCELoss layer measures the binary_cross_entropy loss between input predictions ``input`` and target labels ``label`` . The binary_cross_entropy loss can be described as: @@ -750,14 +724,14 @@ class BCELoss(Layer): For more information, please refer to :ref:`api_guide_Name`. Shape: - input (Tensor): 2-D tensor with shape: [N, *], N is batch_size, `*` means - number of additional dimensions. The input ``input`` should always - be the output of sigmod. Available dtype is float32, float64. - label (Tensor): 2-D tensor with the same shape as ``input``. The target - labels which values should be numbers between 0 and 1. Available - dtype is float32, float64. - output (Tensor): If ``reduction`` is ``'none'``, the shape of output is - same as ``input`` , else the shape of output is scalar. + - input (Tensor): 2-D tensor with shape: ``[N, *]``, N is batch_size, `*` means + number of additional dimensions. The input ``input`` should always + be the output of sigmod. Available dtype is float32, float64. + - label (Tensor): 2-D tensor with the same shape as ``input``. The target + labels which values should be numbers between 0 and 1. Available + dtype is float32, float64. + - output (Tensor): If ``reduction`` is ``'none'``, the shape of output is + same as ``input`` , else the shape of output is scalar. Returns: A callable object of BCELoss. @@ -850,7 +824,7 @@ class NLLLoss(Layer): if `reduction` is ``'sum'``, the reduced sum loss is returned; if `reduction` is ``'none'``, no reduction will be apllied. Default is ``'mean'``. - name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None. + name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None. Shape: - input (Tensor): Input tensor, the shape is :math:`[N, C]`, `C` is the number of classes. @@ -909,6 +883,7 @@ def forward(self, input, label): class KLDivLoss(Layer): r""" + Generate a callable object of 'KLDivLoss' to calculate the Kullback-Leibler divergence loss between Input(X) and Input(Target). Notes that Input(X) is the log-probability @@ -928,14 +903,10 @@ class KLDivLoss(Layer): Default is ``'mean'``. Shape: - - - input (Tensor): (N, *), where * means, any number of additional dimensions. - - - label (Tensor): (N, *), same shape as input. - + - input (Tensor): ``(N, *)``, where ``*`` means, any number of additional dimensions. + - label (Tensor): ``(N, *)``, same shape as input. - output (Tensor): tensor with shape: [1] by default. - Examples: .. code-block:: python @@ -965,6 +936,7 @@ class KLDivLoss(Layer): kldiv_criterion = nn.KLDivLoss(reduction='none') pred_loss = kldiv_criterion(x, target) # shape=[5, 20] + """ def __init__(self, reduction='mean'): @@ -1817,6 +1789,7 @@ def forward(self, input, label): class SoftMarginLoss(Layer): r""" + Creates a criterion that measures a two-class soft margin loss between input predictions ``input`` and target labels ``label`` . It can be described as: @@ -1835,17 +1808,14 @@ class SoftMarginLoss(Layer): name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. Shapes: - - Input (Tensor): The input tensor with shape: [N, *], - N is batch_size, `*` means any number of additional dimensions. The ``input`` ranges from -inf to inf - Available dtype is float32, float64. - - Label (Tensor): The target labels tensor with the same shape as - ``input``. The target labels which values should be numbers -1 or 1. - Available dtype is int32, int64, float32, float64. - - Output (Tensor): If ``reduction`` is ``'none'``, the shape of output is - same as ``input`` , else the shape of output is [1]. + - Input (Tensor): The input tensor with shape: ``[N, *]``, + N is batch_size, `*` means any number of additional dimensions. The ``input`` ranges from -inf to inf + Available dtype is float32, float64. + - Label (Tensor): The target labels tensor with the same shape as + ``input``. The target labels which values should be numbers -1 or 1. + Available dtype is int32, int64, float32, float64. + - Output (Tensor): If ``reduction`` is ``'none'``, the shape of output is + same as ``input`` , else the shape of output is [1]. Returns: A callable object of SoftMarginLoss. @@ -1877,6 +1847,7 @@ class SoftMarginLoss(Layer): # [0.55476735, 1.10505384, 0.89923519, 0.45018155, 1.06587511], # [0.37998142, 0.48067240, 0.47791212, 0.55664053, 0.98581399], # [0.78571653, 0.59319711, 0.39701841, 0.76172109, 0.83781742]]) + """ def __init__(self, reduction='mean', name=None): diff --git a/python/paddle/nn/layer/norm.py b/python/paddle/nn/layer/norm.py index 01fd204cab9a4..cd28479b5d28c 100644 --- a/python/paddle/nn/layer/norm.py +++ b/python/paddle/nn/layer/norm.py @@ -318,6 +318,7 @@ def _check_input_dim(self, input): class GroupNorm(Layer): """ + This interface is used to construct a callable object of the ``GroupNorm`` class. For more details, refer to code examples. It implements the function of the Group Normalization Layer. @@ -338,7 +339,7 @@ class GroupNorm(Layer): name(str, optional): Name for the GroupNorm, default is None. For more information, please refer to :ref:`api_guide_Name`.. Shape: - - x: Tensor with shape: (batch, num_features, *). + - x: Tensor with shape: attr:`(batch, num_features, *)`. - output: The same shape as input x. Returns: @@ -1041,6 +1042,7 @@ def _check_input_dim(self, input): class SyncBatchNorm(_BatchNormBase): r""" + This interface is used to construct a callable object of the ``SyncBatchNorm`` class. It implements the function of the Cross-GPU Synchronized Batch Normalization Layer, and can be used as a normalizer function for other operations, such as conv2d and fully connected @@ -1086,9 +1088,9 @@ class SyncBatchNorm(_BatchNormBase): - :math:`\beta` : trainable shift parameter vector Note: - If you want to use container to pack your model and has ``SyncBatchNorm`` in the - evaluation phase, please use ``nn.LayerList`` or ``nn.Sequential`` instead of - ``list`` to pack the model. + If you want to use container to pack your model and has :ref:`api_paddle_nn_SyncBatchNorm` in the + evaluation phase, please use :ref:`api_paddle_nn_LayerList` or :ref:`api_paddle_nn_Sequential` instead of + :ref:`api_paddle_hub_list` to pack the model. Parameters: num_features(int): Indicate the number of channels of the input ``Tensor``. @@ -1106,29 +1108,30 @@ class SyncBatchNorm(_BatchNormBase): have trainable bias parameter. Default: None. Shapes: - input: Tensor that the dimension from 2 to 5. - output: Tensor with the same shape as input. + - input: Tensor that the dimension from 2 to 5. + - output: Tensor with the same shape as input. Examples: .. code-block:: python - # required: gpu + # required: gpu - import paddle - import paddle.nn as nn + import paddle + import paddle.nn as nn - x = paddle.to_tensor([[[[0.3, 0.4], [0.3, 0.07]], [[0.83, 0.37], [0.18, 0.93]]]]).astype('float32') + x = paddle.to_tensor([[[[0.3, 0.4], [0.3, 0.07]], [[0.83, 0.37], [0.18, 0.93]]]]).astype('float32') - if paddle.is_compiled_with_cuda(): - sync_batch_norm = nn.SyncBatchNorm(2) - hidden1 = sync_batch_norm(x) - print(hidden1) - # Tensor(shape=[1, 2, 2, 2], dtype=float32, place=Place(gpu:0), stop_gradient=False, - # [[[[ 0.26824948, 1.09363246], - # [ 0.26824948, -1.63013160]], + if paddle.is_compiled_with_cuda(): + sync_batch_norm = nn.SyncBatchNorm(2) + hidden1 = sync_batch_norm(x) + print(hidden1) + # Tensor(shape=[1, 2, 2, 2], dtype=float32, place=Place(gpu:0), stop_gradient=False, + # [[[[ 0.26824948, 1.09363246], + # [ 0.26824948, -1.63013160]], + + # [[ 0.80956620, -0.66528702], + # [-1.27446556, 1.13018656]]]]) - # [[ 0.80956620, -0.66528702], - # [-1.27446556, 1.13018656]]]]) """ def __init__( @@ -1277,8 +1280,8 @@ def convert_sync_batchnorm(cls, layer): The original model with converted SyncBatchNorm layers. If BatchNorm*d layer in the model, use SyncBatchNorm layer instead. Examples: - .. code-block:: python + import paddle import paddle.nn as nn diff --git a/python/paddle/nn/layer/pooling.py b/python/paddle/nn/layer/pooling.py index 101686ee2487c..3c37fae3fe174 100755 --- a/python/paddle/nn/layer/pooling.py +++ b/python/paddle/nn/layer/pooling.py @@ -223,6 +223,7 @@ def extra_repr(self): class AvgPool3D(Layer): """ + This operation applies 3D max pooling over input features based on the input, and kernel_size, stride, padding parameters. Input(X) and Output(Out) are in NCDHW format, where N is batch size, C is the number of channels, @@ -263,6 +264,7 @@ class AvgPool3D(Layer): The data type can be float32, float64. - output(Tensor): The output tensor of avg pool3d operator, which is a 5-D tensor. The data type is same as input x. + Examples: .. code-block:: python diff --git a/python/paddle/nn/quant/quant_layers.py b/python/paddle/nn/quant/quant_layers.py index ce909a73fa588..4c12e9658d311 100644 --- a/python/paddle/nn/quant/quant_layers.py +++ b/python/paddle/nn/quant/quant_layers.py @@ -613,14 +613,17 @@ def forward(self, input): class QuantizedConv2DTranspose(Layer): """ + The computational logic of QuantizedConv2DTranspose is the same with Conv2DTranspose. The only difference is that its inputs are all fake quantized. Examples: .. code-block:: python + import paddle import paddle.nn as nn from paddle.nn.quant.quant_layers import QuantizedConv2DTranspose + x_var = paddle.uniform((2, 4, 8, 8), dtype='float32', min=-1., max=1.) conv = nn.Conv2DTranspose(4, 6, (3, 3)) conv_quantized = QuantizedConv2DTranspose(conv) @@ -630,6 +633,7 @@ class QuantizedConv2DTranspose(Layer): y_np = y_var.numpy() print(y_np.shape, y_quantized_np.shape) # (2, 6, 10, 10), (2, 6, 10, 10) + """ def __init__( diff --git a/python/paddle/optimizer/lr.py b/python/paddle/optimizer/lr.py index 6f96e12f995d8..387f9479a8ae0 100644 --- a/python/paddle/optimizer/lr.py +++ b/python/paddle/optimizer/lr.py @@ -1647,6 +1647,7 @@ def get_lr(self): class OneCycleLR(LRScheduler): r""" + Sets the learning rate according to the one cycle learning rate scheduler. The scheduler adjusts the learning rate from an initial learning rate to the maximum learning rate and then from that maximum learning rate to the minimum learning rate, which is much less than the initial learning rate. @@ -1660,22 +1661,25 @@ class OneCycleLR(LRScheduler): Also note that you should update learning rate each step. Args: - max_learning_rate (float): The maximum learning rate. It is a python float number. - Functionally, it defines the initial learning rate by ``divide_factor`` . + max_learning_rate (float): The maximum learning rate. It is a python float number. Functionally, it defines the initial learning rate by ``divide_factor`` . total_steps (int): Number of total training steps. - divide_factor (float): Initial learning rate will be determined by initial_learning_rate = max_learning_rate / divide_factor. Default: 25. + divide_factor (float, optional): Initial learning rate will be determined by initial_learning_rate = max_learning_rate / divide_factor. Default: 25. end_learning_rate (float, optional): The minimum learning rate during training, it should be much less than initial learning rate. phase_pct (float): The percentage of total steps which used to increasing learning rate. Default: 0.3. - anneal_strategy (str, optional): Strategy of adjusting learning rate.'cos' for cosine annealing, - 'linear' for linear annealing. Default: 'cos'. + anneal_strategy (str, optional): Strategy of adjusting learning rate.'cos' for cosine annealing, 'linear' for linear annealing. Default: 'cos'. three_phase (bool, optional): Whether to use three phase. + If ``True``: + 1. The learning rate will first increase from initial learning rate to maximum learning rate. 2. Then it will decrease to initial learning rate. Number of step in this phase is the same as the one in first phase. 3. Finally, it will decrease to minimum learning rate which is much less than initial learning rate. + If ``False``: + 1. The learning rate will increase to maximum learning rate. 2. Then it will directly decrease to minimum learning rate. + last_epoch (int, optional): The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate. verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` . @@ -1727,6 +1731,7 @@ class OneCycleLR(LRScheduler): }, fetch_list=loss.name) scheduler.step() # You should update learning rate each step + """ def __init__( diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py index 9a907049f5400..ddde2a68988b9 100644 --- a/python/paddle/tensor/creation.py +++ b/python/paddle/tensor/creation.py @@ -1194,6 +1194,7 @@ def triu(x, diagonal=0, name=None): def meshgrid(*args, **kwargs): """ + Takes a list of N tensors as input :attr:`*args`, each of which is 1-dimensional vector, and creates N-dimensional grids. Args: diff --git a/python/paddle/tensor/einsum.py b/python/paddle/tensor/einsum.py index edf316bbf508a..4864317254dde 100644 --- a/python/paddle/tensor/einsum.py +++ b/python/paddle/tensor/einsum.py @@ -732,6 +732,7 @@ def preprocess(equation, *operands): def parse_fake_shape(equation, operands, labels): """ + this shape is just used for operands planning. may differ with the original shape. for example: ... is replaced by 1 @@ -739,6 +740,7 @@ def parse_fake_shape(equation, operands, labels): Results ------- list of shape + """ shaped = collections.namedtuple('shaped', ['shape']) @@ -862,6 +864,7 @@ def gen_einsum_op(equation, *operands): def einsum(equation, *operands): r""" + einsum(equation, *operands) The current version of this API should be used in dygraph only mode. @@ -890,35 +893,35 @@ def einsum(equation, *operands): **The summation notation** - The tensor dimensions are labeled using uncased English letters. E.g., `ijk` - relates to a three dimensional tensor whose dimensions are labeled i, j, and k. + relates to a three dimensional tensor whose dimensions are labeled i, j, and k. - The equation is `,` separated into terms, each being a distinct input's - dimension label string. + dimension label string. - Ellipsis `...` enables broadcasting by automatically converting the unlabeled - dimensions into broadcasting dimensions. + dimensions into broadcasting dimensions. - Singular labels are called free labels, duplicate are dummy labels. Dummy labeled - dimensions will be reduced and removed in the output. - - Output labels can be explicitly specified on the right hand side of `->` or omitted. - In the latter case, the output labels will be inferred from the input labels. + dimensions will be reduced and removed in the output. + - Output labels can be explicitly specified on the right hand side of `->` or omitted. In the latter case, the output labels will be inferred from the input labels. - Inference of output labels - Broadcasting label `...`, if present, is put on the leftmost position. - Free labels are reordered alphabetically and put after `...`. - On explicit output labels - If broadcasting is enabled, then `...` must be present. - The output labels can be an empty, an indication to output as a scalar - the sum over the original output. + the sum over the original output. - Non-input labels are invalid. - Duplicate labels are invalid. - For any dummy label which is present for the output, it's promoted to - a free label. + a free label. - For any free label which is not present for the output, it's lowered to - a dummy label. + a dummy label. + - Examples - '...ij, ...jk', where i and k are free labels, j is dummy. The output label - string is '...ik' + string is '...ik' - 'ij -> i', where i is a free label and j is a dummy label. - '...ij, ...jk -> ...ijk', where i, j and k are all free labels. - '...ij, ...jk -> ij', an invalid equation since `...` is not present for - the output. + the output. **The summation rule** @@ -926,8 +929,8 @@ def einsum(equation, *operands): may vary significantly due to implementation specific optimization. - Step 1: preparation for broadcasting, that is, transposing and unsqueezing - the input operands to have each resulting dimension identically labeled across - all the input operands. + the input operands to have each resulting dimension identically labeled across + all the input operands. - Step 2: broadcasting multiply all the resulting operands from step 1. - Step 3: reducing dummy labeled dimensions. - Step 4: transposing the result tensor to match the output labels. @@ -944,78 +947,79 @@ def einsum(equation, *operands): operands should equal the number of input terms in the equation. Returns: - result (`Tensor`): the result tensor. + result (`Tensor`), the result tensor. Examples: .. code-block:: python - import paddle - paddle.seed(102) - x = paddle.rand([4]) - y = paddle.rand([5]) - - # sum - print(paddle.einsum('i->', x)) - # Tensor(shape=[], dtype=float32, place=CUDAPlace(0), stop_gradient=True, - # 1.95791852) - - # dot - print(paddle.einsum('i,i->', x, x)) - # Tensor(shape=[1], dtype=float32, place=CUDAPlace(0), stop_gradient=True, - # [1.45936954]) - - # outer - print(paddle.einsum("i,j->ij", x, y)) - # Tensor(shape=[4, 5], dtype=float32, place=CUDAPlace(0), stop_gradient=True, - # [[0.00079869, 0.00120950, 0.00136844, 0.00187187, 0.00192194], - # [0.23455200, 0.35519385, 0.40186870, 0.54970956, 0.56441545], - # [0.11773264, 0.17828843, 0.20171674, 0.27592498, 0.28330654], - # [0.32897076, 0.49817693, 0.56364071, 0.77099484, 0.79162055]]) - - A = paddle.rand([2, 3, 2]) - B = paddle.rand([2, 2, 3]) - - # transpose - print(paddle.einsum('ijk->kji', A)) - # Tensor(shape=[2, 3, 2], dtype=float32, place=CUDAPlace(0), stop_gradient=True, - # [[[0.95649719, 0.49684682], - # [0.80071914, 0.46258664], - # [0.49814570, 0.33383518]], - # - # [[0.07637714, 0.29374704], - # [0.51470858, 0.51907635], - # [0.99066722, 0.55802226]]]) - - # batch matrix multiplication - print(paddle.einsum('ijk, ikl->ijl', A,B)) - # Tensor(shape=[2, 3, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=True, - # [[[0.32172769, 0.50617385, 0.41394392], - # [0.51736701, 0.49921003, 0.38730967], - # [0.69078457, 0.42282537, 0.30161136]], - # - # [[0.32043904, 0.18164253, 0.27810261], - # [0.50226176, 0.24512935, 0.39881429], - # [0.51476848, 0.23367381, 0.39229113]]]) - - # Ellipsis transpose - print(paddle.einsum('...jk->...kj', A)) - # Tensor(shape=[2, 2, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=True, - # [[[0.95649719, 0.80071914, 0.49814570], - # [0.07637714, 0.51470858, 0.99066722]], - # - # [[0.49684682, 0.46258664, 0.33383518], - # [0.29374704, 0.51907635, 0.55802226]]]) - - # Ellipsis batch matrix multiplication - print(paddle.einsum('...jk, ...kl->...jl', A,B)) - # Tensor(shape=[2, 3, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=True, - # [[[0.32172769, 0.50617385, 0.41394392], - # [0.51736701, 0.49921003, 0.38730967], - # [0.69078457, 0.42282537, 0.30161136]], - # - # [[0.32043904, 0.18164253, 0.27810261], - # [0.50226176, 0.24512935, 0.39881429], - # [0.51476848, 0.23367381, 0.39229113]]]) + import paddle + paddle.seed(102) + x = paddle.rand([4]) + y = paddle.rand([5]) + + # sum + print(paddle.einsum('i->', x)) + # Tensor(shape=[], dtype=float32, place=CUDAPlace(0), stop_gradient=True, + # 1.95791852) + + # dot + print(paddle.einsum('i,i->', x, x)) + # Tensor(shape=[1], dtype=float32, place=CUDAPlace(0), stop_gradient=True, + # [1.45936954]) + + # outer + print(paddle.einsum("i,j->ij", x, y)) + # Tensor(shape=[4, 5], dtype=float32, place=CUDAPlace(0), stop_gradient=True, + # [[0.00079869, 0.00120950, 0.00136844, 0.00187187, 0.00192194], + # [0.23455200, 0.35519385, 0.40186870, 0.54970956, 0.56441545], + # [0.11773264, 0.17828843, 0.20171674, 0.27592498, 0.28330654], + # [0.32897076, 0.49817693, 0.56364071, 0.77099484, 0.79162055]]) + + A = paddle.rand([2, 3, 2]) + B = paddle.rand([2, 2, 3]) + + # transpose + print(paddle.einsum('ijk->kji', A)) + # Tensor(shape=[2, 3, 2], dtype=float32, place=CUDAPlace(0), stop_gradient=True, + # [[[0.95649719, 0.49684682], + # [0.80071914, 0.46258664], + # [0.49814570, 0.33383518]], + # + # [[0.07637714, 0.29374704], + # [0.51470858, 0.51907635], + # [0.99066722, 0.55802226]]]) + + # batch matrix multiplication + print(paddle.einsum('ijk, ikl->ijl', A,B)) + # Tensor(shape=[2, 3, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=True, + # [[[0.32172769, 0.50617385, 0.41394392], + # [0.51736701, 0.49921003, 0.38730967], + # [0.69078457, 0.42282537, 0.30161136]], + # + # [[0.32043904, 0.18164253, 0.27810261], + # [0.50226176, 0.24512935, 0.39881429], + # [0.51476848, 0.23367381, 0.39229113]]]) + + # Ellipsis transpose + print(paddle.einsum('...jk->...kj', A)) + # Tensor(shape=[2, 2, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=True, + # [[[0.95649719, 0.80071914, 0.49814570], + # [0.07637714, 0.51470858, 0.99066722]], + # + # [[0.49684682, 0.46258664, 0.33383518], + # [0.29374704, 0.51907635, 0.55802226]]]) + + # Ellipsis batch matrix multiplication + print(paddle.einsum('...jk, ...kl->...jl', A,B)) + # Tensor(shape=[2, 3, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=True, + # [[[0.32172769, 0.50617385, 0.41394392], + # [0.51736701, 0.49921003, 0.38730967], + # [0.69078457, 0.42282537, 0.30161136]], + # + # [[0.32043904, 0.18164253, 0.27810261], + # [0.50226176, 0.24512935, 0.39881429], + # [0.51476848, 0.23367381, 0.39229113]]]) + """ import os diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py index 1a3a5595928e3..3c26703a80d51 100644 --- a/python/paddle/tensor/linalg.py +++ b/python/paddle/tensor/linalg.py @@ -1905,12 +1905,15 @@ def __check_input(x, vec): def det(x, name=None): """ + Calculates determinant value of a square matrix or batches of square matrices. Args: - x (Tensor): input (Tensor): the input matrix of size `(n, n)` or the + x (Tensor): the input matrix of size `(n, n)` or the batch of matrices of size `(*, n, n)` where `*` is one or more batch dimensions. + name(str, optional): Name of the output. Default is None. It's used + to print debug info for developers. Details: :ref:`api_guide_Name` Returns: Tensor, the determinant value of a square matrix or batches of square matrices. @@ -1961,18 +1964,20 @@ def det(x, name=None): def slogdet(x, name=None): """ + Calculates the sign and natural logarithm of the absolute value of a square matrix's or batches square matrices' determinant. - The determinant can be computed with ``sign * exp(logabsdet) + The determinant can be computed with ``sign * exp`` (logabsdet) Supports input of float, double Note that for matrices that have zero determinant, this returns ``(0, -inf)`` + Args: x (Tensor): the batch of matrices of size :math:`(*, n, n)` where math:`*` is one or more batch dimensions. Returns: - y (Tensor): A tensor containing the sign of the determinant and the natural logarithm + y (Tensor), A tensor containing the sign of the determinant and the natural logarithm of the absolute value of determinant, respectively. Examples: @@ -2090,6 +2095,7 @@ def svd(x, full_matrices=False, name=None): def matrix_power(x, n, name=None): r""" + Computes the n-th power of a square matrix or a batch of square matrices. Let :math:`X` be a sqaure matrix or a batch of square matrices, :math:`n` be @@ -2115,8 +2121,8 @@ def matrix_power(x, n, name=None): For more information, please refer to :ref:`api_guide_Name`. Returns: - Tensor: The n-th power of the matrix (or the batch of matrices) `x`. Its - data type should be the same as that of `x`. + - Tensor, The n-th power of the matrix (or the batch of matrices) `x`. Its + data type should be the same as that of `x`. Examples: .. code-block:: python @@ -3054,8 +3060,9 @@ def pinv(x, rcond=1e-15, hermitian=False, name=None): def solve(x, y, name=None): r""" + Computes the solution of a square system of linear equations with a unique solution for input 'X' and 'Y'. - Let :math: `X` be a sqaure matrix or a batch of square matrices, :math:`Y` be + Let :math:`X` be a sqaure matrix or a batch of square matrices, :math:`Y` be a vector/matrix or a batch of vectors/matrices, the equation should be: .. math:: @@ -3064,9 +3071,9 @@ def solve(x, y, name=None): Specifically, this system of linear equations has one solution if and only if input 'X' is invertible. Args: - x (Tensor): A square matrix or a batch of square matrices. Its shape should be `[*, M, M]`, where `*` is zero or + x (Tensor): A square matrix or a batch of square matrices. Its shape should be ``[*, M, M]``, where ``*`` is zero or more batch dimensions. Its data type should be float32 or float64. - y (Tensor): A vector/matrix or a batch of vectors/matrices. Its shape should be `[*, M, K]`, where `*` is zero or + y (Tensor): A vector/matrix or a batch of vectors/matrices. Its shape should be ``[*, M, K]``, where ``*`` is zero or more batch dimensions. Its data type should be float32 or float64. name(str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py index cf6ff6633bb6f..62dfcad0b3500 100644 --- a/python/paddle/tensor/math.py +++ b/python/paddle/tensor/math.py @@ -272,7 +272,8 @@ def scale(x, scale=1.0, bias=0.0, bias_after_scale=True, act=None, name=None): def stanh(x, scale_a=0.67, scale_b=1.7159, name=None): - """ + r""" + stanh activation. .. math:: @@ -283,8 +284,7 @@ def stanh(x, scale_a=0.67, scale_b=1.7159, name=None): x (Tensor): The input Tensor with data type float32, float64. scale_a (float, optional): The scale factor a of the input. Default is 0.67. scale_b (float, optional): The scale factor b of the output. Default is 1.7159. - name (str, optional): Name for the operation (optional, default is None). - For more information, please refer to :ref:`api_guide_Name`. + name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. Returns: A Tensor with the same data type and shape as ``x`` . diff --git a/python/paddle/vision/ops.py b/python/paddle/vision/ops.py index f8400859de178..ef9a51cbdec60 100755 --- a/python/paddle/vision/ops.py +++ b/python/paddle/vision/ops.py @@ -1296,15 +1296,17 @@ def distribute_fpn_proposals( name=None, ): r""" - In Feature Pyramid Networks (FPN) models, it is needed to distribute + + In Feature Pyramid Networks (FPN) models, it is needed to distribute all proposals into different FPN level, with respect to scale of the proposals, the referring scale and the referring level. Besides, to restore the order of proposals, we return an array which indicates the original index of rois in current proposals. To compute FPN level for each roi, the formula is given as follows: .. math:: - roi\_scale &= \sqrt{BBoxArea(fpn\_roi)} - level = floor(&\log(\\frac{roi\_scale}{refer\_scale}) + refer\_level) + roi\_scale &= \sqrt{BBoxArea(fpn\_roi)} \\ + level &= floor(\log(\frac{roi\_scale}{refer\_scale}) + refer\_level) + where BBoxArea is a function to compute the area of each roi. Args: @@ -1328,13 +1330,13 @@ def distribute_fpn_proposals( None by default. Returns: - multi_rois (List) : The proposals in each FPN level. It is a list of 2-D Tensor with shape [M, 4], where M is - and data type is same as `fpn_rois` . The length is max_level-min_level+1. - restore_ind (Tensor): The index used to restore the order of fpn_rois. It is a 2-D Tensor with shape [N, 1] - , where N is the number of total rois. The data type is int32. - rois_num_per_level (List): A list of 1-D Tensor and each Tensor is - the RoIs' number in each image on the corresponding level. The shape - is [B] and data type of int32, where B is the number of images. + - multi_rois (List), The proposals in each FPN level. It is a list of 2-D Tensor with shape [M, 4], where M is + and data type is same as `fpn_rois` . The length is max_level-min_level+1. + - restore_ind (Tensor), The index used to restore the order of fpn_rois. It is a 2-D Tensor with shape [N, 1] + , where N is the number of total rois. The data type is int32. + - rois_num_per_level (List), A list of 1-D Tensor and each Tensor is + the RoIs' number in each image on the corresponding level. The shape + is [B] and data type of int32, where B is the number of images. Examples: .. code-block:: python @@ -1351,6 +1353,7 @@ def distribute_fpn_proposals( refer_level=4, refer_scale=224, rois_num=rois_num) + """ num_lvl = max_level - min_level + 1 @@ -2438,6 +2441,7 @@ def matrix_nms( name=None, ): """ + This operator does matrix non maximum suppression (NMS). First selects a subset of candidate bounding boxes that have higher scores than score_threshold (if provided), then the top k candidate is selected if @@ -2445,6 +2449,7 @@ def matrix_nms( decayed according to the Matrix NMS scheme. Aftern NMS step, at most keep_top_k number of total bboxes are to be kept per image if keep_top_k is larger than -1. + Args: bboxes (Tensor): A 3-D Tensor with shape [N, M, 4] represents the predicted locations of M bounding bboxes, @@ -2468,29 +2473,32 @@ def matrix_nms( on score_threshold. keep_top_k (int): Number of total bboxes to be kept per image after NMS step. -1 means keeping all bboxes after NMS step. - use_gaussian (bool): Use Gaussian as the decay function. Default: False - gaussian_sigma (float): Sigma for Gaussian decay function. Default: 2.0 - background_label (int): The index of background label, the background + use_gaussian (bool, optional): Use Gaussian as the decay function. Default: False + gaussian_sigma (float, optional): Sigma for Gaussian decay function. Default: 2.0 + background_label (int, optional): The index of background label, the background label will be ignored. If set to -1, then all categories will be considered. Default: 0 - normalized (bool): Whether detections are normalized. Default: True - return_index(bool): Whether return selected index. Default: False - return_rois_num(bool): whether return rois_num. Default: True - name(str): Name of the matrix nms op. Default: None. + normalized (bool, optional): Whether detections are normalized. Default: True + return_index(bool, optional): Whether return selected index. Default: False + return_rois_num(bool, optional): whether return rois_num. Default: True + name(str, optional): Name of the matrix nms op. Default: None. Returns: - A tuple with three Tensor: (Out, Index, RoisNum) if return_index is True, - otherwise, a tuple with two Tensor (Out, RoisNum) is returned. - Out (Tensor): A 2-D Tensor with shape [No, 6] containing the - detection results. - Each row has 6 values: [label, confidence, xmin, ymin, xmax, ymax] - Index (Tensor): A 2-D Tensor with shape [No, 1] containing the - selected indices, which are absolute values cross batches. - rois_num (Tensor): A 1-D Tensor with shape [N] containing - the number of detected boxes in each image. + - A tuple with three Tensor, (Out, Index, RoisNum) if return_index is True, + otherwise, a tuple with two Tensor (Out, RoisNum) is returned. + - Out (Tensor), A 2-D Tensor with shape [No, 6] containing the + detection results. + Each row has 6 values, [label, confidence, xmin, ymin, xmax, ymax] + - Index (Tensor), A 2-D Tensor with shape [No, 1] containing the + selected indices, which are absolute values cross batches. + - rois_num (Tensor), A 1-D Tensor with shape [N] containing + the number of detected boxes in each image. + Examples: .. code-block:: python + import paddle from paddle.vision.ops import matrix_nms + boxes = paddle.rand([4, 1, 4]) boxes[..., 2] = boxes[..., 0] + boxes[..., 2] boxes[..., 3] = boxes[..., 1] + boxes[..., 3] @@ -2498,6 +2506,7 @@ def matrix_nms( out = matrix_nms(bboxes=boxes, scores=scores, background_label=0, score_threshold=0.5, post_threshold=0.1, nms_top_k=400, keep_top_k=200, normalized=False) + """ check_variable_and_dtype( bboxes, 'BBoxes', ['float32', 'float64'], 'matrix_nms' From 91f4d1cee18dd286ecf3ee4738babe36493ddf12 Mon Sep 17 00:00:00 2001 From: ustiniankw <102717963+ustiniankw@users.noreply.github.com> Date: Tue, 22 Nov 2022 17:31:52 +0800 Subject: [PATCH 153/210] Fixdocs (#47986) * list112-122, test=document_fix * precommitfix, test=document_fix * list112-127, test=document_fix * fix_ResNetBasicBlock, test=document_fix * pre-commit_resnet, test=document_fix * refix, test=document * refix, test=document_fix --- python/paddle/fft.py | 56 ++++++------- .../contrib/sparsity/supported_layer_list.py | 10 ++- python/paddle/fluid/contrib/sparsity/utils.py | 15 ++-- python/paddle/fluid/dygraph/layers.py | 63 +++++++++----- python/paddle/fluid/framework.py | 45 +++++++--- python/paddle/fluid/layers/metric_op.py | 18 ++-- python/paddle/fluid/layers/nn.py | 19 ++--- .../nn/functional/fused_transformer.py | 6 +- .../incubate/nn/layer/fused_transformer.py | 11 ++- .../incubate/operators/graph_khop_sampler.py | 41 +++++---- .../incubate/operators/graph_reindex.py | 84 +++++++++---------- python/paddle/incubate/xpu/resnet_block.py | 9 +- python/paddle/signal.py | 13 ++- python/paddle/sparse/nn/layer/activation.py | 7 ++ 14 files changed, 235 insertions(+), 162 deletions(-) diff --git a/python/paddle/fft.py b/python/paddle/fft.py index 1e4ca9237469b..a1748c76b9285 100644 --- a/python/paddle/fft.py +++ b/python/paddle/fft.py @@ -626,6 +626,7 @@ def ifftn(x, s=None, axes=None, norm="backward", name=None): def rfftn(x, s=None, axes=None, norm="backward", name=None): """ + The N dimensional FFT for real input. This function computes the N-dimensional discrete Fourier Transform over @@ -659,9 +660,9 @@ def rfftn(x, s=None, axes=None, norm="backward", name=None): three operations are shown below: - "backward": The factor of forward direction and backward direction are ``1`` - and ``1/n`` respectively; + and ``1/n`` respectively; - "forward": The factor of forward direction and backward direction are ``1/n`` - and ``1`` respectively; + and ``1`` respectively; - "ortho": The factor of forward direction and backword direction are both ``1/sqrt(n)``. Where ``n`` is the multiplication of each element in ``s`` . @@ -670,36 +671,35 @@ def rfftn(x, s=None, axes=None, norm="backward", name=None): refer to :ref:`api_guide_Name` . Returns: - out(Tensor): complex tensor + out(Tensor), complex tensor Examples: + .. code-block:: python - .. code-block:: python - - import paddle + import paddle - # default, all axis will be used to exec fft - x = paddle.ones((2, 3, 4)) - print(paddle.fft.rfftn(x)) - # Tensor(shape=[2, 3, 3], dtype=complex64, place=CUDAPlace(0), stop_gradient=True, - # [[[(24+0j), 0j , 0j ], - # [0j , 0j , 0j ], - # [0j , 0j , 0j ]], - # - # [[0j , 0j , 0j ], - # [0j , 0j , 0j ], - # [0j , 0j , 0j ]]]) - - # use axes(2, 0) - print(paddle.fft.rfftn(x, axes=(2, 0))) - # Tensor(shape=[2, 3, 3], dtype=complex64, place=CUDAPlace(0), stop_gradient=True, - # [[[(8+0j), 0j , 0j ], - # [(8+0j), 0j , 0j ], - # [(8+0j), 0j , 0j ]], - # - # [[0j , 0j , 0j ], - # [0j , 0j , 0j ], - # [0j , 0j , 0j ]]]) + # default, all axis will be used to exec fft + x = paddle.ones((2, 3, 4)) + print(paddle.fft.rfftn(x)) + # Tensor(shape=[2, 3, 3], dtype=complex64, place=CUDAPlace(0), stop_gradient=True, + # [[[(24+0j), 0j , 0j ], + # [0j , 0j , 0j ], + # [0j , 0j , 0j ]], + # + # [[0j , 0j , 0j ], + # [0j , 0j , 0j ], + # [0j , 0j , 0j ]]]) + + # use axes(2, 0) + print(paddle.fft.rfftn(x, axes=(2, 0))) + # Tensor(shape=[2, 3, 3], dtype=complex64, place=CUDAPlace(0), stop_gradient=True, + # [[[(8+0j), 0j , 0j ], + # [(8+0j), 0j , 0j ], + # [(8+0j), 0j , 0j ]], + # + # [[0j , 0j , 0j ], + # [0j , 0j , 0j ], + # [0j , 0j , 0j ]]]) """ return fftn_r2c(x, s, axes, norm, forward=True, onesided=True, name=name) diff --git a/python/paddle/fluid/contrib/sparsity/supported_layer_list.py b/python/paddle/fluid/contrib/sparsity/supported_layer_list.py index f55a877b4b7f3..b0b64f27eccc1 100644 --- a/python/paddle/fluid/contrib/sparsity/supported_layer_list.py +++ b/python/paddle/fluid/contrib/sparsity/supported_layer_list.py @@ -82,15 +82,17 @@ def _default_pruning(weight_nparray, m, n, func_name, param_name): def add_supported_layer(layer, pruning_func=None): r""" + Add supported layers and its corresponding pruning function. Args: name (string|Layer): The name or type of layer, needed to support. If layer is `Layer` then - it would be turn to string internally. ASP would use this name to match parameter's name and call - its the corresponding pruning function. + it would be turn to string internally. ASP would use this name to match parameter's name and call + its the corresponding pruning function. pruning_func (function, optional): a function type which receives five argument (weight_nparray, - m, n, func_name, param_name), weight_nparray is a nparray of weight, param_name is the name of weight, - m, n, and func_name, please see `prune_model` for details. + m, n, func_name, param_name), weight_nparray is a nparray of weight, param_name is the name of weight, + m, n, and func_name, please see `prune_model` for details. + """ name = None if isinstance(layer, str): diff --git a/python/paddle/fluid/contrib/sparsity/utils.py b/python/paddle/fluid/contrib/sparsity/utils.py index b5be3887380ae..b9a5c0a7b31da 100644 --- a/python/paddle/fluid/contrib/sparsity/utils.py +++ b/python/paddle/fluid/contrib/sparsity/utils.py @@ -92,20 +92,25 @@ def get_checking_method(mask_algo): def calculate_density(x): r""" + Return the density of the input tensor. Args: x (nparray): The input tensor. + Returns: - float: The density of :attr:`x`. + float, The density of :attr:`x`. + Examples: .. code-block:: python - import paddle - import numpy as np - x = np.array([[0, 1, 3, 0], + import paddle + import numpy as np + + x = np.array([[0, 1, 3, 0], [1, 1, 0, 1]]) - paddle.incubate.asp.calculate_density(x) # 0.625 + paddle.incubate.asp.calculate_density(x) # 0.625 + """ x_flattened = x.flatten() return float(np.nonzero(x_flattened)[0].size) / x_flattened.size diff --git a/python/paddle/fluid/dygraph/layers.py b/python/paddle/fluid/dygraph/layers.py index 5e15519bd9627..1593cc78e6a2c 100644 --- a/python/paddle/fluid/dygraph/layers.py +++ b/python/paddle/fluid/dygraph/layers.py @@ -177,13 +177,14 @@ def __init__(self, name_scope=None, dtype="float32"): def train(self): """ + Sets this Layer and all its sublayers to training mode. This only effects certain modules like `Dropout` and `BatchNorm`. Returns: None - Example:: + Examples: .. code-block:: python import paddle @@ -260,6 +261,7 @@ def forward(self, input): def apply(self, fn): """ + Applies ``fn`` recursively to every sublayer (as returned by ``.sublayers()``) as well as self. Typical use includes initializing the parameters of a model. @@ -267,7 +269,7 @@ def apply(self, fn): fn (function): a function to be applied to each sublayer Returns: - Layer: self + Layer, self Example:: .. code-block:: python @@ -287,6 +289,7 @@ def init_weights(layer): net.apply(init_weights) print(net.state_dict()) + """ for layer in self.children(): layer.apply(fn) @@ -296,10 +299,12 @@ def init_weights(layer): return self def full_name(self): - """Full name for this layer, composed by name_scope + "/" + MyLayer.__class__.__name__ + """ + + Full name for this layer, composed by name_scope + "/" + MyLayer.__class__.__name__ Returns: - str: full name of this layer. + str, full name of this layer. Example:: .. code-block:: python @@ -321,7 +326,9 @@ def forward(self, x): return self._full_name def register_forward_post_hook(self, hook): - """Register a forward post-hook for Layer. The hook will be called after `forward` function has been computed. + """ + + Register a forward post-hook for Layer. The hook will be called after `forward` function has been computed. It should have the following form, `input` and `output` of the `hook` is `input` and `output` of the `Layer` respectively. User can use forward post-hook to change the output of the Layer or perform information statistics tasks on the Layer. @@ -332,7 +339,7 @@ def register_forward_post_hook(self, hook): hook(function): a function registered as a forward post-hook Returns: - HookRemoveHelper: a HookRemoveHelper object that can be used to remove the added hook by calling `hook_remove_helper.remove()` . + HookRemoveHelper, a HookRemoveHelper object that can be used to remove the added hook by calling `hook_remove_helper.remove()` . Examples: .. code-block:: python @@ -364,13 +371,16 @@ def forward_post_hook(layer, input, output): # hook change the linear's output to output * 2, so out0 is equal to out1 * 2. assert (out0.numpy() == (out1.numpy()) * 2).any() + """ hook_remove_helper = HookRemoveHelper(self._forward_post_hooks) self._forward_post_hooks[hook_remove_helper._hook_id] = hook return hook_remove_helper def register_forward_pre_hook(self, hook): - """Register a forward pre-hook for Layer. The hook will be called before `forward` function has been computed. + """ + + Register a forward pre-hook for Layer. The hook will be called before `forward` function has been computed. It should have the following form, `input` of the `hook` is `input` of the `Layer`, hook can either return a tuple or a single modified value in the hook. We will wrap the value into a tuple if @@ -383,7 +393,7 @@ def register_forward_pre_hook(self, hook): hook(function): a function registered as a forward pre-hook Returns: - HookRemoveHelper: a HookRemoveHelper object that can be used to remove the added hook by calling `hook_remove_helper.remove()` . + HookRemoveHelper, a HookRemoveHelper object that can be used to remove the added hook by calling `hook_remove_helper.remove()` . Examples: .. code-block:: python @@ -581,18 +591,20 @@ def forward(self, input): ) def parameters(self, include_sublayers=True): - """Returns a list of all Parameters from current layer and its sub-layers. + """ + + Returns a list of all Parameters from current layer and its sub-layers. Returns: - list of Tensor : a list of Parameters. + list of Tensor, a list of Parameters. Examples: .. code-block:: python - import paddle + import paddle - linear = paddle.nn.Linear(1,1) - print(linear.parameters()) # print linear_0.w_0 and linear_0.b_0 + linear = paddle.nn.Linear(1,1) + print(linear.parameters()) # print linear_0.w_0 and linear_0.b_0 """ ret = [ @@ -604,7 +616,9 @@ def parameters(self, include_sublayers=True): return ret def children(self): - """Returns an iterator over immediate children layers. + """ + + Returns an iterator over immediate children layers. Yields: Layer: a child layer @@ -654,13 +668,15 @@ def named_children(self): yield name, layer def sublayers(self, include_self=False): - """Returns a list of sub layers. + """ + + Returns a list of sub layers. Parameters: include_self(bool, optional): Whether return self as sublayers. Default: False Returns: - list of Layer : a list of sub layers. + list of Layer, a list of sub layers. Examples: .. code-block:: python @@ -839,13 +855,14 @@ def register_buffer(self, name, tensor, persistable=True): def buffers(self, include_sublayers=True): """ + Returns a list of all buffers from current layer and its sub-layers. Parameters: include_sublayers(bool, optional): Whether include the buffers of sublayers. If True, also include the buffers from sublayers. Default: True Returns: - list of Tensor : a list of buffers. + list of Tensor, a list of buffers. Examples: .. code-block:: python @@ -1020,7 +1037,9 @@ def backward(self, *inputs): raise ValueError("Layer shouldn't implement backward") def add_sublayer(self, name, sublayer): - """Adds a sub Layer instance. + """ + + Adds a sub Layer instance. Added sublayer can be accessed by self.name @@ -1028,7 +1047,7 @@ def add_sublayer(self, name, sublayer): name(str): name of this sublayer. sublayer(Layer): an instance of Layer. Returns: - Layer: the sublayer passed in. + Layer, the sublayer passed in. Examples: .. code-block:: python @@ -1055,6 +1074,7 @@ def forward(self, input): model = MySequential(fc1, fc2) for prefix, layer in model.named_sublayers(): print(prefix, layer) + """ assert isinstance(sublayer, Layer) or sublayer is None @@ -1070,7 +1090,7 @@ def add_parameter(self, name, parameter): name(str): name of this sublayer. parameter(Parameter): an instance of Parameter. Returns: - Parameter: the parameter passed in. + Parameter, the parameter passed in. Examples: .. code-block:: python @@ -1503,6 +1523,7 @@ def to_static_state_dict( use_hook=True, ): ''' + Get all parameters and buffers of current layer and its sub-layers. And set them into a dict Parameters: @@ -1511,7 +1532,7 @@ def to_static_state_dict( use_hook(bool, optional) : If true, the operations contained in _state_dict_hooks will be appended to the destination. Default: True Retruns: - dict: a dict contains all the parameters and persistable buffers. + dict, a dict contains all the parameters and persistable buffers. Examples: .. code-block:: python diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 4fc525003f71d..c5e0631ecd4da 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -1352,12 +1352,13 @@ def __instancecheck__(cls, instance): class Variable(metaclass=VariableMetaClass): """ - **Notes**: - **The constructor of Variable should not be invoked directly.** - **In Static Graph Mode: Please use** `Block.create_var` **to create a Static variable which has no data until being feed.** + Notes: + The constructor of Variable should not be invoked directly. + + In Static Graph Mode: Please use ** `Block.create_var` ** to create a Static variable which has no data until being feed. - **In Dygraph Mode: Please use** :ref:`api_fluid_dygraph_to_variable` **to create a dygraph variable with real data** + In Dygraph Mode: Please use ** :ref:`api_fluid_dygraph_to_variable` ** to create a dygraph variable with real data. In Fluid, every input and output of an OP is a variable. In most cases, variables are used for holding different kinds of data or training @@ -1513,12 +1514,13 @@ def __init__( def detach(self): """ + Returns a new Variable, detached from the current graph. It will share data with origin Variable and without tensor copy. In addition, the detached Variable doesn't provide gradient propagation. Returns: - ( :ref:`api_guide_Variable_en` | dtype is same as current Variable): The detached Variable. + ( :ref:`api_guide_Variable_en` | dtype is same as current Variable), The detached Variable. Examples: .. code-block:: python @@ -1532,6 +1534,7 @@ def detach(self): # create a detached Variable y = x.detach() + """ assert ( @@ -2081,6 +2084,7 @@ def type(self): @property def T(self): """ + Permute current Variable with its dimensions reversed. If `n` is the dimensions of `x` , `x.T` is equivalent to `x.transpose([n-1, n-2, ..., 0])`. @@ -2099,6 +2103,7 @@ def T(self): x_T_np = exe.run(paddle.static.default_main_program(), fetch_list=[x_T])[0] print(x_T_np.shape) # (5, 3, 2) + """ if len(self.shape) == 1: return self @@ -2137,7 +2142,7 @@ def clone(self): as ``out = assign(tensor)`` . Returns: - Variable: The cloned Variable. + Variable, The cloned Variable. Examples: .. code-block:: python @@ -2167,6 +2172,7 @@ def clone(self): def _set_error_clip(self, error_clip): """ + Set the error_clip. Args: @@ -2174,11 +2180,13 @@ def _set_error_clip(self, error_clip): Returns: None + """ self.error_clip = error_clip def _set_info(self, key, value): """ + Set key-value information for this variable. Args: @@ -2187,6 +2195,7 @@ def _set_info(self, key, value): Returns: None + """ if not hasattr(self, "_info"): self._info = {} @@ -2194,6 +2203,7 @@ def _set_info(self, key, value): def _get_info(self, key): """ + Get the information of this variable corresponding to key. Args: @@ -2201,6 +2211,7 @@ def _get_info(self, key): Returns: object + """ if hasattr(self, "_info") and key in self._info: return self._info[key] @@ -2208,7 +2219,9 @@ def _get_info(self, key): def _slice_indices(self, slice, length): """ + Reference implementation for the slice.indices method. + """ # Compute step and length as integers. step = 1 if slice.step is None else slice.step @@ -2379,7 +2392,7 @@ def get_value(self, scope=None): Default: None Returns: - Tensor: the value in given scope. + Tensor, the value in given scope. Examples: .. code-block:: python @@ -2434,6 +2447,7 @@ def get_value(self, scope=None): def set_value(self, value, scope=None): ''' + Set the value to the tensor in given scope. Args: @@ -2473,6 +2487,7 @@ def set_value(self, value, scope=None): if var.persistable: t_load = paddle.load(path+var.name+'.pdtensor') var.set_value(t_load) + ''' # The 'framework' is a low-level module, and 'executor' @@ -2543,10 +2558,11 @@ def set_value(self, value, scope=None): def size(self): """ + Returns the number of elements for current Variable, which is a int64 Variable with shape [1] Returns: - Variable: the number of elements for current Variable + Variable, the number of elements for current Variable Examples: .. code-block:: python @@ -2560,6 +2576,7 @@ def size(self): # get the number of elements of the Variable y = x.size() + """ output = self.block.create_var( @@ -2574,23 +2591,27 @@ def size(self): def _set_attr(self, name, val): """ + Set the value of attribute by attribute's name. Args: name(str): the attribute name. val(int|str|list): the value of the attribute. + """ self._update_desc_attr(name, val) def _has_attr(self, name): """ + Whether this Variable has the attribute with the name `name` or not. Args: name(str): the attribute name. Returns: - bool: True if has this attribute. + bool, True if has this attribute. + """ return self.desc.has_attr(name) @@ -2620,7 +2641,7 @@ def attr(self, name): name(str): the attribute name. Returns: - int|str|list: The attribute value. The return value + int|str|list, The attribute value. The return value can be any valid attribute type. """ return self.desc.attr(name) @@ -3193,14 +3214,16 @@ def type(self): def input(self, name): r""" + Get the input arguments according to the input parameter name. Args: name(str): The input parameter name. Returns: - list: return the list of argument names that associated with \ + list, return the list of argument names that associated with \ the specific parameter name. + """ return self.desc.input(name) diff --git a/python/paddle/fluid/layers/metric_op.py b/python/paddle/fluid/layers/metric_op.py index 8a63b55089e8b..3179f5d568c4f 100755 --- a/python/paddle/fluid/layers/metric_op.py +++ b/python/paddle/fluid/layers/metric_op.py @@ -37,22 +37,29 @@ def accuracy(input, label, k=1, correct=None, total=None): """ + accuracy layer. Refer to the https://en.wikipedia.org/wiki/Precision_and_recall This function computes the accuracy using the input and label. If the correct label occurs in top k predictions, then correct will increment by one. - Note: the dtype of accuracy is determined by input. the input and label dtype can be different. + + Note: + the dtype of accuracy is determined by input. the input and label dtype can be different. + Args: input(Tensor): The input of accuracy layer, which is the predictions of network. A Tensor with type float32,float64. The shape is ``[sample_number, class_dim]`` . label(Tensor): The label of dataset. Tensor with type int32,int64. The shape is ``[sample_number, 1]`` . - k(int): The top k predictions for each class will be checked. Data type is int64 or int32. - correct(Tensor): The correct predictions count. A Tensor with type int64 or int32. - total(Tensor): The total entries count. A tensor with type int64 or int32. + k(int, optional): The top k predictions for each class will be checked. Data type is int64 or int32. Default is 1. + correct(Tensor, optional): The correct predictions count. A Tensor with type int64 or int32. Default is None. + total(Tensor, optional): The total entries count. A tensor with type int64 or int32. Default is None. + Returns: - Tensor: The correct rate. A Tensor with type float32. + Tensor, The correct rate. A Tensor with type float32. + Examples: .. code-block:: python + import numpy as np import paddle import paddle.static as static @@ -72,6 +79,7 @@ def accuracy(input, label, k=1, correct=None, total=None): fetch_list=[result[0]]) print(output) #[array([0.], dtype=float32)] + """ if _non_static_mode(): if correct is None: diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 96ca8a459bd50..1f74a79a91b7c 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -9057,6 +9057,7 @@ def pow(x, factor=1.0, name=None): @deprecated(since="2.0.0", update_to="paddle.static.nn.prelu") def prelu(x, mode, param_attr=None, data_format="NCHW", name=None): r""" + prelu activation. .. math:: @@ -9071,26 +9072,20 @@ def prelu(x, mode, param_attr=None, data_format="NCHW", name=None): element: All elements do not share alpha. Each element has its own alpha. Parameters: - x (Tensor): The input Tensor or LoDTensor with data type float32. - mode (str): The mode for weight sharing. - - param_attr (ParamAttr|None, optional): The parameter attribute for the learnable \ - weight (alpha), it can be create by ParamAttr. None by default. \ - For detailed information, please refer to :ref:`api_fluid_ParamAttr`. - - name (str, optional): Name for the operation (optional, default is None). \ - For more information, please refer to :ref:`api_guide_Name`. - + param_attr (ParamAttr|None, optional): The parameter attribute for the learnable + weight (alpha), it can be create by ParamAttr. None by default. + For detailed information, please refer to :ref:`api_fluid_ParamAttr`. data_format(str, optional): Data format that specifies the layout of input. It may be "NC", "NCL", "NCHW", "NCDHW", "NLC", "NHWC" or "NDHWC". Default: "NCHW". + name (str, optional): Name for the operation (optional, default is None). + For more information, please refer to :ref:`api_guide_Name`. Returns: - Tensor: A tensor with the same shape and data type as x. + Tensor, A tensor with the same shape and data type as x. Examples: - .. code-block:: python import paddle diff --git a/python/paddle/incubate/nn/functional/fused_transformer.py b/python/paddle/incubate/nn/functional/fused_transformer.py index 0887cd56aefe4..e6c8f33efb2b3 100644 --- a/python/paddle/incubate/nn/functional/fused_transformer.py +++ b/python/paddle/incubate/nn/functional/fused_transformer.py @@ -284,9 +284,11 @@ def fused_bias_dropout_residual_layer_norm( name=None, ): r""" + The fused_bias_dropout_residual_layer_norm operator. The pseudo code is as follows: .. code-block:: python + y = layer_norm(residual + dropout(bias + x)) Parameters: @@ -315,10 +317,9 @@ def fused_bias_dropout_residual_layer_norm( name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. Returns: - Tensor: The output Tensor, the data type and shape is same as `x`. + Tensor, The output Tensor, the data type and shape is same as `x`. Examples: - .. code-block:: python # required: gpu @@ -336,6 +337,7 @@ def fused_bias_dropout_residual_layer_norm( x, residual, bias) # [2, 4, 128] print(output.shape) + """ seed = None if mode not in ('downscale_in_infer', 'upscale_in_train'): diff --git a/python/paddle/incubate/nn/layer/fused_transformer.py b/python/paddle/incubate/nn/layer/fused_transformer.py index 5bf553f0eb77b..72b074a68cb15 100644 --- a/python/paddle/incubate/nn/layer/fused_transformer.py +++ b/python/paddle/incubate/nn/layer/fused_transformer.py @@ -705,6 +705,7 @@ def _amp_decorate(self, dtype): class FusedTransformerEncoderLayer(Layer): """ + FusedTransformerEncoderLayer is composed of two sub-layers which are self (multi-head) attention and feedforward network. Before and after each sub-layer, pre-process and post-precess would be applied on the input and output accordingly. If @@ -746,7 +747,6 @@ class FusedTransformerEncoderLayer(Layer): Examples: - .. code-block:: python # required: gpu @@ -759,6 +759,7 @@ class FusedTransformerEncoderLayer(Layer): attn_mask = paddle.rand((2, 2, 4, 4)) encoder_layer = FusedTransformerEncoderLayer(128, 2, 512) enc_output = encoder_layer(enc_input, attn_mask) # [2, 4, 128] + """ def __init__( @@ -835,7 +836,9 @@ def __init__( def forward(self, src, src_mask=None, cache=None): """ + Applies a Transformer encoder layer on the input. + Parameters: src (Tensor): The input of Transformer encoder layer. It is a tensor with shape `[batch_size, sequence_length, d_model]`. @@ -851,17 +854,19 @@ def forward(self, src, src_mask=None, cache=None): `-INF` values and the others have 0 values. It can be None when nothing wanted or needed to be prevented attention to. Default None. cache (Tensor, optional): It is an instance of `MultiHeadAttention.Cache`. - See `TransformerEncoderLayer.gen_cache` for more details. It is + See :ref:`api_paddle_nn_TransformerEncoderLayer`.gen_cache for more details. It is only used for inference and should be None for training. Default None. + Returns: - Tensor|tuple: It is a tensor that has the same shape and data type \ + Tensor|tuple, It is a tensor that has the same shape and data type \ as `enc_input`, representing the output of Transformer encoder \ layer. Or a tuple if `cache` is not None, except for encoder \ layer output, the tuple includes the new cache which is same \ as input `cache` argument but `incremental_cache` has an \ incremental length. See `MultiHeadAttention.gen_cache` and \ `MultiHeadAttention.forward` for more details. + """ src_mask = _convert_attention_mask(src_mask, src.dtype) if cache is None: diff --git a/python/paddle/incubate/operators/graph_khop_sampler.py b/python/paddle/incubate/operators/graph_khop_sampler.py index 821c4b418ed7e..bbe8d6a5646d3 100644 --- a/python/paddle/incubate/operators/graph_khop_sampler.py +++ b/python/paddle/incubate/operators/graph_khop_sampler.py @@ -28,6 +28,7 @@ def graph_khop_sampler( name=None, ): """ + Graph Khop Sampler API. This API is mainly used in Graph Learning domain, and the main purpose is to @@ -50,38 +51,36 @@ def graph_khop_sampler( sample_sizes (list|tuple): The number of neighbors and number of layers we want to sample. The data type should be int, and the shape should only have one dimension. - sorted_eids (Tensor): The sorted edge ids, should not be None when `return_eids` + sorted_eids (Tensor, optional): The sorted edge ids, should not be None when `return_eids` is True. The shape should be [num_edges, 1], and the data - type should be the same with `row`. - return_eids (bool): Whether to return the id of the sample edges. Default is False. + type should be the same with `row`. Default is None. + return_eids (bool, optional): Whether to return the id of the sample edges. Default is False. name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. Returns: - edge_src (Tensor): The src index of the output edges, also means the first column of - the edges. The shape is [num_sample_edges, 1] currently. - edge_dst (Tensor): The dst index of the output edges, also means the second column - of the edges. The shape is [num_sample_edges, 1] currently. - sample_index (Tensor): The original id of the input nodes and sampled neighbor nodes. - reindex_nodes (Tensor): The reindex id of the input nodes. - edge_eids (Tensor): Return the id of the sample edges if `return_eids` is True. + - edge_src (Tensor), The src index of the output edges, also means the first column of + the edges. The shape is [num_sample_edges, 1] currently. + - edge_dst (Tensor), The dst index of the output edges, also means the second column + of the edges. The shape is [num_sample_edges, 1] currently. + - sample_index (Tensor), The original id of the input nodes and sampled neighbor nodes. + - reindex_nodes (Tensor), The reindex id of the input nodes. + - edge_eids (Tensor), Return the id of the sample edges if `return_eids` is True. Examples: - .. code-block:: python - import paddle + import paddle - row = [3, 7, 0, 9, 1, 4, 2, 9, 3, 9, 1, 9, 7] - colptr = [0, 2, 4, 5, 6, 7, 9, 11, 11, 13, 13] - nodes = [0, 8, 1, 2] - sample_sizes = [2, 2] - row = paddle.to_tensor(row, dtype="int64") - colptr = paddle.to_tensor(colptr, dtype="int64") - nodes = paddle.to_tensor(nodes, dtype="int64") + row = [3, 7, 0, 9, 1, 4, 2, 9, 3, 9, 1, 9, 7] + colptr = [0, 2, 4, 5, 6, 7, 9, 11, 11, 13, 13] + nodes = [0, 8, 1, 2] + sample_sizes = [2, 2] + row = paddle.to_tensor(row, dtype="int64") + colptr = paddle.to_tensor(colptr, dtype="int64") + nodes = paddle.to_tensor(nodes, dtype="int64") - edge_src, edge_dst, sample_index, reindex_nodes = \ - paddle.incubate.graph_khop_sampler(row, colptr, nodes, sample_sizes, False) + edge_src, edge_dst, sample_index, reindex_nodes = paddle.incubate.graph_khop_sampler(row, colptr, nodes, sample_sizes, False) """ diff --git a/python/paddle/incubate/operators/graph_reindex.py b/python/paddle/incubate/operators/graph_reindex.py index d721c9a002e18..0ac5f0246f26c 100644 --- a/python/paddle/incubate/operators/graph_reindex.py +++ b/python/paddle/incubate/operators/graph_reindex.py @@ -35,6 +35,7 @@ def graph_reindex( name=None, ): """ + Graph Reindex API. This API is mainly used in Graph Learning domain, which should be used @@ -42,11 +43,11 @@ def graph_reindex( is to reindex the ids information of the input nodes, and return the corresponding graph edges after reindex. - **Notes**: + Notes: The number in x should be unique, otherwise it would cause potential errors. - Besides, we also support multi-edge-types neighbors reindexing. If we have different - edge_type neighbors for x, we should concatenate all the neighbors and count of x. - We will reindex all the nodes from 0. + Besides, we also support multi-edge-types neighbors reindexing. If we have different + edge_type neighbors for x, we should concatenate all the neighbors and count of x. + We will reindex all the nodes from 0. Take input nodes x = [0, 1, 2] as an example. If we have neighbors = [8, 9, 0, 4, 7, 6, 7], and count = [2, 3, 2], @@ -60,53 +61,52 @@ def graph_reindex( should be the same with `x`. count (Tensor): The neighbor count of the input nodes `x`. And the data type should be int32. - value_buffer (Tensor|None): Value buffer for hashtable. The data type should - be int32, and should be filled with -1. - index_buffer (Tensor|None): Index buffer for hashtable. The data type should - be int32, and should be filled with -1. - flag_buffer_hashtable (bool): Whether to use buffer for hashtable to speed up. + value_buffer (Tensor, optional): Value buffer for hashtable. The data type should + be int32, and should be filled with -1. Default is None. + index_buffer (Tensor, optional): Index buffer for hashtable. The data type should + be int32, and should be filled with -1. Default is None. + flag_buffer_hashtable (bool, optional): Whether to use buffer for hashtable to speed up. Default is False. Only useful for gpu version currently. name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. Returns: - reindex_src (Tensor): The source node index of graph edges after reindex. - reindex_dst (Tensor): The destination node index of graph edges after reindex. - out_nodes (Tensor): The index of unique input nodes and neighbors before reindex, - where we put the input nodes `x` in the front, and put neighbor - nodes in the back. + - reindex_src (Tensor), The source node index of graph edges after reindex. + - reindex_dst (Tensor), The destination node index of graph edges after reindex. + - out_nodes (Tensor), The index of unique input nodes and neighbors before reindex, + where we put the input nodes `x` in the front, and put neighbor + nodes in the back. Examples: - .. code-block:: python - import paddle - - x = [0, 1, 2] - neighbors_e1 = [8, 9, 0, 4, 7, 6, 7] - count_e1 = [2, 3, 2] - x = paddle.to_tensor(x, dtype="int64") - neighbors_e1 = paddle.to_tensor(neighbors_e1, dtype="int64") - count_e1 = paddle.to_tensor(count_e1, dtype="int32") - - reindex_src, reindex_dst, out_nodes = \ - paddle.incubate.graph_reindex(x, neighbors_e1, count_e1) - # reindex_src: [3, 4, 0, 5, 6, 7, 6] - # reindex_dst: [0, 0, 1, 1, 1, 2, 2] - # out_nodes: [0, 1, 2, 8, 9, 4, 7, 6] - - neighbors_e2 = [0, 2, 3, 5, 1] - count_e2 = [1, 3, 1] - neighbors_e2 = paddle.to_tensor(neighbors_e2, dtype="int64") - count_e2 = paddle.to_tensor(count_e2, dtype="int32") - - neighbors = paddle.concat([neighbors_e1, neighbors_e2]) - count = paddle.concat([count_e1, count_e2]) - reindex_src, reindex_dst, out_nodes = \ - paddle.incubate.graph_reindex(x, neighbors, count) - # reindex_src: [3, 4, 0, 5, 6, 7, 6, 0, 2, 8, 9, 1] - # reindex_dst: [0, 0, 1, 1, 1, 2, 2, 0, 1, 1, 1, 2] - # out_nodes: [0, 1, 2, 8, 9, 4, 7, 6, 3, 5] + import paddle + + x = [0, 1, 2] + neighbors_e1 = [8, 9, 0, 4, 7, 6, 7] + count_e1 = [2, 3, 2] + x = paddle.to_tensor(x, dtype="int64") + neighbors_e1 = paddle.to_tensor(neighbors_e1, dtype="int64") + count_e1 = paddle.to_tensor(count_e1, dtype="int32") + + reindex_src, reindex_dst, out_nodes = \ + paddle.incubate.graph_reindex(x, neighbors_e1, count_e1) + # reindex_src: [3, 4, 0, 5, 6, 7, 6] + # reindex_dst: [0, 0, 1, 1, 1, 2, 2] + # out_nodes: [0, 1, 2, 8, 9, 4, 7, 6] + + neighbors_e2 = [0, 2, 3, 5, 1] + count_e2 = [1, 3, 1] + neighbors_e2 = paddle.to_tensor(neighbors_e2, dtype="int64") + count_e2 = paddle.to_tensor(count_e2, dtype="int32") + + neighbors = paddle.concat([neighbors_e1, neighbors_e2]) + count = paddle.concat([count_e1, count_e2]) + reindex_src, reindex_dst, out_nodes = \ + paddle.incubate.graph_reindex(x, neighbors, count) + # reindex_src: [3, 4, 0, 5, 6, 7, 6, 0, 2, 8, 9, 1] + # reindex_dst: [0, 0, 1, 1, 1, 2, 2, 0, 1, 1, 1, 2] + # out_nodes: [0, 1, 2, 8, 9, 4, 7, 6, 3, 5] """ if flag_buffer_hashtable: diff --git a/python/paddle/incubate/xpu/resnet_block.py b/python/paddle/incubate/xpu/resnet_block.py index 726a1676da125..a02dcffeff897 100644 --- a/python/paddle/incubate/xpu/resnet_block.py +++ b/python/paddle/incubate/xpu/resnet_block.py @@ -325,6 +325,7 @@ def resnet_basic_block( class ResNetBasicBlock(Layer): r""" + ResNetBasicBlock is designed for optimize the performence of the basic unit of ssd resnet block. If has_shortcut = True, it can calculate 3 Conv2D, 3 BatchNorm and 2 ReLU in one time. If has_shortcut = False, it can calculate 2 Conv2D, 2 BatchNorm and 2 ReLU in one time. In this @@ -362,14 +363,14 @@ class ResNetBasicBlock(Layer): and variance are also used during train period. Default: False. is_test (bool, optional): A flag indicating whether it is in test phrase or not. Default: False. - filter_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights + filter_attr (ParamAttr, optional): The parameter attribute for learnable parameters/weights of conv2d. If it is set to None or one attribute of ParamAttr, conv2d will create ParamAttr as param_attr. Default: None. - scale_attr (ParamAttr|None): The parameter attribute for Parameter `scale` + scale_attr (ParamAttr, optional): The parameter attribute for Parameter `scale` of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm will create ParamAttr as param_attr, the name of scale can be set in ParamAttr. If the Initializer of the param_attr is not set, the parameter is initialized with Xavier. Default: None. - bias_attr (ParamAttr|None): The parameter attribute for the bias of batch_norm. + bias_attr (ParamAttr, optional): The parameter attribute for the bias of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm will create ParamAttr as bias_attr, the name of bias can be set in ParamAttr. If the Initializer of the bias_attr is not set, the bias is initialized zero. @@ -396,7 +397,6 @@ class ResNetBasicBlock(Layer): Examples: - .. code-block:: python # required: xpu @@ -426,6 +426,7 @@ class ResNetBasicBlock(Layer): out = resnet_basic_block.forward(x) print(out.shape) # [2, 8, 16, 16] + """ def __init__( diff --git a/python/paddle/signal.py b/python/paddle/signal.py index 82d46b8196763..5b6879c2855bc 100644 --- a/python/paddle/signal.py +++ b/python/paddle/signal.py @@ -259,6 +259,7 @@ def stft( name=None, ): r""" + Short-time Fourier transform (STFT). The STFT computes the discrete Fourier transforms (DFT) of short overlapping @@ -271,9 +272,12 @@ def stft( Where: - :math:`t`: The :math:`t`-th input window. + - :math:`\omega`: Frequency :math:`0 \leq \omega < \text{n\_fft}` for `onesided=False`, - or :math:`0 \leq \omega < \lfloor \text{n\_fft} / 2 \rfloor + 1` for `onesided=True`. + or :math:`0 \leq \omega < \lfloor \text{n\_fft} / 2 \rfloor + 1` for `onesided=True`. + - :math:`N`: Value of `n_fft`. + - :math:`H`: Value of `hop_length`. Args: @@ -300,9 +304,9 @@ def stft( to set this property. For more information, please refer to :ref:`api_guide_Name`. Returns: - The complex STFT output tensor with shape `[..., n_fft//2 + 1, num_frames]`( - real-valued input and `onesided` is `True`) or `[..., n_fft, num_frames]`( - `onesided` is `False`) + The complex STFT output tensor with shape `[..., n_fft//2 + 1, num_frames]` + (real-valued input and `onesided` is `True`) or `[..., n_fft, num_frames]` + (`onesided` is `False`) Examples: .. code-block:: python @@ -319,6 +323,7 @@ def stft( x = paddle.randn([8, 48000], dtype=paddle.float64) + \ paddle.randn([8, 48000], dtype=paddle.float64)*1j # [8, 48000] complex128 y1 = stft(x, n_fft=512, center=False, onesided=False) # [8, 512, 372] + """ check_variable_and_dtype( x, 'x', ['float32', 'float64', 'complex64', 'complex128'], 'stft' diff --git a/python/paddle/sparse/nn/layer/activation.py b/python/paddle/sparse/nn/layer/activation.py index 91d5c198189dd..f87901123a5c0 100644 --- a/python/paddle/sparse/nn/layer/activation.py +++ b/python/paddle/sparse/nn/layer/activation.py @@ -20,6 +20,7 @@ class ReLU(Layer): """ + Sparse ReLU Activation, requiring x to be a SparseCooTensor or SparseCsrTensor. .. math:: @@ -44,6 +45,7 @@ class ReLU(Layer): relu = paddle.sparse.nn.ReLU() out = relu(sparse_x) # [0., 0., 1.] + """ def __init__(self, name=None): @@ -60,6 +62,7 @@ def extra_repr(self): class Softmax(Layer): r""" + Sparse Softmax Activation, requiring x to be a SparseCooTensor or SparseCsrTensor. Note: @@ -129,6 +132,7 @@ def extra_repr(self): class ReLU6(Layer): """ + Sparse ReLU6 Activation, requiring x to be a SparseCooTensor or SparseCsrTensor. .. math:: @@ -152,6 +156,7 @@ class ReLU6(Layer): sparse_x = dense_x.to_sparse_coo(1) relu6 = paddle.sparse.nn.ReLU6() out = relu6(sparse_x) + """ def __init__(self, name=None): @@ -168,6 +173,7 @@ def extra_repr(self): class LeakyReLU(Layer): r""" + Sparse Leaky ReLU Activation, requiring x to be a SparseCooTensor or SparseCsrTensor. .. math:: @@ -199,6 +205,7 @@ class LeakyReLU(Layer): sparse_x = dense_x.to_sparse_coo(1) leaky_relu = paddle.sparse.nn.LeakyReLU(0.5) out = leaky_relu(sparse_x) + """ def __init__(self, negative_slope=0.01, name=None): From 0cdca6763224350385496c74e0680d14adc9c58c Mon Sep 17 00:00:00 2001 From: HongyuJia Date: Tue, 22 Nov 2022 19:14:47 +0800 Subject: [PATCH 154/210] fix typo error (#48156) --- paddle/fluid/framework/custom_operator.cc | 12 +++++------- paddle/fluid/framework/grad_op_desc_maker.h | 2 +- paddle/fluid/framework/op_desc.h | 2 +- paddle/fluid/framework/op_proto_maker.cc | 4 ++-- paddle/fluid/operators/ops_extra_info.h | 2 +- paddle/phi/api/ext/op_meta_info.h | 5 ++--- 6 files changed, 12 insertions(+), 15 deletions(-) diff --git a/paddle/fluid/framework/custom_operator.cc b/paddle/fluid/framework/custom_operator.cc index a76b164467149..1ca2f4e56dd71 100644 --- a/paddle/fluid/framework/custom_operator.cc +++ b/paddle/fluid/framework/custom_operator.cc @@ -513,8 +513,8 @@ Custom Operator. According to the phi::DenseTensor operation function implemented by the user independently of the framework, it is encapsulated into a framework -operator to adapt to various execution scenarios such as dynamic graph, -mode static graph mode, and inference mode. +operator to adapt to various execution scenarios such as dynamic graph +mode, static graph mode, and inference mode. )DOC"); } @@ -979,11 +979,9 @@ void RegisterOperatorWithMetaInfo(const std::vector& op_meta_infos, "Custom grad operator infershape error. " "If a custom grad operator contains only one input and " "only one output, the input shape will be directly set " - "to " - "the output shape. Otherwise, Please set the forward " - "input " - "as the grad operator's input or set the InferShapeFn " - "of custom grad operator by " + "to the output shape. Otherwise, Please set the forward " + "input as the grad operator's input or set the " + "InferShapeFn of custom grad operator by " ".SetInferShapeFn(PD_INFER_SHAPE(...))")); ctx->ShareDim(grad_op_inputs[0], out_name); } diff --git a/paddle/fluid/framework/grad_op_desc_maker.h b/paddle/fluid/framework/grad_op_desc_maker.h index bb36742d475ad..dd795e190bdd2 100644 --- a/paddle/fluid/framework/grad_op_desc_maker.h +++ b/paddle/fluid/framework/grad_op_desc_maker.h @@ -56,7 +56,7 @@ using GradOpPtr = typename details::GradOpPtrTrait::Type; operator fwd_op. After it is called (through operator()), the pairs of (gradient variable, corresponding input variable of fwd_op) will be added to grad_to_var. If an input variable of fwd_op is contained in no_grad_set, its - gradient varialbe will be ignored or kEmptyVarName depending on the template + gradient variable will be ignored or kEmptyVarName depending on the template argument DropEmptyIG in the derived classes. */ class GradOpDescMakerBase { diff --git a/paddle/fluid/framework/op_desc.h b/paddle/fluid/framework/op_desc.h index 6c6f13d7c929b..33460d08f729b 100644 --- a/paddle/fluid/framework/op_desc.h +++ b/paddle/fluid/framework/op_desc.h @@ -217,7 +217,7 @@ class OpDesc { return ret_val; } - // it it really needed? or just mantain a ptr from block? + // it it really needed? or just maintain a ptr from block? proto::OpDesc desc_; BlockDesc *block_{nullptr}; // not_own // input arg name => input variable names diff --git a/paddle/fluid/framework/op_proto_maker.cc b/paddle/fluid/framework/op_proto_maker.cc index 5f75991b50671..fbad45e889156 100644 --- a/paddle/fluid/framework/op_proto_maker.cc +++ b/paddle/fluid/framework/op_proto_maker.cc @@ -98,12 +98,12 @@ void OpProtoAndCheckerMaker::operator()(proto::OpProto* proto, .SetDefault({}) .AsExtra(); - AddAttr(OpNamescopeAttrName(), "Operator name with namesope.") + AddAttr(OpNamescopeAttrName(), "Operator name with namescope.") .SetDefault("") .AsExtra(); AddAttr>(OpCreationCallstackAttrName(), - "Callstack for Op Creatation.") + "Callstack for Op Creation.") .SetDefault({}) .AsExtra(); AddAttr(OpDeviceAttrName(), "Device type of this operator.") diff --git a/paddle/fluid/operators/ops_extra_info.h b/paddle/fluid/operators/ops_extra_info.h index 33f8c8ddb9c8e..b16e4ed58f3fe 100644 --- a/paddle/fluid/operators/ops_extra_info.h +++ b/paddle/fluid/operators/ops_extra_info.h @@ -37,7 +37,7 @@ enum class ExtraAttrProperty : uint8_t { SCHEDULE, // The attributes for ONEDNN only, can be saved in OneDNNContext ONEDNN, - // The attributes for ONEDNN only, can be saved in GPUContext + // The attributes for GPUDNN only, can be saved in GPUContext GPUDNN, // Add necessary properties as needed }; diff --git a/paddle/phi/api/ext/op_meta_info.h b/paddle/phi/api/ext/op_meta_info.h index 546b0accf8ba7..7d2be9c90d79e 100644 --- a/paddle/phi/api/ext/op_meta_info.h +++ b/paddle/phi/api/ext/op_meta_info.h @@ -271,7 +271,7 @@ struct KernelFuncImpl { static void Compute(CustomOpKernelContext* ctx, const Args&... args) { static_assert(out_idx == 0, "If return std::vector in Custom OpKernel, " - "you cannot pass output by kernel funciton argument."); + "you cannot pass output by kernel function argument."); auto outs = impl_fn(args...); auto* orig_outs = ctx->AllMutableOutput(); PD_CHECK(orig_outs->size() == outs.size(), @@ -626,8 +626,7 @@ class PADDLE_API OpMetaInfoBuilder { void RegisterAllCustomOperator(); // Using this api to load compiled custom operator's dynamic library and -// register Custom -// Operator into it +// register Custom Operator into it void LoadCustomOperatorLib(const std::string& dso_name); /////////////////////// Op register Macro ///////////////////////// From 1022b77729d7795540174fbd97327a7daf40061e Mon Sep 17 00:00:00 2001 From: feng_shuai Date: Tue, 22 Nov 2022 19:36:30 +0800 Subject: [PATCH 155/210] fix:fix the bug of TRT_8.0.3.4 (#48135) * fix:fix the bug of trt_8.0.3.4 * fix: fix the bug of trt_8.0 * fix: notes --- paddle/fluid/inference/tensorrt/op_teller.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc index f27c006f2080e..f68d6b7702679 100644 --- a/paddle/fluid/inference/tensorrt/op_teller.cc +++ b/paddle/fluid/inference/tensorrt/op_teller.cc @@ -1760,8 +1760,8 @@ struct SimpleOpTypeSetTeller : public Teller { return false; } } else { -#if !IS_TRT_VERSION_GE(8000) - VLOG(3) << "The version of TRT must be greater than 8000"; +#if !IS_TRT_VERSION_GE(8100) + VLOG(3) << "The version of TRT must be greater than 8100"; return false; #endif } From cbdc86b5cb0664f6c840be3b063e7b9280468256 Mon Sep 17 00:00:00 2001 From: Zhang Zheng <32410583+ZzSean@users.noreply.github.com> Date: Tue, 22 Nov 2022 19:49:49 +0800 Subject: [PATCH 156/210] Optimize the format of printing phi kernels (#48228) --- paddle/phi/core/kernel_factory.cc | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/paddle/phi/core/kernel_factory.cc b/paddle/phi/core/kernel_factory.cc index a2b9f5971756b..0352312edc025 100644 --- a/paddle/phi/core/kernel_factory.cc +++ b/paddle/phi/core/kernel_factory.cc @@ -334,11 +334,17 @@ std::ostream& operator<<(std::ostream& os, KernelFactory& kernel_factory) { os << "{"; bool need_comma_kernels = false; for (const auto& op_kernel_pair : kernel_factory.kernels()) { - if (need_comma_kernels) os << ","; - os << "\"" << op_kernel_pair.first << "\":["; + if (need_comma_kernels) { + os << ","; + os << std::endl; + } + os << "\"" << op_kernel_pair.first << " \":[" << std::endl; bool need_comma_per_kernel = false; for (const auto& kernel_pair : op_kernel_pair.second) { - if (need_comma_per_kernel) os << ","; + if (need_comma_per_kernel) { + os << ","; + os << std::endl; + } os << "{\"" << kernel_pair.first << "\":" << kernel_pair.second << "}"; need_comma_per_kernel = true; } From 27f4925410aa78c0d6db36b0aa970ab9797b4f29 Mon Sep 17 00:00:00 2001 From: yuehuayingxueluo <867460659@qq.com> Date: Tue, 22 Nov 2022 21:28:16 +0800 Subject: [PATCH 157/210] clear fluid apis in loss.py v_1 (#48132) * clear fluid apis: center_loss, bpr_loss, edit_distance, hsigmoid, sampled_softmax_with_cross_entropy, rank_loss, margin_rank_loss, sigmoid_cross_entropy_with_logits, huber_loss * fix python/paddle/fluid/layers/loss.py * fix test_layers.py * fix CI bug * fix nn.py --- python/paddle/fluid/evaluator.py | 90 -- python/paddle/fluid/layers/loss.py | 849 ------------------ python/paddle/fluid/metrics.py | 2 +- .../unittests/ipu/test_huber_loss_op_ipu.py | 89 -- .../ipu/test_margin_rank_loss_op_ipu.py | 2 +- .../unittests/ipu/test_rank_loss_op_ipu.py | 75 -- .../unittests/mlu/test_huber_loss_op_mlu.py | 24 - .../unittests/npu/test_huber_loss_op_npu.py | 21 - .../fluid/tests/unittests/test_center_loss.py | 70 +- .../tests/unittests/test_dist_transpiler.py | 12 +- .../fluid/tests/unittests/test_hsigmoid_op.py | 60 +- .../tests/unittests/test_huber_loss_op.py | 30 +- .../fluid/tests/unittests/test_layers.py | 84 -- .../unittests/test_margin_rank_loss_op.py | 5 +- .../tests/unittests/test_rank_loss_op.py | 28 - .../test_teacher_student_sigmoid_loss_op.py | 19 - python/paddle/nn/functional/loss.py | 1 - 17 files changed, 40 insertions(+), 1421 deletions(-) delete mode 100644 python/paddle/fluid/tests/unittests/ipu/test_huber_loss_op_ipu.py delete mode 100644 python/paddle/fluid/tests/unittests/ipu/test_rank_loss_op_ipu.py diff --git a/python/paddle/fluid/evaluator.py b/python/paddle/fluid/evaluator.py index b8ccfd083cafd..f3be27eaf4aa3 100644 --- a/python/paddle/fluid/evaluator.py +++ b/python/paddle/fluid/evaluator.py @@ -23,7 +23,6 @@ from .layers import detection __all__ = [ - 'EditDistance', 'DetectionMAP', ] @@ -126,95 +125,6 @@ def _create_state(self, suffix, dtype, shape): return state -class EditDistance(Evaluator): - """ - Warning: This would be deprecated in the future. Please use fluid.metrics.EditDistance - instead. - Accumulate edit distance sum and sequence number from mini-batches and - compute the average edit_distance and instance error of all batches. - - Args: - input: the sequences predicted by network. - label: the target sequences which must have same sequence count - with input. - ignored_tokens(list of int): Tokens that should be removed before - calculating edit distance. - - Examples: - .. code-block:: python - - exe = fluid.executor(place) - distance_evaluator = fluid.Evaluator.EditDistance(input, label) - for epoch in PASS_NUM: - distance_evaluator.reset(exe) - for data in batches: - loss = exe.run(fetch_list=[cost]) - distance, instance_error = distance_evaluator.eval(exe) - - In the above example: - 'distance' is the average of the edit distance in a pass. - 'instance_error' is the instance error rate in a pass. - - """ - - def __init__(self, input, label, ignored_tokens=None, **kwargs): - super().__init__("edit_distance", **kwargs) - main_program = self.helper.main_program - if main_program.current_block().idx != 0: - raise ValueError("You can only invoke Evaluator in root block") - - self.total_distance = self._create_state( - dtype='float32', shape=[1], suffix='total_distance' - ) - self.seq_num = self._create_state( - dtype='int64', shape=[1], suffix='seq_num' - ) - self.instance_error = self._create_state( - dtype='int64', shape=[1], suffix='instance_error' - ) - distances, seq_num = layers.edit_distance( - input=input, label=label, ignored_tokens=ignored_tokens - ) - - zero = layers.fill_constant(shape=[1], value=0.0, dtype='float32') - compare_result = layers.equal(distances, zero) - compare_result_int = layers.cast(x=compare_result, dtype='int64') - seq_right_count = layers.reduce_sum(compare_result_int) - instance_error_count = layers.elementwise_sub( - x=seq_num, y=seq_right_count - ) - total_distance = layers.reduce_sum(distances) - layers.sums( - input=[self.total_distance, total_distance], out=self.total_distance - ) - layers.sums(input=[self.seq_num, seq_num], out=self.seq_num) - layers.sums( - input=[self.instance_error, instance_error_count], - out=self.instance_error, - ) - self.metrics.append(total_distance) - self.metrics.append(instance_error_count) - - def eval(self, executor, eval_program=None): - if eval_program is None: - eval_program = Program() - block = eval_program.current_block() - with program_guard(main_program=eval_program): - total_distance = _clone_var_(block, self.total_distance) - seq_num = _clone_var_(block, self.seq_num) - instance_error = _clone_var_(block, self.instance_error) - seq_num = layers.cast(x=seq_num, dtype='float32') - instance_error = layers.cast(x=instance_error, dtype='float32') - avg_distance = layers.elementwise_div(x=total_distance, y=seq_num) - avg_instance_error = layers.elementwise_div( - x=instance_error, y=seq_num - ) - result = executor.run( - eval_program, fetch_list=[avg_distance, avg_instance_error] - ) - return np.array(result[0]), np.array(result[1]) - - class DetectionMAP(Evaluator): """ Warning: This would be deprecated in the future. Please use fluid.metrics.DetectionMAP diff --git a/python/paddle/fluid/layers/loss.py b/python/paddle/fluid/layers/loss.py index 1c4a1ef6acade..710382bdd26fa 100644 --- a/python/paddle/fluid/layers/loss.py +++ b/python/paddle/fluid/layers/loss.py @@ -35,21 +35,12 @@ from paddle import _C_ops, _legacy_C_ops __all__ = [ - 'center_loss', - 'bpr_loss', 'cross_entropy', 'square_error_cost', - 'edit_distance', 'warpctc', 'nce', - 'hsigmoid', - 'sampled_softmax_with_cross_entropy', 'softmax_with_cross_entropy', - 'rank_loss', - 'margin_rank_loss', 'sigmoid_cross_entropy_with_logits', - 'teacher_student_sigmoid_loss', - 'huber_loss', 'kldiv_loss', 'npair_loss', 'mse_loss', @@ -58,159 +49,6 @@ kIgnoreIndex = -100 -def center_loss( - input, label, num_classes, alpha, param_attr, update_center=True -): - r""" - :api_attr: Static Graph - - **Center loss Cost layer** - - This OP accepts input (deep features,the output of the last hidden layer) - and target label and return the center loss cost. The average of the - distances of each sample in the mini-batch from the center of the - corresponding category is calculated as the center loss. - - For deep features, :math:`X`, and target labels, :math:`Y`, the equation is: - - .. math:: - - Out = \\frac{1}{2}(X - Y)^2 - - Args: - input (Variable): a 2-D tensor with shape[N x M]. Its dtype should be float32 or float64. - label (Variable): the groud truth which is a 2-D tensor - with shape[N x 1],where N is the batch size. Its dtype should be int32. - num_classes (int): the number of classification categories. - alpha (float|Variable): learning rate of centers. - param_attr (ParamAttr): Attribute initializer of centers. - update_center (bool): whether to update value of center. - - Returns: - Variable: 2-D tensor with shape [N * 1] - - Examples: - .. code-block:: python - - import paddle.fluid as fluid - import paddle - paddle.enable_static() - - input = fluid.data(name='x',shape=[20,30],dtype='float32') - label = fluid.data(name='y',shape=[20,1],dtype='int64') - num_classes = 1000 - alpha = 0.01 - param_attr = fluid.initializer.Xavier(uniform=False) - center_loss=fluid.layers.center_loss(input=input, - label=label, - num_classes=1000, - alpha=alpha, - param_attr=fluid.initializer.Xavier(uniform=False), - update_center=True) - """ - helper = LayerHelper('center_loss', **locals()) - dtype = helper.input_dtype() - check_variable_and_dtype( - input, 'input', ['float32', 'float64'], 'center_loss' - ) - check_variable_and_dtype(label, 'label', ['int32', 'int64'], 'center_loss') - - centers_shape = [num_classes, input.shape[1]] - centers_param = helper.create_parameter( - attr=param_attr, shape=centers_shape, dtype=dtype - ) - centers_param.stop_gradient = True - - if isinstance(alpha, Variable): - alpha_param = alpha - check_variable_and_dtype( - alpha, 'alpha', ['float32', 'float64'], 'center_loss' - ) - else: - assert isinstance(alpha, float) - alpha_param = helper.create_variable( - name="centerloss_alpha", - shape=[1], - dtype="float32", - type=core.VarDesc.VarType.LOD_TENSOR, - persistable=True, - stop_gradient=True, - initializer=Constant(alpha), - ) - - centersdiff = helper.create_variable_for_type_inference(dtype=input.dtype) - loss = helper.create_variable_for_type_inference(dtype=input.dtype) - helper.append_op( - type='center_loss', - inputs={ - 'X': [input], - 'Label': [label], - 'Centers': [centers_param], - 'CenterUpdateRate': [alpha_param], - }, - outputs={ - 'SampleCenterDiff': [centersdiff], - 'Loss': [loss], - 'CentersOut': [centers_param], - }, - attrs={'cluster_num': num_classes, 'need_update': update_center}, - ) - return loss - - -def bpr_loss(input, label, name=None): - r""" - - **Bayesian Personalized Ranking Loss Operator** - - This operator belongs to pairwise ranking loss. Label is the desired item. - The loss at a given point in one session is defined as: - - .. math:: - Y[i] = 1/(N[i] - 1) * \sum_j{\log(\sigma(X[i, Label[i]]-X[i, j]))} - - Learn more details by reading paper . - - Args: - input (Variable|list): a 2-D tensor with shape [N x D], where N is the - batch size and D is the number of positive classes and negative classes - This input is not probability but logits. - label (Variable|list): the ground truth which is a 2-D tensor. `label` - is a tensor with shape [N x 1]. - name (str|None): A name for this layer(optional). If set None, the - layer will be named automatically. Default: None. - Returns: - A 2-D tensor with shape [N x 1], the bpr loss. - - Examples: - .. code-block:: python - - import paddle.fluid as fluid - import paddle - - paddle.enable_static() - - neg_size = 10 - label = fluid.data( - name="label", shape=[3, 1], dtype="int64") - predict = fluid.data( - name="predict", shape=[3, neg_size + 1], dtype="float32") - cost = fluid.layers.bpr_loss(input=predict, label=label) - """ - helper = LayerHelper('bpr_loss', **locals()) - out = helper.create_variable_for_type_inference(dtype=input.dtype) - check_variable_and_dtype( - input, 'input', ['float16', 'float32', 'float64'], 'bpr_loss' - ) - helper.append_op( - type='bpr_loss', - inputs={'X': [input], 'Label': [label]}, - outputs={'Y': [out]}, - ) - return out - - def cross_entropy(input, label, soft_label=False, ignore_index=kIgnoreIndex): r""" :alias_main: paddle.nn.functional.cross_entropy @@ -347,86 +185,6 @@ def square_error_cost(input, label): return paddle.nn.functional.square_error_cost(input, label) -def edit_distance( - input, - label, - normalized=True, - ignored_tokens=None, - input_length=None, - label_length=None, -): - """ - This op computes the edit distances, also called Levenshtein distance, between a batch of - hypothesis strings and their references. It measures how dissimilar two strings are by counting - the minimum number of operations to transform one string into another. - The operations include insertion, deletion, and substitution. - - For example, given hypothesis string A = "kitten" and reference - B = "sitting", A will be transformed into B - at least after two substitutions and one insertion: - - "kitten" -> "sitten" -> "sittin" -> "sitting" - - So the edit distance between A and B is 3. - - The input is a Tensor, the input_length and label_length should be supported. - - The `batch_size` of labels should be same as `input`. - - The output include the edit distance value between every pair of input and related label, and the number of sequence. - If Attr(normalized) is true, - the edit distance value will be divided by the length of label. - - Parameters: - input(Tensor): The input tensor, its rank should be equal to 2 and its data type should be int64. - label(Tensor): The label tensor, its rank should be equal to 2 and its data type should be int64. - normalized(bool, default True): Indicated whether to normalize the edit distance. - ignored_tokens(list, default None): Tokens that will be removed before - calculating edit distance. - input_length(Tensor): The length for each sequence in `input` if it's of Tensor type, it should have shape `(batch_size, )` and its data type should be int64. - label_length(Tensor): The length for each sequence in `label` if it's of Tensor type, it should have shape `(batch_size, )` and its data type should be int64. - NOTE: To be avoid unexpected result, the value of every elements in input_length and label_length should be equal to the value of the second dimension of input and label. For example, The input: [[1,2,3,4],[5,6,7,8],[9,10,11,12]], the shape of input is [3,4] and the input_length should be [4,4,4] - NOTE: This Api is different from fluid.metrics.EditDistance - - Returns: - Tuple: - - distance(Tensor): edit distance result, its data type is float32, and its shape is (batch_size, 1). - sequence_num(Tensor): sequence number, its data type is float32, and its shape is (1,). - - Examples: - .. code-block:: python - - import paddle - import paddle.nn.functional as F - - input = paddle.to_tensor([[1,2,3],[4,5,6],[4,4,4],[1,1,1]], dtype='int64') - label = paddle.to_tensor([[1,3,4,1],[4,5,8,1],[7,7,7,1],[1,1,1,1]], dtype='int64') - input_len = paddle.to_tensor([3,3,3,3], dtype='int64') - label_len = paddle.to_tensor([4,4,4,4], dtype='int64') - - distance, sequence_num = F.loss.edit_distance(input=input, label=label, input_length=input_len, label_length=label_len, normalized=False) - - # print(distance) - # [[3.] - # [2.] - # [4.] - # [1.]] - # if set normalized to True - # [[0.75] - # [0.5 ] - # [1. ] - # [0.25] - # - # print(sequence_num) - # [4] - - """ - return paddle.nn.functional.loss.edit_distance( - input, label, normalized, ignored_tokens, input_length, label_length - ) - - def warpctc( input, label, @@ -837,363 +595,6 @@ def _init_by_numpy_array(numpy_array): return cost / (num_neg_samples + 1) -def hsigmoid( - input, - label, - num_classes, - param_attr=None, - bias_attr=None, - name=None, - path_table=None, - path_code=None, - is_custom=False, - is_sparse=False, -): - """ - :api_attr: Static Graph - - The hierarchical sigmoid organizes the classes into a complete binary tree to reduce the computational complexity - and speed up the model training, especially the training of language model. - Each leaf node of the complete binary tree represents a class(word) and each non-leaf node acts as a binary classifier. - For each class(word), there's a unique path from root to itself, hsigmoid calculate the cost for each non-leaf node on - the path, and sum them to get a total cost. - Comparing to softmax, the OP can reduce the computational complexity from :math:`O(N)` to :math:`O(logN)`, where :math:`N` - represents the number of classes or the size of word dict. - - The OP supports default tree and custom tree. For the default tree, you can refer to `Hierarchical Probabilistic Neural - Network Language Model `. For the custom - tree, you need to set :attr:`is_custom` to True, and do the following steps (take the language model as an example): - - 1. Using a custom word dict to build a binary tree, each leaf node should be an word in the word dict. - 2. Creating a dict map word_id -> path that from the word to the root node, we call it path_table. - 3. Creating a dict map word_id -> code of path that from the word to the root node, we call it path_code. - Code means the label of each binary classifier, 1 indicate true, 0 indicate false. - 4. Now, each word should has its path and code along the path, you can pass a batch of path and code related - to the same batch of inputs. - - Parameters: - input (Variable): A tensor with the shape [N, D], where N is the size of mini-batch, - and D is the feature size. Its data type supports float32 and float64. - label (Variable): A tensor contains the labels of training data. Its shape is [N, 1] - and data type is int64. - num_classes (int): The number of classes or the size of word dict, must be greater than 2. - If the default tree is used (:attr:`is_custom` is set to False), :attr:`num_classes` - should not be None. If the custom tree is used (:attr:`is_custom` is set to True), - :attr:`num_classes` should be the number of non-leaf nodes, which indicates the num of - classes using by the binary classifier. - param_attr (ParamAttr, optional): The parameter attribute for the learnable parameters/weights - of hsigmoid. If it is set to None or one attribute of ParamAttr, hsigmoid will create a - ParamAttr as param_attr. If the Initializer of the param_attr is not set, the parameter is - initialized with Xavier. Default: None. - bias_attr (ParamAttr|bool, optional): The parameter attribute for the bias of hsigmoid. If it - is set to False, no bias will be added. If it is set to None or one attribute of ParamAttr, - hsigmoid will create a ParamAttr as bias_attr. If the Initializer of the bias_attr is not - set, the bias is initialized zero. Default: None. - name (str, optional): Normally there is no need for user to set this property. For more information, - please refer to :ref:`api_guide_Name`. Default: None. - path_table (Variable, optional): A tensor that stores each batch of samples' path from leaf to root - node, its shape is [N, L] and data type is int64, where L is the length of path. For each sample i, - path_table[i] is a np.array like structure and each element in this array is the indexes in parent - nodes' weight matrix. Default: None. - path_code (Variable, optional): A tensor that stores each batch of samples' code of path from leaf - to root node, its shape is [N, L] and data type is int64, which is the same as :attr:`path_table`. - Each code of path is consisted with the code of nodes from leaf to root node. Default: None. - is_custom (bool, optional): Whether use custom binary tree. If it's True, :attr:`path_table`, - :attr:`path_code` and :attr:`num_classes` should be set, otherwise :attr:`num_classes` should - be set. Default: False. - is_sparse (bool, optional): Whether use sparse updating instead of dense updating, if it's True, the - gradient of W and input will be sparse. Default: False. - - Returns: - Variable: A tensor with the cost of hierarchical sigmoid, its shape is [N, 1] and data type is the same as :attr:`input`. - - Examples: - - .. code-block:: python - - import paddle.fluid as fluid - x = fluid.layers.fill_constant(shape=[4, 3], value=0.9, dtype='float32') - # x = [[0.9, 0.9, 0.9], [0.9, 0.9, 0.9], [0.9, 0.9, 0.9], [0.9, 0.9, 0.9]] - y = fluid.layers.fill_constant( - shape=[4, 1], value=1, dtype='int64') - # y = [[1], [1], [1], [1]] - out = fluid.layers.hsigmoid(input=x, label=y, num_classes=2, param_attr=fluid.initializer.Constant( - value=0.05), bias_attr=fluid.initializer.Constant(value=.0)) - # out = [[0.62792355], [0.62792355], [0.62792355], [0.62792355]] - """ - check_variable_and_dtype(input, 'input', ['float32', 'float64'], 'hsigmoid') - check_variable_and_dtype(label, 'label', ['int64'], 'hsigmoid') - - helper = LayerHelper('hierarchical_sigmoid', **locals()) - dtype = helper.input_dtype() - out = helper.create_variable_for_type_inference(dtype) - pre_out = helper.create_variable_for_type_inference(dtype) - dim = input.shape[1] - if ((num_classes is None) or (num_classes < 2)) and (not is_custom): - raise ValueError( - "num_classes must not be less than 2 with default tree" - ) - - if (not is_custom) and (is_sparse): - print("Sparse mode should not be used without custom tree") - is_sparse = False - - if (not is_custom) and ( - (path_table is not None) or (path_code is not None) - ): - raise ValueError( - "only num_classes should be passed without custom tree" - ) - - if (is_custom) and (path_code is None): - raise ValueError("path_code should not be None with custom tree") - elif (is_custom) and (path_table is None): - raise ValueError("path_table should not be None with custom tree") - elif (is_custom) and (num_classes is None): - raise ValueError("num_classes should not be None with custom tree") - else: - pass - - weights = None - remote_prefetch = is_sparse - print( - "With sparse mode, if your models has only small parameter prefetch may cause speed down" - ) - if not is_custom: - weights = helper.create_parameter( - attr=helper.param_attr, - shape=[num_classes - 1, dim], - is_bias=False, - dtype=input.dtype, - ) - else: - weights = helper.create_parameter( - attr=helper.param_attr, - shape=[num_classes, dim], - is_bias=False, - dtype=input.dtype, - ) - inputs = { - "X": input, - "W": weights, - "PathTable": path_table, - "PathCode": path_code, - "Label": label, - } - if helper.bias_attr: - if not is_custom: - bias = helper.create_parameter( - attr=helper.bias_attr, - shape=[num_classes - 1, 1], - is_bias=True, - dtype=input.dtype, - ) - inputs['Bias'] = bias - else: - bias = helper.create_parameter( - attr=helper.bias_attr, - shape=[num_classes, 1], - is_bias=True, - dtype=input.dtype, - ) - inputs['Bias'] = bias - helper.append_op( - type="hierarchical_sigmoid", - inputs=inputs, - outputs={"Out": out, "PreOut": pre_out, "W_Out": weights}, - attrs={ - "num_classes": num_classes, - "is_sparse": is_sparse, - "remote_prefetch": remote_prefetch, - }, - ) - return out - - -def sampled_softmax_with_cross_entropy( - logits, - label, - num_samples, - num_true=1, - remove_accidental_hits=True, - use_customized_samples=False, - customized_samples=None, - customized_probabilities=None, - seed=0, -): - """ - **Sampled Softmax With Cross Entropy Operator.** - - Cross entropy loss with sampled softmax is used as the output layer for - larger output classes extensively. This operator samples a number of samples - for all examples, and computes the softmax normalized values for each - row of the sampled tensor, after which cross-entropy loss is computed. - - Because this operator performs a softmax on logits internally, it expects - unscaled logits. This operator should not be used with the output of - softmax operator since that would produce incorrect results. - - For examples with T true labels (T >= 1), we assume that each true label has - a probability of 1/T. For each sample, S samples are generated using a - log uniform distribution. True labels are concatenated with these samples to - form T + S samples for each example. So, assume the shape of logits is - [N x K], the shape for samples is [N x (T+S)]. For each sampled label, a - probability is calculated, which corresponds to the Q(y|x) in - [Jean et al., 2014](http://arxiv.org/abs/1412.2007). - - Logits are sampled according to the sampled labels. Then if - remove_accidental_hits is True, if a sample[i, j] accidentally hits true - labels, then the corresponding sampled_logits[i, j] is minus by 1e20 to - make its softmax result close to zero. Then sampled logits are subtracted by - logQ(y|x), these sampled logits and re-indexed labels are used to compute - a softmax with cross entropy. - - Args: - logits (Variable): The unscaled log probabilities, which is a 2-D tensor - with shape [N x K]. N is the batch_size, and K is the class number. - label (Variable): The ground truth which is a 2-D tensor. Label is a - Tensor with shape [N x T], where T is the number of true - labels per example. - num_samples (int): The number for each example, num_samples should be - less than the number of class. - num_true(int): The number of target classes per training example. - remove_accidental_hits (bool): A flag indicating whether to remove - accidental hits when sampling. If True and if a sample[i, j] - accidentally hits true labels, then the corresponding - sampled_logits[i, j] is minus by 1e20 to make its softmax result - close to zero. Default is True. - use_customized_samples (bool): Whether to use custom samples and probabities to sample - logits. - customized_samples (Variable): User defined samples, which is a 2-D tensor - with shape [N, T + S]. S is the num_samples, and T is the number of true - labels per example. - customized_probabilities (Variable): User defined probabilities of samples, - a 2-D tensor which has the same shape with customized_samples. - seed (int): The random seed for generating random number, which is used - in the process of sampling. Default is 0. - - Returns: - Variable: Return the cross entropy loss which is a 2-D tensor with shape - [N x 1]. - - Examples: - .. code-block:: python - - import paddle.fluid as fluid - - input = fluid.layers.data(name='data', shape=[256], dtype='float32') - label = fluid.layers.data(name='label', shape=[1], dtype='int64') - fc = fluid.layers.fc(input=input, size=100) - out = fluid.layers.sampled_softmax_with_cross_entropy( - logits=fc, label=label, num_samples=25) - """ - if _non_static_mode(): - sample_logits_attrs = ( - 'use_customized_samples', - use_customized_samples, - 'uniq', - True, - 'remove_accidental_hits', - remove_accidental_hits, - 'num_samples', - num_samples, - 'seed', - seed, - ) - ( - _, - _, - _, - _, - sampled_logits_out, - sampled_label_out, - ) = _legacy_C_ops.sample_logits(logits, label, *sample_logits_attrs) - depth = num_samples + 1 - sampled_softlabel_out = _legacy_C_ops.one_hot( - sampled_label_out, 'depth', depth - ) - - softmax_with_cross_entropy_attrs = ( - 'soft_label', - True, - 'numeric_stable_mode', - False, - ) - - _, loss = _legacy_C_ops.softmax_with_cross_entropy( - sampled_logits_out, - sampled_softlabel_out, - *softmax_with_cross_entropy_attrs - ) - return loss / num_true - - helper = LayerHelper('sample_logits', **locals()) - samples = ( - customized_samples - if use_customized_samples - else helper.create_variable_for_type_inference(dtype='int64') - ) - probabilities = ( - customized_probabilities - if use_customized_samples - else helper.create_variable_for_type_inference(dtype=logits.dtype) - ) - sampled_logits = helper.create_variable_for_type_inference( - dtype=logits.dtype - ) - sampled_label = helper.create_variable_for_type_inference(dtype='int64') - sampled_softlabel = helper.create_variable_for_type_inference( - dtype=logits.dtype - ) - logits_dim = helper.create_variable_for_type_inference(dtype=logits.dtype) - labels_dim = helper.create_variable_for_type_inference(dtype=label.type) - - helper.append_op( - type='sample_logits', - inputs={ - 'Logits': logits, - 'Labels': label, - 'CustomizedSamples': customized_samples, - 'CustomizedProbabilities': customized_probabilities, - }, - outputs={ - 'Samples': samples, - 'Probabilities': probabilities, - 'SampledLabels': sampled_label, - 'SampledLogits': sampled_logits, - 'LogitsDim': logits_dim, - 'LabelsDim': labels_dim, - }, - attrs={ - 'use_customized_samples': use_customized_samples, - 'uniq': True, - 'remove_accidental_hits': remove_accidental_hits, - 'num_samples': num_samples, - 'seed': seed, - }, - ) - loss = helper.create_variable_for_type_inference(dtype=logits.dtype) - softmax = helper.create_variable_for_type_inference(dtype=logits.dtype) - helper.append_op( - type='one_hot', - inputs={'X': sampled_label}, - attrs={'depth': num_samples + 1}, - outputs={'Out': sampled_softlabel}, - ) - - helper.append_op( - type='softmax_with_cross_entropy', - inputs={'Logits': sampled_logits, 'Label': sampled_softlabel}, - outputs={'Softmax': softmax, 'Loss': loss}, - attrs={ - 'soft_label': True, - 'ignore_index': False, - 'numeric_stable_mode': False, - }, - ) - return loss / num_true - - def softmax_with_cross_entropy( logits, label, @@ -1364,118 +765,6 @@ def identity_loss(x, reduction="none"): return out -def rank_loss(label, left, right, name=None): - r""" - - This operator implements the sort loss layer in the RankNet model. RankNet is a pairwise ranking model - with a training sample consisting of a pair of documents (A and B), The label (P) - indicates whether A is ranked higher than B or not. Please refer to more details: - `RankNet `_ - - Rank loss layer takes three inputs: left ( :math:`o_i` ), right ( :math:`o_j` ) and - label ( :math:`P_{i,j}` ). The inputs respectively represent RankNet's output scores - for documents A and B and the value of label P. Rank loss layer takes batch inputs - with size batch_size (batch_size >= 1), P = {0, 1} or {0, 0.5, 1}, - where 0.5 means that there is no information about the rank of the input pair. - The following equation computes rank loss C_{i,j} from the inputs: - - .. math:: - C_{i,j} &= -\\tilde{P_{ij}} * o_{i,j} + \log(1 + e^{o_{i,j}}) \\\\ - .. math:: - o_{i,j} &= o_i - o_j \\\\ - .. math:: - \\tilde{P_{i,j}} &= \\left \{0, 0.5, 1 \\right \} \ or \ \\left \{0, 1 \\right \} - - Parameters: - label (Variable): 2-D ``Tensor`` with the shape of :math:`[batch,1]`, the data type is float32, batch indicates the size of the data. Indicats whether A ranked higher than B or not. - left (Variable): 2-D ``Tensor`` with the shape of :math:`[batch,1]`, the data type is float32. RankNet's output score for doc A. - right (Variable): 2-D ``Tensor`` with the shape of :math:`[batch,1]`, the data type is float32. RankNet's output score for doc B. - name(str|None): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name` . - - Returns: - Variable: ``Tensor`` indicating the output value of the sort loss layer, the data type is float32, and the return value's shape is :math:`[batch,1]` . - - Raises: - ValueError: Any of label, left, and right is not a ``Variable`` . - - Examples: - - .. code-block:: python - - import paddle.fluid as fluid - import paddle - paddle.enable_static() - label = fluid.data(name="label", shape=[-1, 1], dtype="float32") - left = fluid.data(name="left", shape=[-1, 1], dtype="float32") - right = fluid.data(name="right", shape=[-1, 1], dtype="float32") - out = fluid.layers.rank_loss(label, left, right) - - """ - helper = LayerHelper('rank_loss', **locals()) - check_variable_and_dtype(label, 'label', ['float32'], "rank_loss") - check_variable_and_dtype(left, 'left', ['float32'], "rank_loss") - check_variable_and_dtype(right, 'right', ['float32'], "rank_loss") - - out = helper.create_variable_for_type_inference("float32") - - helper.append_op( - type='rank_loss', - inputs={"Label": label, "Left": left, "Right": right}, - outputs={'Out': out}, - ) - return out - - -def margin_rank_loss(label, left, right, margin=0.1, name=None): - r""" - Margin Ranking Loss Layer for ranking problem, - which compares left score and right score passed in. - The ranking loss can be defined as following equation: - - .. math:: - - rank\_loss = max(0, -label * (left - right) + margin) - - Args: - label (Variable): Indicates whether the left is ranked higher than the right or not. - Data type is float32. - left (Variable): Ranking score for left. Data type float32. - right (Variable): Ranking score for right. Data type float32. - margin (float): Indicates the given margin. - name(str|None): For detailed information, please refer to - :ref:`api_guide_Name` . Usually name is no need to set and None by default. - - Returns: - Variable: The ranking loss. - - Raises: - ValueError: Any of label, left, and right is not a Variable. - - Examples: - - .. code-block:: python - - import paddle.fluid as fluid - label = fluid.data(name="label", shape=[-1, 1], dtype="float32") - left = fluid.data(name="left", shape=[-1, 1], dtype="float32") - right = fluid.data(name="right", shape=[-1, 1], dtype="float32") - out = fluid.layers.margin_rank_loss(label, left, right) - """ - helper = LayerHelper('margin_rank_loss', **locals()) - check_variable_and_dtype(label, 'label', ['float32'], 'margin_rank_loss') - check_variable_and_dtype(label, 'left', ['float32'], 'margin_rank_loss') - check_variable_and_dtype(label, 'right', ['float32'], 'margin_rank_loss') - out = helper.create_variable_for_type_inference(left.dtype) - act = helper.create_variable_for_type_inference(left.dtype) - helper.append_op( - type='margin_rank_loss', - inputs={"Label": label, "X1": left, "X2": right}, - outputs={'Out': out, 'Activated': act}, - attrs={'margin': margin}, - ) - return out - - @templatedoc() def sigmoid_cross_entropy_with_logits( x, label, ignore_index=kIgnoreIndex, name=None, normalize=False @@ -1539,144 +828,6 @@ def sigmoid_cross_entropy_with_logits( return out -def teacher_student_sigmoid_loss( - input, label, soft_max_up_bound=15.0, soft_max_lower_bound=-15.0 -): - """ - - **Teacher Student Log Loss Layer** - - This layer accepts input predictions and target label and returns the - teacher_student loss. Z is click or not, z' is value of teacher loss, label = {-2, -1, [0, 2]} - when z' is not exist, clk = 0 : label = -2; when z' is not exist, clk = 1 : label = -1; - when z' is exist , clk = 0 : label = 0 + z'; when z' is exist , clk = 1 : label = 1 + z' - - .. math:: - loss = max(x, 0) - x * z + log(1 + exp(-abs(x))) + max(x, 0) - x * z' + log(1 + exp(-abs(x))) - - Args: - input (Variable|list): a 2-D tensor with shape [N x 1], where N is the - batch size. This input is a probability computed - by the previous operator. - label (Variable|list): the ground truth which is a 2-D tensor with - shape [N x 1], where N is the batch size. - soft_max_up_bound (float): if input > soft_max_up_bound, will be bound - soft_max_lower_bound (float): if input < soft_max_lower_bound, will be bound - - Returns: - Variable: A 2-D tensor with shape [N x 1], the teacher_student_sigmoid_loss. - - Examples: - .. code-block:: python - - import paddle.fluid as fluid - import paddle - paddle.enable_static() - batch_size = 64 - label = fluid.data( - name="label", shape=[batch_size, 1], dtype="int64") - similarity = fluid.data( - name="similarity", shape=[batch_size, 1], dtype="float32") - cost = fluid.layers.teacher_student_sigmoid_loss(input=similarity, label=label) - - """ - check_variable_and_dtype( - input, - "input", - ['float32', 'float64', 'int32', 'int64'], - 'teacher_student_sigmoid_loss', - ) - check_variable_and_dtype( - label, - "label", - ['float32', 'float64', 'int32', 'int64'], - 'teacher_student_sigmoid_loss', - ) - - helper = LayerHelper('teacher_student_sigmoid_loss', **locals()) - out = helper.create_variable(dtype=input.dtype) - helper.append_op( - type='teacher_student_sigmoid_loss', - inputs={'X': [input], 'Label': [label]}, - outputs={'Y': [out]}, - attrs={ - "soft_max_lower_bound": float(soft_max_lower_bound), - "soft_max_up_bound": float(soft_max_up_bound), - }, - ) - return out - - -def huber_loss(input, label, delta): - r""" - This operator computes the Huber loss between input and label. - Huber loss is commonly used in regression tasks. Compared to square_error_cost, Huber loss is more robust and less sensitivity to outliers. - - When the absolute difference between input and label is greater than delta, the linear error is calculated: - - .. math:: - huber\_loss = delta * (label - input) - 0.5 * delta * delta - - When the absolute difference between input and label is greater than delta, the square error is calculated: - - .. math:: - huber\_loss = 0.5 * (label - input) * (label - input) - - - Args: - input (Variable): Predicted data, 2D-Tensor with the shape of [batch_size, 1]. The data type should be float32. - label (Variable): Ground truth label, 2D-Tensor with the shape of [batch_size, 1]. The data type should be float32. - delta (float): The threshold for Huber loss, which is used to control the balance between the linear error and square error. The data type should be float32. - - Returns: - Variable: The huber loss, a tensor with the same shape and data type as input. - - - Examples: - - .. code-block:: python - - import paddle.fluid as fluid - import numpy as np - - DATATYPE='float32' - input_data = np.array([[1.],[2.],[3.],[4.]]).astype(DATATYPE) - label_data = np.array([[3.],[3.],[4.],[4.]]).astype(DATATYPE) - - x = fluid.data(name='input', shape=[None, 1], dtype=DATATYPE) - y = fluid.data(name='label', shape=[None, 1], dtype=DATATYPE) - loss = fluid.layers.huber_loss(input=x, label=y, delta=1.0) - - place = fluid.CPUPlace() - #place = fluid.CUDAPlace(0) - exe = fluid.Executor(place) - HuberLoss, = exe.run(feed={'input':input_data ,'label':label_data}, fetch_list=[loss.name]) - print(HuberLoss) #[[1.5], [0.5], [0.5], [0. ]], dtype=float32 - """ - if in_dygraph_mode(): - out, residual = _C_ops.huber_loss(input, label, delta) - return out - - helper = LayerHelper('huber_loss', **locals()) - check_variable_and_dtype( - input, 'input', ['float32', 'float64'], 'huber_loss' - ) - check_variable_and_dtype( - label, 'label', ['float32', 'float64'], 'huber_loss' - ) - residual = helper.create_variable_for_type_inference( - dtype=helper.input_dtype() - ) - out = helper.create_variable_for_type_inference(dtype=helper.input_dtype()) - helper.append_op( - type='huber_loss', - inputs={'X': input, 'Y': label}, - outputs={'Out': out, 'Residual': residual}, - attrs={'delta': delta}, - ) - return out - - @deprecated(since="2.0.0", update_to="paddle.nn.functional.kl_div") @templatedoc() def kldiv_loss(x, target, reduction='mean', name=None): diff --git a/python/paddle/fluid/metrics.py b/python/paddle/fluid/metrics.py index 5776b4efdf3b9..c775bff5f3f7f 100644 --- a/python/paddle/fluid/metrics.py +++ b/python/paddle/fluid/metrics.py @@ -702,7 +702,7 @@ def eval(self): """ if self.seq_num == 0: raise ValueError( - "There is no data in EditDistance Metric. Please check layers.edit_distance output has been added to EditDistance." + "There is no data in EditDistance Metric. Please check paddle.nn.functional.loss.edit_distance output has been added to EditDistance." ) avg_distance = self.total_distance / self.seq_num avg_instance_error = self.instance_error / float(self.seq_num) diff --git a/python/paddle/fluid/tests/unittests/ipu/test_huber_loss_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_huber_loss_op_ipu.py deleted file mode 100644 index 5030e368083ee..0000000000000 --- a/python/paddle/fluid/tests/unittests/ipu/test_huber_loss_op_ipu.py +++ /dev/null @@ -1,89 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import unittest - -import numpy as np -import paddle -import paddle.static -from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest - - -class TestBase(IPUOpTest): - def setUp(self): - self.set_atol() - self.set_training() - self.set_data_feed() - self.set_feed_attr() - self.set_op_attrs() - - def set_data_feed(self): - x = np.random.uniform(size=[3, 4, 2, 2]) - target = np.random.uniform(size=[3, 4, 2, 2]) - self.feed_fp32 = { - "x": x.astype(np.float32), - "target": target.astype(np.float32), - } - self.feed_fp16 = { - "x": x.astype(np.float16), - "target": target.astype(np.float16), - } - - def set_feed_attr(self): - self.feed_shape = [x.shape for x in self.feed_fp32.values()] - self.feed_list = list(self.feed_fp32.keys()) - - def set_op_attrs(self): - self.attrs = { - 'delta': 1.0, - } - - @IPUOpTest.static_graph - def build_model(self, on_ipu): - x = paddle.static.data( - name=self.feed_list[0], shape=self.feed_shape[0], dtype="float32" - ) - target = paddle.static.data( - name=self.feed_list[1], shape=self.feed_shape[1], dtype='float32' - ) - out = paddle.fluid.layers.huber_loss(x, target, **self.attrs) - self.fetch_list = [out.name] - - def run_model(self, exec_mode): - self.run_op_test(exec_mode) - - def test(self): - for m in IPUOpTest.ExecutionMode: - if not self.skip_mode(m): - self.build_model(self.is_ipu_mode(m)) - self.run_model(m) - self.check() - - -class TestCase1(TestBase): - def set_op_attrs(self): - self.attrs = { - 'delta': 0.5, - } - - -class TestCase2(TestBase): - def set_op_attrs(self): - self.attrs = { - 'delta': 0.0, - } - - -if __name__ == "__main__": - unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ipu/test_margin_rank_loss_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_margin_rank_loss_op_ipu.py index 5861009fd8518..371c9708ca0f4 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_margin_rank_loss_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_margin_rank_loss_op_ipu.py @@ -63,7 +63,7 @@ def build_model(self, on_ipu): right = paddle.static.data( name=self.feed_list[2], shape=self.feed_shape[2], dtype='float32' ) - out = paddle.fluid.layers.margin_rank_loss(label, left, right) + out = paddle.nn.functional.margin_ranking_loss(left, right, label) self.fetch_list = [out.name] def run_model(self, exec_mode): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_rank_loss_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_rank_loss_op_ipu.py deleted file mode 100644 index bebe0e2232770..0000000000000 --- a/python/paddle/fluid/tests/unittests/ipu/test_rank_loss_op_ipu.py +++ /dev/null @@ -1,75 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import unittest - -import numpy as np -import paddle -import paddle.static -from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest - - -class TestBase(IPUOpTest): - def setUp(self): - self.set_atol() - self.set_training() - self.set_data_feed() - self.set_feed_attr() - - def set_data_feed(self): - label = np.random.uniform(size=[3, 1]) - left = np.random.uniform(size=[3, 1]) - right = np.random.uniform(size=[3, 1]) - self.feed_fp32 = { - "label": label.astype(np.float32), - "left": left.astype(np.float32), - "right": right.astype(np.float32), - } - self.feed_fp16 = { - "label": label.astype(np.float16), - "left": left.astype(np.float16), - "right": right.astype(np.float16), - } - - def set_feed_attr(self): - self.feed_shape = [x.shape for x in self.feed_fp32.values()] - self.feed_list = list(self.feed_fp32.keys()) - - @IPUOpTest.static_graph - def build_model(self, on_ipu): - label = paddle.static.data( - name=self.feed_list[0], shape=self.feed_shape[0], dtype="float32" - ) - left = paddle.static.data( - name=self.feed_list[1], shape=self.feed_shape[1], dtype='float32' - ) - right = paddle.static.data( - name=self.feed_list[2], shape=self.feed_shape[2], dtype='float32' - ) - out = paddle.fluid.layers.rank_loss(label, left, right) - self.fetch_list = [out.name] - - def run_model(self, exec_mode): - self.run_op_test(exec_mode) - - def test(self): - for m in IPUOpTest.ExecutionMode: - if not self.skip_mode(m): - self.build_model(self.is_ipu_mode(m)) - self.run_model(m) - self.check() - - -if __name__ == "__main__": - unittest.main() diff --git a/python/paddle/fluid/tests/unittests/mlu/test_huber_loss_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_huber_loss_op_mlu.py index 35003418095cb..5e83c7e57daa2 100644 --- a/python/paddle/fluid/tests/unittests/mlu/test_huber_loss_op_mlu.py +++ b/python/paddle/fluid/tests/unittests/mlu/test_huber_loss_op_mlu.py @@ -37,7 +37,6 @@ class TestHuberLossOp(OpTest): def setUp(self): self.op_type = 'huber_loss' self.set_mlu() - self.python_api = paddle.fluid.layers.huber_loss self.python_out_sig = ["Out"] self.delta = 1.0 self.init_input() @@ -103,28 +102,5 @@ def set_shape(self): return (6, 6, 1) -class TestHuberLossOpError(unittest.TestCase): - def test_errors(self): - with program_guard(Program(), Program()): - # the input and label must be Variable - xw = np.random.random((6, 6)).astype("float32") - xr = fluid.data(name='xr', shape=[None, 6], dtype="float32") - lw = np.random.random((6, 6)).astype("float32") - lr = fluid.data(name='lr', shape=[None, 6], dtype="float32") - delta = 1.0 - self.assertRaises(TypeError, fluid.layers.huber_loss, xr, lw, delta) - self.assertRaises(TypeError, fluid.layers.huber_loss, xw, lr, delta) - - # the dtype of input and label must be float32 or float64 - xw2 = fluid.data(name='xw2', shape=[None, 6], dtype="int32") - lw2 = fluid.data(name='lw2', shape=[None, 6], dtype="int32") - self.assertRaises( - TypeError, fluid.layers.huber_loss, xw2, lr, delta - ) - self.assertRaises( - TypeError, fluid.layers.huber_loss, xr, lw2, delta - ) - - if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/npu/test_huber_loss_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_huber_loss_op_npu.py index 0e81f00de62c8..d4f602a0381da 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_huber_loss_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_huber_loss_op_npu.py @@ -126,27 +126,6 @@ def init_dtype(self): @unittest.skipIf( not paddle.is_compiled_with_npu(), "core is not compiled with NPU" ) -class TestHuberLossOpError(unittest.TestCase): - def test_errors(self): - with program_guard(Program(), Program()): - # the input and label must be Variable - xw = np.random.random((6, 6)).astype("float32") - xr = fluid.data(name='xr', shape=[None, 6], dtype="float32") - lw = np.random.random((6, 6)).astype("float32") - lr = fluid.data(name='lr', shape=[None, 6], dtype="float32") - delta = 1.0 - self.assertRaises(TypeError, fluid.layers.huber_loss, xr, lw, delta) - self.assertRaises(TypeError, fluid.layers.huber_loss, xw, lr, delta) - - # the dtype of input and label must be float32 or float64 - xw2 = fluid.data(name='xw2', shape=[None, 6], dtype="int32") - lw2 = fluid.data(name='lw2', shape=[None, 6], dtype="int32") - self.assertRaises( - TypeError, fluid.layers.huber_loss, xw2, lr, delta - ) - self.assertRaises( - TypeError, fluid.layers.huber_loss, xr, lw2, delta - ) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_center_loss.py b/python/paddle/fluid/tests/unittests/test_center_loss.py index 7bf68100e029d..bf21e64945f56 100644 --- a/python/paddle/fluid/tests/unittests/test_center_loss.py +++ b/python/paddle/fluid/tests/unittests/test_center_loss.py @@ -15,7 +15,7 @@ import unittest import numpy as np from op_test import OpTest -import paddle.fluid as fluid +import paddle class TestCenterLossOp(OpTest): @@ -89,72 +89,6 @@ def config(self): self.need_update = False -class BadInputTestCenterLoss(unittest.TestCase): - def test_error(self): - with fluid.program_guard(fluid.Program()): - - def test_bad_x(): - data = [[1, 2, 3, 4], [5, 6, 7, 8]] - label = fluid.layers.data( - name='label', shape=[2, 1], dtype='int32' - ) - res = fluid.layers.center_loss( - data, - label, - num_classes=1000, - alpha=0.2, - param_attr=fluid.initializer.Xavier(uniform=False), - update_center=True, - ) - - self.assertRaises(TypeError, test_bad_x) - - def test_bad_y(): - data = fluid.layers.data( - name='data', shape=[2, 32], dtype='float32' - ) - label = [[2], [3]] - res = fluid.layers.center_loss( - data, - label, - num_classes=1000, - alpha=0.2, - param_attr=fluid.initializer.Xavier(uniform=False), - update_center=True, - ) - - self.assertRaises(TypeError, test_bad_y) - - def test_bad_alpha(): - data = fluid.layers.data( - name='data2', - shape=[2, 32], - dtype='float32', - append_batch_size=False, - ) - label = fluid.layers.data( - name='label2', - shape=[2, 1], - dtype='int32', - append_batch_size=False, - ) - alpha = fluid.layers.data( - name='alpha', - shape=[1], - dtype='int64', - append_batch_size=False, - ) - res = fluid.layers.center_loss( - data, - label, - num_classes=1000, - alpha=alpha, - param_attr=fluid.initializer.Xavier(uniform=False), - update_center=True, - ) - - self.assertRaises(TypeError, test_bad_alpha) - - if __name__ == "__main__": + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py index ce02bc4af7950..210dff8d6269d 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py +++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py @@ -1440,14 +1440,18 @@ def network_with_table(self, is_sparse, is_distributed): ), ) - cost = fluid.layers.hsigmoid( + loss = paddle.nn.HSigmoidLoss( + feature_size=emb.shape[1], + num_classes=num_total_classes, + is_custom=True, + is_sparse=is_sparse, + ) + + cost = loss( input=emb, label=label, - num_classes=num_total_classes, path_table=path_table, path_code=path_code, - is_custom=True, - is_sparse=is_sparse, ) avg_cost = paddle.mean(cost) # optimizer diff --git a/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py b/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py index db3627d21ce75..733ce0e6ce140 100644 --- a/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py +++ b/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py @@ -17,7 +17,6 @@ import paddle import paddle.fluid as fluid import paddle.nn.functional as F -from paddle.fluid import Program, program_guard import paddle.fluid.initializer as I import math from op_test import OpTest, skip_check_grad_ci @@ -305,15 +304,19 @@ def hs_net_conf(self, is_sparse): ), ) - cost = fluid.layers.hsigmoid( + loss = paddle.nn.HSigmoidLoss( + feature_size=emb.shape[1], + num_classes=3, + bias_attr=True, + is_custom=True, + is_sparse=is_sparse, + ) + + cost = loss( input=emb, label=label, - bias_attr=True, - num_classes=3, path_table=path_table, path_code=path_code, - is_custom=True, - is_sparse=is_sparse, ) avg_cost = fluid.layers.reduce_mean(cost) @@ -633,16 +636,19 @@ def test_fluid_api(self): path_code = fluid.data('path_code', [-1, -1], 'int64') weight_attr = I.NumpyArrayInitializer(self.weight_np) bias_attr = I.NumpyArrayInitializer(self.bias_np) - out = fluid.layers.hsigmoid( - x, - labels, - self.num_classes, - weight_attr, - bias_attr, - 'out', - path_table, - path_code, - self.is_custom, + loss = paddle.nn.HSigmoidLoss( + feature_size=x.shape[1], + num_classes=self.num_classes, + weight_attr=weight_attr, + bias_attr=bias_attr, + is_custom=self.is_custom, + name='out', + ) + out = loss( + input=x, + label=labels, + path_table=path_table, + path_code=path_code, ) exe = fluid.Executor(self.place) @@ -730,28 +736,6 @@ def test_errors(self): self.assertRaises(ValueError, F.hsigmoid_loss, x, label, 0, weight) paddle.enable_static() - # test paddle.fluid.layers.hsigmoid - with program_guard(Program()): - label = fluid.data('label', [4, 1], 'int64') - # The input type must be Variable. - self.assertRaises(TypeError, fluid.layers.hsigmoid, 1, label, 2) - # The input dtype must be float16, float32, float64. - x_int32 = fluid.data(name='x_int32', shape=[4, 3], dtype='int32') - self.assertRaises( - TypeError, fluid.layers.hsigmoid, x_int32, label, 2 - ) - # support the input dtype is float32 - x_fp32 = fluid.data(name='x_fp32', shape=[4, 3], dtype='float32') - fluid.layers.hsigmoid(x_fp32, label, 2) - - # The label type must be Variable. - self.assertRaises(TypeError, fluid.layers.hsigmoid, x_fp32, 1, 2) - # The label dtype must be int64. - label_int32 = fluid.data('label_int32', [4, 1], 'int32') - self.assertRaises( - TypeError, fluid.layers.hsigmoid, x_fp32, label_int32, 2 - ) - class TestHSigmoidLossAPICustom(TestHSigmoidLossAPI): def set_attrs(self): diff --git a/python/paddle/fluid/tests/unittests/test_huber_loss_op.py b/python/paddle/fluid/tests/unittests/test_huber_loss_op.py index 778fedfd4aa08..afb1170a9db4e 100644 --- a/python/paddle/fluid/tests/unittests/test_huber_loss_op.py +++ b/python/paddle/fluid/tests/unittests/test_huber_loss_op.py @@ -15,9 +15,7 @@ import unittest import numpy as np from op_test import OpTest -import paddle.fluid as fluid import paddle -from paddle.fluid import Program, program_guard def huber_loss_forward(val, delta): @@ -31,7 +29,6 @@ def huber_loss_forward(val, delta): class TestHuberLossOp(OpTest): def setUp(self): self.op_type = 'huber_loss' - self.python_api = paddle.fluid.layers.huber_loss self.python_out_sig = ["Out"] self.delta = 1.0 self.init_input() @@ -54,10 +51,10 @@ def set_shape(self): return (100, 1) def test_check_output(self): - self.check_output(check_eager=True) + self.check_output(check_eager=False) def test_check_grad_normal(self): - self.check_grad(['X', 'Y'], 'Out', check_eager=True) + self.check_grad(['X', 'Y'], 'Out', check_eager=False) def test_check_grad_ingore_x(self): self.check_grad( @@ -85,29 +82,6 @@ def set_shape(self): return (6, 6, 1) -class TestHuberLossOpError(unittest.TestCase): - def test_errors(self): - with program_guard(Program(), Program()): - # the input and label must be Variable - xw = np.random.random((6, 6)).astype("float32") - xr = fluid.data(name='xr', shape=[None, 6], dtype="float32") - lw = np.random.random((6, 6)).astype("float32") - lr = fluid.data(name='lr', shape=[None, 6], dtype="float32") - delta = 1.0 - self.assertRaises(TypeError, fluid.layers.huber_loss, xr, lw, delta) - self.assertRaises(TypeError, fluid.layers.huber_loss, xw, lr, delta) - - # the dtype of input and label must be float32 or float64 - xw2 = fluid.data(name='xw2', shape=[None, 6], dtype="int32") - lw2 = fluid.data(name='lw2', shape=[None, 6], dtype="int32") - self.assertRaises( - TypeError, fluid.layers.huber_loss, xw2, lr, delta - ) - self.assertRaises( - TypeError, fluid.layers.huber_loss, xr, lw2, delta - ) - - if __name__ == '__main__': paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index 1f2f07a067e63..38b0d96571b60 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -2984,7 +2984,6 @@ def setUp(self): "make_gaussian_random_batch_size_like", "make_kldiv_loss", "make_prelu", - "make_sampled_softmax_with_cross_entropy", "make_sampling_id", "make_uniform_random_batch_size_like", } @@ -3091,18 +3090,6 @@ def _get_data( append_batch_size=append_batch_size, ) - def make_sampled_softmax_with_cross_entropy(self): - with program_guard( - fluid.default_main_program(), fluid.default_startup_program() - ): - logits = self._get_data(name='Logits', shape=[256], dtype='float32') - label = self._get_data(name='Label', shape=[1], dtype='int64') - num_samples = 25 - output = layers.sampled_softmax_with_cross_entropy( - logits, label, num_samples - ) - return output - def make_fit_a_line(self): with program_guard( fluid.default_main_program(), @@ -3237,33 +3224,6 @@ def make_sigmoid_cross_entropy(self): x=dat, label=lbl, ignore_index=ignore_index ) - def make_hsigmoid(self): - self._force_to_use_cpu = True - with fluid.framework._dygraph_place_guard(place=fluid.CPUPlace()): - x = self._get_data(name='x', shape=[2], dtype='float32') - y = self._get_data(name='y', shape=[2], dtype='int64') - return layers.hsigmoid(input=x, label=y, num_classes=2) - - # test hsigmod with custom tree structure - program2 = Program() - with program_guard(program2): - x2 = self._get_data(name='x2', shape=[4, 8], dtype='float32') - y2 = self._get_data(name='y2', shape=[4], dtype='int64') - path_table = self._get_data( - name='path_table', shape=[4, 6], dtype='int64' - ) - path_code = self._get_data( - name='path_code', shape=[4, 6], dtype='int64' - ) - return layers.hsigmoid( - input=x2, - label=y2, - num_classes=6, - path_table=path_table, - path_code=path_code, - is_custom=True, - ) - def make_pool2d(self): with program_guard( fluid.default_main_program(), fluid.default_startup_program() @@ -3597,31 +3557,6 @@ def make_argsort(self): return out return ids - def make_rank_loss(self): - with program_guard( - fluid.default_main_program(), fluid.default_startup_program() - ): - label = self._get_data( - name='label', - append_batch_size=False, - shape=[16, 1], - dtype="float32", - ) - left = self._get_data( - name='left', - append_batch_size=False, - shape=[16, 1], - dtype="float32", - ) - right = self._get_data( - name='right', - append_batch_size=False, - shape=[16, 1], - dtype="float32", - ) - out = layers.rank_loss(label, left, right, name="rank_loss") - return out - def make_shape(self): with program_guard( fluid.default_main_program(), fluid.default_startup_program() @@ -3691,14 +3626,6 @@ def make_cross_entropy(self): out = layers.cross_entropy(x, label, False, 4) return out - def make_bpr_loss(self): - self._force_to_use_cpu = True - with fluid.framework._dygraph_place_guard(place=fluid.CPUPlace()): - x = self._get_data(name="x", shape=[30, 10], dtype="float32") - label = self._get_data(name="label", shape=[30, 1], dtype="int64") - out = layers.bpr_loss(x, label) - return out - def make_expand(self): with program_guard( fluid.default_main_program(), fluid.default_startup_program() @@ -4585,17 +4512,6 @@ def test_warpctc_with_padding(self): ) return output - def test_edit_distance(self): - with self.static_graph(): - predict = layers.data( - name='predict', shape=[-1, 1], dtype='int64', lod_level=1 - ) - label = layers.data( - name='label', shape=[-1, 1], dtype='int64', lod_level=1 - ) - evaluator = fluid.evaluator.EditDistance(predict, label) - return evaluator.metrics - def test_basic_gru(self): input_size = 128 hidden_size = 256 diff --git a/python/paddle/fluid/tests/unittests/test_margin_rank_loss_op.py b/python/paddle/fluid/tests/unittests/test_margin_rank_loss_op.py index f68b995a1ff7a..a931296fa364d 100644 --- a/python/paddle/fluid/tests/unittests/test_margin_rank_loss_op.py +++ b/python/paddle/fluid/tests/unittests/test_margin_rank_loss_op.py @@ -16,6 +16,7 @@ import numpy as np from op_test import OpTest from paddle import fluid +import paddle class TestMarginRankLossOp(OpTest): @@ -87,7 +88,9 @@ def check_identity(self, place): label = fluid.data("label", (self.batch_size, 1), "float32") x1 = fluid.data("x1", (self.batch_size, 1), "float32") x2 = fluid.data("x2", (self.batch_size, 1), "float32") - out = fluid.layers.margin_rank_loss(label, x1, x2, self.margin) + out = paddle.nn.functional.margin_ranking_loss( + x1, x2, label, self.margin, 'none' + ) exe = fluid.Executor(place) exe.run(start) diff --git a/python/paddle/fluid/tests/unittests/test_rank_loss_op.py b/python/paddle/fluid/tests/unittests/test_rank_loss_op.py index 1ace41d2d24b2..49820853aa115 100644 --- a/python/paddle/fluid/tests/unittests/test_rank_loss_op.py +++ b/python/paddle/fluid/tests/unittests/test_rank_loss_op.py @@ -15,8 +15,6 @@ import unittest import numpy as np from op_test import OpTest -import paddle.fluid as fluid -from paddle.fluid import Program, program_guard class TestRankLossOp(OpTest): @@ -84,31 +82,5 @@ def set_shape(self): return (batch_size), (batch_size), (batch_size) -class TestRankLossOpError(unittest.TestCase): - def test_errors(self): - with program_guard(Program(), Program()): - label = fluid.data(name="label", shape=[16, 1], dtype="float32") - left = fluid.data(name="left", shape=[16, 1], dtype="float32") - right = fluid.data(name="right", shape=[16, 1], dtype="float32") - - def test_label_Variable(): - label_data = np.random.rand(16, 1).astype("float32") - out = fluid.layers.rank_loss(label_data, left, right) - - self.assertRaises(TypeError, test_label_Variable) - - def test_left_Variable(): - left_data = np.random.rand(16, 1).astype("float32") - out = fluid.layers.rank_loss(label, left_data, right) - - self.assertRaises(TypeError, test_left_Variable) - - def test_right_Variable(): - right_data = np.random.rand(16, 1).astype("float32") - out = fluid.layers.rank_loss(label, left, right_data) - - self.assertRaises(TypeError, test_right_Variable) - - if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_teacher_student_sigmoid_loss_op.py b/python/paddle/fluid/tests/unittests/test_teacher_student_sigmoid_loss_op.py index 88ea8f647c4bf..307a4edcf185c 100644 --- a/python/paddle/fluid/tests/unittests/test_teacher_student_sigmoid_loss_op.py +++ b/python/paddle/fluid/tests/unittests/test_teacher_student_sigmoid_loss_op.py @@ -17,8 +17,6 @@ from math import exp from op_test import OpTest from scipy.special import logit -import unittest -import paddle.fluid as fluid class TestTeacherStudentSigmoidLossOp(OpTest): @@ -71,20 +69,3 @@ def test_check_output(self): def test_check_grad(self): self.check_grad(["X"], "Y", numeric_grad_delta=0.005) - - -class TestTeacherStudentSigmoidLossInvalidInput(unittest.TestCase): - def test_error(self): - def test_invalid_input(): - input = [512, 1] - label = fluid.data(name='label', shape=[None, 1], dtype='float32') - loss = fluid.layers.teacher_student_sigmoid_loss(input, label) - - self.assertRaises(TypeError, test_invalid_input) - - def test_invalid_label(): - input = fluid.data(name='input1', shape=[None, 1], dtype='float32') - label = [512, 1] - loss = fluid.layers.teacher_student_sigmoid_loss(input, label) - - self.assertRaises(TypeError, test_invalid_label) diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py index 234224964b3f0..e93e52b31a5f9 100755 --- a/python/paddle/nn/functional/loss.py +++ b/python/paddle/nn/functional/loss.py @@ -506,7 +506,6 @@ def edit_distance( input_length(Tensor): The length for each sequence in `input` if it's of Tensor type, it should have shape `(batch_size, )` and its data type should be int64. label_length(Tensor): The length for each sequence in `label` if it's of Tensor type, it should have shape `(batch_size, )` and its data type should be int64. NOTE: To be avoid unexpected result, the value of every elements in input_length and label_length should be equal to the value of the second dimension of input and label. For example, The input: [[1,2,3,4],[5,6,7,8],[9,10,11,12]], the shape of input is [3,4] and the input_length should be [4,4,4] - NOTE: This Api is different from fluid.metrics.EditDistance Returns: Tuple: From 78b30e976201c2b021b87c865528e05a511e0a43 Mon Sep 17 00:00:00 2001 From: Piotr Paturej <48731682+piotrekobi@users.noreply.github.com> Date: Tue, 22 Nov 2022 14:48:22 +0100 Subject: [PATCH 158/210] [PHI] Migrate elementwise_div + all elementwise grad kernels (#48210) * Migrate elementwise_div * Migrate elementwise grad kernels --- .../mkldnn/elementwise_add_mkldnn_op.cc | 8 - .../mkldnn/elementwise_div_mkldnn_op.cc | 32 -- .../mkldnn/elementwise_mul_mkldnn_op.cc | 8 - .../mkldnn/elementwise_sub_mkldnn_op.cc | 8 - paddle/phi/kernels/elementwise_kernel.cc | 5 + .../kernels/onednn/elementwise_grad_kernel.cc | 361 ++++++++++++++++++ .../phi/kernels/onednn/elementwise_kernel.cc | 139 +++++++ 7 files changed, 505 insertions(+), 56 deletions(-) delete mode 100644 paddle/fluid/operators/elementwise/mkldnn/elementwise_div_mkldnn_op.cc create mode 100644 paddle/phi/kernels/onednn/elementwise_grad_kernel.cc create mode 100644 paddle/phi/kernels/onednn/elementwise_kernel.cc diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc index ac0bf15aeb99a..57996477e38a9 100644 --- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc +++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc @@ -25,11 +25,3 @@ REGISTER_OP_KERNEL( dnnl::algorithm::binary_add>, ops::EltwiseMKLDNNKernel, ops::EltwiseMKLDNNKernel) - -REGISTER_OP_KERNEL( - elementwise_add_grad, - MKLDNN, - ::paddle::platform::CPUPlace, - ops::EltwiseMKLDNNGradKernel, - ops::EltwiseMKLDNNGradKernel) diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_div_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_div_mkldnn_op.cc deleted file mode 100644 index d527a078c658c..0000000000000 --- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_div_mkldnn_op.cc +++ /dev/null @@ -1,32 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h" - -namespace ops = paddle::operators; - -REGISTER_OP_KERNEL(elementwise_div, - MKLDNN, - paddle::platform::CPUPlace, - ops::EltwiseMKLDNNKernel, - ops::EltwiseMKLDNNKernel) - -REGISTER_OP_KERNEL( - elementwise_div_grad, - MKLDNN, - paddle::platform::CPUPlace, - ops::EltwiseMKLDNNGradKernel, - ops::EltwiseMKLDNNGradKernel) diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc index afebc5c6e322a..ba3a0d87f6cf7 100644 --- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc +++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc @@ -25,11 +25,3 @@ REGISTER_OP_KERNEL( dnnl::algorithm::binary_mul>, ops::EltwiseMKLDNNKernel, ops::EltwiseMKLDNNKernel) - -REGISTER_OP_KERNEL( - elementwise_mul_grad, - MKLDNN, - ::paddle::platform::CPUPlace, - ops::EltwiseMKLDNNGradKernel, - ops::EltwiseMKLDNNGradKernel) diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_sub_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_sub_mkldnn_op.cc index c0eb9b657dbf7..91660b79b09ac 100644 --- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_sub_mkldnn_op.cc +++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_sub_mkldnn_op.cc @@ -25,11 +25,3 @@ REGISTER_OP_KERNEL( dnnl::algorithm::binary_sub>, ops::EltwiseMKLDNNKernel, ops::EltwiseMKLDNNKernel) - -REGISTER_OP_KERNEL( - elementwise_sub_grad, - MKLDNN, - ::paddle::platform::CPUPlace, - ops::EltwiseMKLDNNGradKernel, - ops::EltwiseMKLDNNGradKernel) diff --git a/paddle/phi/kernels/elementwise_kernel.cc b/paddle/phi/kernels/elementwise_kernel.cc index c6031b34af249..c0b99b8ddf036 100644 --- a/paddle/phi/kernels/elementwise_kernel.cc +++ b/paddle/phi/kernels/elementwise_kernel.cc @@ -414,3 +414,8 @@ PD_REGISTER_KERNEL(elementwise_pow, float, phi::dtype::float16) {} #endif + +#if defined PADDLE_WITH_MKLDNN +PD_REGISTER_KERNEL( + divide, OneDNN, ONEDNN, phi::DivideKernel, float, phi::dtype::bfloat16) {} +#endif diff --git a/paddle/phi/kernels/onednn/elementwise_grad_kernel.cc b/paddle/phi/kernels/onednn/elementwise_grad_kernel.cc new file mode 100644 index 0000000000000..7c65c373dedb1 --- /dev/null +++ b/paddle/phi/kernels/onednn/elementwise_grad_kernel.cc @@ -0,0 +1,361 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/elementwise_add_grad_kernel.h" +#include "paddle/phi/kernels/elementwise_divide_grad_kernel.h" +#include "paddle/phi/kernels/elementwise_multiply_grad_kernel.h" +#include "paddle/phi/kernels/elementwise_subtract_grad_kernel.h" + +#include "paddle/phi/backends/onednn/onednn_reuse.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { +namespace funcs { + +inline std::vector CalculateBroadcastedDims( + const phi::DenseTensor* x, const phi::DenseTensor* y) { + const auto src_tz = phi::vectorize(x->dims()); + const auto dst_tz = phi::vectorize(y->dims()); + + std::vector dst_tz_ex(src_tz.size(), 1); + + if (src_tz.size() == dst_tz.size()) { + for (size_t i = 0; i < src_tz.size(); i++) { + dst_tz_ex[i] = (src_tz[i] == dst_tz[i]) ? dst_tz[i] : 1; + } + } else { + size_t j = 0; + for (size_t i = 0; i < src_tz.size(); i++) { + dst_tz_ex[i] = (src_tz[i] != dst_tz[j]) ? 1 : dst_tz[j++]; + if (j == dst_tz.size()) break; + } + } + + return dst_tz_ex; +} + +inline void AddSubNonBroadcast(ReorderOneDNNHandler* reorder_handler, + phi::DenseTensor* grad_tensor, + const std::shared_ptr& src_memory, + const std::shared_ptr& dst_memory, + const std::vector& scales) { + dnnl::primitive_attr reorder_attr; + reorder_attr.set_output_scales(0, scales); + auto reorder_p = + reorder_handler->AcquireReorder(dst_memory, src_memory, reorder_attr); + + paddle::platform::RecordEvent record_reorder( + "int_reorder", + paddle::platform::TracerEventType::UserDefined, + 2, + paddle::platform::EventRole::kUniqueOp); + + reorder_p->execute( + OneDNNContext::tls().get_stream(), *src_memory, *dst_memory); +} + +template +inline void BroadcastReduction(const Place& place, + const dnnl::engine& onednn_engine, + phi::DenseTensor* grad_tensor, + const phi::DenseTensor* dout, + const std::shared_ptr& src_memory, + std::shared_ptr dst_memory, + const std::vector& scales, + const bool is_sub) { + dnnl::primitive_attr broadcast_reduction_attr; + + // Broadcasting + if (is_sub) { + dnnl::post_ops po; + po.append_eltwise(1.0f, dnnl::algorithm::eltwise_linear, scales[0], 0); + broadcast_reduction_attr.set_post_ops(po); + } + + ReductionOneDNNHandler reduction_handler( + dnnl::algorithm::reduction_sum, + 0.0f, + 0.0f, + onednn_engine, + place, + dout, + grad_tensor, + CalculateBroadcastedDims(dout, grad_tensor), + broadcast_reduction_attr); + dst_memory = reduction_handler.AcquireDstMemory(grad_tensor); + + auto reduction_p = reduction_handler.AcquireForwardPrimitive(); + auto astream = OneDNNContext::tls().get_stream(); + reduction_p->execute(astream, + { + {DNNL_ARG_SRC, *src_memory}, + {DNNL_ARG_DST, *dst_memory}, + }); + astream.wait(); + grad_tensor->set_mem_desc(dst_memory->get_desc().reshape( + phi::vectorize(grad_tensor->dims()))); +} + +} // namespace funcs + +template +void ElementwiseGradKernel(const OneDNNContext& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor* out, + const DenseTensor& dout, + int axis, + DenseTensor* dx, + DenseTensor* dy) { + const auto& onednn_engine = dev_ctx.GetEngine(); + // oneDNN's binary is optimized for broadcasting y into x, so in other case + // we have to swap tensors to achieve optimal performance + bool swap_x_y = false; + auto* non_const_x = &x; + auto* non_const_y = &y; + if (x.numel() < y.numel()) { + std::swap(non_const_x, non_const_y); + std::swap(dx, dy); + swap_x_y = true; + } + + std::vector scales{1.0}; + if (swap_x_y) { + scales[0] = (BINARY_OP == dnnl::algorithm::binary_add) ? 1 : -1; + } + + auto tz = phi::vectorize(dout.dims()); + + funcs::ReorderOneDNNHandler reorder_handler( + tz, dout.dtype(), funcs::ToOneDNNDataType(dout.dtype()), onednn_engine); + + auto reorder_src_memory = reorder_handler.AcquireSrcMemory( + dout.mem_desc(), funcs::to_void_cast(dout.data())); + + std::shared_ptr dst_memory; + std::shared_ptr broadcast_src_memory = reorder_src_memory; + + auto& astream = OneDNNContext::tls().get_stream(); + if (dx) { + // elementwise_add & elementwise_sub + if (BINARY_OP == dnnl::algorithm::binary_add || + BINARY_OP == dnnl::algorithm::binary_sub) { + if (dout.dims() == dx->dims()) { + dst_memory = reorder_handler.AcquireDstMemory( + dx, dout.mem_desc(), dev_ctx.GetPlace()); + AddSubNonBroadcast( + &reorder_handler, dx, reorder_src_memory, dst_memory, scales); + } + } else { // elementwise_mul & elementwise_div + funcs::BinaryOneDNNHandler binary_handler(BINARY_OP, + axis, + onednn_engine, + dev_ctx.GetPlace(), + &dout, + non_const_y, + dx, + 1.0f, + 1.0f, + 1.0f, + false); + + const auto src_dout_memory = binary_handler.AcquireSrcMemory(&dout); + const auto src_y_memory = + binary_handler.AcquireSecondSrcMemory(non_const_y); + dst_memory = binary_handler.AcquireDstMemory(dx); + + const auto binary_prim = binary_handler.AcquireForwardPrimitive(); + + const std::unordered_map args = { + {DNNL_ARG_SRC_0, *src_dout_memory}, + {DNNL_ARG_SRC_1, *src_y_memory}, + {DNNL_ARG_DST, *dst_memory}}; + + binary_prim->execute(astream, args); + } + astream.wait(); + + if (dout.dims() != dx->dims()) { + funcs::BroadcastReduction(dev_ctx.GetPlace(), + onednn_engine, + dx, + &dout, + broadcast_src_memory, + dst_memory, + scales, + BINARY_OP == dnnl::algorithm::binary_sub); + } else { + dx->set_mem_desc(dst_memory->get_desc()); + } + } + + if (dy) { + // elementwise_add & elementwise_sub + if (BINARY_OP == dnnl::algorithm::binary_add || + BINARY_OP == dnnl::algorithm::binary_sub) { + if (dout.dims() == dy->dims()) { + dst_memory = reorder_handler.AcquireDstMemory( + dy, dout.mem_desc(), dev_ctx.GetPlace()); + AddSubNonBroadcast( + &reorder_handler, dy, reorder_src_memory, dst_memory, scales); + } + } else { // elementwise_mul & elementwise_div + std::unordered_map args; + std::shared_ptr binary_prim; + std::shared_ptr post_op_memory; + std::shared_ptr src_0_memory; + std::shared_ptr src_1_memory; + + funcs::BinaryOneDNNHandler binary_handler(dnnl::algorithm::binary_mul, + axis, + onednn_engine, + dev_ctx.GetPlace(), + &dout, + non_const_x, + nullptr, + 1.0f, + 1.0f, + 1.0f, + false); + + src_1_memory = binary_handler.AcquireSecondSrcMemory(non_const_x); + + if (BINARY_OP == dnnl::algorithm::binary_div) { + funcs::BinaryOneDNNHandler post_op_binary_handler( + dnnl::algorithm::binary_div, + axis, + onednn_engine, + dev_ctx.GetPlace(), + non_const_y, + non_const_y, + nullptr, + 1.0f, + 1.0f, + 1.0f, + false); + + post_op_memory = post_op_binary_handler.AcquireSrcMemory(non_const_y); + + dnnl::post_ops po; + po.append_binary(dnnl::algorithm::binary_div, + post_op_memory->get_desc()); + + binary_handler = + funcs::BinaryOneDNNHandler(dnnl::algorithm::binary_mul, + axis, + onednn_engine, + dev_ctx.GetPlace(), + &dout, + out, + nullptr, + -1.0f, + 1.0f, + 1.0f, + false, + po); + + src_1_memory = binary_handler.AcquireSecondSrcMemory(out); + } + + src_0_memory = binary_handler.AcquireSrcMemory(&dout); + + const auto dst_dy_memory = (dout.dims() == dy->dims()) + ? binary_handler.AcquireDstMemory(dy) + : binary_handler.AcquireDstMemory(); + + binary_prim = binary_handler.AcquireForwardPrimitive(); + args = {{DNNL_ARG_SRC_0, *src_0_memory}, + {DNNL_ARG_SRC_1, *src_1_memory}, + {DNNL_ARG_DST, *dst_dy_memory}}; + + if (BINARY_OP == dnnl::algorithm::binary_div) + args.insert({DNNL_ARG_ATTR_MULTIPLE_POST_OP(0) | DNNL_ARG_SRC_1, + *post_op_memory}); + + binary_prim->execute(astream, args); + broadcast_src_memory = dst_dy_memory; + dst_memory = dst_dy_memory; + } + astream.wait(); + + if (dout.dims() != dy->dims()) { + funcs::BroadcastReduction(dev_ctx.GetPlace(), + onednn_engine, + dy, + &dout, + broadcast_src_memory, + dst_memory, + scales, + BINARY_OP == dnnl::algorithm::binary_sub); + } else { + dy->set_mem_desc(dst_memory->get_desc()); + } + } +} + +#define DEFINE_ONEDNN_ELEMENTWISE_GRAD_KERNEL(name, algorithm) \ + template \ + void name##GradKernel(const Context& dev_ctx, \ + const DenseTensor& x, \ + const DenseTensor& y, \ + const DenseTensor& dout, \ + int axis, \ + DenseTensor* dx, \ + DenseTensor* dy) { \ + ElementwiseGradKernel( \ + dev_ctx, x, y, nullptr, dout, axis, dx, dy); \ + } + +DEFINE_ONEDNN_ELEMENTWISE_GRAD_KERNEL(Add, dnnl::algorithm::binary_add) +DEFINE_ONEDNN_ELEMENTWISE_GRAD_KERNEL(Subtract, dnnl::algorithm::binary_sub) +DEFINE_ONEDNN_ELEMENTWISE_GRAD_KERNEL(Multiply, dnnl::algorithm::binary_mul) + +template +void DivideGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& out, + const DenseTensor& dout, + int axis, + DenseTensor* dx, + DenseTensor* dy) { + ElementwiseGradKernel( + dev_ctx, x, y, &out, dout, axis, dx, dy); +} +} // namespace phi + +PD_REGISTER_KERNEL( + add_grad, OneDNN, ONEDNN, phi::AddGradKernel, float, phi::dtype::bfloat16) { +} + +PD_REGISTER_KERNEL(subtract_grad, + OneDNN, + ONEDNN, + phi::SubtractGradKernel, + float, + phi::dtype::bfloat16) {} + +PD_REGISTER_KERNEL(multiply_grad, + OneDNN, + ONEDNN, + phi::MultiplyGradKernel, + float, + phi::dtype::bfloat16) {} + +PD_REGISTER_KERNEL(divide_grad, + OneDNN, + ONEDNN, + phi::DivideGradKernel, + float, + phi::dtype::bfloat16) {} diff --git a/paddle/phi/kernels/onednn/elementwise_kernel.cc b/paddle/phi/kernels/onednn/elementwise_kernel.cc new file mode 100644 index 0000000000000..51be7559772d7 --- /dev/null +++ b/paddle/phi/kernels/onednn/elementwise_kernel.cc @@ -0,0 +1,139 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/elementwise_add_kernel.h" +#include "paddle/phi/kernels/elementwise_divide_kernel.h" +#include "paddle/phi/kernels/elementwise_multiply_kernel.h" +#include "paddle/phi/kernels/elementwise_subtract_kernel.h" + +#include "paddle/phi/backends/onednn/onednn_reuse.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void ElementwiseKernel(const OneDNNContext& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + int axis, + DenseTensor* out) { + const auto& onednn_engine = dev_ctx.GetEngine(); + + float scale_x = dev_ctx.HasDnnAttr("Scale_x") + ? PADDLE_GET_CONST(float, dev_ctx.GetDnnAttr("Scale_x")) + : 1; + float scale_y = dev_ctx.HasDnnAttr("Scale_y") + ? PADDLE_GET_CONST(float, dev_ctx.GetDnnAttr("Scale_y")) + : 1; + float scale_out = + dev_ctx.HasDnnAttr("Scale_out") + ? PADDLE_GET_CONST(float, dev_ctx.GetDnnAttr("Scale_out")) + : 1; + + dnnl::post_ops post_operations; + funcs::AppendActivation(dev_ctx, post_operations); + + auto* non_const_x = &x; + auto* non_const_y = &y; + + funcs::BinaryOneDNNHandler handler(BINARY_OP, + axis, + onednn_engine, + dev_ctx.GetPlace(), + non_const_x, + non_const_y, + out, + scale_x, + scale_y, + scale_out, + true, + post_operations); + + // oneDNN's binary is optimized for broadcasting y into x, so in other case + // we have to swap tensors to achieve optimal performance + if (x.numel() < y.numel()) { + std::swap(non_const_x, non_const_y); + } + + const auto src_x_memory = handler.AcquireSrcMemory(non_const_x); + const auto src_y_memory = handler.AcquireSecondSrcMemory(non_const_y); + // (jczaja) For Inplace src and dst should be the same memory object. + // So x should share buffer with z. But UT mechanics is testing inplace + // execution for this op not checking that x can be bradcasted to match in + // shape y tensor. + // This is wrong as when x is to be broadcasted then z(out) will match the + // shape of y which is bigger than x. Hence if x is smaller in shape than z + // and they share a buffer (of + // shape x) then this buffer is not big enough to hold result of elementwise + // operation. + const bool reuse_x_memory = non_const_x->numel() == out->numel() && + non_const_x->IsSharedBufferWith(*out); + std::shared_ptr dst_memory; + + if (reuse_x_memory) { + dst_memory = src_x_memory; + // NOTE(chenfeiyu): when the output reuses memory from other tensor rather + // than allocate its own, it's still need to take care of its data type. + // Unfortunately, paddle's operator only infers the output' shape, but not + // the data type. Alloc takes care of allocation and data type + // normally, but if the memory is already allocated and there is no need + // to re-allocate, it just set the data type. So this it added there to + // get the right data type. + dev_ctx.template Alloc(out); + } else { + dst_memory = handler.AcquireDstMemory(out); + } + + const auto binary_prim = handler.AcquireForwardPrimitive(); + + auto& astream = OneDNNContext::tls().get_stream(); + + const std::unordered_map args = { + {DNNL_ARG_SRC_0, *src_x_memory}, + {DNNL_ARG_SRC_1, *src_y_memory}, + {DNNL_ARG_DST, *dst_memory}}; + + binary_prim->execute(astream, args); + astream.wait(); + + if (handler.use_broadcasting_hack == false) { + out->set_mem_desc(dst_memory->get_desc()); + } else { + auto dims = dst_memory->get_desc().dims(); + dims.insert(dims.begin(), non_const_x->dims()[0]); + dims[1] /= dims[0]; + out->set_mem_desc(dst_memory->get_desc().reshape(dims)); + } +} + +#define DEFINE_ONEDNN_ELEMENTWISE_KERNEL(name, algorithm) \ + template \ + void name##RawKernel(const Context& dev_ctx, \ + const DenseTensor& x, \ + const DenseTensor& y, \ + int axis, \ + DenseTensor* out) { \ + ElementwiseKernel(dev_ctx, x, y, axis, out); \ + } + +DEFINE_ONEDNN_ELEMENTWISE_KERNEL(Divide, dnnl::algorithm::binary_div) + +} // namespace phi + +PD_REGISTER_KERNEL(divide_raw, + OneDNN, + ONEDNN, + phi::DivideRawKernel, + float, + phi::dtype::bfloat16) {} From b07e6b452555b4a50cb700202150bdbb3566af49 Mon Sep 17 00:00:00 2001 From: ZZK <359521840@qq.com> Date: Wed, 23 Nov 2022 09:32:05 +0800 Subject: [PATCH 159/210] Use cublaslt in multi transformer FFN (#48052) * use fused mlp in multi transformer * Restruct code * use cublaslt to fuse ffn * fix conflict --- .../fused/fused_multi_transformer_op.cu | 513 ++++++++++++++++++ .../fused/fused_multi_transformer_op.cu.h | 253 ++++++++- 2 files changed, 763 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu index f52bc2a7f54d1..f56baef1d2672 100644 --- a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu +++ b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu @@ -14,6 +14,517 @@ limitations under the License. */ namespace paddle { namespace operators { +#if CUDA_VERSION >= 11060 // Use cublasLt to fuse FFN operation. + +template +class FusedMultiTransformerOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + using U = LayerNormParamType; + auto &dev_ctx = ctx.cuda_device_context(); + + auto *time_step = ctx.Input("TimeStep"); + // 0. input + auto *input_x = ctx.Input("X"); + const auto input_x_dims = input_x->dims(); + int bsz = input_x_dims[0]; + int seq_len = input_x_dims[1]; + int dim_embed = input_x_dims[2]; + int bsz_seq = bsz * seq_len; + + // 1. layer norm + const auto pre_layer_norm = ctx.Attr("pre_layer_norm"); + const float epsilon = ctx.Attr("epsilon"); + auto ln_scales = ctx.MultiInput("LnScale"); + auto ln_biases = ctx.MultiInput("LnBias"); + + auto ln_compute = AttnLayerNorm(dev_ctx, epsilon, bsz_seq, dim_embed); + Tensor ln_mean, ln_var; + ln_mean.Resize({{bsz_seq}}); + auto *ln_mean_data = + dev_ctx.Alloc(&ln_mean, ln_mean.numel() * sizeof(U)); + ln_var.Resize({{bsz_seq}}); + auto *ln_var_data = dev_ctx.Alloc(&ln_var, ln_var.numel() * sizeof(U)); + + // 2. qkv + // x: qkv's input [batch_size, seq_len, dim_embed] + // y: qkv's weight: [3, num_head, dim_head, dim_embed] + auto qkv_weights = ctx.MultiInput("QKVW"); + auto qkv_biases = ctx.MultiInput("QKVBias"); + const bool trans_qkvw = ctx.Attr("trans_qkvw"); + const auto qkv_w_dims = qkv_weights[0]->dims(); + int num_head = trans_qkvw ? qkv_w_dims[1] : qkv_w_dims[2]; + int dim_head = trans_qkvw ? qkv_w_dims[2] : qkv_w_dims[3]; + int hidden_size = num_head * dim_head; + int output_size = 3 * hidden_size; + int input_size = dim_embed; + + bool compute_bias = qkv_biases.size() > 0 && time_step == nullptr; + // (transA, transB, compute_bias) = (false, trans_qkvw, false) + + // Since we fused QKVBias into QKVBiasAddTransposeSplit kernel, here we set + // compute_bias as false. + auto qkv_compute = AttnMatMul(dev_ctx, + false, + trans_qkvw, + bsz_seq, + output_size, + input_size, + /*compute_bias=*/false); + + Tensor qkv_out; + qkv_out.Resize({{bsz, seq_len, 3, num_head, dim_head}}); + auto *qkv_out_data = + dev_ctx.Alloc(&qkv_out, qkv_out.numel() * sizeof(T)); + + // 3. fmha + AttnDropoutParam attn_param( + true, "upscale_in_train", 0.0, true, true, 0, nullptr); + auto fmha_compute = + FMHARef(dev_ctx, bsz, seq_len, num_head, dim_head, attn_param); + auto *src_mask = ctx.Input("SrcMask"); + auto cache_kvs = ctx.MultiInput("CacheKV"); + auto cache_kv_outs = ctx.MultiOutput("CacheKVOut"); + // auto *time_step = ctx.Input("TimeStep"); + auto pre_caches = ctx.MultiInput("PreCaches"); + int cache_offset = 0; + if (pre_caches.size() > 0) { + cache_offset = pre_caches[0]->dims()[3]; + } + + auto out_seq_len = seq_len; + if (time_step) { + PADDLE_ENFORCE_EQ(time_step->place(), + platform::CPUPlace(), + platform::errors::PreconditionNotMet( + "The place of input(TimeStep) must be CPUPlace.")); + // cache_seq_len + int time_step_value = time_step->data()[0]; + PADDLE_ENFORCE_GT(time_step_value, + 0, + platform::errors::PreconditionNotMet( + "The value of time_step must > 0, but now is %d", + time_step_value)); + PADDLE_ENFORCE_EQ( + seq_len, + 1, + platform::errors::PreconditionNotMet( + "In decode stage, the seq_len of input must be 1, but now is %d", + seq_len)); + out_seq_len += time_step_value; + } else { + out_seq_len += cache_offset; + } + + Tensor q_transpose_out, kv_transpose_out, qk_out; + q_transpose_out.Resize({{bsz, num_head, seq_len, dim_head}}); + auto *q_transpose_out_data = + dev_ctx.Alloc(&q_transpose_out, q_transpose_out.numel() * sizeof(T)); + + kv_transpose_out.Resize({{2, bsz, num_head, seq_len, dim_head}}); + auto *kv_transpose_out_data = dev_ctx.Alloc( + &kv_transpose_out, kv_transpose_out.numel() * sizeof(T)); + + qk_out.Resize({{bsz, num_head, seq_len, out_seq_len}}); + auto *qk_out_data = dev_ctx.Alloc(&qk_out, qk_out.numel() * sizeof(T)); + + Tensor src_mask_out; + if (cache_offset > 0) { + src_mask_out.Resize({{bsz, num_head, seq_len, out_seq_len}}); + auto *src_mask_out_data = + dev_ctx.Alloc(&src_mask_out, src_mask_out.numel() * sizeof(T)); + } + + // [2, bs, num_head, cache_seq_len + seq_len, head_dim] + Tensor pre_cache_kv_out; + if (cache_offset > 0) { + pre_cache_kv_out.Resize( + {{2, bsz, num_head, seq_len + cache_offset, dim_head}}); + auto *pre_cache_kv_out_data = dev_ctx.Alloc( + &pre_cache_kv_out, pre_cache_kv_out.numel() * sizeof(T)); + } + + Tensor softmax_out; + Tensor attn_dropout_mask_out, attn_dropout_out; + Tensor qktv_out, fmha_out; + softmax_out.Resize({{bsz, num_head, seq_len, out_seq_len}}); + auto *softmax_out_data = + dev_ctx.Alloc(&softmax_out, softmax_out.numel() * sizeof(T)); + + attn_dropout_mask_out.Resize({{bsz, num_head, seq_len, out_seq_len}}); + auto *attn_dropout_mask_out_data = dev_ctx.Alloc( + &attn_dropout_mask_out, attn_dropout_mask_out.numel() * sizeof(T)); + attn_dropout_out.Resize({{bsz, num_head, seq_len, out_seq_len}}); + auto *attn_dropout_data_data = dev_ctx.Alloc( + &attn_dropout_out, attn_dropout_out.numel() * sizeof(T)); + + qktv_out.Resize({{bsz, num_head, seq_len, dim_head}}); + auto *qktv_out_data = + dev_ctx.Alloc(&qktv_out, qktv_out.numel() * sizeof(T)); + fmha_out.Resize({{bsz, seq_len, num_head, dim_head}}); + auto *fmha_out_data = + dev_ctx.Alloc(&fmha_out, fmha_out.numel() * sizeof(T)); + + // 4. out_linear + auto out_linear_weights = ctx.MultiInput("OutLinearW"); + auto out_linear_biases = ctx.MultiInput("OutLinearBias"); + int ring_id = ctx.Attr("ring_id"); + // (transA, transB, compute_bias) = (false, false, false) + auto out_linear_compute = AttnMatMul( + dev_ctx, false, false, bsz_seq, dim_embed, hidden_size, false); + + // 5. ln(residual + bias) + DropoutParam dropout_param2(true, 0, true, true, 0.0, nullptr, 0); + FusedDropoutLayerNormHelper fused_dropout_layernorm_helper( + dev_ctx, bsz_seq, dim_embed, dropout_param2, epsilon); + auto ffn_ln_scales = ctx.MultiInput("FFNLnScale"); + auto ffn_ln_biases = ctx.MultiInput("FFNLnBias"); + Tensor bias_dropout_residual_out, dropout_mask_out; + T *bias_dropout_residual_out_data = nullptr; + if (pre_layer_norm) { + bias_dropout_residual_out.Resize({{bsz, seq_len, dim_embed}}); + bias_dropout_residual_out_data = + dev_ctx.Alloc(&bias_dropout_residual_out, + bias_dropout_residual_out.numel() * sizeof(T)); + } + dropout_mask_out.Resize({{bsz, seq_len, dim_embed}}); + auto *dropout_mask_out_data = dev_ctx.Alloc( + &dropout_mask_out, dropout_mask_out.numel() * sizeof(uint8_t)); + + // 6. ffn1 matmul + bias_add + gelu. + auto ffn1_weights = ctx.MultiInput("FFN1Weight"); + auto ffn1_biases = ctx.MultiInput("FFN1Bias"); + auto ffn1_weight_dim = ffn1_weights[0]->dims(); + + int dim_ffn = ffn1_weight_dim[1]; + + Tensor ffn1_out; + ffn1_out.Resize({{bsz_seq, dim_ffn}}); + auto *ffn1_out_data = + dev_ctx.Alloc(&ffn1_out, ffn1_out.numel() * sizeof(T)); + + auto ffn1_linear_bias_gelu = CublasFusedMLP(dev_ctx); + const phi::DDim ffn1_input_shape({bsz_seq, dim_ffn}); + ffn1_linear_bias_gelu.Setup( + ffn1_input_shape, ffn1_weight_dim, false, false); + + // 8. ffn2 matmul + bias_add + residual. + auto ffn2_weights = ctx.MultiInput("FFN2Weight"); + auto ffn2_biases = ctx.MultiInput("FFN2Bias"); + + auto ffn2_linear_bias_residual = CublasFusedMLP(dev_ctx); + ffn2_linear_bias_residual.Setup( + ffn1_out.dims(), ffn2_weights[0]->dims(), false, false); + + // 9. ffn2 residual bias + DropoutParam ffn2_dropout_param(true, 0, true, true, 0.0, nullptr, 0); + FusedDropoutLayerNormHelper ffn2_fused_dropout_helper( + dev_ctx, bsz_seq, dim_embed, ffn2_dropout_param, epsilon); + + // calc + auto *out = ctx.Output("Out"); + auto *from_data = dev_ctx.Alloc(out, out->numel() * sizeof(T)); + Tensor *from_tensor = out; + Tensor tmp_out; + tmp_out.Resize({{bsz, seq_len, dim_embed}}); + auto *tmp_out_data = + dev_ctx.Alloc(&tmp_out, tmp_out.numel() * sizeof(T)); + + auto *x_data = input_x->data(); + Tensor *buf0 = nullptr; + Tensor *buf1 = nullptr; + + // step0: x --> buf1 + // step1: buf1 --> buf0 + // step2: buf0 --> buf1 + int layers = qkv_weights.size(); + if (pre_layer_norm) { + if (layers & 1) { + // odd, set buf1 as out + buf0 = &tmp_out; + buf1 = out; + } else { + // even, set buf0 as out + buf0 = out; + buf1 = &tmp_out; + } + } else { + buf0 = &tmp_out; + buf1 = out; + } + + for (int i = 0; i < layers; ++i) { + // step1. layer_norm + if (i == 0 && pre_layer_norm) { + auto *ln_scale_data = ln_scales[i]->data(); + auto *ln_bias_data = ln_biases[i]->data(); + // TODO(wangxi): can remove mean var in inference + ln_compute.ComputeForward(x_data, + ln_scale_data, + ln_bias_data, + buf1->data(), + ln_mean_data, + ln_var_data); + } +#ifdef _DEBUG_FUSED_MULTI_TRANSFORMER + VLOG(0) << "step1"; +#endif + + // step2. qkv + const Tensor *qkv_bias = qkv_biases.size() > 0 ? qkv_biases[i] : nullptr; + // NOTE: in decoder stage, bias is fused in fmha + const Tensor *bias = time_step ? nullptr : qkv_bias; + if (!pre_layer_norm && i == 0) { + qkv_compute.ComputeForward( + qkv_weights[i], input_x, bias, &qkv_out, &qkv_out); + } else { + qkv_compute.ComputeForward( + qkv_weights[i], buf1, bias, &qkv_out, &qkv_out); + } +#ifdef _DEBUG_FUSED_MULTI_TRANSFORMER + VLOG(0) << "step2"; +#endif + + // step3. fmha + const Tensor *cache_kv = cache_kvs.size() > 0 ? cache_kvs[i] : nullptr; + Tensor *cache_kv_out = cache_kv ? cache_kv_outs[i] : nullptr; + + if (time_step) { // generation decoder stage + // [2, batch_size, num_head, max_seq_len, head_size] + int max_seq_len = cache_kv->dims()[3]; + fmha(dev_ctx, + qkv_out, + *qkv_bias, + *src_mask, + cache_kv_out, + &fmha_out, + bsz, + max_seq_len, + num_head, + dim_head, + time_step->data()[0], + 1. / sqrt(dim_head)); + } else if (cache_kv_out) { // generation context stage + const Tensor *pre_cache_kv_tensor = + pre_caches.size() > 0 ? pre_caches[i] : nullptr; + Tensor *pre_cache_kv_out_tmp = + cache_offset > 0 ? &pre_cache_kv_out : nullptr; + Tensor *src_mask_tmp = cache_offset > 0 ? &src_mask_out : nullptr; + qkv_bias_add_transpose_split(dev_ctx, + q_transpose_out_data, + kv_transpose_out_data, + qkv_out_data, + qkv_bias->data(), + bsz, + num_head, + seq_len, + dim_head, + compute_bias); + fmha_compute.ComputeForwardWithoutTranspose(qkv_out, + pre_cache_kv_tensor, + src_mask, + &q_transpose_out, + &kv_transpose_out, + pre_cache_kv_out_tmp, + &qk_out, + src_mask_tmp, + &softmax_out, + &attn_dropout_mask_out, + &attn_dropout_out, + &qktv_out, + &fmha_out); + + const T *k_ptr = nullptr; + const T *v_ptr = nullptr; + + if (cache_offset > 0) { + // [2, bsz, num_head, cache_offset + seq_len, head_dim] + const T *kv_data = pre_cache_kv_out.data(); + k_ptr = kv_data; + int64_t k_size = bsz * num_head * (seq_len + cache_offset) * dim_head; + v_ptr = k_ptr + k_size; + } else { + // [3, bsz, num_head, seq_len, head_dim] + int64_t k_size = bsz * seq_len * num_head * dim_head; + const T *q_ptr = q_transpose_out_data; + k_ptr = kv_transpose_out_data; + v_ptr = k_ptr + k_size; + } + + // [2, bsz, num_head, max_seq_len, head_dim] + int max_seq_len = cache_kv_out->dims()[3]; + T *cache_kv_data = cache_kv_out->data(); + int64_t cache_k_size = bsz * num_head * max_seq_len * dim_head; + + T *cache_k_ptr = cache_kv_data; + T *cache_v_ptr = cache_kv_data + cache_k_size; + + const int seq_len_tmp = seq_len + cache_offset; + write_cache_kv(dev_ctx, + cache_k_ptr, + cache_v_ptr, + k_ptr, + v_ptr, + bsz, + num_head, + seq_len_tmp, + max_seq_len, + dim_head); + } else { // not generation + // TODO(wangxi): can remove dropout in inference + qkv_bias_add_transpose_split(dev_ctx, + q_transpose_out_data, + kv_transpose_out_data, + qkv_out_data, + qkv_bias->data(), + bsz, + num_head, + seq_len, + dim_head, + compute_bias); + fmha_compute.ComputeForwardWithoutTranspose(qkv_out, + cache_kv, + src_mask, + &q_transpose_out, + &kv_transpose_out, + cache_kv_out, + &qk_out, + nullptr, + &softmax_out, + &attn_dropout_mask_out, + &attn_dropout_out, + &qktv_out, + &fmha_out); + } +#ifdef _DEBUG_FUSED_MULTI_TRANSFORMER + VLOG(0) << "step3"; +#endif + + if (pre_layer_norm) { + out_linear_compute.ComputeForward( + out_linear_weights[i], &fmha_out, nullptr, buf1, nullptr); + AllReduce(*buf1, ring_id, buf1->numel(), dev_ctx); + } else { + out_linear_compute.ComputeForward( + out_linear_weights[i], &fmha_out, nullptr, buf0, nullptr); + AllReduce(*buf0, ring_id, buf0->numel(), dev_ctx); + } +#ifdef _DEBUG_FUSED_MULTI_TRANSFORMER + VLOG(0) << "step4"; +#endif + + // step5. ln(residual + dropout(input + bias)) + if (pre_layer_norm) { + auto *ln_scale_data = ffn_ln_scales[i]->data(); + auto *ln_bias_data = ffn_ln_biases[i]->data(); + auto *out_linear_bias_data = out_linear_biases[i]->data(); + + // inplace + fused_dropout_layernorm_helper.LayernormResidualDropoutBias( + dev_ctx, + buf1->data(), + x_data, + out_linear_bias_data, + ln_scale_data, + ln_bias_data, + bias_dropout_residual_out_data, + dropout_mask_out_data, + buf1->data(), + ln_mean_data, + ln_var_data); + } else { + auto *ln_scale_data = ln_scales[i]->data(); + auto *ln_bias_data = ln_biases[i]->data(); + auto *out_linear_bias_data = out_linear_biases[i]->data(); + auto *residual_data = (i == 0 ? x_data : buf1->data()); + fused_dropout_layernorm_helper.LayernormResidualDropoutBias( + dev_ctx, + buf0->data(), + residual_data, + out_linear_bias_data, + ln_scale_data, + ln_bias_data, + buf0->data(), + dropout_mask_out_data, + buf1->data(), + ln_mean_data, + ln_var_data); + } + +#ifdef _DEBUG_FUSED_MULTI_TRANSFORMER + VLOG(0) << "step5"; +#endif + // step6. ffn1 matmul + bias_add + gelu. + + ffn1_linear_bias_gelu.ComputeForward( + buf1, ffn1_weights[i], ffn1_biases[i], nullptr, &ffn1_out, "gelu"); + +#ifdef _DEBUG_FUSED_MULTI_TRANSFORMER + VLOG(0) << "step6"; +#endif + + // step7. ffn2 matmul + bias_add + residual. + if (pre_layer_norm) { + ffn2_linear_bias_residual.ComputeForward(&ffn1_out, + ffn2_weights[i], + ffn2_biases[i], + &bias_dropout_residual_out, + buf1, + "none"); + + } else { + ffn2_linear_bias_residual.ComputeForward( + &ffn1_out, ffn2_weights[i], ffn2_biases[i], buf1, buf0, "none"); + } + + if (pre_layer_norm) { + AllReduce(*buf1, ring_id, buf1->numel(), dev_ctx); + } else { + AllReduce(*buf0, ring_id, buf0->numel(), dev_ctx); + } + +#ifdef _DEBUG_FUSED_MULTI_TRANSFORMER + VLOG(0) << "step7"; +#endif + + // step8. layer norm or do nothing(because bias_add + residual has been + // fused into cublasFusedMLP. ) + if (pre_layer_norm) { + if (i < layers - 1) { + auto *ln_scale_data = ln_scales[i + 1]->data(); + auto *ln_bias_data = ln_biases[i + 1]->data(); + ffn2_fused_dropout_helper.LayerNorm(dev_ctx, + buf1->data(), + ln_scale_data, + ln_bias_data, + buf0->data(), + ln_mean_data, + ln_var_data); + } + } else { + auto *ln_scale_data = ffn_ln_scales[i]->data(); + auto *ln_bias_data = ffn_ln_biases[i]->data(); + ffn2_fused_dropout_helper.LayerNorm(dev_ctx, + buf0->data(), + ln_scale_data, + ln_bias_data, + buf1->data(), + ln_mean_data, + ln_var_data); + } +#ifdef _DEBUG_FUSED_MULTI_TRANSFORMER + VLOG(0) << "step8"; +#endif + if (pre_layer_norm) { + x_data = buf1->data(); + std::swap(buf0, buf1); + } + } + } +}; + +#else + template class FusedMultiTransformerOpKernel : public framework::OpKernel { public: @@ -550,6 +1061,8 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel { } }; +#endif // CUDA_VERSION >= 11060 + } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu.h b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu.h index 79fc561698989..c36ee69723e45 100644 --- a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu.h +++ b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu.h @@ -26,7 +26,9 @@ limitations under the License. */ #include "paddle/fluid/operators/fused/attn_gemm.h" #include "paddle/fluid/operators/fused/fmha_ref.h" #include "paddle/fluid/operators/fused/fused_dropout_helper.h" +#include "paddle/fluid/operators/fused/fused_gemm_epilogue_op.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" +#include "paddle/fluid/platform/dynload/cublasLt.h" #include "paddle/phi/api/include/tensor.h" #include "paddle/phi/backends/gpu/gpu_device_function.h" #include "paddle/phi/kernels/funcs/math_function.h" @@ -37,6 +39,8 @@ limitations under the License. */ #include "paddle/fluid/platform/device/gpu/nccl_helper.h" #endif +DECLARE_bool(gemm_use_half_precision_compute_type); + namespace paddle { namespace operators { @@ -1336,10 +1340,10 @@ inline cudaError_t GetNumBlocks(int64_t n, int *num_blocks) { constexpr int kBlockSize = 128; constexpr int kNumWaves = 16; - const int device_id = paddle::platform::GetCurrentDeviceId(); - const int sm_count = paddle::platform::GetGPUMultiProcessors(device_id); + const int device_id = phi::backends::gpu::GetCurrentDeviceId(); + const int sm_count = phi::backends::gpu::GetGPUMultiProcessors(device_id); const int max_thread_per_multiprocessor = - paddle::platform::GetGPUMultiProcessors(device_id); + phi::backends::gpu::GetGPUMultiProcessors(device_id); *num_blocks = std::max(1, @@ -1400,6 +1404,249 @@ void qkv_bias_add_transpose_split(const phi::GPUContext &dev_ctx, } } +#if CUDA_VERSION >= 11060 +// Only Used in Inference +template +class CublasFusedMLP { + public: + // (m, n, k) = bsz_seq, hidden_feature, in_feature + explicit CublasFusedMLP(const phi::GPUContext &dev_ctx) : dev_ctx_(dev_ctx) { + // Set Math Type + cudaDataType_t mat_type = CUDA_R_32F; + cudaDataType_t scale_type = CUDA_R_32F; + cublasComputeType_t compute_type = CUBLAS_COMPUTE_32F; + + if (std::is_same::value) { + mat_type = CUDA_R_16F; + if (FLAGS_gemm_use_half_precision_compute_type) { + compute_type = CUBLAS_COMPUTE_16F; + scale_type = CUDA_R_16F; + } + } + if (std::is_same::value) { + mat_type = CUDA_R_16BF; + } + if (std::is_same::value) { + mat_type = CUDA_R_64F; + scale_type = CUDA_R_64F; + compute_type = CUBLAS_COMPUTE_64F; + } + + // Just for init. + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatmulDescCreate( + &operation_desc_, compute_type, scale_type)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate( + &x_desc_, mat_type, 1, 1, 1)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate( + &w_desc_, mat_type, 1, 1, 1)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate( + &out_desc_, mat_type, 1, 1, 1)); + } + + ~CublasFusedMLP() { + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatmulDescDestroy(operation_desc_)); + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatrixLayoutDestroy(x_desc_)); + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatrixLayoutDestroy(w_desc_)); + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatrixLayoutDestroy(out_desc_)); + } + + // Change to use tensor's shape. + void Setup(const phi::DDim &x_shape, + const phi::DDim &w_shape, + bool trans_x, + bool trans_w) { + int64_t M = trans_x ? x_shape[1] : x_shape[0]; + int64_t K = trans_w ? w_shape[1] : w_shape[0]; + int64_t N = trans_w ? w_shape[0] : w_shape[1]; + + cublasOperation_t cublas_transA = trans_x ? CUBLAS_OP_T : CUBLAS_OP_N; + cublasOperation_t cublas_transB = trans_w ? CUBLAS_OP_T : CUBLAS_OP_N; + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatmulDescSetAttribute( + operation_desc_, + CUBLASLT_MATMUL_DESC_TRANSB, + &cublas_transA, + sizeof(cublas_transA))); + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatmulDescSetAttribute( + operation_desc_, + CUBLASLT_MATMUL_DESC_TRANSA, + &cublas_transB, + sizeof(cublas_transB))); + + /* + cublas use col major: x(M, K) matmul w(K, N) = out(M, N) equals to w_t(N, K) + * x_t(K, M) = out(N, M) + */ + SetCublasMatrixLayout_(x_desc_, cublas_transA, K, M); + SetCublasMatrixLayout_(w_desc_, cublas_transB, N, K); + SetCublasMatrixLayout_(out_desc_, CUBLAS_OP_N, N, M); + } + + void ComputeForward(const phi::DenseTensor *input, + const phi::DenseTensor *weight, + const phi::DenseTensor *bias, + phi::DenseTensor *residual, + phi::DenseTensor *output, + const std::string &activation) { + // here: (transa, transb): nt, input * weight. + // (M * K) * (K * N) + cublasLtHandle_t lt_handle = dev_ctx_.cublaslt_handle(); + size_t workspace_size = static_cast(16) * 1024 * 1024; + cudaStream_t stream = dev_ctx_.stream(); + memory::allocation::AllocationPtr workspace = + memory::Alloc(dev_ctx_.GetPlace(), + workspace_size, + phi::Stream(reinterpret_cast(stream))); + + const bool add_residual = (residual == nullptr) ? false : true; + const bool add_bias = (bias == nullptr) ? false : true; + if (add_bias) { + SetCublasBiasPtr_(bias); + } + + // Set cublasLt epilogue. + cublasLtEpilogue_t epiloque_func = GetEpilogueType_(activation, add_bias); + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatmulDescSetAttribute( + operation_desc_, + CUBLASLT_MATMUL_DESC_EPILOGUE, + &epiloque_func, + sizeof(epiloque_func))); + + const auto *x_data = input->data(); + const auto *w_data = weight->data(); + auto *residual_data = + add_residual ? residual->data() : output->data(); + auto *out_data = output->data(); + + // if add_residual, we compute result + 1.0 * residual, else result + 0.0 * + // out. + double alpha64 = 1.0, beta64 = add_residual ? 1.0 : 0.0; + float alpha32 = 1.0f, beta32 = add_residual ? 1.0f : 0.0f; + void *alpha = nullptr, *beta = nullptr; + if (std::is_same::value) { + alpha = &alpha64; + beta = &beta64; + } else { + alpha = &alpha32; + beta = &beta32; + } + + auto algo = GemmEpilogueAlgoCache::Instance().GetGemmAlgo(lt_handle, + operation_desc_, + w_desc_, + x_desc_, + out_desc_, + alpha, + beta, + w_data, + x_data, + out_data, + stream, + workspace->ptr(), + workspace_size); + + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatmul(lt_handle, + operation_desc_, + alpha, + w_data, + w_desc_, + x_data, + x_desc_, + beta, + residual_data, + out_desc_, + out_data, + out_desc_, + algo /*algo*/, + workspace->ptr() /*workspace*/, + workspace_size, + stream)); + } + + private: + static cublasLtEpilogue_t GetEpilogueType_(const std::string &activation, + const bool add_bias) { + if (activation == "relu") { + if (add_bias) { + return CUBLASLT_EPILOGUE_RELU_BIAS; + } else { + return CUBLASLT_EPILOGUE_RELU; + } + } else if (activation == "gelu") { + if (add_bias) { + return CUBLASLT_EPILOGUE_GELU_BIAS; + } else { + return CUBLASLT_EPILOGUE_GELU; + } + } else if (activation == "none") { + if (add_bias) { + return CUBLASLT_EPILOGUE_BIAS; + } else { + return CUBLASLT_EPILOGUE_DEFAULT; + } + } else { + PADDLE_ENFORCE_EQ( + true, + false, + platform::errors::InvalidArgument( + "The activation attribute of fused_gemm_epilogue op should be" + " one of {\"none\", \"relu\", \"gelu\"}. But received %s." + "But received activation=%s.", + activation)); + } + } + + void SetCublasMatrixLayout_(cublasLtMatrixLayout_t layout_desc, + cublasOperation_t cublas_trans, + const size_t cublas_m, + const size_t cublas_n) { + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatrixLayoutSetAttribute( + layout_desc, + CUBLASLT_MATRIX_LAYOUT_ROWS, + cublas_trans == CUBLAS_OP_N ? &cublas_m : &cublas_n, + sizeof(cublas_m))); + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatrixLayoutSetAttribute( + layout_desc, + CUBLASLT_MATRIX_LAYOUT_COLS, + cublas_trans == CUBLAS_OP_N ? &cublas_n : &cublas_m, + sizeof(cublas_m))); + const size_t cublas_ld = cublas_trans == CUBLAS_OP_N ? cublas_m : cublas_n; + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatrixLayoutSetAttribute( + layout_desc, + CUBLASLT_MATRIX_LAYOUT_LD, + &cublas_ld, + sizeof(cublas_ld))); + } + + void SetCublasBiasPtr_(const phi::DenseTensor *bias) { + const T *bias_data = bias->data(); + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatmulDescSetAttribute( + operation_desc_, + CUBLASLT_MATMUL_DESC_BIAS_POINTER, + &bias_data, + sizeof(bias_data))); + } + + const phi::GPUContext &dev_ctx_; + cublasLtMatmulDesc_t operation_desc_; + cublasLtMatrixLayout_t x_desc_; + cublasLtMatrixLayout_t w_desc_; + cublasLtMatrixLayout_t out_desc_; +}; + +#endif // PADDLE_FLUID_OPERATORS_FUSED_FUSED_MULTI_TRANSFORMER_OP_CU_H_ + } // namespace } // namespace operators From 25ffe9c29808fbb243c41b2115ceffa299d3bdd4 Mon Sep 17 00:00:00 2001 From: zhangyikun02 <48021248+zhangyk0314@users.noreply.github.com> Date: Wed, 23 Nov 2022 10:18:57 +0800 Subject: [PATCH 160/210] add warpctc kernel and change cast_v2 to cast for xpu, test=kunlun (#48134) --- cmake/external/xpu.cmake | 2 +- paddle/fluid/framework/data_type_transform.cc | 4 +- .../tests/test_gradient_accmulator.cc | 14 +- .../operators/metrics/accuracy_op_xpu.cc | 8 +- paddle/fluid/operators/top_k_op_xpu.cc | 10 +- .../fluid/platform/device/xpu/xpu2_op_list.h | 3 + paddle/phi/kernels/xpu/amp_kernel.cc | 20 +- paddle/phi/kernels/xpu/cast_kernel.cc | 16 +- .../kernels/xpu/cross_entropy_grad_kernel.cc | 20 +- .../phi/kernels/xpu/cross_entropy_kernel.cc | 10 +- paddle/phi/kernels/xpu/gather_grad_kernel.cc | 22 +- paddle/phi/kernels/xpu/sgd_kernel.cc | 11 +- paddle/phi/kernels/xpu/top_k_kernel.cc | 20 +- paddle/phi/kernels/xpu/warpctc_grad_kernel.cc | 60 ++ paddle/phi/kernels/xpu/warpctc_kernel.cc | 102 ++++ .../unittests/xpu/test_warpctc_op_xpu.py | 563 ++++++++++++++++++ 16 files changed, 804 insertions(+), 81 deletions(-) create mode 100644 paddle/phi/kernels/xpu/warpctc_grad_kernel.cc create mode 100644 paddle/phi/kernels/xpu/warpctc_kernel.cc create mode 100644 python/paddle/fluid/tests/unittests/xpu/test_warpctc_op_xpu.py diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake index 3b09f92081e4e..8d485fba6a3bd 100644 --- a/cmake/external/xpu.cmake +++ b/cmake/external/xpu.cmake @@ -10,7 +10,7 @@ set(XPU_RT_LIB_NAME "libxpurt.so") if(NOT DEFINED XPU_BASE_URL) set(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.su.bcebos.com/KL-SDK/klsdk-dev") - set(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20221116") + set(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20221120") else() set(XPU_BASE_URL "${XPU_BASE_URL}") endif() diff --git a/paddle/fluid/framework/data_type_transform.cc b/paddle/fluid/framework/data_type_transform.cc index 3ab653841545c..0768d2d82fb81 100644 --- a/paddle/fluid/framework/data_type_transform.cc +++ b/paddle/fluid/framework/data_type_transform.cc @@ -40,12 +40,12 @@ static void XPUCastData(const phi::DenseTensor& in, const platform::XPUDeviceContext* dev_ctx) { using XPUInTDType = typename XPUTypeTrait::Type; using XPUOutTDType = typename XPUTypeTrait::Type; - int r = xpu::cast_v2( + int r = xpu::cast( dev_ctx->x_context(), reinterpret_cast(in.data()), reinterpret_cast(out->mutable_data(in.place())), in.numel()); - PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast_v2"); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast"); dev_ctx->Wait(); } diff --git a/paddle/fluid/imperative/tests/test_gradient_accmulator.cc b/paddle/fluid/imperative/tests/test_gradient_accmulator.cc index 5d5aabbaf5f4f..6768a0ef35a63 100644 --- a/paddle/fluid/imperative/tests/test_gradient_accmulator.cc +++ b/paddle/fluid/imperative/tests/test_gradient_accmulator.cc @@ -161,13 +161,10 @@ TEST(test_add_functor, add_functor) { static_cast(1.0), static_cast(2.0)); EXPECT_EQ(cpu_res, 0); - -#ifndef PADDLE_WITH_XPU - // does not support double when compiled using xpu + // double cpu_res = TensorddTest( cpu_place, cpu_place, static_cast(1.0), static_cast(2.0)); EXPECT_EQ(cpu_res, 0); -#endif #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) int gpu_res = 1; @@ -217,6 +214,9 @@ TEST(test_add_functor, add_functor) { static_cast(1.0), static_cast(2.0)); EXPECT_EQ(xpu_res, 0); + xpu_res = TensorddTest( + xpu_place, xpu_place, static_cast(1.0), static_cast(2.0)); + EXPECT_EQ(xpu_res, 0); // different places xpu_res = TensorddTest( cpu_place, xpu_place, static_cast(1.0), static_cast(2.0)); @@ -234,6 +234,12 @@ TEST(test_add_functor, add_functor) { static_cast(1.0), static_cast(2.0)); EXPECT_EQ(xpu_res, 0); + xpu_res = TensorddTest( + cpu_place, xpu_place, static_cast(1.0), static_cast(2.0)); + EXPECT_EQ(xpu_res, 0); + xpu_res = TensorddTest( + xpu_place, cpu_place, static_cast(1.0), static_cast(2.0)); + EXPECT_EQ(xpu_res, 0); #endif } diff --git a/paddle/fluid/operators/metrics/accuracy_op_xpu.cc b/paddle/fluid/operators/metrics/accuracy_op_xpu.cc index f3f39a40fbaea..4c83071264a42 100644 --- a/paddle/fluid/operators/metrics/accuracy_op_xpu.cc +++ b/paddle/fluid/operators/metrics/accuracy_op_xpu.cc @@ -50,13 +50,13 @@ class AccuracyXPUKernel : public framework::OpKernel { int* label_int32_ptr = RAII_GUARD.alloc_l3_or_gm(size); PADDLE_ENFORCE_XDNN_NOT_NULL(label_int32_ptr); - int r = xpu::cast_v2( + int r = xpu::cast( dev_ctx.x_context(), indices_data, indices_int32_ptr, size); - PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast_v2"); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast"); - r = xpu::cast_v2( + r = xpu::cast( dev_ctx.x_context(), label_data, label_int32_ptr, size); - PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast_v2"); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast"); r = xpu::accuracy(dev_ctx.x_context(), indices_int32_ptr, diff --git a/paddle/fluid/operators/top_k_op_xpu.cc b/paddle/fluid/operators/top_k_op_xpu.cc index 46428a3596d56..25f3faa38a0c5 100644 --- a/paddle/fluid/operators/top_k_op_xpu.cc +++ b/paddle/fluid/operators/top_k_op_xpu.cc @@ -79,11 +79,11 @@ class TopkXPUKernel : public framework::OpKernel { PADDLE_ENFORCE_XDNN_SUCCESS(r, "sorted_topk"); // cast to int64 as final result - r = xpu::cast_v2(dev_ctx.x_context(), - (const int32_t*)indices_int_data, - indices_data, - indices->numel()); - PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast_v2"); + r = xpu::cast(dev_ctx.x_context(), + (const int32_t*)indices_int_data, + indices_data, + indices->numel()); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast"); } }; diff --git a/paddle/fluid/platform/device/xpu/xpu2_op_list.h b/paddle/fluid/platform/device/xpu/xpu2_op_list.h index 4db463f46ae0d..4862401f83a6f 100644 --- a/paddle/fluid/platform/device/xpu/xpu2_op_list.h +++ b/paddle/fluid/platform/device/xpu/xpu2_op_list.h @@ -681,6 +681,9 @@ XPUOpMap& get_kl2_ops() { pOpKernelType(vartype::INT8, XPUPlace()), pOpKernelType(vartype::UINT8, XPUPlace()), pOpKernelType(vartype::FP32, XPUPlace())})}, + {"warpctc_grad", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"warpctc", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"where_index", XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()), pOpKernelType(vartype::BOOL, XPUPlace()), diff --git a/paddle/phi/kernels/xpu/amp_kernel.cc b/paddle/phi/kernels/xpu/amp_kernel.cc index 07679e45482c1..18ebc26aa87d1 100644 --- a/paddle/phi/kernels/xpu/amp_kernel.cc +++ b/paddle/phi/kernels/xpu/amp_kernel.cc @@ -233,11 +233,11 @@ void CheckFiniteAndUnscaleKernel(const Context& dev_ctx, dev_ctx.template Alloc(&float_out, out->numel() * sizeof(MPDType)); - int r = xpu::cast_v2(dev_ctx.x_context(), - reinterpret_cast(x->data()), - float_x.data(), - x->numel()); - PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast_v2"); + int r = xpu::cast(dev_ctx.x_context(), + reinterpret_cast(x->data()), + float_x.data(), + x->numel()); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast"); r = xpu::scale(dev_ctx.x_context(), float_x.data(), @@ -248,11 +248,11 @@ void CheckFiniteAndUnscaleKernel(const Context& dev_ctx, 0.0); PADDLE_ENFORCE_XDNN_SUCCESS(r, "scale"); - r = xpu::cast_v2(dev_ctx.x_context(), - float_out.data(), - reinterpret_cast(out->data()), - out->numel()); - PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast_v2"); + r = xpu::cast(dev_ctx.x_context(), + float_out.data(), + reinterpret_cast(out->data()), + out->numel()); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast"); } else { int r = xpu::scale(dev_ctx.x_context(), reinterpret_cast(x->data()), diff --git a/paddle/phi/kernels/xpu/cast_kernel.cc b/paddle/phi/kernels/xpu/cast_kernel.cc index 346cf4cd3bfd6..86633eac09a5c 100644 --- a/paddle/phi/kernels/xpu/cast_kernel.cc +++ b/paddle/phi/kernels/xpu/cast_kernel.cc @@ -39,14 +39,14 @@ void CastKernel(const Context& dev_ctx, int r = -1; switch (out_dtype) { case phi::DataType::FLOAT32: - r = xpu::cast_v2( + r = xpu::cast( dev_ctx.x_context(), reinterpret_cast(in_data), dev_ctx.template Alloc(out), numel); break; case phi::DataType::FLOAT16: - r = xpu::cast_v2( + r = xpu::cast( dev_ctx.x_context(), reinterpret_cast(in_data), reinterpret_cast( @@ -54,35 +54,35 @@ void CastKernel(const Context& dev_ctx, numel); break; case phi::DataType::INT64: - r = xpu::cast_v2( + r = xpu::cast( dev_ctx.x_context(), reinterpret_cast(in_data), dev_ctx.template Alloc(out), numel); break; case phi::DataType::INT32: - r = xpu::cast_v2( + r = xpu::cast( dev_ctx.x_context(), reinterpret_cast(in_data), dev_ctx.template Alloc(out), numel); break; case phi::DataType::BOOL: - r = xpu::cast_v2( + r = xpu::cast( dev_ctx.x_context(), reinterpret_cast(in_data), dev_ctx.template Alloc(out), numel); break; case phi::DataType::UINT8: - r = xpu::cast_v2( + r = xpu::cast( dev_ctx.x_context(), reinterpret_cast(in_data), dev_ctx.template Alloc(out), numel); break; case phi::DataType::FLOAT64: - r = xpu::cast_v2( + r = xpu::cast( dev_ctx.x_context(), reinterpret_cast(in_data), dev_ctx.template Alloc(out), @@ -93,7 +93,7 @@ void CastKernel(const Context& dev_ctx, "Not supported cast %d -> %d", x.dtype(), out_dtype)); } - PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast_v2"); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast"); } } // namespace phi diff --git a/paddle/phi/kernels/xpu/cross_entropy_grad_kernel.cc b/paddle/phi/kernels/xpu/cross_entropy_grad_kernel.cc index 042f41df9808e..edb7157a3440a 100644 --- a/paddle/phi/kernels/xpu/cross_entropy_grad_kernel.cc +++ b/paddle/phi/kernels/xpu/cross_entropy_grad_kernel.cc @@ -59,11 +59,11 @@ void CrossEntropyWithSoftmaxGradKernel(const Context& dev_ctx, RAII_GUARD.alloc_l3_or_gm(labels.numel()); PADDLE_ENFORCE_XDNN_NOT_NULL(labels_int_ptr_l3); - r = xpu::cast_v2(dev_ctx.x_context(), - labels.data(), - labels_int_ptr_l3, - labels.numel()); - PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast_v2"); + r = xpu::cast(dev_ctx.x_context(), + labels.data(), + labels_int_ptr_l3, + labels.numel()); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast"); r = xpu::hard_softmax_with_cross_entropy_grad( dev_ctx.x_context(), @@ -117,11 +117,11 @@ void CrossEntropyWithSoftmaxGradKernel(const Context& dev_ctx, RAII_GUARD.alloc_l3_or_gm(labels.numel()); PADDLE_ENFORCE_XDNN_NOT_NULL(labels_int_ptr_l3); - r = xpu::cast_v2(dev_ctx.x_context(), - labels.data(), - labels_int_ptr_l3, - labels.numel()); - PADDLE_ENFORCE_XDNN_SUCCESS(r, "clip_v2"); + r = xpu::cast(dev_ctx.x_context(), + labels.data(), + labels_int_ptr_l3, + labels.numel()); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast"); r = xpu::hard_softmax_with_cross_entropy_grad( dev_ctx.x_context(), reinterpret_cast(loss_grad.data()), diff --git a/paddle/phi/kernels/xpu/cross_entropy_kernel.cc b/paddle/phi/kernels/xpu/cross_entropy_kernel.cc index cf58374f1c005..f054c6c445148 100644 --- a/paddle/phi/kernels/xpu/cross_entropy_kernel.cc +++ b/paddle/phi/kernels/xpu/cross_entropy_kernel.cc @@ -132,11 +132,11 @@ void CrossEntropyWithSoftmaxKernel(const Context& dev_ctx, int* labels_int_ptr_l3 = RAII_GUARD.alloc_l3_or_gm(labels.numel()); PADDLE_ENFORCE_XDNN_NOT_NULL(labels_int_ptr_l3); - r = xpu::cast_v2(dev_ctx.x_context(), - labels.data(), - labels_int_ptr_l3, - labels.numel()); - PADDLE_ENFORCE_XDNN_SUCCESS(r, "clip_v2"); + r = xpu::cast(dev_ctx.x_context(), + labels.data(), + labels_int_ptr_l3, + labels.numel()); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast"); r = xpu::hard_cross_entropy( dev_ctx.x_context(), diff --git a/paddle/phi/kernels/xpu/gather_grad_kernel.cc b/paddle/phi/kernels/xpu/gather_grad_kernel.cc index b1c1731130106..7be22a86d0019 100644 --- a/paddle/phi/kernels/xpu/gather_grad_kernel.cc +++ b/paddle/phi/kernels/xpu/gather_grad_kernel.cc @@ -72,16 +72,11 @@ void GatherGradKernel(const Context& dev_ctx, } else { xpu::ctx_guard RAII_GUARD(dev_ctx.x_context()); int* index_int_ptr_l3 = RAII_GUARD.alloc_l3_or_gm(index.numel()); - r = xpu::cast_v2(dev_ctx.x_context(), - index.data(), - index_int_ptr_l3, - index.numel()); - PADDLE_ENFORCE_EQ(r, - XPU_SUCCESS, - phi::errors::External("XPU API(cast_v2) return wrong " - "value[%d %s]", - r, - XPUAPIErrorMsg[r])); + r = xpu::cast(dev_ctx.x_context(), + index.data(), + index_int_ptr_l3, + index.numel()); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast"); r = xpu::gather_grad( dev_ctx.x_context(), @@ -93,12 +88,7 @@ void GatherGradKernel(const Context& dev_ctx, axis_v, overwrite); } - PADDLE_ENFORCE_EQ( - r, - xpu::Error_t::SUCCESS, - phi::errors::External("XPU gather grad kernel return wrong value[%d %s]", - r, - XPUAPIErrorMsg[r])); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "gather_grad"); } } // namespace phi diff --git a/paddle/phi/kernels/xpu/sgd_kernel.cc b/paddle/phi/kernels/xpu/sgd_kernel.cc index 1bfd790893af0..510fddae3ba7c 100644 --- a/paddle/phi/kernels/xpu/sgd_kernel.cc +++ b/paddle/phi/kernels/xpu/sgd_kernel.cc @@ -54,12 +54,11 @@ void SGDDenseKernel(const Context &dev_ctx, const float *lr = nullptr; if (std::is_same::value) { float *lr_float = RAII_GUARD.alloc_l3_or_gm(learning_rate.numel()); - int r = - xpu::cast_v2(dev_ctx.x_context(), - reinterpret_cast(lr_t), - lr_float, - learning_rate.numel()); - PADDLE_ENFORCE_XDNN_SUCCESS(r, "clip_v2"); + int r = xpu::cast(dev_ctx.x_context(), + reinterpret_cast(lr_t), + lr_float, + learning_rate.numel()); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast"); lr = lr_float; } else { lr = reinterpret_cast(lr_t); diff --git a/paddle/phi/kernels/xpu/top_k_kernel.cc b/paddle/phi/kernels/xpu/top_k_kernel.cc index 411b74928d0a2..f2592f9501ee1 100644 --- a/paddle/phi/kernels/xpu/top_k_kernel.cc +++ b/paddle/phi/kernels/xpu/top_k_kernel.cc @@ -68,11 +68,11 @@ void TopkKernel(const Context& dev_ctx, k); PADDLE_ENFORCE_XDNN_SUCCESS(r, "sorted_topk"); - r = xpu::cast_v2(dev_ctx.x_context(), - (const int32_t*)indices_int_data, - indices_data, - indices->numel()); - PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast_v2"); + r = xpu::cast(dev_ctx.x_context(), + (const int32_t*)indices_int_data, + indices_data, + indices->numel()); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast"); } else { // do transpose if axis is not the last dim of input std::vector trans_axes; @@ -127,11 +127,11 @@ void TopkKernel(const Context& dev_ctx, k); PADDLE_ENFORCE_XDNN_SUCCESS(r, "sorted_topk"); - r = xpu::cast_v2(dev_ctx.x_context(), - (const int32_t*)trans_idx_int32_data, - trans_idx_data, - indices->numel()); - PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast_v2"); + r = xpu::cast(dev_ctx.x_context(), + (const int32_t*)trans_idx_int32_data, + trans_idx_data, + indices->numel()); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast"); // Transpose back to original dims std::vector trans_back_axes; for (int i = 0; i < axis; i++) { diff --git a/paddle/phi/kernels/xpu/warpctc_grad_kernel.cc b/paddle/phi/kernels/xpu/warpctc_grad_kernel.cc new file mode 100644 index 0000000000000..330bc01d8b336 --- /dev/null +++ b/paddle/phi/kernels/xpu/warpctc_grad_kernel.cc @@ -0,0 +1,60 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/warpctc_grad_kernel.h" + +#include "paddle/phi/backends/xpu/enforce_xpu.h" +#include "paddle/phi/backends/xpu/xpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void WarpctcGradKernel(const Context& dev_ctx, + const DenseTensor& logits, + const paddle::optional& logits_length, + const DenseTensor& warpctcgrad, + const DenseTensor& loss_grad, + int blank, + bool norm_by_times, + DenseTensor* logits_grad) { + dev_ctx.template Alloc(logits_grad); + + bool has_logits_length = logits_length.is_initialized(); + if (!has_logits_length) { + PADDLE_THROW( + phi::errors::External("XPU only support logits_length is_initialized")); + } + + int max_seq_length = warpctcgrad.dims()[0]; // Tmax + int num_sequences = warpctcgrad.dims()[1]; // B + int seq_width = warpctcgrad.dims()[2]; // D + auto* logits_length_ptr = logits_length.get_ptr(); + + int r = xpu::ctc_loss_grad(dev_ctx.x_context(), + loss_grad.data(), + logits_grad->data(), + warpctcgrad.data(), + max_seq_length, + num_sequences, + seq_width, + logits_length_ptr->data(), + norm_by_times); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "ctc_loss_grad"); +} + +} // namespace phi + +PD_REGISTER_KERNEL( + warpctc_grad, XPU, ALL_LAYOUT, phi::WarpctcGradKernel, float) {} diff --git a/paddle/phi/kernels/xpu/warpctc_kernel.cc b/paddle/phi/kernels/xpu/warpctc_kernel.cc new file mode 100644 index 0000000000000..833ff81daa208 --- /dev/null +++ b/paddle/phi/kernels/xpu/warpctc_kernel.cc @@ -0,0 +1,102 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/warpctc_kernel.h" + +#include "paddle/phi/backends/xpu/enforce_xpu.h" +#include "paddle/phi/backends/xpu/xpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void WarpctcKernel(const Context& dev_ctx, + const DenseTensor& logits, + const DenseTensor& label, + const paddle::optional& logits_length, + const paddle::optional& labels_length, + int blank, + bool norm_by_times, + DenseTensor* loss, + DenseTensor* warpctcgrad) { + bool has_logits_length = logits_length.is_initialized(); + if (!has_logits_length) { + PADDLE_THROW( + phi::errors::External("XPU only support logits_length is_initialized")); + } + bool has_labels_length = labels_length.is_initialized(); + if (!has_labels_length) { + PADDLE_THROW( + phi::errors::External("XPU only support labels_length is_initialized")); + } + + int max_sequence_length = logits.dims()[0]; + int num_sequences = logits.dims()[1]; + int sequence_width = logits.dims()[2]; + int max_target_seq_length = label.dims()[1]; + + PADDLE_ENFORCE_GT(max_sequence_length, + 0, + phi::errors::InvalidArgument( + "The first dimension of Input(Logits) should be " + "greater than zero " + "but received %d. ", + max_sequence_length)); + PADDLE_ENFORCE_GT(num_sequences, + 0, + phi::errors::InvalidArgument( + "The second dimension of Input(Logits) should be " + "greater than zero " + "but received %d. ", + num_sequences)); + PADDLE_ENFORCE_GT(sequence_width, + 0, + phi::errors::InvalidArgument( + "The third dimension of Input(Logits) should be " + "greater than zero " + "but received %d. ", + sequence_width)); + + loss->Resize(phi::make_ddim({num_sequences, 1})); + dev_ctx.template Alloc(loss); + + warpctcgrad->Resize( + phi::make_ddim({max_sequence_length, num_sequences, sequence_width})); + dev_ctx.template Alloc(warpctcgrad); + + const T* logits_data = logits.data(); + const int* label_data = label.data(); + auto logits_length_data = logits_length.get_ptr()->data(); + auto labels_length_data = labels_length.get_ptr()->data(); + T* loss_data = loss->data(); + T* warpctcgrad_data = warpctcgrad->data(); + + int r = xpu::ctc_loss(dev_ctx.x_context(), + logits_data, + label_data, + loss_data, + warpctcgrad_data, + logits_length_data, + labels_length_data, + max_sequence_length, + num_sequences, + sequence_width, + max_target_seq_length, + blank); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "ctc_loss"); +} + +} // namespace phi + +PD_REGISTER_KERNEL(warpctc, XPU, ALL_LAYOUT, phi::WarpctcKernel, float) {} diff --git a/python/paddle/fluid/tests/unittests/xpu/test_warpctc_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_warpctc_op_xpu.py new file mode 100644 index 0000000000000..3dcefb0e1e91f --- /dev/null +++ b/python/paddle/fluid/tests/unittests/xpu/test_warpctc_op_xpu.py @@ -0,0 +1,563 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys + +sys.path.append("..") +import unittest +import numpy as np +from test_softmax_op import stable_softmax +import paddle.fluid as fluid +from paddle.fluid import Program, program_guard +import paddle +import paddle.nn.functional as F +from op_test_xpu import XPUOpTest +from xpu.get_test_cover_info import ( + create_test_class, + get_xpu_op_support_types, + XPUOpTestWrapper, +) + +paddle.enable_static() + +CUDA_BLOCK_SIZE = 32 + + +class CTCForward(object): + def __init__( + self, + softmax, + softmax_lod, + labels, + labels_lod, + num_classes, + batch_size, + blank, + norm_by_times, + ): + self.softmax = softmax + self.softmax_lod = softmax_lod + self.labels = labels + self.labels_lod = labels_lod + self.blank = blank + self.norm_by_times = norm_by_times + + self.level = 0 + self.num_classes = num_classes + self.batch_size = batch_size + + self.loss = np.zeros([self.batch_size, 1], dtype=softmax.dtype) + self.gradient = np.zeros(self.softmax.shape, dtype=softmax.dtype) + + # float64 + self.EXP_MAX = sys.float_info.max + self.EXP_MIN = sys.float_info.min + self.LOG_ZERO = np.log(self.EXP_MIN) + self.LOG_INFINITY = np.log(self.EXP_MAX) + + def safe_exp(self, x): + if x <= self.LOG_ZERO: + return 0.0 + if x >= self.LOG_INFINITY: + return self.EXP_MAX + return np.exp(x) + + def safe_log(self, x): + if x <= self.EXP_MIN: + return self.LOG_ZERO + return np.log(x) + + # x = lna and y = lnb are in log scale, ln(a / b) = lna - lnb + def log_div(self, x, y): + res = x - y + if res <= self.LOG_ZERO: + return self.LOG_ZERO + if res >= self.LOG_INFINITY: + return self.LOG_INFINITY + return res + + # x = lna and y = lnb are in log scale, ln(a * b) = lna + lnb + def log_mul(self, x, y): + res = x + y + if res <= self.LOG_ZERO: + return self.LOG_ZERO + if res >= self.LOG_INFINITY: + return self.LOG_INFINITY + return res + + # x = lna and y = lnb are in log scale, + # ln(a + b) = lna + ln(1 + exp(lnb - lna)), where b > a + def log_add(self, x, y): + if x < y: + t = y + y = x + x = t + return x + self.safe_log(1 + self.safe_exp(y - x)) + + def segment_range(self, time, total_times, total_segments): + start = max(0, total_segments - (2 * (total_times - time))) + end = min(total_segments, 2 * (time + 1)) + return start, end + + def forward_a_sequence(self, softmax_a_sequence, labels_a_sequence): + total_times = softmax_a_sequence.shape[0] + total_segments = labels_a_sequence.shape[0] * 2 + 1 + + required_times = labels_a_sequence.shape[0] + old_label = -1 + for i in range(labels_a_sequence.shape[0]): + # two contingous labels with the same value + if labels_a_sequence[i, 0] == old_label: + required_times = required_times + 1 + old_label = labels_a_sequence[i, 0] + + if total_times < required_times: + return 0 + + # calculate the forward and backward variables, + # reference Chapter 7.3 of "Alex Grave, Supervised Sequence + # Labelling with Recurrent Neural Networks" + log_acts = np.zeros( + [total_times, self.num_classes], dtype=softmax_a_sequence.dtype + ) + for i in range(total_times): + for j in range(self.num_classes): + log_acts[i, j] = self.safe_log(softmax_a_sequence[i, j]) + + # calculate the forward variables + forward_vars = np.zeros( + [total_times, total_segments], dtype=softmax_a_sequence.dtype + ) + for i in range(total_times): + for j in range(total_segments): + forward_vars[i, j] = self.LOG_ZERO + + for i in range(total_times): + # dp initialization at t0 + if i == 0: + forward_vars[i, 0] = log_acts[0, self.blank] + if total_segments > 1: + forward_vars[i, 1] = log_acts[0, labels_a_sequence[i, 0]] + continue + + # dp from t1 + start, end = self.segment_range(i, total_times, total_segments) + for k in range(end - start): + j = k + start + if j & 1 == 1: + label_idx = j // 2 + label_val = labels_a_sequence[label_idx, 0] + fv = self.log_add( + forward_vars[i - 1, j], forward_vars[i - 1, j - 1] + ) + if ( + j > 1 + and label_val != labels_a_sequence[label_idx - 1, 0] + ): + fv = self.log_add(fv, forward_vars[i - 1, j - 2]) + fv = self.log_mul(fv, log_acts[i, label_val]) + else: + fv = forward_vars[i - 1, j] + if j > 0: + fv = self.log_add(fv, forward_vars[i - 1, j - 1]) + fv = self.log_mul(fv, log_acts[i, self.blank]) + forward_vars[i, j] = fv + + # sum the last two value as log_prob + log_prob = forward_vars[total_times - 1, total_segments - 1] + if total_segments > 1: + log_prob = self.log_add( + log_prob, forward_vars[total_times - 1, total_segments - 2] + ) + + return -log_prob + + def forward(self): + softmax_offset = 0 + labels_offset = 0 + for i in range(self.batch_size): + if self.labels.shape[1] == 1: + softmax_start_i = softmax_offset + softmax_end_i = softmax_offset + self.softmax_lod[self.level][i] + labels_start_i = labels_offset + labels_end_i = labels_offset + self.labels_lod[self.level][i] + + softmax_a_sequence = self.softmax[ + softmax_start_i:softmax_end_i, : + ] + labels_a_sequence = self.labels[labels_start_i:labels_end_i, :] + self.loss[i] = self.forward_a_sequence( + softmax_a_sequence, labels_a_sequence + ) + softmax_offset += self.softmax_lod[self.level][i] + labels_offset += self.labels_lod[self.level][i] + else: + softmax_a_sequence = self.softmax[: self.softmax_lod[i], i, :] + labels_a_sequence = self.labels[: self.labels_lod[i], :] + self.loss[i] = self.forward_a_sequence( + softmax_a_sequence, labels_a_sequence + ) + + return self.loss + + +def python_api( + logits, + label, + logits_length=None, + labels_length=None, + blank=0, + norm_by_times=False, +): + return paddle.fluid.layers.warpctc( + logits, label, blank, norm_by_times, logits_length, labels_length + ) + + +class XPUTestWarpCTCOp(XPUOpTestWrapper): + def __init__(self): + self.op_name = 'warpctc' + + class TestWarpCTCOpWithPadding(XPUOpTest): + def config(self): + self.batch_size = 4 + self.num_classes = 8 + self.logits_lod = [[4, 1, 3, 3]] + self.labels_lod = [[3, 1, 4, 4]] + self.logits_length = np.array([4, 1, 3, 3], dtype=np.int64) + self.labels_length = np.array([3, 1, 4, 4], dtype=np.int64) + self.blank = self.num_classes - 1 + self.norm_by_times = False + + def setUp(self): + self.op_type = "warpctc" + self.dtype = self.in_type + self.place = paddle.XPUPlace(0) + self.python_api = python_api + self.python_out_sig = ["Loss"] + self.config() + + logits = np.random.uniform( + 0.1, 1.0, [sum(self.logits_length), self.num_classes] + ).astype(self.dtype) + print("logits.shape = ", logits.shape) + softmax = np.apply_along_axis(stable_softmax, 1, logits) + # labels should not be blank + labels = np.random.randint( + 0, + self.num_classes - 1, + [sum(self.labels_length), 1], + dtype="int32", + ) + + ctc = CTCForward( + softmax, + self.logits_lod, + labels, + self.labels_lod, + self.num_classes, + self.batch_size, + self.blank, + self.norm_by_times, + ) + loss = ctc.forward() + + max_sequence_length = 0 + for i in range(self.batch_size): + max_sequence_length = max( + max_sequence_length, self.logits_length[i] + ) + # reshape logits to T*N*S + new_logits = np.zeros( + [max_sequence_length, self.batch_size, self.num_classes], + dtype=logits.dtype, + ) + + cur = 0 + for batch_id in range(self.batch_size): + for i in range(self.logits_length[batch_id]): + for j in range(self.num_classes): + new_logits[i, batch_id, j] = logits[cur + i, j] + cur = cur + self.logits_length[batch_id] + + # reshape labels to N*S + max_target_seq_length = 0 + for i in range(self.batch_size): + max_target_seq_length = max( + max_target_seq_length, self.labels_length[i] + ) + new_labels = np.zeros( + [self.batch_size, max_target_seq_length], dtype="int32" + ) + + cur = 0 + for batch_id in range(self.batch_size): + for i in range(self.labels_length[batch_id]): + new_labels[batch_id, i] = labels[cur + i] + cur = cur + self.labels_length[batch_id] + + self.gradient = np.zeros( + [max_sequence_length, self.batch_size, self.num_classes], + dtype=logits.dtype, + ) + + self.inputs = { + "Logits": new_logits, + "Label": new_labels, + "LogitsLength": self.logits_length, + "LabelLength": self.labels_length, + } + self.outputs = {"Loss": loss} + self.attrs = { + "blank": self.blank, + "norm_by_times": self.norm_by_times, + } + + def test_check_output(self): + self.check_output(check_eager=True) + + def test_check_grad(self): + self.outputs['WarpCTCGrad'] = self.gradient + place = paddle.XPUPlace(0) + self.check_grad_with_place( + place, + ["Logits"], + "Loss", + max_relative_error=0.007, + check_dygraph=False, + ) + + class TestWarpCTCOpWithPaddingCase1(TestWarpCTCOpWithPadding): + def config(self): + self.batch_size = 4 + self.num_classes = CUDA_BLOCK_SIZE + 2 + self.logits_lod = [[4, 1, 3, 3]] + self.labels_lod = [[3, 1, 4, 4]] + self.logits_length = np.array([4, 1, 3, 3], dtype=np.int64) + self.labels_length = np.array([3, 1, 4, 4], dtype=np.int64) + self.blank = self.num_classes - 1 + self.norm_by_times = False + + class TestWarpCTCOpError(unittest.TestCase): + def test_errors(self): + self.dtype = self.in_type + self.place = paddle.XPUPlace(0) + with program_guard(Program(), Program()): + logits = fluid.data( + name='logits', shape=[5, 16, 6], dtype=self.dtype + ) + logits_length = fluid.data( + name='logits_length', shape=[None], dtype='int64' + ) + label = fluid.data(name='label', shape=[16, 3], dtype='int32') + label_length = fluid.data( + name='labels_length', shape=[None], dtype='int64' + ) + + def test_logits_Variable(): + logits_data = np.random.rand(5, 16, 6).astype(logits.dtype) + fluid.layers.warpctc( + input=logits_data, + label=label, + input_length=logits_length, + label_length=label_length, + ) + + self.assertRaises(TypeError, test_logits_Variable) + + def test_label_Variable(): + label_data = np.random.randint(0, 5, [5, 1]).astype("int32") + fluid.layers.warpctc( + input=logits, + label=label_data, + input_length=logits_length, + label_length=label_length, + ) + + self.assertRaises(TypeError, test_label_Variable) + + def test_logits_len_Variable(): + logits_length_data = np.array([5] * 16).astype("int64") + fluid.layers.warpctc( + input=logits, + label=label, + input_length=logits_length_data, + label_length=label_length, + ) + + self.assertRaises(TypeError, test_logits_len_Variable) + + def test_label_len_Variable(): + label_length_data = np.array([3] * 16).astype("int64") + fluid.layers.warpctc( + input=logits, + label=label, + input_length=logits_length, + label_length=label_length_data, + ) + + self.assertRaises(TypeError, test_label_len_Variable) + + def test_dygraph_errors(self): + def test_dygraph_with_lod(): + + self.dtype = self.in_type + self.place = paddle.XPUPlace(0) + + logits = np.random.uniform(0.1, 1.0, [20, 15]).astype( + self.dtype + ) + # labels should not be blank + labels = np.random.randint(0, 15 - 1, [15, 1], dtype="int32") + softmax = paddle.to_tensor(logits) + labels = paddle.to_tensor(labels) + + fluid.layers.warpctc(input=softmax, label=labels) + + paddle.disable_static() + self.assertRaises(ValueError, test_dygraph_with_lod) + paddle.enable_static() + + class TestCTCLossAPICase(unittest.TestCase): + def test_functinal_api(self): + self.dtype = self.in_type + self.place = paddle.XPUPlace(0) + self.batch_size = 4 + self.num_classes = CUDA_BLOCK_SIZE + 2 + self.logits_length = np.array([4, 1, 3, 3], dtype=np.int64) + self.labels_length = np.array([3, 1, 4, 4], dtype=np.int64) + self.blank = self.num_classes - 1 + self.norm_by_times = False + + logits = np.random.uniform( + 0.1, + 1.0, + [max(self.logits_length), self.batch_size, self.num_classes], + ).astype(self.dtype) + softmax = np.apply_along_axis(stable_softmax, -1, logits) + # labels should not be blank + labels = np.random.randint( + 0, + self.num_classes - 1, + [self.batch_size, max(self.labels_length)], + dtype="int32", + ) + + ctc = CTCForward( + softmax, + self.logits_length, + labels, + self.labels_length, + self.num_classes, + self.batch_size, + self.blank, + self.norm_by_times, + ) + loss_np = ctc.forward() + + paddle.disable_static() + softmax = paddle.to_tensor(logits) + labels = paddle.to_tensor(labels) + logits_length = paddle.to_tensor(self.logits_length) + labels_length = paddle.to_tensor(self.labels_length) + loss_pd_mean = F.ctc_loss( + softmax, + labels, + logits_length, + labels_length, + blank=self.blank, + reduction='mean', + ) + loss_pd_mean = loss_pd_mean.numpy() + + loss_pd_sum = F.ctc_loss( + softmax, + labels, + logits_length, + labels_length, + blank=self.blank, + reduction='sum', + ) + loss_pd_sum = loss_pd_sum.numpy() + paddle.enable_static() + loss_np = np.squeeze(loss_np, axis=-1) + loss_np_mean = (loss_np / labels_length.numpy()).mean() + loss_np_sum = loss_np.sum() + + np.testing.assert_allclose( + loss_pd_mean, loss_np_mean, rtol=1e-05, atol=1 + ) + np.testing.assert_allclose( + loss_pd_sum, loss_np_sum, rtol=1e-05, atol=1 + ) + + def test_class_api(self): + self.dtype = self.in_type + self.place = paddle.XPUPlace(0) + self.batch_size = 3 + self.num_classes = 15 + self.logits_length = np.array([3, 3, 3], dtype=np.int64) + self.labels_length = np.array([0, 1, 2], dtype=np.int64) + self.blank = 0 + self.norm_by_times = False + + logits = np.random.uniform( + 0.1, + 1.0, + [max(self.logits_length), self.batch_size, self.num_classes], + ).astype(self.dtype) + softmax = np.apply_along_axis(stable_softmax, -1, logits) + # labels should not be blank + labels = np.random.randint( + 1, + self.num_classes, + [self.batch_size, max(self.labels_length)], + dtype="int32", + ) + + ctc = CTCForward( + softmax, + self.logits_length, + labels, + self.labels_length, + self.num_classes, + self.batch_size, + self.blank, + self.norm_by_times, + ) + loss_np = ctc.forward() + + paddle.disable_static() + softmax = paddle.to_tensor(logits) + labels = paddle.to_tensor(labels) + logits_length = paddle.to_tensor(self.logits_length) + labels_length = paddle.to_tensor(self.labels_length) + + loss_pd = paddle.nn.CTCLoss(self.blank, 'none')( + softmax, labels, logits_length, labels_length + ) + loss_pd = loss_pd.numpy() + paddle.enable_static() + loss_np = np.squeeze(loss_np, axis=-1) + + np.testing.assert_allclose(loss_pd, loss_np, rtol=1e-05, atol=1) + + +support_types = get_xpu_op_support_types('warpctc') +for stype in support_types: + create_test_class(globals(), XPUTestWarpCTCOp, stype) + + +if __name__ == "__main__": + unittest.main() From fd8ec69d2ed3fb122f4acedfedc6854388b9bb20 Mon Sep 17 00:00:00 2001 From: ccrrong <101700995+ccrrong@users.noreply.github.com> Date: Wed, 23 Nov 2022 10:23:00 +0800 Subject: [PATCH 161/210] remove eye (#48127) --- python/paddle/fluid/layers/tensor.py | 108 ------------------ .../tests/unittests/npu/test_eye_op_npu.py | 33 ------ .../fluid/tests/unittests/test_eye_op.py | 46 -------- .../fluid/tests/unittests/test_layers.py | 64 ----------- 4 files changed, 251 deletions(-) diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py index ee7b764ad7b86..b7e0e60145df3 100644 --- a/python/paddle/fluid/layers/tensor.py +++ b/python/paddle/fluid/layers/tensor.py @@ -68,7 +68,6 @@ 'zeros_like', 'ones_like', 'diag', - 'eye', 'triu', ] @@ -1787,113 +1786,6 @@ def diag(diagonal): return out -def eye( - num_rows, num_columns=None, batch_shape=None, dtype='float32', name=None -): - """ - This function constructs a or a batch of 2-D tensor with ones on the diagonal and zeros elsewhere. - - Args: - num_rows(int): the number of rows in each batch tensor. - num_columns(int, optional): the number of columns in each batch tensor. - If None, default: num_rows. - batch_shape(list, optional): If provided, the returned tensor will have a leading - batch size of this shape, the data type of ``batch_shape`` is int. Default is None. - dtype(np.dtype|str, optional): The data type of the returned tensor. - It should be int32, int64, float16, float32, float64, default is 'float32'. - name(str, optional): The default value is None. Normally there is no - need for user to set this property. For more information, please - refer to :ref:`api_guide_Name`. - - Returns: - Tensor: An identity Tensor or LoDTensor of shape batch_shape + [num_rows, num_columns]. - - Examples: - .. code-block:: python - - import paddle.fluid as fluid - data = fluid.layers.eye(3, dtype='int32') - # [[1, 0, 0] - # [0, 1, 0] - # [0, 0, 1]] - - data = fluid.layers.eye(2, 3, dtype='int32') - # [[1, 0, 0] - # [0, 1, 0]] - - data = fluid.layers.eye(2, batch_shape=[3]) - # Construct a batch of 3 identity tensors, each 2 x 2. - # data[i, :, :] is a 2 x 2 identity tensor, i = 0, 1, 2. - - """ - - def _check_attr(attr, message): - if isinstance(attr, ((Variable, core.VarBase, core.eager.Tensor))): - assert len(attr.shape) == 1 and attr.shape[0] in [1, -1] - elif not isinstance(attr, int) or attr < 0: - raise TypeError("{} should be a non-negative int.".format(message)) - - _check_attr(num_rows, "num_rows") - if not isinstance(dtype, core.VarDesc.VarType): - dtype = convert_np_dtype_to_dtype_(dtype) - if num_columns is not None: - _check_attr(num_columns, "num_columns") - else: - num_columns = num_rows - - if in_dygraph_mode(): - out = _C_ops.eye( - num_rows, num_columns, dtype, _current_expected_place() - ) - elif _in_legacy_dygraph(): - out = _legacy_C_ops.eye( - 'dtype', dtype, 'num_rows', num_rows, 'num_columns', num_columns - ) - else: - helper = LayerHelper("eye", **locals()) - check_dtype( - dtype, - 'dtype', - ['float16', 'float32', 'float64', 'int32', 'int64'], - 'eye', - ) - out = helper.create_variable_for_type_inference(dtype=dtype) - helper.append_op( - type='eye', - inputs={}, - outputs={'Out': [out]}, - attrs={ - 'num_rows': num_rows, - 'num_columns': num_columns, - 'dtype': dtype, - }, - stop_gradient=True, - ) - - if batch_shape is not None: - re_shape = [1] * len(batch_shape) - re_shape = re_shape + [num_rows, num_columns] - expand_times = batch_shape + [1, 1] - if _non_static_mode(): - out, _ = _legacy_C_ops.reshape2(out, None, 'shape', re_shape) - return _legacy_C_ops.expand(out, None, 'expand_times', expand_times) - - if not isinstance(batch_shape, list): - raise TypeError("batch_shape should be a list") - for batch_val in batch_shape: - if batch_val <= 0: - raise TypeError("batch_shape should be a positive int list") - - from .nn import expand - from paddle import reshape - - out = reshape(x=out, shape=re_shape) - out = expand(x=out, expand_times=expand_times) - - out.stop_gradient = True - return out - - def ones_like(x, out=None): """ **ones_like** diff --git a/python/paddle/fluid/tests/unittests/npu/test_eye_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_eye_op_npu.py index e80be96f2cdcb..dd4ad921172c8 100755 --- a/python/paddle/fluid/tests/unittests/npu/test_eye_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_eye_op_npu.py @@ -139,39 +139,6 @@ def test_out(self): paddle.enable_static() self.assertEqual((out.numpy() == expected_result).all(), True) - paddle.disable_static(paddle.NPUPlace(0)) - batch_shape = [2] - out = fluid.layers.eye(10, 10, dtype="int32", batch_shape=batch_shape) - result = np.eye(10, dtype="int32") - expected_result = [] - for index in reversed(batch_shape): - tmp_result = [] - for i in range(index): - tmp_result.append(result) - result = tmp_result - expected_result = np.stack(result, axis=0) - paddle.enable_static() - self.assertEqual( - out.numpy().shape == np.array(expected_result).shape, True - ) - self.assertEqual((out.numpy() == expected_result).all(), True) - - paddle.disable_static(paddle.NPUPlace(0)) - batch_shape = [3, 2] - out = fluid.layers.eye(10, 10, dtype="int32", batch_shape=batch_shape) - result = np.eye(10, dtype="int32") - expected_result = [] - for index in reversed(batch_shape): - tmp_result = [] - for i in range(index): - tmp_result.append(result) - result = tmp_result - expected_result = np.stack(result, axis=0) - paddle.enable_static() - self.assertEqual( - out.numpy().shape == np.array(expected_result).shape, True - ) - self.assertEqual((out.numpy() == expected_result).all(), True) def test_errors(self): with paddle.static.program_guard(paddle.static.Program()): diff --git a/python/paddle/fluid/tests/unittests/test_eye_op.py b/python/paddle/fluid/tests/unittests/test_eye_op.py index e61037ec1afbd..fb93aee9b30cd 100644 --- a/python/paddle/fluid/tests/unittests/test_eye_op.py +++ b/python/paddle/fluid/tests/unittests/test_eye_op.py @@ -109,40 +109,6 @@ def test_out(self): paddle.enable_static() self.assertEqual((out.numpy() == expected_result).all(), True) - paddle.disable_static() - batch_shape = [2] - out = fluid.layers.eye(10, 10, dtype="int64", batch_shape=batch_shape) - result = np.eye(10, dtype="int64") - expected_result = [] - for index in reversed(batch_shape): - tmp_result = [] - for i in range(index): - tmp_result.append(result) - result = tmp_result - expected_result = np.stack(result, axis=0) - paddle.enable_static() - self.assertEqual( - out.numpy().shape == np.array(expected_result).shape, True - ) - self.assertEqual((out.numpy() == expected_result).all(), True) - - paddle.disable_static() - batch_shape = [3, 2] - out = fluid.layers.eye(10, 10, dtype="int64", batch_shape=batch_shape) - result = np.eye(10, dtype="int64") - expected_result = [] - for index in reversed(batch_shape): - tmp_result = [] - for i in range(index): - tmp_result.append(result) - result = tmp_result - expected_result = np.stack(result, axis=0) - paddle.enable_static() - self.assertEqual( - out.numpy().shape == np.array(expected_result).shape, True - ) - self.assertEqual((out.numpy() == expected_result).all(), True) - def test_errors(self): with paddle.static.program_guard(paddle.static.Program()): @@ -212,18 +178,6 @@ def test_error(self): paddle.eye(-1) -class TestEyeRowsCol2(TestEyeRowsCol): - def call_func(self, x): - rows = paddle.assign(3) - cols = paddle.assign(10) - out = paddle.fluid.layers.eye(rows, cols) - return out - - def test_error(self): - with self.assertRaises(TypeError): - paddle.fluid.layers.eye(-1) - - if __name__ == "__main__": paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index 38b0d96571b60..1d892e76c92c0 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -2407,70 +2407,6 @@ def test_conv3d_transpose(self): conv3d1.bias.numpy(), conv3d2.bias.numpy() ) - def test_eye_op(self): - np_eye = np.eye(3, 2) - array_rlt1 = [np_eye for _ in range(3)] - stack_rlt1 = np.stack(array_rlt1, axis=0) - array_rlt2 = [stack_rlt1 for _ in range(4)] - stack_rlt2 = np.stack(array_rlt2, axis=0) - - with self.dynamic_graph(): - with _test_eager_guard(): - eager_eye_tensor = layers.eye(num_rows=3, num_columns=2) - eager_eye_tensor_rlt1 = layers.eye( - num_rows=3, num_columns=2, batch_shape=[3] - ) - eager_eye_tensor_rlt2 = layers.eye( - num_rows=3, num_columns=2, batch_shape=[4, 3] - ) - eager_diag_tensor = layers.eye(20) - eager_eye_tensor_value = eager_eye_tensor.numpy() - eager_eye_tensor_rlt1_value = eager_eye_tensor_rlt1.numpy() - eager_eye_tensor_rlt2_value = eager_eye_tensor_rlt2.numpy() - eager_diag_tensor_value = eager_diag_tensor.numpy() - - eye_tensor = layers.eye(num_rows=3, num_columns=2) - eye_tensor_rlt1 = layers.eye( - num_rows=3, num_columns=2, batch_shape=[3] - ) - eye_tensor_rlt2 = layers.eye( - num_rows=3, num_columns=2, batch_shape=[4, 3] - ) - diag_tensor = layers.eye(20) - eye_tensor_value = eye_tensor.numpy() - eye_tensor_rlt1_value = eye_tensor_rlt1.numpy() - eye_tensor_rlt2_value = eye_tensor_rlt2.numpy() - diag_tensor_value = diag_tensor.numpy() - - np.testing.assert_allclose(eager_eye_tensor_value, np_eye, rtol=1e-05) - np.testing.assert_allclose( - eager_eye_tensor_rlt1_value, stack_rlt1, rtol=1e-05 - ) - np.testing.assert_allclose( - eager_eye_tensor_rlt2_value, stack_rlt2, rtol=1e-05 - ) - np.testing.assert_allclose( - eager_diag_tensor_value, np.eye(20), rtol=1e-05 - ) - - np.testing.assert_allclose(eye_tensor_value, np_eye, rtol=1e-05) - np.testing.assert_allclose( - eye_tensor_rlt1_value, stack_rlt1, rtol=1e-05 - ) - np.testing.assert_allclose( - eye_tensor_rlt2_value, stack_rlt2, rtol=1e-05 - ) - np.testing.assert_allclose(diag_tensor_value, np.eye(20), rtol=1e-05) - - with self.assertRaises(TypeError): - layers.eye(num_rows=3.1) - with self.assertRaises(TypeError): - layers.eye(num_rows=3, num_columns=2.2) - with self.assertRaises(TypeError): - layers.eye(num_rows=3, batch_shape=2) - with self.assertRaises(TypeError): - layers.eye(num_rows=3, batch_shape=[-1]) - def func_while_loop(self): with self.static_graph(): i = layers.fill_constant(shape=[1], dtype='int64', value=0) From a606db6762fc7fde1381d376e1e3e0bfe9672e7a Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Wed, 23 Nov 2022 10:40:13 +0800 Subject: [PATCH 162/210] fix vector out of range error (#48255) --- paddle/phi/kernels/funcs/broadcast_function.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/phi/kernels/funcs/broadcast_function.h b/paddle/phi/kernels/funcs/broadcast_function.h index a222422c89fdc..325a90e5ed05a 100644 --- a/paddle/phi/kernels/funcs/broadcast_function.h +++ b/paddle/phi/kernels/funcs/broadcast_function.h @@ -878,7 +878,7 @@ void BroadcastKernelForDifferentVecSize( const auto dims_simplifier = BroadcastDimsSimplifier(ins, (*outs)[0]->dims(), axis); if (VLOG_IS_ON(4)) { - for (size_t i = 0; i < dims_simplifier.in_dims.size(); ++i) { + for (size_t i = 0; i < ins.size(); ++i) { VLOG(4) << "input i=" << i << ": origin_dims={" << ins[i]->dims() << "}, simplied_dims={" << phi::make_ddim(dims_simplifier.in_dims[i]) << "}"; From 32462c6489489f2c045a809315052f2992302ed7 Mon Sep 17 00:00:00 2001 From: HongyuJia Date: Wed, 23 Nov 2022 10:46:09 +0800 Subject: [PATCH 163/210] opt kernel_factory warning message (#48245) --- paddle/phi/core/kernel_factory.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/phi/core/kernel_factory.cc b/paddle/phi/core/kernel_factory.cc index 0352312edc025..3370e9b805889 100644 --- a/paddle/phi/core/kernel_factory.cc +++ b/paddle/phi/core/kernel_factory.cc @@ -127,8 +127,8 @@ KernelResult KernelFactory::SelectKernelOrThrowError( if (kernel_iter != iter->second.end()) { return {kernel_iter->second, false}; } - LOG(WARNING) << "The cudnn kernel for [" << kernel_name - << "] is not registered."; + VLOG(3) << "The cudnn kernel for [" << kernel_name + << "] is not registered."; } #endif auto kernel_iter = iter->second.find(kernel_key); From 9666979d8db9421fdfe2a97c0ef700de4f5eb642 Mon Sep 17 00:00:00 2001 From: ccrrong <101700995+ccrrong@users.noreply.github.com> Date: Wed, 23 Nov 2022 11:31:44 +0800 Subject: [PATCH 164/210] move conv2d_transpose and conv3d_transpose (#48198) --- python/paddle/fluid/layers/nn.py | 727 ----------------- .../test_mkldnn_conv_bias_fuse_pass.py | 3 +- .../inference/test_trt_conv3d_transpose_op.py | 5 +- .../ir/inference/test_trt_conv_pass.py | 3 +- .../test_trt_conv_quant_dequant_pass.py | 2 +- .../mlu/test_conv2d_transposed_op_mlu.py | 26 +- .../npu/test_conv2d_transpose_op_npu.py | 14 +- .../unittests/test_conv2d_transpose_layer.py | 3 +- .../unittests/test_conv2d_transpose_op.py | 30 +- .../unittests/test_conv3d_transpose_layer.py | 3 +- .../test_conv3d_transpose_part2_op.py | 23 +- .../unittests/test_conv_transpose_nn_grad.py | 10 +- .../test_functional_conv2d_transpose.py | 2 +- .../test_functional_conv3d_transpose.py | 4 +- .../test_imperative_load_static_param.py | 8 +- .../fluid/tests/unittests/test_layers.py | 6 +- python/paddle/static/nn/__init__.py | 4 +- python/paddle/static/nn/common.py | 729 +++++++++++++++++- 18 files changed, 804 insertions(+), 798 deletions(-) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 1f74a79a91b7c..4d188228bf7ee 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -77,8 +77,6 @@ 'inplace_abn', 'instance_norm', 'data_norm', - 'conv2d_transpose', - 'conv3d_transpose', 'reduce_sum', 'reduce_mean', 'reduce_max', @@ -3811,731 +3809,6 @@ def spectral_norm(weight, dim=0, power_iters=1, eps=1e-12, name=None): return out -def conv2d_transpose( - input, - num_filters, - output_size=None, - filter_size=None, - padding=0, - stride=1, - dilation=1, - groups=None, - param_attr=None, - bias_attr=None, - use_cudnn=True, - act=None, - name=None, - data_format='NCHW', -): - r""" - :api_attr: Static Graph - - The convolution2D transpose layer calculates the output based on the input, - filter, and dilations, strides, paddings. Input(Input) and output(Output) - are in NCHW or NHWC format. Where N is batch size, C is the number of channels, - H is the height of the feature, and W is the width of the feature. - Parameters(dilations, strides, paddings) are two elements. These two elements - represent height and width, respectively. The details of convolution transpose - layer, please refer to the following explanation and references - `therein `_. - If bias attribution and activation type are provided, bias is added to - the output of the convolution, and the corresponding activation function - is applied to the final result. - - For each input :math:`X`, the equation is: - - .. math:: - - Out = \sigma (W \\ast X + b) - - Where: - - * :math:`X`: Input value, a 4-D Tensor with NCHW or NHWC format. - * :math:`W`: Filter value, a 4-D Tensor with MCHW format. - * :math:`\\ast`: Convolution operation. - * :math:`b`: Bias value, a 2-D Tensor with shape [M, 1]. - * :math:`\\sigma`: Activation function. - * :math:`Out`: Output value, a 4-D Tensor with data format 'NCHW' or 'NHWC', the shape of :math:`Out` and :math:`X` may be different. - - Example: - - - Input: - - Input shape: :math:`(N, C_{in}, H_{in}, W_{in})` - - Filter shape: :math:`(C_{in}, C_{out}, H_f, W_f)` - - - Output: - - Output shape: :math:`(N, C_{out}, H_{out}, W_{out})` - - Where - - .. math:: - - H^\prime_{out} &= (H_{in} - 1) * strides[0] - pad_height_top - pad_height_bottom + dilations[0] * (H_f - 1) + 1 \\\\ - W^\prime_{out} &= (W_{in} - 1) * strides[1] - pad_width_left - pad_width_right + dilations[1] * (W_f - 1) + 1 \\\\ - H_{out} &\in [ H^\prime_{out}, H^\prime_{out} + strides[0] ] \\\\ - W_{out} &\in [ W^\prime_{out}, W^\prime_{out} + strides[1] ] - - Note: - The conv2d_transpose can be seen as the backward of the conv2d. For conv2d, - when stride > 1, conv2d maps multiple input shape to the same output shape, - so for conv2d_transpose, when stride > 1, input shape maps multiple output shape. - If output_size is None, :math:`H_{out} = H^\prime_{out}, W_{out} = W^\prime_{out}`; - else, the :math:`H_{out}` of the output size must between :math:`H^\prime_{out}` - and :math:`H^\prime_{out} + strides[0]`, and the :math:`W_{out}` of the output size must - between :math:`W^\prime_{out}` and :math:`W^\prime_{out} + strides[1]`, - conv2d_transpose can compute the kernel size automatically. - - Args: - input(Tensor): 4-D Tensor with [N, C, H, W] or [N, H, W, C] format, - its data type is float32 or float64. - num_filters(int): The number of the filter. It is as same as the output - image channel. - output_size(int|tuple, optional): The output image size. If output size is a - tuple, it must contain two integers, (image_height, image_width). None if use - filter_size, padding, and stride to calculate output_size. - If output_size and filter_size are specified at the same time, They - should follow the formula above. Default: None. output_size and filter_size - should not be None at the same time. - filter_size(int|tuple, optional): The filter size. If filter_size is a tuple, - it must contain two integers, (filter_size_height, filter_size_width). - Otherwise, filter_size_height = filter_size_width = filter_size. None if - use output size to calculate filter_size. Default: None. filter_size and - output_size should not be None at the same time. - stride(int|tuple, optional): The stride size. It means the stride in transposed convolution. - If stride is a tuple, it must contain two integers, (stride_height, stride_width). - Otherwise, stride_height = stride_width = stride. Default: stride = 1. - padding(str|int|list|tuple, optional): The padding size. It means the number of zero-paddings - on both sides for each dimension. If `padding` is a string, either 'VALID' or - 'SAME' which is the padding algorithm. If `padding` is a tuple or list, - it could be in three forms: `[pad_height, pad_width]` or - `[pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`, - and when `data_format` is `"NCHW"`, `padding` can be in the form - `[[0,0], [0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`. - when `data_format` is `"NHWC"`, `padding` can be in the form - `[[0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`. - Default: padding = 0. - dilation(int|tuple, optional): The dilation size. It means the spacing between the kernel points. - If dilation is a tuple, it must contain two integers, (dilation_height, dilation_width). - Otherwise, dilation_height = dilation_width = dilation. Default: dilation = 1. - filter_size(int|tuple, optional): The filter size. If filter_size is a tuple, - it must contain two integers, (filter_size_height, filter_size_width). - Otherwise, filter_size_height = filter_size_width = filter_size. None if - use output size to calculate filter_size. Default: None. - groups(int, optional): The groups number of the Conv2d transpose layer. Inspired by - grouped convolution in Alex Krizhevsky's Deep CNN paper, in which - when group=2, the first half of the filters is only connected to the - first half of the input channels, while the second half of the - filters is only connected to the second half of the input channels. - Default: groups = 1. - param_attr (ParamAttr, optional): The parameter attribute for learnable parameters/weights - of conv2d_transpose. If it is set to None or one attribute of ParamAttr, conv2d_transpose - will create ParamAttr as param_attr. If the Initializer of the param_attr - is not set, the parameter is initialized with Xavier. Default: None. - bias_attr (ParamAttr|bool, optional): The parameter attribute for the bias of conv2d_transpose. - If it is set to False, no bias will be added to the output units. - If it is set to None or one attribute of ParamAttr, conv2d_transpose - will create ParamAttr as bias_attr. If the Initializer of the bias_attr - is not set, the bias is initialized zero. Default: None. - use_cudnn(bool, optional): Use cudnn kernel or not, it is valid only when the cudnn - library is installed. Default: True. - act (str, optional): Activation type, if it is set to None, activation is not appended. - Default: None. - name(str, optional): For detailed information, please refer - to :ref:`api_guide_Name`. Usually name is no need to set and - None by default. - data_format (str, optional): Specify the data format of the input, and the data format of the output - will be consistent with that of the input. An optional string from: `"NCHW"`, `"NHWC"`. - The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of: - `[batch_size, input_channels, input_height, input_width]`. - - Returns: - A Tensor representing the conv2d_transpose, whose - data type is the same with input and shape is (num_batches, channels, out_h, - out_w) or (num_batches, out_h, out_w, channels). If act is None, the tensor - storing the transposed convolution result, and if act is not None, the - tensor storing transposed convolution and non-linearity activation - result. - - Raises: - ValueError: If the type of `use_cudnn` is not bool. - ValueError: If `data_format` is not "NCHW" or "NHWC". - ValueError: If `padding` is a string, but not "SAME" or "VALID". - ValueError: If `padding` is a tuple, but the element corresponding to the input's batch size is not 0 - or the element corresponding to the input's channel is not 0. - ValueError: If `output_size` and filter_size are None at the same time. - ShapeError: If the input is not 4-D Tensor. - ShapeError: If the input's dimension size and filter's dimension size not equal. - ShapeError: If the dimension size of input minus the size of `stride` is not 2. - ShapeError: If the number of input channels is not equal to filter's channels. - ShapeError: If the size of `output_size` is not equal to that of `stride`. - - Examples: - .. code-block:: python - - import paddle - paddle.enable_static() - - data = paddle.static.data(name='data', shape=[None, 3, 32, 32], dtype='float32') - conv2d_transpose = paddle.static.nn.conv2d_transpose(input=data, num_filters=2, filter_size=3) - print(conv2d_transpose.shape) # [-1, 2, 34, 34] - """ - assert ( - param_attr is not False - ), "param_attr should not be False in conv2d_transpose." - if len(input.shape) != 4: - raise ValueError( - "Input size should be 4, " - "but received {}".format(len(input.shape)) - ) - - if data_format not in ['NCHW', 'NHWC']: - raise ValueError( - "Attr(data_format) of Op(fluid.layers.conv2d_transpose) got wrong value: received " - + data_format - + " but only NCHW or NHWC supported." - ) - - input_channel = input.shape[1] if data_format == 'NCHW' else input.shape[-1] - op_type = 'conv2d_transpose' - if ( - input_channel == groups - and num_filters == input_channel - and not use_cudnn - ): - op_type = 'depthwise_conv2d_transpose' - - helper = LayerHelper(op_type, **locals()) - if not isinstance(input, Variable): - raise TypeError("Input of conv2d_transpose must be Variable") - - stride = utils.convert_to_list(stride, 2, 'stride') - dilation = utils.convert_to_list(dilation, 2, 'dilation') - - if not isinstance(use_cudnn, bool): - raise ValueError("use_cudnn should be True or False") - - def _update_padding(padding, data_format): - def is_list_or_tuple(ele): - if isinstance(ele, list) or isinstance(ele, tuple): - return True - return False - - if is_list_or_tuple(padding) and len(padding) == 4: - if is_list_or_tuple(padding[0]) and (data_format == "NCHW"): - if not (padding[0] == [0, 0] and padding[1] == [0, 0]): - raise ValueError( - "Non-zero padding(%s) in the batch or channel dimensions " - "is not supported." % str(padding) - ) - padding = padding[2:4] - padding = [ele for a_list in padding for ele in a_list] - elif is_list_or_tuple(padding[0]) and (data_format == "NHWC"): - if not (padding[0] == [0, 0] and padding[3] == [0, 0]): - raise ValueError( - "Non-zero padding(%s) in the batch or channel dimensions " - "is not supported." % str(padding) - ) - padding = padding[1:3] - padding = [ele for a_list in padding for ele in a_list] - padding = utils.convert_to_list(padding, 4, 'padding') - else: - padding = utils.convert_to_list(padding, 2, 'padding') - padding = [padding[0], padding[0], padding[1], padding[1]] - return padding - - padding_algorithm = "EXPLICIT" - if isinstance(padding, str): - padding = padding.upper() - if padding not in ["SAME", "VALID"]: - raise ValueError( - "Unknown padding: '%s'. It can only be 'SAME' or 'VALID'." - % str(padding) - ) - if padding == "VALID": - padding_algorithm = "VALID" - padding = [0, 0, 0, 0] - elif padding == "SAME": - padding_algorithm = "SAME" - padding = [0, 0, 0, 0] - - padding = _update_padding(padding, data_format) - - if output_size is None: - output_size = [] - elif isinstance(output_size, (list, tuple)): - if utils._contain_var(output_size): - output_size = utils._convert_to_tensor_list(output_size) - else: - output_size = utils.convert_to_list(output_size, 2, 'output_size') - elif isinstance(output_size, int): - output_size = utils.convert_to_list(output_size, 2, 'output_size') - elif isinstance(output_size, Variable): - check_dtype( - output_size.dtype, - 'output_size', - ['int32', 'int64'], - 'conv2d_transpose', - ) - if len(output_size.shape) == 1 and ( - output_size.shape[0] == 1 or output_size.shape[0] == 2 - ): - if output_size.shape[0] == 1: - output_size = [output_size, output_size] - else: - raise ValueError("output_size must contain one or two integers.") - else: - raise ValueError( - "output_size should be int, list[int] or tuple[int] or Tensor" - ) - - if filter_size is None: - if output_size is []: - raise ValueError("output_size must be set when filter_size is None") - if not _non_static_mode(): - if isinstance(output_size, Variable) or utils._contain_var( - output_size - ): - raise ValueError( - "filter_size should not be None when output_size is Variable or contain Variable in static mode." - ) - else: - output_size = utils.convert_shape_to_list(output_size) - if len(output_size) == 1: - output_size = utils.convert_to_list( - output_size[0], 2, 'output_size' - ) - - h_in = input.shape[2] if data_format == 'NCHW' else input.shape[1] - w_in = input.shape[3] if data_format == 'NCHW' else input.shape[2] - - filter_size_h = ( - output_size[0] - - (h_in - 1) * stride[0] - + padding[0] - + padding[1] - - 1 - ) // dilation[0] + 1 - filter_size_w = ( - output_size[1] - - (w_in - 1) * stride[1] - + padding[2] - + padding[3] - - 1 - ) // dilation[1] + 1 - filter_size = [filter_size_h, filter_size_w] - else: - filter_size = utils.convert_to_list( - filter_size, 2, 'conv2d_transpose.filter_size' - ) - - if len(padding) == 4 and utils._is_symmetric_padding(padding, 2): - padding = [padding[0], padding[2]] - - if groups is None: - groups = 1 - elif groups <= 0: - raise ValueError( - "the groups of input must be greater than 0, " - "but received the groups of input is {}".format(groups) - ) - - filter_shape = [input_channel, num_filters // groups] + filter_size - - img_filter = helper.create_parameter( - dtype=input.dtype, shape=filter_shape, attr=helper.param_attr - ) - - pre_bias = helper.create_variable_for_type_inference(dtype=input.dtype) - helper.append_op( - type=op_type, - inputs={'Input': [input], 'Filter': [img_filter]}, - outputs={'Output': pre_bias}, - attrs={ - 'output_size': output_size, - 'strides': stride, - 'paddings': padding, - 'padding_algorithm': padding_algorithm, - 'dilations': dilation, - 'groups': groups, - 'use_cudnn': use_cudnn, - 'data_format': data_format, - }, - ) - - if data_format == 'NCHW': - pre_act = helper.append_bias_op(pre_bias, dim_start=1, dim_end=2) - else: - pre_act = helper.append_bias_op(pre_bias, dim_start=3, dim_end=4) - out = helper.append_activation(pre_act) - return out - - -def conv3d_transpose( - input, - num_filters, - output_size=None, - filter_size=None, - padding=0, - stride=1, - dilation=1, - groups=None, - param_attr=None, - bias_attr=None, - use_cudnn=True, - act=None, - name=None, - data_format='NCDHW', -): - r""" - :api_attr: Static Graph - - The convolution3D transpose layer calculates the output based on the input, - filter, and dilations, strides, paddings. Input(Input) and output(Output) - are in NCDHW or NDHWC format. Where N is batch size, C is the number of channels, - D is the depth of the feature, H is the height of the feature, and W - is the width of the feature. Parameters(dilations, strides, paddings) are - two elements. These two elements represent height and width, respectively. - The details of convolution transpose layer, please refer to the following - explanation and references `therein `_. - If bias attribution and activation type are provided, bias is added to - the output of the convolution, and the corresponding activation function - is applied to the final result. - - For each input :math:`X`, the equation is: - - .. math:: - - Out = \sigma (W \ast X + b) - - In the above equation: - - * :math:`X`: Input value, a Tensor with NCDHW or NDHWC format. - * :math:`W`: Filter value, a Tensor with MCDHW format. - * :math:`\ast`: Convolution operation. - * :math:`b`: Bias value, a 2-D Tensor with shape [M, 1]. - * :math:`\sigma`: Activation function. - * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different. - - Example: - - - Input: - - Input shape: :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})` - - Filter shape: :math:`(C_{in}, C_{out}, D_f, H_f, W_f)` - - - Output: - - Output shape: :math:`(N, C_{out}, D_{out}, H_{out}, W_{out})` - - Where - - .. math:: - - D^\prime_{out} &= (D_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (D_f - 1) + 1 \\\\ - H^\prime_{out} &= (H_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (H_f - 1) + 1 \\\\ - W^\prime_{out} &= (W_{in} - 1) * strides[2] - 2 * paddings[2] + dilations[2] * (W_f - 1) + 1 \\\\ - D_{out} &\in [ D^\prime_{out}, D^\prime_{out} + strides[0] ] \\\\ - H_{out} &\in [ H^\prime_{out}, H^\prime_{out} + strides[1] ] \\\\ - W_{out} &\in [ W^\prime_{out}, W^\prime_{out} + strides[2] ] - - Note: - The conv3d_transpose can be seen as the backward of the conv3d. For conv3d, - when stride > 1, conv3d maps multiple input shape to the same output shape, - so for conv3d_transpose, when stride > 1, input shape maps multiple output shape. - If output_size is None, :math:`H_{out} = H^\prime_{out}, :math:`H_{out} = \ - H^\prime_{out}, W_{out} = W^\prime_{out}`; else, the :math:`D_{out}` of the output - size must between :math:`D^\prime_{out}` and :math:`D^\prime_{out} + strides[0]`, - the :math:`H_{out}` of the output size must between :math:`H^\prime_{out}` - and :math:`H^\prime_{out} + strides[1]`, and the :math:`W_{out}` of the output size must - between :math:`W^\prime_{out}` and :math:`W^\prime_{out} + strides[2]`, - conv3d_transpose can compute the kernel size automatically. - - Args: - input(Tensor): The input is 5-D Tensor with shape [N, C, D, H, W] or [N, D, H, W, C], the data type - of input is float32 or float64. - num_filters(int): The number of the filter. It is as same as the output - image channel. - output_size(int|tuple, optional): The output image size. If output size is a - tuple, it must contain three integers, (image_depth, image_height, image_width). This - parameter only works when filter_size is None. If output_size and filter_size are - specified at the same time, They should follow the formula above. Default: None. - Output_size and filter_size should not be None at the same time. - filter_size(int|tuple, optional): The filter size. If filter_size is a tuple, - it must contain three integers, (filter_size_depth, filter_size_height, - filter_size_width). Otherwise, filter_size_depth = filter_size_height = \ - filter_size_width = filter_size. None if use output size to - calculate filter_size. Default: None. filter_size and output_size should not be - None at the same time. - padding(int|list|str|tuple, optional): The padding size. The padding argument effectively - adds `dilation * (kernel - 1)` amount of zero-padding on both sides of input. If `padding` is a string, - either 'VALID' or 'SAME' supported, which is the padding algorithm. If `padding` - is a tuple or list, it could be in three forms: `[pad_depth, pad_height, pad_width]` or - `[pad_depth_front, pad_depth_back, pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`, - and when `data_format` is `'NCDHW'`, `padding` can be in the form - `[[0,0], [0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`. - when `data_format` is `'NDHWC'`, `padding` can be in the form - `[[0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`. - Default: padding = 0. - stride(int|tuple, optional): The stride size. It means the stride in transposed convolution. - If stride is a tuple, it must contain three integers, (stride_depth, stride_height, - stride_width). Otherwise, stride_depth = stride_height = stride_width = stride. - Default: stride = 1. - dilation(int|tuple, optional): The dilation size. It means the spacing between the kernel points. - If dilation is a tuple, it must contain three integers, (dilation_depth, dilation_height, - dilation_width). Otherwise, dilation_depth = dilation_height = dilation_width = dilation. - Default: dilation = 1. - groups(int, optional): The groups number of the Conv3d transpose layer. Inspired by - grouped convolution in Alex Krizhevsky's Deep CNN paper, in which - when group=2, the first half of the filters is only connected to the - first half of the input channels, while the second half of the - filters is only connected to the second half of the input channels. - Default: groups=1 - param_attr (ParamAttr, optional): The parameter attribute for learnable parameters/weights - of conv3d_transpose. If it is set to None or one attribute of ParamAttr, conv3d_transpose - will create ParamAttr as param_attr. If the Initializer of the param_attr - is not set, the parameter is initialized with Xavier. Default: None. - bias_attr (ParamAttr|bool, optional): The parameter attribute for the bias of conv3d_transpose. - If it is set to False, no bias will be added to the output units. - If it is set to None or one attribute of ParamAttr, conv3d_transpose - will create ParamAttr as bias_attr. If the Initializer of the bias_attr - is not set, the bias is initialized zero. Default: None. - use_cudnn(bool, optional): Use cudnn kernel or not, it is valid only when the cudnn - library is installed. Default: True - act (str, optional): Activation type, if it is set to None, activation is not appended. - Default: None. - name(str, optional): For detailed information, please refer - to :ref:`api_guide_Name`. Usually name is no need to set and - None by default. - data_format (str, optional): Specify the data format of the input, and the data format of the output - will be consistent with that of the input. An optional string from: `"NCHW"`, `"NHWC"`. - The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of: - `[batch_size, input_channels, input_height, input_width]`. - - Returns: - A Variable holding Tensor representing the conv3d_transpose, whose data - type is the same with input and shape is (num_batches, channels, out_d, out_h, - out_w) or (num_batches, out_d, out_h, out_w, channels). If act is None, the tensor - variable storing the transposed convolution result, and if act is not None, the tensor - variable storing transposed convolution and non-linearity activation result. - - Raises: - ValueError: If the type of `use_cudnn` is not bool. - ValueError: If `data_format` is not "NCDHW" or "NDHWC". - ValueError: If `padding` is a string, but not "SAME" or "VALID". - ValueError: If `padding` is a tuple, but the element corresponding to the input's batch size is not 0 - or the element corresponding to the input's channel is not 0. - ValueError: If `output_size` and filter_size are None at the same time. - ShapeError: If the input is not 5-D Tensor. - ShapeError: If the input's dimension size and filter's dimension size not equal. - ShapeError: If the dimension size of input minus the size of `stride` is not 2. - ShapeError: If the number of input channels is not equal to filter's channels. - ShapeError: If the size of `output_size` is not equal to that of `stride`. - - Examples: - .. code-block:: python - - import paddle - import numpy as np - - paddle.enable_static() - data = paddle.static.data(name='data', shape=[None, 3, 12, 32, 32], dtype='float32') - param_attr = paddle.framework.ParamAttr(name='conv3d.weight', initializer=paddle.nn.initializer.XavierNormal(), learning_rate=0.001) - res = paddle.static.nn.conv3d_transpose(input=data, num_filters=2, filter_size=3, act="relu", param_attr=param_attr) - place = paddle.CPUPlace() - exe = paddle.static.Executor(place) - exe.run(paddle.static.default_startup_program()) - x = np.random.rand(1, 3, 12, 32, 32).astype("float32") - output = exe.run(feed={"data": x}, fetch_list=[res]) - print(output) - """ - assert ( - param_attr is not False - ), "param_attr should not be False in conv3d_transpose." - if data_format not in ['NCDHW', 'NDHWC']: - raise ValueError( - "Param(data_format) of Op(fluid.layers.conv3d_transpose) got wrong value: received " - + data_format - + " but only NCDHW or NDHWC supported." - ) - - l_type = "conv3d_transpose" - helper = LayerHelper(l_type, **locals()) - if not isinstance(input, Variable): - raise TypeError("Input of conv3d_transpose must be Variable") - if len(input.shape) != 5: - raise ValueError( - "Input should be 5D tensor, but received input with the shape of {}".format( - input.shape - ) - ) - input_channel = ( - input.shape[1] if data_format == 'NCDHW' else input.shape[-1] - ) - - stride = utils.convert_to_list(stride, 3, 'stride') - dilation = utils.convert_to_list(dilation, 3, 'dilation') - - if not isinstance(use_cudnn, bool): - raise ValueError("use_cudnn should be True or False") - - def _update_padding(padding, data_format): - def is_list_or_tuple(ele): - if isinstance(ele, list) or isinstance(ele, tuple): - return True - return False - - if is_list_or_tuple(padding) and len(padding) == 5: - if is_list_or_tuple(padding[0]) and (data_format == "NCDHW"): - if not (padding[0] == [0, 0] and padding[1] == [0, 0]): - raise ValueError( - "Non-zero padding(%s) in the batch or channel dimensions " - "is not supported." % str(padding) - ) - padding = padding[2:5] - padding = [ele for a_list in padding for ele in a_list] - elif is_list_or_tuple(padding[0]) and (data_format == "NDHWC"): - if not (padding[0] == [0, 0] and padding[4] == [0, 0]): - raise ValueError( - "Non-zero padding(%s) in the batch or channel dimensions " - "is not supported." % str(padding) - ) - padding = padding[1:4] - padding = [ele for a_list in padding for ele in a_list] - padding = utils.convert_to_list(padding, 6, 'padding') - - elif is_list_or_tuple(padding) and len(padding) == 6: - padding = utils.convert_to_list(padding, 6, 'padding') - - else: - padding = utils.convert_to_list(padding, 3, 'padding') - padding = [ - padding[0], - padding[0], - padding[1], - padding[1], - padding[2], - padding[2], - ] - return padding - - padding_algorithm = "EXPLICIT" - if isinstance(padding, str): - padding = padding.upper() - if padding not in ["SAME", "VALID"]: - raise ValueError( - "Unknown padding: '%s'. It can only be 'SAME' or 'VALID'." - % str(padding) - ) - if padding == "VALID": - padding_algorithm = "VALID" - padding = [0, 0, 0, 0, 0, 0] - elif padding == "SAME": - padding_algorithm = "SAME" - padding = [0, 0, 0, 0, 0, 0] - - padding = _update_padding(padding, data_format) - - if filter_size is None: - if output_size is None: - raise ValueError("output_size must be set when filter_size is None") - if isinstance(output_size, int): - output_size = [output_size, output_size, output_size] - - d_in = input.shape[2] if data_format == 'NCDHW' else input.shape[1] - h_in = input.shape[3] if data_format == 'NCDHW' else input.shape[2] - w_in = input.shape[4] if data_format == 'NCDHW' else input.shape[3] - - filter_size_d = ( - output_size[0] - - (d_in - 1) * stride[0] - + padding[0] - + padding[1] - - 1 - ) // dilation[0] + 1 - filter_size_h = ( - output_size[1] - - (h_in - 1) * stride[1] - + padding[2] - + padding[3] - - 1 - ) // dilation[1] + 1 - filter_size_w = ( - output_size[2] - - (w_in - 1) * stride[2] - + padding[4] - + padding[5] - - 1 - ) // dilation[2] + 1 - filter_size = [filter_size_d, filter_size_h, filter_size_w] - else: - filter_size = utils.convert_to_list( - filter_size, 3, 'conv3d_transpose.filter_size' - ) - - if len(padding) == 6 and utils._is_symmetric_padding(padding, 3): - padding = [padding[0], padding[2], padding[4]] - - if output_size is None: - output_size = [] - elif isinstance(output_size, (list, tuple, int)): - output_size = utils.convert_to_list(output_size, 3, 'output_size') - else: - raise ValueError("output_size should be int, list[int] or tuple[int]") - - groups = 1 if groups is None else groups - if groups <= 0: - raise ValueError( - "the groups of conv3d_transpose should be greater than 0. Received groups: {}".format( - groups - ) - ) - if num_filters % groups != 0: - raise ValueError( - "Attr(num_filters) must be divisible by groups," - "Received: Attr(num_filters) is {}, the groups is {}".format( - num_filters, groups - ) - ) - - filter_shape = [input_channel, num_filters // groups] + filter_size - img_filter = helper.create_parameter( - dtype=input.dtype, shape=filter_shape, attr=helper.param_attr - ) - - if data_format == 'NCDHW': - data_format = 'NCHW' - if data_format == 'NDHWC': - data_format = 'NHWC' - - pre_bias = helper.create_variable_for_type_inference(dtype=input.dtype) - helper.append_op( - type=l_type, - inputs={'Input': [input], 'Filter': [img_filter]}, - outputs={'Output': pre_bias}, - attrs={ - 'output_size': output_size, - 'strides': stride, - 'paddings': padding, - 'padding_algorithm': padding_algorithm, - 'dilations': dilation, - 'groups': groups, - 'use_cudnn': use_cudnn, - 'data_format': data_format, - }, - ) - - if data_format == 'NCHW': - pre_act = helper.append_bias_op(pre_bias, dim_start=1, dim_end=2) - else: - pre_act = helper.append_bias_op(pre_bias, dim_start=4, dim_end=5) - out = helper.append_activation(pre_act) - return out - - def reduce_sum(input, dim=None, keep_dim=False, name=None): """ diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_bias_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_bias_fuse_pass.py index 6c667ac08db1c..517d826a7cd42 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_bias_fuse_pass.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_bias_fuse_pass.py @@ -15,6 +15,7 @@ import unittest import numpy as np from inference_pass_test import InferencePassTest +import paddle import paddle.fluid as fluid from paddle.fluid.core import PassVersionChecker @@ -173,7 +174,7 @@ def setUp(self): initializer=fluid.initializer.Xavier(uniform=False), learning_rate=0.001, ) - conv_out = fluid.layers.conv2d_transpose( + conv_out = paddle.static.nn.conv2d_transpose( input=data, num_filters=3, filter_size=3, diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_conv3d_transpose_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_conv3d_transpose_op.py index 9beabe5505230..491cbe80b3879 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_conv3d_transpose_op.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_conv3d_transpose_op.py @@ -15,6 +15,7 @@ import unittest import numpy as np from inference_pass_test import InferencePassTest +import paddle import paddle.fluid as fluid import paddle.fluid.core as core from paddle.fluid.core import PassVersionChecker @@ -28,7 +29,7 @@ def setUp(self): data = fluid.data( name="data", shape=[-1, 4, 4, 32, 32], dtype="float32" ) - conv_out = fluid.layers.conv3d_transpose( + conv_out = paddle.static.nn.conv3d_transpose( input=data, num_filters=self.conv_num_filters, filter_size=self.conv_filter_size, @@ -95,7 +96,7 @@ def setUp(self): data = fluid.data( name="data", shape=[-1, 6, -1, -1, -1], dtype="float32" ) - conv_out = fluid.layers.conv3d_transpose( + conv_out = paddle.static.nn.conv3d_transpose( input=data, num_filters=self.conv_num_filters, filter_size=self.conv_filter_size, diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_conv_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_conv_pass.py index 90dbed96f9066..0d2b314db4e2f 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_conv_pass.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_conv_pass.py @@ -16,6 +16,7 @@ import unittest import numpy as np from inference_pass_test import InferencePassTest +import paddle import paddle.fluid as fluid import paddle.fluid.core as core from paddle.fluid.core import PassVersionChecker @@ -109,7 +110,7 @@ def setUp(self): data = fluid.data( name="data", shape=[-1, 6, 64, 64], dtype="float32" ) - conv_out = fluid.layers.conv2d_transpose( + conv_out = paddle.static.nn.conv2d_transpose( input=data, num_filters=self.conv_num_filters, filter_size=self.conv_filter_size, diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_conv_quant_dequant_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_conv_quant_dequant_pass.py index 8ca6bbad042dd..7ef60536ca2e3 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_conv_quant_dequant_pass.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_conv_quant_dequant_pass.py @@ -237,7 +237,7 @@ def network(): data_reshape = paddle.reshape(self.data, shape=[1, 4, 14, 14]) self.label = fluid.data(name='label', shape=[1, 1], dtype='int64') label_shape = paddle.reshape(self.label, shape=[1, 1, 1]) - conv_out = fluid.layers.conv2d_transpose( + conv_out = paddle.static.nn.conv2d_transpose( input=data_reshape, num_filters=self.conv_num_filters, filter_size=self.conv_filter_size, diff --git a/python/paddle/fluid/tests/unittests/mlu/test_conv2d_transposed_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_conv2d_transposed_op_mlu.py index 076c6e2ca3fcd..a24206e0900b3 100644 --- a/python/paddle/fluid/tests/unittests/mlu/test_conv2d_transposed_op_mlu.py +++ b/python/paddle/fluid/tests/unittests/mlu/test_conv2d_transposed_op_mlu.py @@ -499,21 +499,21 @@ def test_case1(self): data2 = fluid.layers.data( name='data2', shape=[5, 5, 3], dtype='float32' ) - out1 = fluid.layers.conv2d_transpose( + out1 = paddle.static.nn.conv2d_transpose( input=data1, groups=1, num_filters=6, filter_size=3, data_format='NCHW', ) - out2 = fluid.layers.conv2d_transpose( + out2 = paddle.static.nn.conv2d_transpose( input=data2, groups=1, num_filters=6, filter_size=3, data_format='NHWC', ) - out3 = fluid.layers.conv2d_transpose( + out3 = paddle.static.nn.conv2d_transpose( input=data1, groups=1, num_filters=6, @@ -521,7 +521,7 @@ def test_case1(self): padding=[[0, 0], [1, 1], [1, 1], [0, 0]], data_format='NHWC', ) - out4 = fluid.layers.conv2d_transpose( + out4 = paddle.static.nn.conv2d_transpose( input=data1, groups=3, num_filters=6, @@ -529,7 +529,7 @@ def test_case1(self): padding=[[0, 0], [0, 0], [2, 1], [0, 0]], data_format='NCHW', ) - out5 = fluid.layers.conv2d_transpose( + out5 = paddle.static.nn.conv2d_transpose( input=data2, groups=1, num_filters=6, @@ -537,7 +537,7 @@ def test_case1(self): padding='SAME', data_format='NCHW', ) - out6 = fluid.layers.conv2d_transpose( + out6 = paddle.static.nn.conv2d_transpose( input=data1, groups=1, num_filters=6, @@ -545,7 +545,7 @@ def test_case1(self): padding='VALID', data_format='NHWC', ) - out7 = fluid.layers.conv2d_transpose( + out7 = paddle.static.nn.conv2d_transpose( input=data1, groups=1, num_filters=6, @@ -586,7 +586,7 @@ def test_exception(self): data = fluid.layers.data(name='data', shape=[3, 5, 5], dtype="float32") def attr_data_format(): - out = fluid.layers.conv2d_transpose( + out = paddle.static.nn.conv2d_transpose( input=data, groups=1, num_filters=6, @@ -597,7 +597,7 @@ def attr_data_format(): self.assertRaises(ValueError, attr_data_format) def attr_padding_str(): - out = fluid.layers.conv2d_transpose( + out = paddle.static.nn.conv2d_transpose( input=data, groups=1, num_filters=6, @@ -608,7 +608,7 @@ def attr_padding_str(): self.assertRaises(ValueError, attr_padding_str) def attr_padding_list(): - out = fluid.layers.conv2d_transpose( + out = paddle.static.nn.conv2d_transpose( input=data, groups=1, num_filters=6, @@ -619,7 +619,7 @@ def attr_padding_list(): self.assertRaises(ValueError, attr_padding_list) def attr_padding_with_data_format(): - out = fluid.layers.conv2d_transpose( + out = paddle.static.nn.conv2d_transpose( input=data, groups=1, num_filters=6, @@ -635,14 +635,14 @@ def attr_padding_with_data_format(): ) def error_input_size(): - out = fluid.layers.conv2d_transpose( + out = paddle.static.nn.conv2d_transpose( input=error_input, groups=1, num_filters=6, filter_size=3 ) self.assertRaises(ValueError, error_input_size) def error_groups(): - out = fluid.layers.conv2d_transpose( + out = paddle.static.nn.conv2d_transpose( input=data, groups=0, num_filters=6, diff --git a/python/paddle/fluid/tests/unittests/npu/test_conv2d_transpose_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_conv2d_transpose_op_npu.py index d27c98b270bfb..aae34ebfb5be2 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_conv2d_transpose_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_conv2d_transpose_op_npu.py @@ -435,21 +435,21 @@ def test_case1(self): data2 = fluid.layers.data( name='data2', shape=[5, 5, 3], dtype='float32' ) - out1 = fluid.layers.conv2d_transpose( + out1 = paddle.static.nn.conv2d_transpose( input=data1, groups=1, num_filters=6, filter_size=3, data_format='NCHW', ) - out2 = fluid.layers.conv2d_transpose( + out2 = paddle.static.nn.conv2d_transpose( input=data2, groups=1, num_filters=6, filter_size=3, data_format='NHWC', ) - out3 = fluid.layers.conv2d_transpose( + out3 = paddle.static.nn.conv2d_transpose( input=data1, groups=1, num_filters=6, @@ -457,7 +457,7 @@ def test_case1(self): padding=[[0, 0], [1, 1], [1, 1], [0, 0]], data_format='NHWC', ) - out4 = fluid.layers.conv2d_transpose( + out4 = paddle.static.nn.conv2d_transpose( input=data1, groups=3, num_filters=6, @@ -465,7 +465,7 @@ def test_case1(self): padding=[[0, 0], [0, 0], [2, 1], [0, 0]], data_format='NCHW', ) - out5 = fluid.layers.conv2d_transpose( + out5 = paddle.static.nn.conv2d_transpose( input=data2, groups=1, num_filters=6, @@ -473,7 +473,7 @@ def test_case1(self): padding='SAME', data_format='NCHW', ) - out6 = fluid.layers.conv2d_transpose( + out6 = paddle.static.nn.conv2d_transpose( input=data1, groups=1, num_filters=6, @@ -481,7 +481,7 @@ def test_case1(self): padding='VALID', data_format='NHWC', ) - out7 = fluid.layers.conv2d_transpose( + out7 = paddle.static.nn.conv2d_transpose( input=data1, groups=1, num_filters=6, diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_layer.py b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_layer.py index 79a90046bfd78..266543bf16af3 100644 --- a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_layer.py +++ b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_layer.py @@ -13,6 +13,7 @@ # limitations under the License. import numpy as np +import paddle from paddle import fluid, nn import paddle.fluid.dygraph as dg import paddle.nn.functional as F @@ -104,7 +105,7 @@ def fluid_layer(self, place): else: bias_attr = I.NumpyArrayInitializer(self.bias) - y_var = fluid.layers.conv2d_transpose( + y_var = paddle.static.nn.conv2d_transpose( x_var, self.num_filters, filter_size=self.filter_size, diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py index 482da8164b245..f22a536cda888 100644 --- a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py +++ b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py @@ -835,21 +835,21 @@ def test_case1(self): data2 = fluid.layers.data( name='data2', shape=[5, 5, 3], dtype='float32' ) - out1 = fluid.layers.conv2d_transpose( + out1 = paddle.static.nn.conv2d_transpose( input=data1, groups=1, num_filters=6, filter_size=3, data_format='NCHW', ) - out2 = fluid.layers.conv2d_transpose( + out2 = paddle.static.nn.conv2d_transpose( input=data2, groups=1, num_filters=6, filter_size=3, data_format='NHWC', ) - out3 = fluid.layers.conv2d_transpose( + out3 = paddle.static.nn.conv2d_transpose( input=data1, groups=1, num_filters=6, @@ -857,7 +857,7 @@ def test_case1(self): padding=[[0, 0], [1, 1], [1, 1], [0, 0]], data_format='NHWC', ) - out4 = fluid.layers.conv2d_transpose( + out4 = paddle.static.nn.conv2d_transpose( input=data1, groups=3, num_filters=6, @@ -865,7 +865,7 @@ def test_case1(self): padding=[[0, 0], [0, 0], [2, 1], [0, 0]], data_format='NCHW', ) - out5 = fluid.layers.conv2d_transpose( + out5 = paddle.static.nn.conv2d_transpose( input=data2, groups=1, num_filters=6, @@ -873,7 +873,7 @@ def test_case1(self): padding='SAME', data_format='NCHW', ) - out6 = fluid.layers.conv2d_transpose( + out6 = paddle.static.nn.conv2d_transpose( input=data1, groups=1, num_filters=6, @@ -881,7 +881,7 @@ def test_case1(self): padding='VALID', data_format='NHWC', ) - out7 = fluid.layers.conv2d_transpose( + out7 = paddle.static.nn.conv2d_transpose( input=data1, groups=1, num_filters=6, @@ -919,7 +919,7 @@ def test_exception(self): data = fluid.layers.data(name='data', shape=[3, 5, 5], dtype="float32") def attr_data_format(): - out = fluid.layers.conv2d_transpose( + out = paddle.static.nn.conv2d_transpose( input=data, groups=1, num_filters=6, @@ -930,7 +930,7 @@ def attr_data_format(): self.assertRaises(ValueError, attr_data_format) def attr_padding_str(): - out = fluid.layers.conv2d_transpose( + out = paddle.static.nn.conv2d_transpose( input=data, groups=1, num_filters=6, @@ -941,7 +941,7 @@ def attr_padding_str(): self.assertRaises(ValueError, attr_padding_str) def attr_padding_list(): - out = fluid.layers.conv2d_transpose( + out = paddle.static.nn.conv2d_transpose( input=data, groups=1, num_filters=6, @@ -952,7 +952,7 @@ def attr_padding_list(): self.assertRaises(ValueError, attr_padding_list) def attr_padding_with_data_format(): - out = fluid.layers.conv2d_transpose( + out = paddle.static.nn.conv2d_transpose( input=data, groups=1, num_filters=6, @@ -968,14 +968,14 @@ def attr_padding_with_data_format(): ) def error_input_size(): - out = fluid.layers.conv2d_transpose( + out = paddle.static.nn.conv2d_transpose( input=error_input, groups=1, num_filters=6, filter_size=3 ) self.assertRaises(ValueError, error_input_size) def error_groups(): - out = fluid.layers.conv2d_transpose( + out = paddle.static.nn.conv2d_transpose( input=data, groups=0, num_filters=6, @@ -1064,7 +1064,7 @@ def path_prefix(self): def call_func(self, x): w_var = paddle.randn((3, 6, 3, 3), dtype='float32') output_size = paddle.assign([17]) - out = paddle.fluid.layers.conv2d_transpose( + out = paddle.static.nn.conv2d_transpose( x, num_filters=6, output_size=output_size, filter_size=3, stride=2 ) return out @@ -1076,7 +1076,7 @@ def path_prefix(self): def call_func(self, x): output_size = [17, paddle.assign([17])] - out = paddle.fluid.layers.conv2d_transpose( + out = paddle.static.nn.conv2d_transpose( x, num_filters=6, output_size=output_size, filter_size=3, stride=2 ) return out diff --git a/python/paddle/fluid/tests/unittests/test_conv3d_transpose_layer.py b/python/paddle/fluid/tests/unittests/test_conv3d_transpose_layer.py index 87511cb0fafb8..6c690595a0072 100644 --- a/python/paddle/fluid/tests/unittests/test_conv3d_transpose_layer.py +++ b/python/paddle/fluid/tests/unittests/test_conv3d_transpose_layer.py @@ -13,6 +13,7 @@ # limitations under the License. import numpy as np +import paddle from paddle import fluid, nn import paddle.fluid.dygraph as dg import paddle.nn.functional as F @@ -101,7 +102,7 @@ def fluid_layer(self, place): bias_attr = False else: bias_attr = I.NumpyArrayInitializer(self.bias) - y_var = fluid.layers.conv3d_transpose( + y_var = paddle.static.nn.conv3d_transpose( x_var, self.num_filters, filter_size=self.filter_size, diff --git a/python/paddle/fluid/tests/unittests/test_conv3d_transpose_part2_op.py b/python/paddle/fluid/tests/unittests/test_conv3d_transpose_part2_op.py index 0aed6c1c515be..80fb35f902a8f 100644 --- a/python/paddle/fluid/tests/unittests/test_conv3d_transpose_part2_op.py +++ b/python/paddle/fluid/tests/unittests/test_conv3d_transpose_part2_op.py @@ -15,6 +15,7 @@ import unittest import numpy as np +import paddle import paddle.fluid.core as core import paddle.fluid as fluid from test_conv3d_transpose_op import TestConv3DTransposeOp @@ -91,21 +92,21 @@ def test_case1(self): name='data2', shape=[5, 5, 5, 3], dtype='float32' ) - out1 = fluid.layers.conv3d_transpose( + out1 = paddle.static.nn.conv3d_transpose( input=data1, groups=1, num_filters=6, filter_size=3, data_format='NCDHW', ) - out2 = fluid.layers.conv3d_transpose( + out2 = paddle.static.nn.conv3d_transpose( input=data2, groups=1, num_filters=6, filter_size=3, data_format='NDHWC', ) - out3 = fluid.layers.conv3d_transpose( + out3 = paddle.static.nn.conv3d_transpose( input=data1, groups=1, num_filters=6, @@ -113,7 +114,7 @@ def test_case1(self): padding=[[0, 0], [0, 0], [1, 1], [0, 0], [1, 1]], data_format='NCDHW', ) - out4 = fluid.layers.conv3d_transpose( + out4 = paddle.static.nn.conv3d_transpose( input=data2, groups=3, num_filters=6, @@ -121,7 +122,7 @@ def test_case1(self): padding=[[0, 0], [0, 0], [1, 1], [1, 2], [0, 0]], data_format='NDHWC', ) - out5 = fluid.layers.conv3d_transpose( + out5 = paddle.static.nn.conv3d_transpose( input=data2, groups=1, num_filters=6, @@ -129,7 +130,7 @@ def test_case1(self): padding='SAME', data_format='NCDHW', ) - out6 = fluid.layers.conv3d_transpose( + out6 = paddle.static.nn.conv3d_transpose( input=data2, groups=1, num_filters=6, @@ -137,7 +138,7 @@ def test_case1(self): padding='VALID', data_format='NDHWC', ) - out7 = fluid.layers.conv3d_transpose( + out7 = paddle.static.nn.conv3d_transpose( input=data2, groups=1, num_filters=6, @@ -177,7 +178,7 @@ def test_exception(self): ) def attr_data_format(): - out = fluid.layers.conv2d_transpose( + out = paddle.static.nn.conv2d_transpose( input=data, groups=1, num_filters=6, @@ -188,7 +189,7 @@ def attr_data_format(): self.assertRaises(ValueError, attr_data_format) def attr_padding_str(): - out = fluid.layers.conv2d_transpose( + out = paddle.static.nn.conv2d_transpose( input=data, groups=1, num_filters=6, @@ -199,7 +200,7 @@ def attr_padding_str(): self.assertRaises(ValueError, attr_padding_str) def attr_padding_list(): - out = fluid.layers.conv2d_transpose( + out = paddle.static.nn.conv2d_transpose( input=data, groups=1, num_filters=6, @@ -210,7 +211,7 @@ def attr_padding_list(): self.assertRaises(ValueError, attr_padding_list) def attr_padding_with_data_format(): - out = fluid.layers.conv2d_transpose( + out = paddle.static.nn.conv2d_transpose( input=data, groups=1, num_filters=6, diff --git a/python/paddle/fluid/tests/unittests/test_conv_transpose_nn_grad.py b/python/paddle/fluid/tests/unittests/test_conv_transpose_nn_grad.py index f4c139a5463ee..c37394ed83427 100644 --- a/python/paddle/fluid/tests/unittests/test_conv_transpose_nn_grad.py +++ b/python/paddle/fluid/tests/unittests/test_conv_transpose_nn_grad.py @@ -36,7 +36,7 @@ def func(self, place): if core.is_compiled_with_rocm(): dtype = np.float32 x = layers.data('x', shape, False, dtype) - y = layers.conv2d_transpose( + y = paddle.static.nn.conv2d_transpose( x, 2, filter_size=1, groups=1, bias_attr=False ) x_arr = np.random.uniform(-1, 1, shape).astype(dtype) @@ -92,7 +92,7 @@ def func(self, place): if core.is_compiled_with_rocm(): dtype = np.float32 x = layers.data('x', shape, False, dtype) - y = layers.conv2d_transpose( + y = paddle.static.nn.conv2d_transpose( input=x, num_filters=2, filter_size=1, @@ -145,7 +145,7 @@ def func(self, place): if core.is_compiled_with_rocm(): dtype = np.float32 x = layers.data('x', shape, False, dtype) - y = layers.conv2d_transpose( + y = paddle.static.nn.conv2d_transpose( input=x, num_filters=2, filter_size=1, @@ -198,7 +198,7 @@ def func(self, place): if core.is_compiled_with_rocm(): dtype = np.float32 x = layers.data('x', shape, False, dtype) - y = layers.conv2d_transpose( + y = paddle.static.nn.conv2d_transpose( input=x, num_filters=2, filter_size=1, @@ -251,7 +251,7 @@ def func(self, place): if core.is_compiled_with_rocm(): dtype = np.float32 x = layers.data('x', shape, False, dtype) - y = layers.conv2d_transpose( + y = paddle.static.nn.conv2d_transpose( input=x, num_filters=2, filter_size=1, diff --git a/python/paddle/fluid/tests/unittests/test_functional_conv2d_transpose.py b/python/paddle/fluid/tests/unittests/test_functional_conv2d_transpose.py index d377be24ddf87..d45f13e4c9576 100644 --- a/python/paddle/fluid/tests/unittests/test_functional_conv2d_transpose.py +++ b/python/paddle/fluid/tests/unittests/test_functional_conv2d_transpose.py @@ -89,7 +89,7 @@ def static_graph_case_1(self): (-1, self.in_channels, -1, -1), dtype=self.dtype, ) - y = fluid.layers.conv2d_transpose( + y = paddle.static.nn.conv2d_transpose( x, self.out_channels, output_size=self.output_size, diff --git a/python/paddle/fluid/tests/unittests/test_functional_conv3d_transpose.py b/python/paddle/fluid/tests/unittests/test_functional_conv3d_transpose.py index d9ee04953592b..3c013fe873396 100644 --- a/python/paddle/fluid/tests/unittests/test_functional_conv3d_transpose.py +++ b/python/paddle/fluid/tests/unittests/test_functional_conv3d_transpose.py @@ -89,7 +89,7 @@ def static_graph_case_1(self): (-1, self.in_channels, -1, -1, -1), dtype=self.dtype, ) - y = fluid.layers.conv3d_transpose( + y = paddle.static.nn.conv3d_transpose( x, self.out_channels, output_size=self.output_size, @@ -550,7 +550,7 @@ def static_graph_case(self): with fluid.unique_name.guard(): with fluid.program_guard(main, start): x = fluid.data("input", self.input.shape, dtype=paddle.float32) - y = fluid.layers.conv3d_transpose( + y = paddle.static.nn.conv3d_transpose( x, self.num_filters, self.filter_size, diff --git a/python/paddle/fluid/tests/unittests/test_imperative_load_static_param.py b/python/paddle/fluid/tests/unittests/test_imperative_load_static_param.py index c0f6badfe2c97..c96fe97fac87f 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_load_static_param.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_load_static_param.py @@ -103,20 +103,20 @@ def testLoadStaticModel(self): name="conv2d_trans_in", shape=[None, 10, 10, 10] ) - conv2d_trans_out_1 = fluid.layers.conv2d_transpose( + conv2d_trans_out_1 = paddle.static.nn.conv2d_transpose( conv2d_trans_in, num_filters=10, filter_size=5, act="relu" ) - conv2d_trans_out_2 = fluid.layers.conv2d_transpose( + conv2d_trans_out_2 = paddle.static.nn.conv2d_transpose( conv2d_trans_in, num_filters=10, filter_size=5, act="relu" ) conv3d_trans_in = fluid.data( name='conv3d_trans_in', shape=[None, 3, 12, 32, 32], dtype='float32' ) - conv3d_trans_out_1 = fluid.layers.conv3d_transpose( + conv3d_trans_out_1 = paddle.static.nn.conv3d_transpose( input=conv3d_trans_in, num_filters=2, filter_size=3, act="relu" ) - conv3d_trans_out_2 = fluid.layers.conv3d_transpose( + conv3d_trans_out_2 = paddle.static.nn.conv3d_transpose( input=conv3d_trans_in, num_filters=2, filter_size=3, act="relu" ) diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index 1d892e76c92c0..8b6c2ada2d2d2 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -716,7 +716,7 @@ def test_conv2d_transpose(self): inp_np = np.arange(0, 24).reshape([2, 3, 2, 2]).astype('float32') with self.static_graph(): img = layers.data(name='pixel', shape=[3, 2, 2], dtype='float32') - out = layers.conv2d_transpose( + out = paddle.static.nn.conv2d_transpose( input=img, num_filters=10, filter_size=27, @@ -2270,7 +2270,7 @@ def test_conv3d_transpose(self): with self.static_graph(): img = layers.data(name='pixel', shape=[3, 2, 2, 2], dtype='float32') - out = layers.conv3d_transpose( + out = paddle.static.nn.conv3d_transpose( input=img, num_filters=12, filter_size=12, use_cudnn=False ) static_rlt = self.get_static_graph_result( @@ -3062,7 +3062,7 @@ def make_conv2d_transpose(self): fluid.default_main_program(), fluid.default_startup_program() ): img = self._get_data(name='pixel', shape=[3, 2, 2], dtype='float32') - return layers.conv2d_transpose( + return paddle.static.nn.conv2d_transpose( input=img, num_filters=10, output_size=28 ) diff --git a/python/paddle/static/nn/__init__.py b/python/paddle/static/nn/__init__.py index 64e079ae375e3..7f695a48219d4 100755 --- a/python/paddle/static/nn/__init__.py +++ b/python/paddle/static/nn/__init__.py @@ -14,15 +14,15 @@ from .common import fc # noqa: F401 from .common import deform_conv2d # noqa: F401 +from .common import conv2d_transpose # noqa: F401 +from .common import conv3d_transpose # noqa: F401 from ...fluid.layers import batch_norm # noqa: F401 from ...fluid.layers import bilinear_tensor_product # noqa: F401 from ...fluid.layers import case # noqa: F401 from ...fluid.layers import cond # noqa: F401 from ...fluid.layers import conv2d # noqa: F401 -from ...fluid.layers import conv2d_transpose # noqa: F401 from ...fluid.layers import conv3d # noqa: F401 -from ...fluid.layers import conv3d_transpose # noqa: F401 from ...fluid.layers import create_parameter # noqa: F401 from ...fluid.layers import crf_decoding # noqa: F401 from ...fluid.layers import data_norm # noqa: F401 diff --git a/python/paddle/static/nn/common.py b/python/paddle/static/nn/common.py index 9c0e1114f7e2f..aee2009edd28f 100755 --- a/python/paddle/static/nn/common.py +++ b/python/paddle/static/nn/common.py @@ -13,7 +13,9 @@ # limitations under the License. import paddle -from paddle.fluid.framework import static_only +from paddle.fluid.framework import static_only, Variable, _non_static_mode + +from paddle.fluid.data_feeder import check_dtype from paddle.common_ops_import import ( check_type, @@ -174,6 +176,731 @@ def fc( ) +def conv2d_transpose( + input, + num_filters, + output_size=None, + filter_size=None, + padding=0, + stride=1, + dilation=1, + groups=None, + param_attr=None, + bias_attr=None, + use_cudnn=True, + act=None, + name=None, + data_format='NCHW', +): + r""" + :api_attr: Static Graph + + The convolution2D transpose layer calculates the output based on the input, + filter, and dilations, strides, paddings. Input(Input) and output(Output) + are in NCHW or NHWC format. Where N is batch size, C is the number of channels, + H is the height of the feature, and W is the width of the feature. + Parameters(dilations, strides, paddings) are two elements. These two elements + represent height and width, respectively. The details of convolution transpose + layer, please refer to the following explanation and references + `therein `_. + If bias attribution and activation type are provided, bias is added to + the output of the convolution, and the corresponding activation function + is applied to the final result. + + For each input :math:`X`, the equation is: + + .. math:: + + Out = \sigma (W \\ast X + b) + + Where: + + * :math:`X`: Input value, a 4-D Tensor with NCHW or NHWC format. + * :math:`W`: Filter value, a 4-D Tensor with MCHW format. + * :math:`\\ast`: Convolution operation. + * :math:`b`: Bias value, a 2-D Tensor with shape [M, 1]. + * :math:`\\sigma`: Activation function. + * :math:`Out`: Output value, a 4-D Tensor with data format 'NCHW' or 'NHWC', the shape of :math:`Out` and :math:`X` may be different. + + Example: + + - Input: + + Input shape: :math:`(N, C_{in}, H_{in}, W_{in})` + + Filter shape: :math:`(C_{in}, C_{out}, H_f, W_f)` + + - Output: + + Output shape: :math:`(N, C_{out}, H_{out}, W_{out})` + + Where + + .. math:: + + H^\prime_{out} &= (H_{in} - 1) * strides[0] - pad_height_top - pad_height_bottom + dilations[0] * (H_f - 1) + 1 \\\\ + W^\prime_{out} &= (W_{in} - 1) * strides[1] - pad_width_left - pad_width_right + dilations[1] * (W_f - 1) + 1 \\\\ + H_{out} &\in [ H^\prime_{out}, H^\prime_{out} + strides[0] ] \\\\ + W_{out} &\in [ W^\prime_{out}, W^\prime_{out} + strides[1] ] + + Note: + The conv2d_transpose can be seen as the backward of the conv2d. For conv2d, + when stride > 1, conv2d maps multiple input shape to the same output shape, + so for conv2d_transpose, when stride > 1, input shape maps multiple output shape. + If output_size is None, :math:`H_{out} = H^\prime_{out}, W_{out} = W^\prime_{out}`; + else, the :math:`H_{out}` of the output size must between :math:`H^\prime_{out}` + and :math:`H^\prime_{out} + strides[0]`, and the :math:`W_{out}` of the output size must + between :math:`W^\prime_{out}` and :math:`W^\prime_{out} + strides[1]`, + conv2d_transpose can compute the kernel size automatically. + + Args: + input(Tensor): 4-D Tensor with [N, C, H, W] or [N, H, W, C] format, + its data type is float32 or float64. + num_filters(int): The number of the filter. It is as same as the output + image channel. + output_size(int|tuple, optional): The output image size. If output size is a + tuple, it must contain two integers, (image_height, image_width). None if use + filter_size, padding, and stride to calculate output_size. + If output_size and filter_size are specified at the same time, They + should follow the formula above. Default: None. output_size and filter_size + should not be None at the same time. + filter_size(int|tuple, optional): The filter size. If filter_size is a tuple, + it must contain two integers, (filter_size_height, filter_size_width). + Otherwise, filter_size_height = filter_size_width = filter_size. None if + use output size to calculate filter_size. Default: None. filter_size and + output_size should not be None at the same time. + stride(int|tuple, optional): The stride size. It means the stride in transposed convolution. + If stride is a tuple, it must contain two integers, (stride_height, stride_width). + Otherwise, stride_height = stride_width = stride. Default: stride = 1. + padding(str|int|list|tuple, optional): The padding size. It means the number of zero-paddings + on both sides for each dimension. If `padding` is a string, either 'VALID' or + 'SAME' which is the padding algorithm. If `padding` is a tuple or list, + it could be in three forms: `[pad_height, pad_width]` or + `[pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`, + and when `data_format` is `"NCHW"`, `padding` can be in the form + `[[0,0], [0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`. + when `data_format` is `"NHWC"`, `padding` can be in the form + `[[0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`. + Default: padding = 0. + dilation(int|tuple, optional): The dilation size. It means the spacing between the kernel points. + If dilation is a tuple, it must contain two integers, (dilation_height, dilation_width). + Otherwise, dilation_height = dilation_width = dilation. Default: dilation = 1. + filter_size(int|tuple, optional): The filter size. If filter_size is a tuple, + it must contain two integers, (filter_size_height, filter_size_width). + Otherwise, filter_size_height = filter_size_width = filter_size. None if + use output size to calculate filter_size. Default: None. + groups(int, optional): The groups number of the Conv2d transpose layer. Inspired by + grouped convolution in Alex Krizhevsky's Deep CNN paper, in which + when group=2, the first half of the filters is only connected to the + first half of the input channels, while the second half of the + filters is only connected to the second half of the input channels. + Default: groups = 1. + param_attr (ParamAttr, optional): The parameter attribute for learnable parameters/weights + of conv2d_transpose. If it is set to None or one attribute of ParamAttr, conv2d_transpose + will create ParamAttr as param_attr. If the Initializer of the param_attr + is not set, the parameter is initialized with Xavier. Default: None. + bias_attr (ParamAttr|bool, optional): The parameter attribute for the bias of conv2d_transpose. + If it is set to False, no bias will be added to the output units. + If it is set to None or one attribute of ParamAttr, conv2d_transpose + will create ParamAttr as bias_attr. If the Initializer of the bias_attr + is not set, the bias is initialized zero. Default: None. + use_cudnn(bool, optional): Use cudnn kernel or not, it is valid only when the cudnn + library is installed. Default: True. + act (str, optional): Activation type, if it is set to None, activation is not appended. + Default: None. + name(str, optional): For detailed information, please refer + to :ref:`api_guide_Name`. Usually name is no need to set and + None by default. + data_format (str, optional): Specify the data format of the input, and the data format of the output + will be consistent with that of the input. An optional string from: `"NCHW"`, `"NHWC"`. + The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of: + `[batch_size, input_channels, input_height, input_width]`. + + Returns: + A Tensor representing the conv2d_transpose, whose + data type is the same with input and shape is (num_batches, channels, out_h, + out_w) or (num_batches, out_h, out_w, channels). If act is None, the tensor + storing the transposed convolution result, and if act is not None, the + tensor storing transposed convolution and non-linearity activation + result. + + Raises: + ValueError: If the type of `use_cudnn` is not bool. + ValueError: If `data_format` is not "NCHW" or "NHWC". + ValueError: If `padding` is a string, but not "SAME" or "VALID". + ValueError: If `padding` is a tuple, but the element corresponding to the input's batch size is not 0 + or the element corresponding to the input's channel is not 0. + ValueError: If `output_size` and filter_size are None at the same time. + ShapeError: If the input is not 4-D Tensor. + ShapeError: If the input's dimension size and filter's dimension size not equal. + ShapeError: If the dimension size of input minus the size of `stride` is not 2. + ShapeError: If the number of input channels is not equal to filter's channels. + ShapeError: If the size of `output_size` is not equal to that of `stride`. + + Examples: + .. code-block:: python + + import paddle + paddle.enable_static() + + data = paddle.static.data(name='data', shape=[None, 3, 32, 32], dtype='float32') + conv2d_transpose = paddle.static.nn.conv2d_transpose(input=data, num_filters=2, filter_size=3) + print(conv2d_transpose.shape) # [-1, 2, 34, 34] + """ + assert ( + param_attr is not False + ), "param_attr should not be False in conv2d_transpose." + if len(input.shape) != 4: + raise ValueError( + "Input size should be 4, " + "but received {}".format(len(input.shape)) + ) + + if data_format not in ['NCHW', 'NHWC']: + raise ValueError( + "Attr(data_format) of Op(paddle.static.nn.layers.conv2d_transpose) got wrong value: received " + + data_format + + " but only NCHW or NHWC supported." + ) + + input_channel = input.shape[1] if data_format == 'NCHW' else input.shape[-1] + op_type = 'conv2d_transpose' + if ( + input_channel == groups + and num_filters == input_channel + and not use_cudnn + ): + op_type = 'depthwise_conv2d_transpose' + + helper = LayerHelper(op_type, **locals()) + if not isinstance(input, Variable): + raise TypeError("Input of conv2d_transpose must be Variable") + + stride = utils.convert_to_list(stride, 2, 'stride') + dilation = utils.convert_to_list(dilation, 2, 'dilation') + + if not isinstance(use_cudnn, bool): + raise ValueError("use_cudnn should be True or False") + + def _update_padding(padding, data_format): + def is_list_or_tuple(ele): + if isinstance(ele, list) or isinstance(ele, tuple): + return True + return False + + if is_list_or_tuple(padding) and len(padding) == 4: + if is_list_or_tuple(padding[0]) and (data_format == "NCHW"): + if not (padding[0] == [0, 0] and padding[1] == [0, 0]): + raise ValueError( + "Non-zero padding(%s) in the batch or channel dimensions " + "is not supported." % str(padding) + ) + padding = padding[2:4] + padding = [ele for a_list in padding for ele in a_list] + elif is_list_or_tuple(padding[0]) and (data_format == "NHWC"): + if not (padding[0] == [0, 0] and padding[3] == [0, 0]): + raise ValueError( + "Non-zero padding(%s) in the batch or channel dimensions " + "is not supported." % str(padding) + ) + padding = padding[1:3] + padding = [ele for a_list in padding for ele in a_list] + padding = utils.convert_to_list(padding, 4, 'padding') + else: + padding = utils.convert_to_list(padding, 2, 'padding') + padding = [padding[0], padding[0], padding[1], padding[1]] + return padding + + padding_algorithm = "EXPLICIT" + if isinstance(padding, str): + padding = padding.upper() + if padding not in ["SAME", "VALID"]: + raise ValueError( + "Unknown padding: '%s'. It can only be 'SAME' or 'VALID'." + % str(padding) + ) + if padding == "VALID": + padding_algorithm = "VALID" + padding = [0, 0, 0, 0] + elif padding == "SAME": + padding_algorithm = "SAME" + padding = [0, 0, 0, 0] + + padding = _update_padding(padding, data_format) + + if output_size is None: + output_size = [] + elif isinstance(output_size, (list, tuple)): + if utils._contain_var(output_size): + output_size = utils._convert_to_tensor_list(output_size) + else: + output_size = utils.convert_to_list(output_size, 2, 'output_size') + elif isinstance(output_size, int): + output_size = utils.convert_to_list(output_size, 2, 'output_size') + elif isinstance(output_size, Variable): + check_dtype( + output_size.dtype, + 'output_size', + ['int32', 'int64'], + 'conv2d_transpose', + ) + if len(output_size.shape) == 1 and ( + output_size.shape[0] == 1 or output_size.shape[0] == 2 + ): + if output_size.shape[0] == 1: + output_size = [output_size, output_size] + else: + raise ValueError("output_size must contain one or two integers.") + else: + raise ValueError( + "output_size should be int, list[int] or tuple[int] or Tensor" + ) + + if filter_size is None: + if output_size is []: + raise ValueError("output_size must be set when filter_size is None") + if not _non_static_mode(): + if isinstance(output_size, Variable) or utils._contain_var( + output_size + ): + raise ValueError( + "filter_size should not be None when output_size is Variable or contain Variable in static mode." + ) + else: + output_size = utils.convert_shape_to_list(output_size) + if len(output_size) == 1: + output_size = utils.convert_to_list( + output_size[0], 2, 'output_size' + ) + + h_in = input.shape[2] if data_format == 'NCHW' else input.shape[1] + w_in = input.shape[3] if data_format == 'NCHW' else input.shape[2] + + filter_size_h = ( + output_size[0] + - (h_in - 1) * stride[0] + + padding[0] + + padding[1] + - 1 + ) // dilation[0] + 1 + filter_size_w = ( + output_size[1] + - (w_in - 1) * stride[1] + + padding[2] + + padding[3] + - 1 + ) // dilation[1] + 1 + filter_size = [filter_size_h, filter_size_w] + else: + filter_size = utils.convert_to_list( + filter_size, 2, 'conv2d_transpose.filter_size' + ) + + if len(padding) == 4 and utils._is_symmetric_padding(padding, 2): + padding = [padding[0], padding[2]] + + if groups is None: + groups = 1 + elif groups <= 0: + raise ValueError( + "the groups of input must be greater than 0, " + "but received the groups of input is {}".format(groups) + ) + + filter_shape = [input_channel, num_filters // groups] + filter_size + + img_filter = helper.create_parameter( + dtype=input.dtype, shape=filter_shape, attr=helper.param_attr + ) + + pre_bias = helper.create_variable_for_type_inference(dtype=input.dtype) + helper.append_op( + type=op_type, + inputs={'Input': [input], 'Filter': [img_filter]}, + outputs={'Output': pre_bias}, + attrs={ + 'output_size': output_size, + 'strides': stride, + 'paddings': padding, + 'padding_algorithm': padding_algorithm, + 'dilations': dilation, + 'groups': groups, + 'use_cudnn': use_cudnn, + 'data_format': data_format, + }, + ) + + if data_format == 'NCHW': + pre_act = helper.append_bias_op(pre_bias, dim_start=1, dim_end=2) + else: + pre_act = helper.append_bias_op(pre_bias, dim_start=3, dim_end=4) + out = helper.append_activation(pre_act) + return out + + +def conv3d_transpose( + input, + num_filters, + output_size=None, + filter_size=None, + padding=0, + stride=1, + dilation=1, + groups=None, + param_attr=None, + bias_attr=None, + use_cudnn=True, + act=None, + name=None, + data_format='NCDHW', +): + r""" + :api_attr: Static Graph + + The convolution3D transpose layer calculates the output based on the input, + filter, and dilations, strides, paddings. Input(Input) and output(Output) + are in NCDHW or NDHWC format. Where N is batch size, C is the number of channels, + D is the depth of the feature, H is the height of the feature, and W + is the width of the feature. Parameters(dilations, strides, paddings) are + two elements. These two elements represent height and width, respectively. + The details of convolution transpose layer, please refer to the following + explanation and references `therein `_. + If bias attribution and activation type are provided, bias is added to + the output of the convolution, and the corresponding activation function + is applied to the final result. + + For each input :math:`X`, the equation is: + + .. math:: + + Out = \sigma (W \ast X + b) + + In the above equation: + + * :math:`X`: Input value, a Tensor with NCDHW or NDHWC format. + * :math:`W`: Filter value, a Tensor with MCDHW format. + * :math:`\ast`: Convolution operation. + * :math:`b`: Bias value, a 2-D Tensor with shape [M, 1]. + * :math:`\sigma`: Activation function. + * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different. + + Example: + + - Input: + + Input shape: :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})` + + Filter shape: :math:`(C_{in}, C_{out}, D_f, H_f, W_f)` + + - Output: + + Output shape: :math:`(N, C_{out}, D_{out}, H_{out}, W_{out})` + + Where + + .. math:: + + D^\prime_{out} &= (D_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (D_f - 1) + 1 \\\\ + H^\prime_{out} &= (H_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (H_f - 1) + 1 \\\\ + W^\prime_{out} &= (W_{in} - 1) * strides[2] - 2 * paddings[2] + dilations[2] * (W_f - 1) + 1 \\\\ + D_{out} &\in [ D^\prime_{out}, D^\prime_{out} + strides[0] ] \\\\ + H_{out} &\in [ H^\prime_{out}, H^\prime_{out} + strides[1] ] \\\\ + W_{out} &\in [ W^\prime_{out}, W^\prime_{out} + strides[2] ] + + Note: + The conv3d_transpose can be seen as the backward of the conv3d. For conv3d, + when stride > 1, conv3d maps multiple input shape to the same output shape, + so for conv3d_transpose, when stride > 1, input shape maps multiple output shape. + If output_size is None, :math:`H_{out} = H^\prime_{out}, :math:`H_{out} = \ + H^\prime_{out}, W_{out} = W^\prime_{out}`; else, the :math:`D_{out}` of the output + size must between :math:`D^\prime_{out}` and :math:`D^\prime_{out} + strides[0]`, + the :math:`H_{out}` of the output size must between :math:`H^\prime_{out}` + and :math:`H^\prime_{out} + strides[1]`, and the :math:`W_{out}` of the output size must + between :math:`W^\prime_{out}` and :math:`W^\prime_{out} + strides[2]`, + conv3d_transpose can compute the kernel size automatically. + + Args: + input(Tensor): The input is 5-D Tensor with shape [N, C, D, H, W] or [N, D, H, W, C], the data type + of input is float32 or float64. + num_filters(int): The number of the filter. It is as same as the output + image channel. + output_size(int|tuple, optional): The output image size. If output size is a + tuple, it must contain three integers, (image_depth, image_height, image_width). This + parameter only works when filter_size is None. If output_size and filter_size are + specified at the same time, They should follow the formula above. Default: None. + Output_size and filter_size should not be None at the same time. + filter_size(int|tuple, optional): The filter size. If filter_size is a tuple, + it must contain three integers, (filter_size_depth, filter_size_height, + filter_size_width). Otherwise, filter_size_depth = filter_size_height = \ + filter_size_width = filter_size. None if use output size to + calculate filter_size. Default: None. filter_size and output_size should not be + None at the same time. + padding(int|list|str|tuple, optional): The padding size. The padding argument effectively + adds `dilation * (kernel - 1)` amount of zero-padding on both sides of input. If `padding` is a string, + either 'VALID' or 'SAME' supported, which is the padding algorithm. If `padding` + is a tuple or list, it could be in three forms: `[pad_depth, pad_height, pad_width]` or + `[pad_depth_front, pad_depth_back, pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`, + and when `data_format` is `'NCDHW'`, `padding` can be in the form + `[[0,0], [0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`. + when `data_format` is `'NDHWC'`, `padding` can be in the form + `[[0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`. + Default: padding = 0. + stride(int|tuple, optional): The stride size. It means the stride in transposed convolution. + If stride is a tuple, it must contain three integers, (stride_depth, stride_height, + stride_width). Otherwise, stride_depth = stride_height = stride_width = stride. + Default: stride = 1. + dilation(int|tuple, optional): The dilation size. It means the spacing between the kernel points. + If dilation is a tuple, it must contain three integers, (dilation_depth, dilation_height, + dilation_width). Otherwise, dilation_depth = dilation_height = dilation_width = dilation. + Default: dilation = 1. + groups(int, optional): The groups number of the Conv3d transpose layer. Inspired by + grouped convolution in Alex Krizhevsky's Deep CNN paper, in which + when group=2, the first half of the filters is only connected to the + first half of the input channels, while the second half of the + filters is only connected to the second half of the input channels. + Default: groups=1 + param_attr (ParamAttr, optional): The parameter attribute for learnable parameters/weights + of conv3d_transpose. If it is set to None or one attribute of ParamAttr, conv3d_transpose + will create ParamAttr as param_attr. If the Initializer of the param_attr + is not set, the parameter is initialized with Xavier. Default: None. + bias_attr (ParamAttr|bool, optional): The parameter attribute for the bias of conv3d_transpose. + If it is set to False, no bias will be added to the output units. + If it is set to None or one attribute of ParamAttr, conv3d_transpose + will create ParamAttr as bias_attr. If the Initializer of the bias_attr + is not set, the bias is initialized zero. Default: None. + use_cudnn(bool, optional): Use cudnn kernel or not, it is valid only when the cudnn + library is installed. Default: True + act (str, optional): Activation type, if it is set to None, activation is not appended. + Default: None. + name(str, optional): For detailed information, please refer + to :ref:`api_guide_Name`. Usually name is no need to set and + None by default. + data_format (str, optional): Specify the data format of the input, and the data format of the output + will be consistent with that of the input. An optional string from: `"NCHW"`, `"NHWC"`. + The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of: + `[batch_size, input_channels, input_height, input_width]`. + + Returns: + A Variable holding Tensor representing the conv3d_transpose, whose data + type is the same with input and shape is (num_batches, channels, out_d, out_h, + out_w) or (num_batches, out_d, out_h, out_w, channels). If act is None, the tensor + variable storing the transposed convolution result, and if act is not None, the tensor + variable storing transposed convolution and non-linearity activation result. + + Raises: + ValueError: If the type of `use_cudnn` is not bool. + ValueError: If `data_format` is not "NCDHW" or "NDHWC". + ValueError: If `padding` is a string, but not "SAME" or "VALID". + ValueError: If `padding` is a tuple, but the element corresponding to the input's batch size is not 0 + or the element corresponding to the input's channel is not 0. + ValueError: If `output_size` and filter_size are None at the same time. + ShapeError: If the input is not 5-D Tensor. + ShapeError: If the input's dimension size and filter's dimension size not equal. + ShapeError: If the dimension size of input minus the size of `stride` is not 2. + ShapeError: If the number of input channels is not equal to filter's channels. + ShapeError: If the size of `output_size` is not equal to that of `stride`. + + Examples: + .. code-block:: python + + import paddle + import numpy as np + + paddle.enable_static() + data = paddle.static.data(name='data', shape=[None, 3, 12, 32, 32], dtype='float32') + param_attr = paddle.framework.ParamAttr(name='conv3d.weight', initializer=paddle.nn.initializer.XavierNormal(), learning_rate=0.001) + res = paddle.static.nn.conv3d_transpose(input=data, num_filters=2, filter_size=3, act="relu", param_attr=param_attr) + place = paddle.CPUPlace() + exe = paddle.static.Executor(place) + exe.run(paddle.static.default_startup_program()) + x = np.random.rand(1, 3, 12, 32, 32).astype("float32") + output = exe.run(feed={"data": x}, fetch_list=[res]) + print(output) + """ + assert ( + param_attr is not False + ), "param_attr should not be False in conv3d_transpose." + if data_format not in ['NCDHW', 'NDHWC']: + raise ValueError( + "Param(data_format) of Op(paddle.static.nn.conv3d_transpose) got wrong value: received " + + data_format + + " but only NCDHW or NDHWC supported." + ) + + l_type = "conv3d_transpose" + helper = LayerHelper(l_type, **locals()) + if not isinstance(input, Variable): + raise TypeError("Input of conv3d_transpose must be Variable") + if len(input.shape) != 5: + raise ValueError( + "Input should be 5D tensor, but received input with the shape of {}".format( + input.shape + ) + ) + input_channel = ( + input.shape[1] if data_format == 'NCDHW' else input.shape[-1] + ) + + stride = utils.convert_to_list(stride, 3, 'stride') + dilation = utils.convert_to_list(dilation, 3, 'dilation') + + if not isinstance(use_cudnn, bool): + raise ValueError("use_cudnn should be True or False") + + def _update_padding(padding, data_format): + def is_list_or_tuple(ele): + if isinstance(ele, list) or isinstance(ele, tuple): + return True + return False + + if is_list_or_tuple(padding) and len(padding) == 5: + if is_list_or_tuple(padding[0]) and (data_format == "NCDHW"): + if not (padding[0] == [0, 0] and padding[1] == [0, 0]): + raise ValueError( + "Non-zero padding(%s) in the batch or channel dimensions " + "is not supported." % str(padding) + ) + padding = padding[2:5] + padding = [ele for a_list in padding for ele in a_list] + elif is_list_or_tuple(padding[0]) and (data_format == "NDHWC"): + if not (padding[0] == [0, 0] and padding[4] == [0, 0]): + raise ValueError( + "Non-zero padding(%s) in the batch or channel dimensions " + "is not supported." % str(padding) + ) + padding = padding[1:4] + padding = [ele for a_list in padding for ele in a_list] + padding = utils.convert_to_list(padding, 6, 'padding') + + elif is_list_or_tuple(padding) and len(padding) == 6: + padding = utils.convert_to_list(padding, 6, 'padding') + + else: + padding = utils.convert_to_list(padding, 3, 'padding') + padding = [ + padding[0], + padding[0], + padding[1], + padding[1], + padding[2], + padding[2], + ] + return padding + + padding_algorithm = "EXPLICIT" + if isinstance(padding, str): + padding = padding.upper() + if padding not in ["SAME", "VALID"]: + raise ValueError( + "Unknown padding: '%s'. It can only be 'SAME' or 'VALID'." + % str(padding) + ) + if padding == "VALID": + padding_algorithm = "VALID" + padding = [0, 0, 0, 0, 0, 0] + elif padding == "SAME": + padding_algorithm = "SAME" + padding = [0, 0, 0, 0, 0, 0] + + padding = _update_padding(padding, data_format) + + if filter_size is None: + if output_size is None: + raise ValueError("output_size must be set when filter_size is None") + if isinstance(output_size, int): + output_size = [output_size, output_size, output_size] + + d_in = input.shape[2] if data_format == 'NCDHW' else input.shape[1] + h_in = input.shape[3] if data_format == 'NCDHW' else input.shape[2] + w_in = input.shape[4] if data_format == 'NCDHW' else input.shape[3] + + filter_size_d = ( + output_size[0] + - (d_in - 1) * stride[0] + + padding[0] + + padding[1] + - 1 + ) // dilation[0] + 1 + filter_size_h = ( + output_size[1] + - (h_in - 1) * stride[1] + + padding[2] + + padding[3] + - 1 + ) // dilation[1] + 1 + filter_size_w = ( + output_size[2] + - (w_in - 1) * stride[2] + + padding[4] + + padding[5] + - 1 + ) // dilation[2] + 1 + filter_size = [filter_size_d, filter_size_h, filter_size_w] + else: + filter_size = utils.convert_to_list( + filter_size, 3, 'conv3d_transpose.filter_size' + ) + + if len(padding) == 6 and utils._is_symmetric_padding(padding, 3): + padding = [padding[0], padding[2], padding[4]] + + if output_size is None: + output_size = [] + elif isinstance(output_size, (list, tuple, int)): + output_size = utils.convert_to_list(output_size, 3, 'output_size') + else: + raise ValueError("output_size should be int, list[int] or tuple[int]") + + groups = 1 if groups is None else groups + if groups <= 0: + raise ValueError( + "the groups of conv3d_transpose should be greater than 0. Received groups: {}".format( + groups + ) + ) + if num_filters % groups != 0: + raise ValueError( + "Attr(num_filters) must be divisible by groups," + "Received: Attr(num_filters) is {}, the groups is {}".format( + num_filters, groups + ) + ) + + filter_shape = [input_channel, num_filters // groups] + filter_size + img_filter = helper.create_parameter( + dtype=input.dtype, shape=filter_shape, attr=helper.param_attr + ) + + if data_format == 'NCDHW': + data_format = 'NCHW' + if data_format == 'NDHWC': + data_format = 'NHWC' + + pre_bias = helper.create_variable_for_type_inference(dtype=input.dtype) + helper.append_op( + type=l_type, + inputs={'Input': [input], 'Filter': [img_filter]}, + outputs={'Output': pre_bias}, + attrs={ + 'output_size': output_size, + 'strides': stride, + 'paddings': padding, + 'padding_algorithm': padding_algorithm, + 'dilations': dilation, + 'groups': groups, + 'use_cudnn': use_cudnn, + 'data_format': data_format, + }, + ) + + if data_format == 'NCHW': + pre_act = helper.append_bias_op(pre_bias, dim_start=1, dim_end=2) + else: + pre_act = helper.append_bias_op(pre_bias, dim_start=4, dim_end=5) + out = helper.append_activation(pre_act) + return out + + def deformable_conv( input, offset, From 5f36e7757d45f3501d7dacacef8e2a38b6d1c4b1 Mon Sep 17 00:00:00 2001 From: Vvsmile <450864116@qq.com> Date: Wed, 23 Nov 2022 11:51:29 +0800 Subject: [PATCH 165/210] Remove API: dice_loss (#47933) remove dice_loss which is not used in paddle 2.0 --- python/paddle/fluid/layers/nn.py | 48 ------------------- .../fluid/tests/unittests/test_layers.py | 32 ------------- .../tests/unittests/test_nn_dice_loss.py | 41 ---------------- 3 files changed, 121 deletions(-) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 4d188228bf7ee..c9308f98c283f 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -107,7 +107,6 @@ 'label_smooth', 'roi_pool', 'roi_align', - 'dice_loss', 'image_resize', 'image_resize_short', 'resize_linear', @@ -6270,53 +6269,6 @@ def roi_align( return align_out -def dice_loss(input, label, epsilon=0.00001, name=None): - r""" - - Dice loss for comparing the similarity between the input predictions and the label. - This implementation is for binary classification, where the input is sigmoid - predictions of each pixel, usually used for segmentation task. The dice loss can - be defined as the following equation: - - .. math:: - - dice\_loss &= 1 - \frac{2 * intersection\_area}{total\_area} \\ - &= \frac{(total\_area - intersection\_area) - intersection\_area}{total\_area} \\ - &= \frac{(union\_area - intersection\_area)}{total\_area} - - - Parameters: - input (Tensor): Tensor, rank>=2, shape is :math:`[N_1, N_2, ..., N_k, D]`, where :math:`N_1` is - the batch_size, :math:`D` is the number of categories. It is usually the output - predictions of sigmoid activation. The data type can be float32 or float64. - label (Tensor): Tensor, the groud truth with the same rank as input, shape is :math:`[N_1, N_2, ..., N_k, 1]`. - where :math:`N_1` is the batch_size. The data type can be int32 or int64. - epsilon (float): The epsilon will be added to the numerator and denominator. - If both input and label are empty, it makes sure dice is 1. - Default: 0.00001 - name(str, optional): The default value is None. - Normally there is no need for user to set this property. - For more information, please refer to :ref:`api_guide_Name` - - Returns: - Tensor, which shape is [1], data type is the same as `input` . - - Example: - .. code-block:: python - - import paddle - import paddle.nn.functional as F - - x = paddle.randn((3,224,224,2)) - label = paddle.randint(high=2, shape=(3,224,224,1)) - predictions = F.softmax(x) - loss = F.dice_loss(input=predictions, label=label) - """ - return paddle.nn.functional.dice_loss( - input, label, epsilon=epsilon, name=name - ) - - def image_resize( input, out_shape=None, diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index 8b6c2ada2d2d2..cc33db2385afb 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -4141,38 +4141,6 @@ def test_roi_align(self): np.testing.assert_array_equal(static_res, dy_eager_res_value) np.testing.assert_array_equal(static_res, dy_res_value) - def test_dice_loss(self): - num_classes = 4 - eps = 1e-6 - input_np = np.random.rand(2, 3, num_classes).astype('float32') - label_np = np.random.randint(0, num_classes, [2, 3, 1], dtype=np.int64) - - with self.static_graph(): - input_ = layers.data( - name="input", shape=[None, 3, num_classes], dtype="float32" - ) - label_ = layers.data( - name="label", shape=[None, 3, 1], dtype="int64" - ) - output = layers.dice_loss(input_, label_, eps) - static_res = self.get_static_graph_result( - feed={'input': input_np, 'label': label_np}, fetch_list=[output] - )[0] - - with self.dynamic_graph(): - with _test_eager_guard(): - input_ = base.to_variable(input_np) - label_ = base.to_variable(label_np) - dy_eager_res = layers.dice_loss(input_, label_, eps) - dy_eager_res_value = dy_eager_res.numpy() - - input_ = base.to_variable(input_np) - label_ = base.to_variable(label_np) - dy_res = layers.dice_loss(input_, label_, eps) - dy_res_value = dy_res.numpy() - np.testing.assert_array_equal(static_res, dy_res_value) - np.testing.assert_array_equal(static_res, dy_eager_res_value) - def test_roi_perspective_transform(self): # TODO(minqiyang): dygraph do not support lod now with self.static_graph(): diff --git a/python/paddle/fluid/tests/unittests/test_nn_dice_loss.py b/python/paddle/fluid/tests/unittests/test_nn_dice_loss.py index f1e1f6f6e4b2a..8734bc05e749d 100644 --- a/python/paddle/fluid/tests/unittests/test_nn_dice_loss.py +++ b/python/paddle/fluid/tests/unittests/test_nn_dice_loss.py @@ -13,51 +13,10 @@ # limitations under the License. import unittest -import numpy as np -import paddle -import paddle.fluid.layers.nn as nn num_classes = 4 eps = 1e-6 -class TestDiceLossValue(unittest.TestCase): - def test_dice_loss(self): - input_ = paddle.rand([2, 3, num_classes]) - label_ = paddle.randint(0, num_classes, [2, 3, 1], dtype=paddle.int64) - - input_np, label_np = input_.numpy(), label_.numpy() - eye_np = np.eye(num_classes) - label_np = np.float32(eye_np[np.squeeze(label_np)]) - input_np = np.reshape(input_np, [2, -1]) - label_np = np.reshape(label_np, [2, -1]) - intersection_np = np.sum(input_np * label_np, axis=-1) - union_np = input_np.sum(-1) + label_np.sum(-1) - dice_np = np.mean(1 - 2 * intersection_np / (union_np + eps)) - dice_paddle = nn.dice_loss(input_, label_, eps) - np.testing.assert_allclose(dice_np, dice_paddle.numpy(), rtol=1e-05) - - -class TestDiceLossInvalidInput(unittest.TestCase): - def test_error(self): - def test_invalid_dtype(): - input_ = paddle.rand([2, 3, num_classes], dtype=paddle.float32) - label_ = paddle.randint( - 0, num_classes, [2, 3, 1], dtype=paddle.int64 - ) - nn.dice_loss(input_, label_.astype(paddle.float32)) - - self.assertRaises(AssertionError, test_invalid_dtype) - - def test_zero_shape_input(): - input_ = paddle.rand([0, 3, num_classes], dtype=paddle.float32) - label_ = paddle.randint( - 0, num_classes, [0, 3, 1], dtype=paddle.int64 - ) - nn.dice_loss(input_, label_) - - self.assertRaises(AssertionError, test_zero_shape_input) - - if __name__ == "__main__": unittest.main() From b994c89deda98c098d435048585c11d98e9bbc0e Mon Sep 17 00:00:00 2001 From: xiaoguoguo626807 <100397923+xiaoguoguo626807@users.noreply.github.com> Date: Wed, 23 Nov 2022 12:44:55 +0800 Subject: [PATCH 166/210] =?UTF-8?q?=E3=80=90fluid=20api=20clear=E3=80=91re?= =?UTF-8?q?move=20transpose=20(#47917)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * remove transpose * codestyle check * modify CI_STATIC * modify CI_STATIC * modify enable static() * remove unused import * fix conflict about stack * fix conflict about stack * fix conflict about stack * endless conflict --- .../paddle/fluid/contrib/layers/rnn_impl.py | 16 +-- python/paddle/fluid/layers/control_flow.py | 24 +++- python/paddle/fluid/layers/detection.py | 6 +- python/paddle/fluid/layers/nn.py | 103 ------------------ python/paddle/fluid/layers/rnn.py | 17 +-- python/paddle/fluid/nets.py | 4 +- .../auto_parallel/test_dist_op_cost.py | 25 ++--- .../fleet/parallel_dygraph_transformer.py | 10 +- .../fluid/tests/unittests/dist_transformer.py | 5 +- .../seq2seq_dygraph_model.py | 18 +-- .../dygraph_to_static/test_ptb_lm.py | 4 +- .../transformer_dygraph_model.py | 21 ++-- .../unittests/dygraph_to_static/yolov3.py | 5 +- .../unittests/ipu/test_transpose_op_ipu.py | 2 +- .../test_mkldnn_cpu_bfloat16_pass.py | 5 +- .../test_mkldnn_matmul_op_output_fuse_pass.py | 13 ++- ...n_reshape_transpose_matmul_v2_fuse_pass.py | 4 +- .../inference/test_trt_anchor_generator_op.py | 4 +- .../test_trt_shuffle_channel_detect_pass.py | 5 +- .../ir/inference/test_trt_subgraph_pass.py | 4 +- ..._trt_transpose_flatten_concat_fuse_pass.py | 6 +- .../unittests/mlu/test_transpose_op_mlu.py | 8 +- .../fluid/tests/unittests/test_cholesky_op.py | 2 +- .../test_eager_deletion_padding_rnn.py | 14 +-- .../tests/unittests/test_imperative_deepcf.py | 3 +- ..._imperative_lod_tensor_to_selected_rows.py | 2 +- .../test_imperative_ocr_attention_model.py | 2 +- .../unittests/test_imperative_ptb_rnn.py | 6 +- .../unittests/test_imperative_save_load.py | 6 +- .../unittests/test_imperative_save_load_v2.py | 7 +- ..._imperative_selected_rows_to_lod_tensor.py | 2 +- ..._imperative_transformer_sorted_gradient.py | 9 +- .../tests/unittests/test_rnn_cell_api.py | 7 +- .../tests/unittests/test_static_save_load.py | 6 +- .../tests/unittests/test_transpose_op.py | 10 +- .../tests/unittests/transformer_model.py | 4 +- 36 files changed, 157 insertions(+), 232 deletions(-) diff --git a/python/paddle/fluid/contrib/layers/rnn_impl.py b/python/paddle/fluid/contrib/layers/rnn_impl.py index 78813126e26ac..df6a38852ff8c 100644 --- a/python/paddle/fluid/contrib/layers/rnn_impl.py +++ b/python/paddle/fluid/contrib/layers/rnn_impl.py @@ -326,7 +326,7 @@ def basic_gru( ) if batch_first: - input = layers.transpose(input, [1, 0, 2]) + input = paddle.transpose(input, [1, 0, 2]) mask = None if sequence_length: @@ -334,7 +334,7 @@ def basic_gru( mask = layers.sequence_mask( sequence_length, maxlen=max_seq_len, dtype='float32' ) - mask = layers.transpose(mask, [1, 0]) + mask = paddle.transpose(mask, [1, 0]) direc_num = 1 if bidirectional: @@ -425,7 +425,7 @@ def get_single_direction_output( ) if batch_first: - rnn_out = layers.transpose(rnn_out, [1, 0, 2]) + rnn_out = paddle.transpose(rnn_out, [1, 0, 2]) return rnn_out, last_hidden else: @@ -433,7 +433,7 @@ def get_single_direction_output( last_hidden = fw_last_hidden if batch_first: - rnn_out = layers.transpose(rnn_out, [1, 0, 2]) + rnn_out = paddle.transpose(rnn_out, [1, 0, 2]) return rnn_out, last_hidden @@ -610,7 +610,7 @@ def basic_lstm( ) if batch_first: - input = layers.transpose(input, [1, 0, 2]) + input = paddle.transpose(input, [1, 0, 2]) mask = None if sequence_length: @@ -619,7 +619,7 @@ def basic_lstm( sequence_length, maxlen=max_seq_len, dtype='float32' ) - mask = layers.transpose(mask, [1, 0]) + mask = paddle.transpose(mask, [1, 0]) direc_num = 1 if bidirectional: @@ -740,7 +740,7 @@ def get_single_direction_output( ) if batch_first: - rnn_out = layers.transpose(rnn_out, [1, 0, 2]) + rnn_out = paddle.transpose(rnn_out, [1, 0, 2]) return rnn_out, last_hidden, last_cell else: @@ -749,7 +749,7 @@ def get_single_direction_output( last_cell = fw_last_cell if batch_first: - rnn_out = layers.transpose(rnn_out, [1, 0, 2]) + rnn_out = paddle.transpose(rnn_out, [1, 0, 2]) return rnn_out, last_hidden, last_cell diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py index ee53f23684ca9..7ad02f364dfd9 100755 --- a/python/paddle/fluid/layers/control_flow.py +++ b/python/paddle/fluid/layers/control_flow.py @@ -625,10 +625,12 @@ class StaticRNN: Examples: .. code-block:: python + import paddle import paddle.fluid as fluid import paddle.fluid.layers as layers vocab_size, hidden_size=10000, 200 + paddle.enable_static() x = fluid.data(name="x", shape=[None, 1, 1], dtype='int64') # create word sequence x_emb = layers.embedding( @@ -637,7 +639,7 @@ class StaticRNN: dtype='float32', is_sparse=False) # transform batch size to dim 1 - x_emb = layers.transpose(x_emb, perm=[1, 0, 2]) + x_emb = paddle.transpose(x_emb, perm=[1, 0, 2]) rnn = fluid.layers.StaticRNN() with rnn.step(): @@ -714,10 +716,12 @@ def memory( Examples 1: .. code-block:: python + import paddle import paddle.fluid as fluid import paddle.fluid.layers as layers vocab_size, hidden_size=10000, 200 + paddle.enable_static() x = fluid.data(name="x", shape=[None, 1, 1], dtype='int64') # create word sequence x_emb = layers.embedding( @@ -726,7 +730,7 @@ def memory( dtype='float32', is_sparse=False) # transform batch size to dim 1 - x_emb = layers.transpose(x_emb, perm=[1, 0, 2]) + x_emb = paddle.transpose(x_emb, perm=[1, 0, 2]) rnn = fluid.layers.StaticRNN() with rnn.step(): @@ -742,9 +746,11 @@ def memory( Examples 2: .. code-block:: python + import paddle import paddle.fluid as fluid import paddle.fluid.layers as layers vocab_size, hidden_size=10000, 200 + paddle.enable_static() x = fluid.data(name="x", shape=[None, 1, 1], dtype='int64') # create word sequence x_emb = layers.embedding( @@ -753,7 +759,7 @@ def memory( dtype='float32', is_sparse=False) # transform batch size to dim 1 - x_emb = layers.transpose(x_emb, perm=[1, 0, 2]) + x_emb = paddle.transpose(x_emb, perm=[1, 0, 2]) boot_memory = fluid.layers.data(name='boot', shape=[hidden_size], dtype='float32', lod_level=1) rnn = fluid.layers.StaticRNN() with rnn.step(): @@ -842,10 +848,12 @@ def step_input(self, x): Examples: .. code-block:: python + import paddle import paddle.fluid as fluid import paddle.fluid.layers as layers vocab_size, hidden_size=10000, 200 + paddle.enable_static() x = fluid.data(name="x", shape=[None, 1, 1], dtype='int64') # create word sequence x_emb = layers.embedding( @@ -854,7 +862,7 @@ def step_input(self, x): dtype='float32', is_sparse=False) # transform batch size to dim 1 - x_emb = layers.transpose(x_emb, perm=[1, 0, 2]) + x_emb = paddle.transpose(x_emb, perm=[1, 0, 2]) rnn = fluid.layers.StaticRNN() with rnn.step(): @@ -893,10 +901,12 @@ def step_output(self, o): Examples: .. code-block:: python + import paddle import paddle.fluid as fluid import paddle.fluid.layers as layers vocab_size, hidden_size=10000, 200 + paddle.enable_static() x = fluid.data(name="x", shape=[None, 1, 1], dtype='int64') # create word sequence x_emb = layers.embedding( @@ -905,7 +915,7 @@ def step_output(self, o): dtype='float32', is_sparse=False) # transform batch size to dim 1 - x_emb = layers.transpose(x_emb, perm=[1, 0, 2]) + x_emb = paddle.transpose(x_emb, perm=[1, 0, 2]) rnn = fluid.layers.StaticRNN() with rnn.step(): @@ -953,10 +963,12 @@ def output(self, *outputs): Examples: .. code-block:: python + import paddle import paddle.fluid as fluid import paddle.fluid.layers as layers vocab_size, hidden_size=10000, 200 + paddle.enable_static() x = fluid.data(name="x", shape=[None, 1, 1], dtype='int64') # create word sequence x_emb = layers.embedding( @@ -965,7 +977,7 @@ def output(self, *outputs): dtype='float32', is_sparse=False) # transform batch size to dim 1 - x_emb = layers.transpose(x_emb, perm=[1, 0, 2]) + x_emb = paddle.transpose(x_emb, perm=[1, 0, 2]) rnn = fluid.layers.StaticRNN() with rnn.step(): diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py index bfa063c105270..d89c6b9c1f046 100644 --- a/python/paddle/fluid/layers/detection.py +++ b/python/paddle/fluid/layers/detection.py @@ -774,7 +774,7 @@ class number, M is number of bounding boxes. code_type='decode_center_size', ) scores = nn.softmax(input=scores) - scores = nn.transpose(scores, perm=[0, 2, 1]) + scores = paddle.transpose(scores, perm=[0, 2, 1]) scores.stop_gradient = True nmsed_outs = helper.create_variable_for_type_inference( dtype=decoded_box.dtype @@ -2443,7 +2443,7 @@ def _is_list_or_tuple_and_equal(data, length, err_info): stride=stride, ) - mbox_loc = nn.transpose(mbox_loc, perm=[0, 2, 3, 1]) + mbox_loc = paddle.transpose(mbox_loc, perm=[0, 2, 3, 1]) mbox_loc_flatten = nn.flatten(mbox_loc, axis=1) mbox_locs.append(mbox_loc_flatten) @@ -2456,7 +2456,7 @@ def _is_list_or_tuple_and_equal(data, length, err_info): padding=pad, stride=stride, ) - conf_loc = nn.transpose(conf_loc, perm=[0, 2, 3, 1]) + conf_loc = paddle.transpose(conf_loc, perm=[0, 2, 3, 1]) conf_loc_flatten = nn.flatten(conf_loc, axis=1) mbox_confs.append(conf_loc_flatten) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index c9308f98c283f..d782bf973a5f1 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -89,7 +89,6 @@ 'l2_normalize', 'matmul', 'topk', - 'transpose', 'im2sequence', 'row_conv', 'multiplex', @@ -4875,108 +4874,6 @@ def ctc_greedy_decoder( return ctc_out, ctc_out_len -def transpose(x, perm, name=None): - """ - Permute the data dimensions of `input` according to `perm`. - - The `i`-th dimension of the returned tensor will correspond to the - perm[i]-th dimension of `input`. - - Args: - x (Tensor): The input Tensor. It is a N-D Tensor of data types bool, float32, float64, int32. - perm (list|tuple): Permute the input according to the data of perm. - name (str): The name of this layer. It is optional. - - Returns: - Tensor: A transposed n-D Tensor, with data type being bool, float32, float64, int32, int64. - - For Example: - - .. code-block:: text - - x = [[[ 1 2 3 4] [ 5 6 7 8] [ 9 10 11 12]] - [[13 14 15 16] [17 18 19 20] [21 22 23 24]]] - shape(x) = [2,3,4] - - # Example 1 - perm0 = [1,0,2] - y_perm0 = [[[ 1 2 3 4] [13 14 15 16]] - [[ 5 6 7 8] [17 18 19 20]] - [[ 9 10 11 12] [21 22 23 24]]] - shape(y_perm0) = [3,2,4] - - # Example 2 - perm1 = [2,1,0] - y_perm1 = [[[ 1 13] [ 5 17] [ 9 21]] - [[ 2 14] [ 6 18] [10 22]] - [[ 3 15] [ 7 19] [11 23]] - [[ 4 16] [ 8 20] [12 24]]] - shape(y_perm1) = [4,3,2] - - Examples: - - .. code-block:: python - - import paddle - - x = paddle.randn([2, 3, 4]) - x_transposed = paddle.transpose(x, perm=[1, 0, 2]) - print(x_transposed.shape) - # [3L, 2L, 4L] - - """ - if in_dygraph_mode(): - return _C_ops.transpose(x, perm) - else: - if _in_legacy_dygraph(): - out, _ = _legacy_C_ops.transpose2(x, 'axis', perm) - return out - - check_variable_and_dtype( - x, - 'x', - [ - 'bool', - 'float16', - 'float32', - 'float64', - 'int32', - 'int64', - 'complex64', - 'complex128', - ], - 'transpose', - ) - check_type(perm, 'perm', (list, tuple), 'transpose') - if isinstance(perm, tuple): - perm = list(perm) - if len(perm) != len(x.shape): - raise ValueError( - "Input(perm) is the permutation of dimensions of Input(x), " - "its length should be equal to dimensions of Input(x), " - "but received dimension of Input(x) is %s, " - "the length of Input(perm) is %s." % (len(x.shape), len(perm)) - ) - for idx, dim in enumerate(perm): - if dim >= len(x.shape): - raise ValueError( - "Each element in Input(perm) should be less than Input(x)'s dimension, " - "but %d-th element in Input(perm) is %d which exceeds Input(x)'s " - "dimension %d." % (idx, perm[idx], len(x.shape)) - ) - - helper = LayerHelper('transpose', **locals()) - out = helper.create_variable_for_type_inference(x.dtype) - x_shape = helper.create_variable_for_type_inference(x.dtype) - helper.append_op( - type='transpose2', - inputs={'X': [x]}, - outputs={'Out': [out], 'XShape': [x_shape]}, - attrs={'axis': perm}, - ) - return out - - def im2sequence( input, filter_size=1, diff --git a/python/paddle/fluid/layers/rnn.py b/python/paddle/fluid/layers/rnn.py index 9b384203fa885..f7ce8d1e6ca18 100644 --- a/python/paddle/fluid/layers/rnn.py +++ b/python/paddle/fluid/layers/rnn.py @@ -16,6 +16,7 @@ from functools import partial, reduce import warnings + import paddle from paddle.utils import deprecated from . import nn @@ -563,7 +564,7 @@ def _maybe_copy(state, new_state, step_mask): def _transpose_batch_time(x): perm = [1, 0] + list(range(2, len(x.shape))) - return nn.transpose(x, perm) + return paddle.transpose(x, perm) def _rnn_dynamic_graph( @@ -591,7 +592,7 @@ def _rnn_dynamic_graph( mask = sequence_lod.sequence_mask( sequence_length, maxlen=time_steps, dtype=inputs.dtype ) - mask = nn.transpose(mask, [1, 0]) + mask = paddle.transpose(mask, [1, 0]) if is_reverse: inputs = map_structure(lambda x: tensor.reverse(x, axis=[0]), inputs) @@ -678,7 +679,7 @@ def _switch_grad(x, stop=False): maxlen=max_seq_len, dtype=flatten(initial_states)[0].dtype, ) - mask = nn.transpose(mask, [1, 0]) + mask = paddle.transpose(mask, [1, 0]) if is_reverse: inputs = map_structure(lambda x: tensor.reverse(x, axis=[0]), inputs) mask = tensor.reverse(mask, axis=[0]) if sequence_length else None @@ -1032,14 +1033,14 @@ def tile_beam_merge_with_batch(x, beam_size): expand_times = [1] * len(x.shape) expand_times[1] = beam_size x = paddle.tile(x, expand_times) # [batch_size, beam_size, ...] - x = nn.transpose( + x = paddle.transpose( x, list(range(2, len(x.shape))) + [0, 1] ) # [..., batch_size, beam_size] # use 0 to copy to avoid wrong shape x = paddle.reshape( x, shape=[0] * (len(x.shape) - 2) + [-1] ) # [..., batch_size * beam_size] - x = nn.transpose( + x = paddle.transpose( x, [len(x.shape) - 1] + list(range(0, len(x.shape) - 1)) ) # [batch_size * beam_size, ...] return x @@ -1557,7 +1558,9 @@ def _maybe_copy(state, new_state, step_mask): if not output_time_major: final_outputs = map_structure( - lambda x: nn.transpose(x, [1, 0] + list(range(2, len(x.shape)))), + lambda x: paddle.transpose( + x, [1, 0] + list(range(2, len(x.shape))) + ), final_outputs, ) @@ -1629,7 +1632,7 @@ def _maybe_copy(state, new_state, step_mask): return new_state def _transpose_batch_time(x): - return nn.transpose(x, [1, 0] + list(range(2, len(x.shape)))) + return paddle.transpose(x, [1, 0] + list(range(2, len(x.shape)))) def _create_array_out_of_while(dtype): current_block_idx = default_main_program().current_block_idx diff --git a/python/paddle/fluid/nets.py b/python/paddle/fluid/nets.py index eab247452fc92..c4bd6cf81f5f4 100644 --- a/python/paddle/fluid/nets.py +++ b/python/paddle/fluid/nets.py @@ -577,7 +577,7 @@ def __split_heads(x, num_heads): # permute the dimensions into: # [batch_size, num_heads, max_sequence_len, hidden_size_per_head] - return layers.transpose(x=reshaped, perm=[0, 2, 1, 3]) + return paddle.transpose(x=reshaped, perm=[0, 2, 1, 3]) def __combine_heads(x): """ @@ -598,7 +598,7 @@ def __combine_heads(x): if len(x.shape) != 4: raise ValueError("Input(x) should be a 4-D Tensor.") - trans_x = layers.transpose(x, perm=[0, 2, 1, 3]) + trans_x = paddle.transpose(x, perm=[0, 2, 1, 3]) return paddle.reshape( x=trans_x, shape=list( diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_op_cost.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_op_cost.py index 517debaa58842..92ce41aa608cb 100644 --- a/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_op_cost.py +++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_op_cost.py @@ -16,6 +16,7 @@ import copy import paddle + from paddle.distributed.fleet import auto from paddle.distributed.auto_parallel.cluster import Cluster from paddle.distributed.auto_parallel.operators.common import ( @@ -151,9 +152,7 @@ def make_program(): auto.ProcessMesh([0, 1], dim_names=["x"]), ["x", None], ) - out = paddle.fluid.layers.transpose( - out, [1, 0] - ) # [8, 2] [-1, 0] + out = paddle.transpose(out, [1, 0]) # [8, 2] [-1, 0] # matmul param1 = paddle.fluid.layers.create_parameter( @@ -188,9 +187,7 @@ def make_program(): tmp_out, param2 ) # [8, 4] [-1, 0] - out8 = paddle.fluid.layers.transpose( - out2, [1, 0] - ) # [4, 8] [0, -1] + out8 = paddle.transpose(out2, [1, 0]) # [4, 8] [0, -1] # reshape out9 = paddle.reshape(out8, [8, 2, 4]) # [4, 2, 4] [0, -1, -1] @@ -266,9 +263,7 @@ def make_program(): auto.ProcessMesh([0, 1], dim_names=["x"]), ["x", None], ) - out = paddle.fluid.layers.transpose( - out, [1, 0] - ) # [8, 2] [-1, 0] + out = paddle.transpose(out, [1, 0]) # [8, 2] [-1, 0] # matmul_v2 param1 = paddle.fluid.layers.create_parameter( @@ -300,9 +295,7 @@ def make_program(): tmp_out = paddle.matmul(out1, tmp_param) out2 = paddle.matmul(tmp_out, param2) # [8, 4] [-1, 0] - out8 = paddle.fluid.layers.transpose( - out2, [1, 0] - ) # [4, 8] [0, -1] + out8 = paddle.transpose(out2, [1, 0]) # [4, 8] [0, -1] # reshape out9 = paddle.reshape(out8, [8, 2, 4]) # [4, 2, 4] [0, -1, -1] @@ -377,9 +370,7 @@ def make_program(): auto.ProcessMesh([0, 1], dim_names=["x"]), ["x", None], ) - out = paddle.fluid.layers.transpose( - out, [1, 0] - ) # [8, 2] [-1, 0] + out = paddle.transpose(out, [1, 0]) # [8, 2] [-1, 0] # mul param1 = paddle.fluid.layers.create_parameter( @@ -414,9 +405,7 @@ def make_program(): tmp_out, param2 ) # [8, 4] [-1, 0] - out8 = paddle.fluid.layers.transpose( - out2, [1, 0] - ) # [4, 8] [0, -1] + out8 = paddle.transpose(out2, [1, 0]) # [4, 8] [0, -1] # reshape out9 = paddle.reshape(out8, [8, 2, 4]) # [4, 2, 4] [0, -1, -1] diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_transformer.py b/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_transformer.py index 21d357fcdd180..d3fd734d6aa76 100644 --- a/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_transformer.py +++ b/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_transformer.py @@ -328,15 +328,16 @@ def forward(self, queries, keys, values, attn_bias): reshaped_q = paddle.reshape( x=q, shape=[0, 0, self._n_head, self._d_key] ) - transpose_q = fluid.layers.transpose(x=reshaped_q, perm=[0, 2, 1, 3]) + + transpose_q = paddle.transpose(x=reshaped_q, perm=[0, 2, 1, 3]) reshaped_k = paddle.reshape( x=k, shape=[0, 0, self._n_head, self._d_key] ) - transpose_k = fluid.layers.transpose(x=reshaped_k, perm=[0, 2, 1, 3]) + transpose_k = paddle.transpose(x=reshaped_k, perm=[0, 2, 1, 3]) reshaped_v = paddle.reshape( x=v, shape=[0, 0, self._n_head, self._d_value] ) - transpose_v = fluid.layers.transpose(x=reshaped_v, perm=[0, 2, 1, 3]) + transpose_v = paddle.transpose(x=reshaped_v, perm=[0, 2, 1, 3]) # scale dot product attention product = fluid.layers.matmul( @@ -362,7 +363,8 @@ def forward(self, queries, keys, values, attn_bias): # combine heads if len(out.shape) != 4: raise ValueError("Input(x) should be a 4-D Tensor.") - trans_x = fluid.layers.transpose(out, perm=[0, 2, 1, 3]) + + trans_x = paddle.transpose(out, perm=[0, 2, 1, 3]) final_out = paddle.reshape( x=trans_x, shape=[0, 0, trans_x.shape[2] * trans_x.shape[3]], diff --git a/python/paddle/fluid/tests/unittests/dist_transformer.py b/python/paddle/fluid/tests/unittests/dist_transformer.py index fbe292e1f368c..24de04dc6fb56 100644 --- a/python/paddle/fluid/tests/unittests/dist_transformer.py +++ b/python/paddle/fluid/tests/unittests/dist_transformer.py @@ -23,6 +23,7 @@ import random import tarfile +import paddle import paddle.fluid as fluid import paddle.fluid.layers as layers from test_dist_base import TestDistRunnerBase, runtime_main, RUN_STEP @@ -1148,7 +1149,7 @@ def __split_heads(x, n_head): # permute the dimensions into: # [batch_size, n_head, max_sequence_len, hidden_size_per_head] - return layers.transpose(x=reshaped, perm=[0, 2, 1, 3]) + return paddle.transpose(x=reshaped, perm=[0, 2, 1, 3]) def __combine_heads(x): """ @@ -1160,7 +1161,7 @@ def __combine_heads(x): if len(x.shape) != 4: raise ValueError("Input(x) should be a 4-D Tensor.") - trans_x = layers.transpose(x, perm=[0, 2, 1, 3]) + trans_x = paddle.transpose(x, perm=[0, 2, 1, 3]) # The value 0 in shape attr means copying the corresponding dimension # size of the input as the output dimension size. return paddle.reshape( diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_dygraph_model.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_dygraph_model.py index 539400ad927ad..4b52df98c22d5 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_dygraph_model.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_dygraph_model.py @@ -176,7 +176,7 @@ def __init__( ) def _transpose_batch_time(self, x): - return fluid.layers.transpose(x, [1, 0] + list(range(2, len(x.shape)))) + return paddle.transpose(x, [1, 0] + list(range(2, len(x.shape)))) def _merge_batch_beams(self, x): return paddle.reshape(x, shape=(-1, x.shape[2])) @@ -234,7 +234,7 @@ def forward(self, inputs): enc_len_mask = fluid.layers.sequence_mask( src_sequence_length, maxlen=max_seq_len, dtype="float32" ) - enc_len_mask = fluid.layers.transpose(enc_len_mask, [1, 0]) + enc_len_mask = paddle.transpose(enc_len_mask, [1, 0]) # TODO: Because diff exits if call while_loop in static graph. # In while block, a Variable created in parent block participates in the calculation of gradient, @@ -336,7 +336,7 @@ def beam_search(self, inputs): enc_len_mask = fluid.layers.sequence_mask( src_sequence_length, maxlen=max_seq_len, dtype="float32" ) - enc_len_mask = fluid.layers.transpose(enc_len_mask, [1, 0]) + enc_len_mask = paddle.transpose(enc_len_mask, [1, 0]) for k in range(args.max_seq_len): enc_step_input = src_emb[k] @@ -643,7 +643,7 @@ def __init__( ) def _transpose_batch_time(self, x): - return fluid.layers.transpose(x, [1, 0] + list(range(2, len(x.shape)))) + return paddle.transpose(x, [1, 0] + list(range(2, len(x.shape)))) def _merge_batch_beams(self, x): return paddle.reshape(x, shape=(-1, x.shape[2])) @@ -653,14 +653,14 @@ def tile_beam_merge_with_batch(self, x): expand_times = [1] * len(x.shape) expand_times[1] = self.beam_size x = fluid.layers.expand(x, expand_times) # [batch_size, beam_size, ...] - x = fluid.layers.transpose( + x = paddle.transpose( x, list(range(2, len(x.shape))) + [0, 1] ) # [..., batch_size, beam_size] # use 0 to copy to avoid wrong shape x = paddle.reshape( x, shape=[0] * (len(x.shape) - 2) + [-1] ) # [..., batch_size * beam_size] - x = fluid.layers.transpose( + x = paddle.transpose( x, [len(x.shape) - 1] + list(range(0, len(x.shape) - 1)) ) # [batch_size * beam_size, ...] return x @@ -691,9 +691,9 @@ def attention(self, query, enc_output, mask=None): attn = fluid.layers.matmul(query, memory, transpose_y=True) if mask is not None: - attn = fluid.layers.transpose(attn, [1, 0, 2]) + attn = paddle.transpose(attn, [1, 0, 2]) attn = fluid.layers.elementwise_add(attn, mask * 1000000000, -1) - attn = fluid.layers.transpose(attn, [1, 0, 2]) + attn = paddle.transpose(attn, [1, 0, 2]) weight = fluid.layers.softmax(attn) weight_memory = fluid.layers.matmul(weight, memory) @@ -743,7 +743,7 @@ def forward(self, inputs): src_sequence_length, maxlen=max_seq_len, dtype="float32" ) enc_padding_mask = enc_len_mask - 1.0 - enc_len_mask = fluid.layers.transpose(enc_len_mask, [1, 0]) + enc_len_mask = paddle.transpose(enc_len_mask, [1, 0]) enc_outputs = [] # TODO: Because diff exits if call while_loop in static graph. diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ptb_lm.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ptb_lm.py index e87c727f7d716..10e8f2cf68e85 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ptb_lm.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ptb_lm.py @@ -122,12 +122,12 @@ def forward(self, input_embedding, init_hidden=None, init_cell=None): last_hidden = paddle.reshape( last_hidden, shape=[-1, self._num_layers, self._hidden_size] ) - last_hidden = fluid.layers.transpose(x=last_hidden, perm=[1, 0, 2]) + last_hidden = paddle.transpose(x=last_hidden, perm=[1, 0, 2]) last_cell = fluid.layers.concat(cell_array, 1) last_cell = paddle.reshape( last_cell, shape=[-1, self._num_layers, self._hidden_size] ) - last_cell = fluid.layers.transpose(x=last_cell, perm=[1, 0, 2]) + last_cell = paddle.transpose(x=last_cell, perm=[1, 0, 2]) return real_res, last_hidden, last_cell diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_dygraph_model.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_dygraph_model.py index e26699bacfb52..796329ab555d6 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_dygraph_model.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_dygraph_model.py @@ -138,12 +138,13 @@ def forward(self, queries, keys, values, attn_bias, cache=None): k = self.k_fc(keys) v = self.v_fc(values) # split head + q = paddle.reshape(x=q, shape=[0, 0, self.n_head, self.d_key]) - q = layers.transpose(x=q, perm=[0, 2, 1, 3]) + q = paddle.transpose(x=q, perm=[0, 2, 1, 3]) k = paddle.reshape(x=k, shape=[0, 0, self.n_head, self.d_key]) - k = layers.transpose(x=k, perm=[0, 2, 1, 3]) + k = paddle.transpose(x=k, perm=[0, 2, 1, 3]) v = paddle.reshape(x=v, shape=[0, 0, self.n_head, self.d_value]) - v = layers.transpose(x=v, perm=[0, 2, 1, 3]) + v = paddle.transpose(x=v, perm=[0, 2, 1, 3]) if cache is not None: cache_k, cache_v = cache["k"], cache["v"] @@ -160,8 +161,10 @@ def forward(self, queries, keys, values, attn_bias, cache=None): if self.dropout_rate: weights = layers.dropout(weights, dropout_prob=self.dropout_rate) out = layers.matmul(weights, v) - out = layers.transpose(out, perm=[0, 2, 1, 3]) + + out = paddle.transpose(out, perm=[0, 2, 1, 3]) out = paddle.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]]) + out = self.proj_fc(out) return out @@ -703,7 +706,7 @@ def expand_to_beam_size(tensor, beam_size): def merge_batch_beams(tensor): var_dim_in_state = 2 # count in beam dim - tensor = layers.transpose( + tensor = paddle.transpose( tensor, list(range(var_dim_in_state, len(tensor.shape))) + list(range(0, var_dim_in_state)), @@ -714,7 +717,7 @@ def merge_batch_beams(tensor): [0] * (len(tensor.shape) - var_dim_in_state) + [batch_size * beam_size], ) - res = layers.transpose( + res = paddle.transpose( tensor, list( range( @@ -728,7 +731,7 @@ def merge_batch_beams(tensor): def split_batch_beams(tensor): var_dim_in_state = 1 - tensor = layers.transpose( + tensor = paddle.transpose( tensor, list(range(var_dim_in_state, len(tensor.shape))) + list(range(0, var_dim_in_state)), @@ -738,7 +741,7 @@ def split_batch_beams(tensor): [0] * (len(tensor.shape) - var_dim_in_state) + [batch_size, beam_size], ) - res = layers.transpose( + res = paddle.transpose( tensor, list( range( @@ -878,7 +881,7 @@ def gather(input, indices, batch_pos): predict_ids = paddle.stack(predict_ids, axis=0) parent_ids = paddle.stack(parent_ids, axis=0) - finished_seq = layers.transpose( + finished_seq = paddle.transpose( layers.gather_tree(predict_ids, parent_ids), [1, 2, 0] ) finished_scores = topk_scores diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/yolov3.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/yolov3.py index 28078aba7893c..58dae8bcfeb59 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/yolov3.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/yolov3.py @@ -16,6 +16,7 @@ import sys import paddle + import paddle.fluid as fluid from paddle.fluid.dygraph import declarative from paddle.fluid.param_attr import ParamAttr @@ -345,9 +346,7 @@ def forward( name="yolo_box" + str(i), ) self.boxes.append(boxes) - self.scores.append( - fluid.layers.transpose(scores, perm=[0, 2, 1]) - ) + self.scores.append(paddle.transpose(scores, perm=[0, 2, 1])) self.downsample //= 2 if not self.is_train: diff --git a/python/paddle/fluid/tests/unittests/ipu/test_transpose_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_transpose_op_ipu.py index 34e0457c2ded0..e683f82521ad2 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_transpose_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_transpose_op_ipu.py @@ -46,7 +46,7 @@ def build_model(self): x = paddle.static.data( name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32' ) - out = paddle.fluid.layers.transpose(x, **self.attrs) + out = paddle.transpose(x, **self.attrs) self.fetch_list = [out.name] def run_model(self, exec_mode): diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_cpu_bfloat16_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_cpu_bfloat16_pass.py index 5792db6af95c0..0a74492ec6f73 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_cpu_bfloat16_pass.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_cpu_bfloat16_pass.py @@ -15,6 +15,7 @@ import unittest import numpy as np from inference_pass_test import InferencePassTest +import paddle import paddle.fluid as fluid from paddle.fluid.core import PassVersionChecker import paddle @@ -27,8 +28,10 @@ def setUp(self): x = fluid.data( name='x', shape=[-1] + self.shape_x, dtype=self.d_type ) - out = fluid.layers.transpose(x, perm=[0, 1, 2, 3]) + + out = paddle.transpose(x, perm=[0, 1, 2, 3]) out = paddle.reshape(out, [0, 0, 0, 0]) + out = fluid.layers.fc(out, size=1) self.feeds = { diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_op_output_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_op_output_fuse_pass.py index a320dfbe4dcff..79546196660f4 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_op_output_fuse_pass.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_op_output_fuse_pass.py @@ -14,6 +14,7 @@ import unittest import numpy as np + import paddle import paddle.fluid as fluid from inference_pass_test import InferencePassTest @@ -36,8 +37,9 @@ def make_network(self): name='y', shape=[-1] + self.shape_y, dtype=self.d_type ) out = fluid.layers.matmul(x, y) - out = fluid.layers.transpose(out, perm=[0, 2, 1, 3]) + out = paddle.transpose(out, perm=[0, 2, 1, 3]) out = paddle.reshape(out, [0, 0, self.shape_y[0] * self.shape_y[2]]) + out = fluid.layers.relu(out) return out @@ -77,7 +79,7 @@ def make_network(self): name='y', shape=[-1] + self.shape_y, dtype=self.d_type ) out = fluid.layers.matmul(x, y) - out = fluid.layers.transpose(out, perm=[0, 1, 2, 3]) + out = paddle.transpose(out, perm=[0, 1, 2, 3]) out = paddle.reshape(out, [0, 0, 0, 0]) out = fluid.layers.fc(out, size=1) return out @@ -100,11 +102,10 @@ def make_network(self): name='y', shape=[-1] + self.shape_y, dtype=self.d_type ) out = fluid.layers.matmul(x, y) - out = fluid.layers.transpose(out, perm=[0, 2, 1, 3]) - out = fluid.layers.transpose( - out, perm=[0, 1, 2, 3] - ) # breaks pattern + out = paddle.transpose(out, perm=[0, 2, 1, 3]) + out = paddle.transpose(out, perm=[0, 1, 2, 3]) # breaks pattern out = paddle.reshape(out, [0, 0, self.shape_y[0] * self.shape_y[2]]) + out = fluid.layers.relu(out) return out diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_reshape_transpose_matmul_v2_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_reshape_transpose_matmul_v2_fuse_pass.py index 188d111c45528..b5a345b2cfc56 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_reshape_transpose_matmul_v2_fuse_pass.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_reshape_transpose_matmul_v2_fuse_pass.py @@ -33,8 +33,10 @@ def setUp(self): weight = fluid.layers.create_parameter( shape=self.weight_shape, dtype="float32" ) + reshape = paddle.reshape(data, shape=self.reshape_shape) - transpose = fluid.layers.transpose(reshape, self.tranpose_perm) + transpose = paddle.transpose(reshape, self.tranpose_perm) + matmul = paddle.matmul( transpose, weight, diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_anchor_generator_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_anchor_generator_op.py index cd05c8528bc1b..2030282f0c24c 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_anchor_generator_op.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_anchor_generator_op.py @@ -15,6 +15,8 @@ import unittest import numpy as np from inference_pass_test import InferencePassTest + +import paddle import paddle.fluid as fluid import paddle.fluid.core as core from paddle.fluid.core import PassVersionChecker @@ -64,7 +66,7 @@ def build(self): stride=self.stride, ) if self.dynamic_shape_params is not None: - anchor = fluid.layers.transpose(anchor, [2, 3, 0, 1]) + anchor = paddle.transpose(anchor, [2, 3, 0, 1]) out = fluid.layers.batch_norm(anchor, is_test=True) self.fetch_list = [out, var] diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_shuffle_channel_detect_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_shuffle_channel_detect_pass.py index 754149f7b3489..51f4af19c611c 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_shuffle_channel_detect_pass.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_shuffle_channel_detect_pass.py @@ -15,6 +15,8 @@ import unittest import numpy as np from inference_pass_test import InferencePassTest + +import paddle import paddle.fluid as fluid from paddle.fluid.core import PassVersionChecker from paddle.fluid.core import AnalysisConfig @@ -28,8 +30,9 @@ def setUp(self): name="data", shape=[-1, 6, 64, 64], dtype="float32" ) reshape1 = paddle.reshape(x=data, shape=[-1, 2, 3, 64, 64]) - trans = fluid.layers.transpose(x=reshape1, perm=[0, 2, 1, 3, 4]) + trans = paddle.transpose(x=reshape1, perm=[0, 2, 1, 3, 4]) reshape2 = paddle.reshape(x=trans, shape=[-1, 6, 64, 64]) + out = fluid.layers.batch_norm(reshape2, is_test=True) self.feeds = { diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_subgraph_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_subgraph_pass.py index b91b068adb828..86a995c45c01c 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_subgraph_pass.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_subgraph_pass.py @@ -17,6 +17,8 @@ import unittest import numpy as np from inference_pass_test import InferencePassTest + +import paddle import paddle.fluid as fluid import paddle.fluid.core as core from paddle.fluid.core import PassVersionChecker @@ -226,7 +228,7 @@ def setUp(self): self.fetch_list = [out] def append_transpose(self, data): - return fluid.layers.transpose(data, [0, 3, 1, 2]) + return paddle.transpose(data, [0, 3, 1, 2]) def test_check_output(self): if core.is_compiled_with_cuda(): diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_transpose_flatten_concat_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_transpose_flatten_concat_fuse_pass.py index 8fc8b464dda1e..9fc54820d3322 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_transpose_flatten_concat_fuse_pass.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_transpose_flatten_concat_fuse_pass.py @@ -15,6 +15,8 @@ import unittest import numpy as np from inference_pass_test import InferencePassTest + +import paddle import paddle.fluid as fluid import paddle.fluid.core as core from paddle.fluid.core import AnalysisConfig @@ -30,8 +32,8 @@ def setUp(self): data2 = fluid.data( name="data2", shape=[8, 32, 128], dtype="float32" ) - trans1 = fluid.layers.transpose(data1, perm=[0, 2, 1]) - trans2 = fluid.layers.transpose(data2, perm=[0, 2, 1]) + trans1 = paddle.transpose(data1, perm=[0, 2, 1]) + trans2 = paddle.transpose(data2, perm=[0, 2, 1]) flatt1 = fluid.layers.flatten(trans1) flatt2 = fluid.layers.flatten(trans2) concat_out = fluid.layers.concat([flatt1, flatt2], axis=1) diff --git a/python/paddle/fluid/tests/unittests/mlu/test_transpose_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_transpose_op_mlu.py index 30e4519887956..a802f9da215b4 100644 --- a/python/paddle/fluid/tests/unittests/mlu/test_transpose_op_mlu.py +++ b/python/paddle/fluid/tests/unittests/mlu/test_transpose_op_mlu.py @@ -192,26 +192,26 @@ def test_errors(self): def test_x_Variable_check(): # the Input(x)'s type must be Variable - fluid.layers.transpose("not_variable", perm=[1, 0, 2]) + paddle.transpose("not_variable", perm=[1, 0, 2]) self.assertRaises(TypeError, test_x_Variable_check) def test_perm_list_check(): # Input(perm)'s type must be list - fluid.layers.transpose(x, perm="[1, 0, 2]") + paddle.transpose(x, perm="[1, 0, 2]") self.assertRaises(TypeError, test_perm_list_check) def test_perm_length_and_x_dim_check(): # Input(perm) is the permutation of dimensions of Input(input) # its length should be equal to dimensions of Input(input) - fluid.layers.transpose(x, perm=[1, 0, 2, 3, 4]) + paddle.transpose(x, perm=[1, 0, 2, 3, 4]) self.assertRaises(ValueError, test_perm_length_and_x_dim_check) def test_each_elem_value_check(): # Each element in Input(perm) should be less than Input(x)'s dimension - fluid.layers.transpose(x, perm=[3, 5, 7]) + paddle.transpose(x, perm=[3, 5, 7]) self.assertRaises(ValueError, test_each_elem_value_check) diff --git a/python/paddle/fluid/tests/unittests/test_cholesky_op.py b/python/paddle/fluid/tests/unittests/test_cholesky_op.py index 1f5d60843139a..0688a782f7287 100644 --- a/python/paddle/fluid/tests/unittests/test_cholesky_op.py +++ b/python/paddle/fluid/tests/unittests/test_cholesky_op.py @@ -74,7 +74,7 @@ def func(self, place): root = layers.create_parameter( dtype=root_data.dtype, shape=root_data.shape ) - root_t = layers.transpose(root, self.trans_dims) + root_t = paddle.transpose(root, self.trans_dims) x = layers.matmul(x=root, y=root_t) + 1e-05 out = paddle.cholesky(x, upper=self.attrs["upper"]) grad_check(root, out, x_init=root_data, place=place) diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_padding_rnn.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_padding_rnn.py index 220a6d13b81fa..6539c6370fa68 100644 --- a/python/paddle/fluid/tests/unittests/test_eager_deletion_padding_rnn.py +++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_padding_rnn.py @@ -154,7 +154,7 @@ def padding_rnn(input_embedding, len=3, init_hidden=None, init_cell=None): hidden_array.append(pre_hidden) cell_array.append(pre_cell) - input_embedding = layers.transpose(input_embedding, perm=[1, 0, 2]) + input_embedding = paddle.transpose(input_embedding, perm=[1, 0, 2]) rnn = PaddingRNN() with rnn.step(): @@ -230,7 +230,7 @@ def padding_rnn(input_embedding, len=3, init_hidden=None, init_cell=None): c, axes=[0], starts=[num_steps - 1], ends=[num_steps] ) last_cell_array.append(last_c) - real_res = layers.transpose(x=real_res, perm=[1, 0, 2]) + real_res = paddle.transpose(x=real_res, perm=[1, 0, 2]) last_hidden = layers.concat(last_hidden_array, 0) last_cell = layers.concat(last_cell_array, 0) @@ -317,17 +317,17 @@ def encoder_static( last_hidden = paddle.reshape( last_hidden, shape=[-1, num_layers, hidden_size] ) - last_hidden = layers.transpose(x=last_hidden, perm=[1, 0, 2]) + last_hidden = paddle.transpose(x=last_hidden, perm=[1, 0, 2]) last_cell = layers.concat(cell_array, 1) last_cell = paddle.reshape( last_cell, shape=[-1, num_layers, hidden_size] ) - last_cell = layers.transpose(x=last_cell, perm=[1, 0, 2]) + last_cell = paddle.transpose(x=last_cell, perm=[1, 0, 2]) real_res = layers.concat(res, 0) real_res = paddle.reshape(real_res, shape=[len, -1, hidden_size]) - real_res = layers.transpose(x=real_res, perm=[1, 0, 2]) + real_res = paddle.transpose(x=real_res, perm=[1, 0, 2]) return real_res, last_hidden, last_cell @@ -404,7 +404,7 @@ def encoder_static( init_cell=init_cell_reshape, ) elif rnn_model == "cudnn": - x_emb = layers.transpose(x_emb, perm=[1, 0, 2]) + x_emb = paddle.transpose(x_emb, perm=[1, 0, 2]) rnn_out, last_hidden, last_cell = layers.lstm( x_emb, init_hidden_reshape, @@ -417,7 +417,7 @@ def encoder_static( low=-init_scale, high=init_scale ), ) - rnn_out = layers.transpose(rnn_out, perm=[1, 0, 2]) + rnn_out = paddle.transpose(rnn_out, perm=[1, 0, 2]) elif rnn_model == "basic_lstm": rnn_out, last_hidden, last_cell = basic_lstm( x_emb, diff --git a/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py b/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py index 2b0291b601e75..8f9c24223aa17 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py @@ -119,9 +119,10 @@ def __init__(self, num_users, num_items, matrix): def forward(self, users, items): # users_emb = self._user_emb(users) # items_emb = self._item_emb(items) + users_emb = paddle.gather(self._rating_matrix, users) items_emb = paddle.gather( - fluid.layers.transpose(self._rating_matrix, [1, 0]), items + paddle.transpose(self._rating_matrix, [1, 0]), items ) users_emb.stop_gradient = True items_emb.stop_gradient = True diff --git a/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py b/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py index 73f8973ebaf1b..ee4060b8b0570 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py @@ -63,7 +63,7 @@ def __init__( def forward(self, input, label): x_emb = self.embedding(input) projection = fluid.layers.matmul( - x_emb, fluid.layers.transpose(self.embedding.weight, perm=[1, 0]) + x_emb, paddle.transpose(self.embedding.weight, perm=[1, 0]) ) projection = fluid.layers.elementwise_add(projection, self.softmax_bias) projection = paddle.reshape(projection, shape=[-1, self.vocab_size]) diff --git a/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py b/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py index 46b568dec44bd..0eb20a7dca158 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py @@ -264,7 +264,7 @@ def forward(self, inputs): # stride=[1, 1], # filter_size=[conv_features.shape[2], 1]) - transpose_conv_features = fluid.layers.transpose( + transpose_conv_features = paddle.transpose( conv_features, perm=[0, 3, 1, 2] ) sliced_feature = paddle.reshape( diff --git a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py index a3e603b5a9618..183aafd824613 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py @@ -131,17 +131,17 @@ def forward(self, input_embedding, init_hidden=None, init_cell=None): paddle.reshape(self._input, shape=[1, -1, self._hidden_size]) ) real_res = fluid.layers.concat(res, 0) - real_res = fluid.layers.transpose(x=real_res, perm=[1, 0, 2]) + real_res = paddle.transpose(x=real_res, perm=[1, 0, 2]) last_hidden = fluid.layers.concat(self.hidden_array, 1) last_hidden = paddle.reshape( last_hidden, shape=[-1, self._num_layers, self._hidden_size] ) - last_hidden = fluid.layers.transpose(x=last_hidden, perm=[1, 0, 2]) + last_hidden = paddle.transpose(x=last_hidden, perm=[1, 0, 2]) last_cell = fluid.layers.concat(self.cell_array, 1) last_cell = paddle.reshape( last_cell, shape=[-1, self._num_layers, self._hidden_size] ) - last_cell = fluid.layers.transpose(x=last_cell, perm=[1, 0, 2]) + last_cell = paddle.transpose(x=last_cell, perm=[1, 0, 2]) return real_res, last_hidden, last_cell diff --git a/python/paddle/fluid/tests/unittests/test_imperative_save_load.py b/python/paddle/fluid/tests/unittests/test_imperative_save_load.py index 4968a2fe28adc..31ad8fc3faef2 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_save_load.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_save_load.py @@ -126,17 +126,17 @@ def forward(self, input_embedding, init_hidden=None, init_cell=None): paddle.reshape(self._input, shape=[1, -1, self._hidden_size]) ) real_res = fluid.layers.concat(res, 0) - real_res = fluid.layers.transpose(x=real_res, perm=[1, 0, 2]) + real_res = paddle.transpose(x=real_res, perm=[1, 0, 2]) last_hidden = fluid.layers.concat(self.hidden_array, 1) last_hidden = paddle.reshape( last_hidden, shape=[-1, self._num_layers, self._hidden_size] ) - last_hidden = fluid.layers.transpose(x=last_hidden, perm=[1, 0, 2]) + last_hidden = paddle.transpose(x=last_hidden, perm=[1, 0, 2]) last_cell = fluid.layers.concat(self.cell_array, 1) last_cell = paddle.reshape( last_cell, shape=[-1, self._num_layers, self._hidden_size] ) - last_cell = fluid.layers.transpose(x=last_cell, perm=[1, 0, 2]) + last_cell = paddle.transpose(x=last_cell, perm=[1, 0, 2]) return real_res, last_hidden, last_cell diff --git a/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py b/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py index a450d7e871f55..9bf869c25e0b5 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py @@ -14,6 +14,7 @@ import os import unittest +import paddle import paddle.fluid as fluid import paddle.fluid.core as core from paddle.fluid.dygraph.nn import Embedding @@ -128,17 +129,17 @@ def forward(self, input_embedding, init_hidden=None, init_cell=None): paddle.reshape(self._input, shape=[1, -1, self._hidden_size]) ) real_res = fluid.layers.concat(res, 0) - real_res = fluid.layers.transpose(x=real_res, perm=[1, 0, 2]) + real_res = paddle.transpose(x=real_res, perm=[1, 0, 2]) last_hidden = fluid.layers.concat(self.hidden_array, 1) last_hidden = paddle.reshape( last_hidden, shape=[-1, self._num_layers, self._hidden_size] ) - last_hidden = fluid.layers.transpose(x=last_hidden, perm=[1, 0, 2]) + last_hidden = paddle.transpose(x=last_hidden, perm=[1, 0, 2]) last_cell = fluid.layers.concat(self.cell_array, 1) last_cell = paddle.reshape( last_cell, shape=[-1, self._num_layers, self._hidden_size] ) - last_cell = fluid.layers.transpose(x=last_cell, perm=[1, 0, 2]) + last_cell = paddle.transpose(x=last_cell, perm=[1, 0, 2]) return real_res, last_hidden, last_cell diff --git a/python/paddle/fluid/tests/unittests/test_imperative_selected_rows_to_lod_tensor.py b/python/paddle/fluid/tests/unittests/test_imperative_selected_rows_to_lod_tensor.py index f137de9dc2cb2..fbf8f24398788 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_selected_rows_to_lod_tensor.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_selected_rows_to_lod_tensor.py @@ -73,7 +73,7 @@ def forward(self, input, label): fc = fluid.layers.matmul(x_emb, self.softmax_weight) fc = fluid.layers.elementwise_add(fc, self.softmax_bias) projection = fluid.layers.matmul( - fc, fluid.layers.transpose(self.embedding.weight, perm=[1, 0]) + fc, paddle.transpose(self.embedding.weight, perm=[1, 0]) ) projection = paddle.reshape(projection, shape=[-1, self.vocab_size]) loss = fluid.layers.softmax_with_cross_entropy( diff --git a/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py b/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py index 5c6f224a5ee19..700ae9a9c878a 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py @@ -479,15 +479,16 @@ def forward(self, queries, keys, values, attn_bias): reshaped_q = paddle.reshape( x=q, shape=[0, 0, self._n_head, self._d_key] ) - transpose_q = fluid.layers.transpose(x=reshaped_q, perm=[0, 2, 1, 3]) + + transpose_q = paddle.transpose(x=reshaped_q, perm=[0, 2, 1, 3]) reshaped_k = paddle.reshape( x=k, shape=[0, 0, self._n_head, self._d_key] ) - transpose_k = fluid.layers.transpose(x=reshaped_k, perm=[0, 2, 1, 3]) + transpose_k = paddle.transpose(x=reshaped_k, perm=[0, 2, 1, 3]) reshaped_v = paddle.reshape( x=v, shape=[0, 0, self._n_head, self._d_value] ) - transpose_v = fluid.layers.transpose(x=reshaped_v, perm=[0, 2, 1, 3]) + transpose_v = paddle.transpose(x=reshaped_v, perm=[0, 2, 1, 3]) # scale dot product attention product = fluid.layers.matmul( @@ -513,7 +514,7 @@ def forward(self, queries, keys, values, attn_bias): # combine heads if len(out.shape) != 4: raise ValueError("Input(x) should be a 4-D Tensor.") - trans_x = fluid.layers.transpose(out, perm=[0, 2, 1, 3]) + trans_x = paddle.transpose(out, perm=[0, 2, 1, 3]) final_out = paddle.reshape( x=trans_x, shape=[0, 0, trans_x.shape[2] * trans_x.shape[3]], diff --git a/python/paddle/fluid/tests/unittests/test_rnn_cell_api.py b/python/paddle/fluid/tests/unittests/test_rnn_cell_api.py index 1d04b4310539f..638c3bbe0025b 100644 --- a/python/paddle/fluid/tests/unittests/test_rnn_cell_api.py +++ b/python/paddle/fluid/tests/unittests/test_rnn_cell_api.py @@ -15,6 +15,7 @@ import unittest import numpy +import paddle import paddle.fluid as fluid import paddle.fluid.layers as layers import paddle.fluid.core as core @@ -335,7 +336,7 @@ def test_errors(self): name="sequence_length", shape=[None], dtype='int64' ) - inputs_dynamic_rnn = layers.transpose( + inputs_dynamic_rnn = paddle.transpose( inputs_basic_lstm, perm=[1, 0, 2] ) cell = LSTMCell(hidden_size, name="LSTMCell_for_rnn") @@ -428,7 +429,7 @@ def test_run(self): name="sequence_length", shape=[None], dtype='int64' ) - inputs_dynamic_rnn = layers.transpose(inputs_basic_lstm, perm=[1, 0, 2]) + inputs_dynamic_rnn = paddle.transpose(inputs_basic_lstm, perm=[1, 0, 2]) cell = LSTMCell(self.hidden_size, name="LSTMCell_for_rnn") output, final_state = dynamic_rnn( cell=cell, @@ -436,7 +437,7 @@ def test_run(self): sequence_length=sequence_length, is_reverse=False, ) - output_new = layers.transpose(output, perm=[1, 0, 2]) + output_new = paddle.transpose(output, perm=[1, 0, 2]) rnn_out, last_hidden, last_cell = basic_lstm( inputs_basic_lstm, diff --git a/python/paddle/fluid/tests/unittests/test_static_save_load.py b/python/paddle/fluid/tests/unittests/test_static_save_load.py index 8c4c8aa60de0a..720575d4457d5 100644 --- a/python/paddle/fluid/tests/unittests/test_static_save_load.py +++ b/python/paddle/fluid/tests/unittests/test_static_save_load.py @@ -138,17 +138,17 @@ def forward(self, input_embedding, init_hidden=None, init_cell=None): paddle.reshape(self._input, shape=[1, -1, self._hidden_size]) ) real_res = fluid.layers.concat(res, 0) - real_res = fluid.layers.transpose(x=real_res, perm=[1, 0, 2]) + real_res = paddle.transpose(x=real_res, perm=[1, 0, 2]) last_hidden = fluid.layers.concat(self.hidden_array, 1) last_hidden = paddle.reshape( last_hidden, shape=[-1, self._num_layers, self._hidden_size] ) - last_hidden = fluid.layers.transpose(x=last_hidden, perm=[1, 0, 2]) + last_hidden = paddle.transpose(x=last_hidden, perm=[1, 0, 2]) last_cell = fluid.layers.concat(self.cell_array, 1) last_cell = paddle.reshape( last_cell, shape=[-1, self._num_layers, self._hidden_size] ) - last_cell = fluid.layers.transpose(x=last_cell, perm=[1, 0, 2]) + last_cell = paddle.transpose(x=last_cell, perm=[1, 0, 2]) return real_res, last_hidden, last_cell diff --git a/python/paddle/fluid/tests/unittests/test_transpose_op.py b/python/paddle/fluid/tests/unittests/test_transpose_op.py index 67c2b8772c5dd..5a310f6bf8943 100644 --- a/python/paddle/fluid/tests/unittests/test_transpose_op.py +++ b/python/paddle/fluid/tests/unittests/test_transpose_op.py @@ -291,7 +291,7 @@ def test_errors(self): def test_x_Variable_check(): # the Input(x)'s type must be Variable - fluid.layers.transpose("not_variable", perm=[1, 0, 2]) + paddle.transpose("not_variable", perm=[1, 0, 2]) self.assertRaises(TypeError, test_x_Variable_check) @@ -300,26 +300,26 @@ def test_x_dtype_check(): x1 = fluid.layers.data( name='x1', shape=[10, 5, 3], dtype='int8' ) - fluid.layers.transpose(x1, perm=[1, 0, 2]) + paddle.transpose(x1, perm=[1, 0, 2]) self.assertRaises(TypeError, test_x_dtype_check) def test_perm_list_check(): # Input(perm)'s type must be list - fluid.layers.transpose(x, perm="[1, 0, 2]") + paddle.transpose(x, perm="[1, 0, 2]") self.assertRaises(TypeError, test_perm_list_check) def test_perm_length_and_x_dim_check(): # Input(perm) is the permutation of dimensions of Input(input) # its length should be equal to dimensions of Input(input) - fluid.layers.transpose(x, perm=[1, 0, 2, 3, 4]) + paddle.transpose(x, perm=[1, 0, 2, 3, 4]) self.assertRaises(ValueError, test_perm_length_and_x_dim_check) def test_each_elem_value_check(): # Each element in Input(perm) should be less than Input(x)'s dimension - fluid.layers.transpose(x, perm=[3, 5, 7]) + paddle.transpose(x, perm=[3, 5, 7]) self.assertRaises(ValueError, test_each_elem_value_check) diff --git a/python/paddle/fluid/tests/unittests/transformer_model.py b/python/paddle/fluid/tests/unittests/transformer_model.py index 842d9320dafc7..cf564e771e26f 100644 --- a/python/paddle/fluid/tests/unittests/transformer_model.py +++ b/python/paddle/fluid/tests/unittests/transformer_model.py @@ -121,7 +121,7 @@ def __split_heads(x, n_head): # permute the dimensions into: # [batch_size, n_head, max_sequence_len, hidden_size_per_head] - return layers.transpose(x=reshaped, perm=[0, 2, 1, 3]) + return paddle.transpose(x=reshaped, perm=[0, 2, 1, 3]) def __combine_heads(x): """ @@ -133,7 +133,7 @@ def __combine_heads(x): if len(x.shape) != 4: raise ValueError("Input(x) should be a 4-D Tensor.") - trans_x = layers.transpose(x, perm=[0, 2, 1, 3]) + trans_x = paddle.transpose(x, perm=[0, 2, 1, 3]) # FIXME(guosheng): Decouple the program desc with batch_size. return paddle.reshape( x=trans_x, From edf469194cc4ca58b827598ab678b1936d90eb55 Mon Sep 17 00:00:00 2001 From: duanyanhui <45005871+YanhuiDua@users.noreply.github.com> Date: Wed, 23 Nov 2022 13:42:06 +0800 Subject: [PATCH 167/210] add support of controlflow op for custom device (#48259) --- paddle/fluid/operators/controlflow/conditional_block_op.h | 7 +++++++ paddle/fluid/operators/controlflow/while_op_helper.cc | 5 +++-- paddle/fluid/operators/select_op_helper.h | 3 ++- 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/operators/controlflow/conditional_block_op.h b/paddle/fluid/operators/controlflow/conditional_block_op.h index bb2a79f96ee91..e0dad2f9cc09e 100644 --- a/paddle/fluid/operators/controlflow/conditional_block_op.h +++ b/paddle/fluid/operators/controlflow/conditional_block_op.h @@ -89,6 +89,13 @@ class ConditionalOp : public framework::OperatorBase { framework::TensorCopy(*ips[0], platform::CPUPlace(), &cpu_tensor); platform::DeviceContextPool::Instance().Get(ips[0]->place())->Wait(); res = cpu_tensor.data()[0]; +#endif + } else if (platform::is_custom_place(ips[0]->place())) { +#if defined(PADDLE_WITH_CUSTOM_DEVICE) + phi::DenseTensor cpu_tensor; + framework::TensorCopy(*ips[0], platform::CPUPlace(), &cpu_tensor); + platform::DeviceContextPool::Instance().Get(ips[0]->place())->Wait(); + res = cpu_tensor.data()[0]; #endif } else { res = ips[0]->data()[0]; diff --git a/paddle/fluid/operators/controlflow/while_op_helper.cc b/paddle/fluid/operators/controlflow/while_op_helper.cc index 8966f21d296bd..adbfed14226f8 100644 --- a/paddle/fluid/operators/controlflow/while_op_helper.cc +++ b/paddle/fluid/operators/controlflow/while_op_helper.cc @@ -228,8 +228,9 @@ bool GetCondData(const phi::DenseTensor &cond) { // platform::is_npu_place(cond.place()) or // platform::is_xpu_place(cond.place()) is true std::unique_ptr cpu_cond{new phi::DenseTensor()}; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ - defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_XPU) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_XPU) || \ + defined(PADDLE_WITH_CUSTOM_DEVICE) framework::TensorCopySync(cond, platform::CPUPlace(), cpu_cond.get()); #else PADDLE_THROW(platform::errors::PreconditionNotMet( diff --git a/paddle/fluid/operators/select_op_helper.h b/paddle/fluid/operators/select_op_helper.h index ffab83e4e74fa..0ef7ed44291df 100644 --- a/paddle/fluid/operators/select_op_helper.h +++ b/paddle/fluid/operators/select_op_helper.h @@ -39,7 +39,8 @@ inline int GetBranchNumber(const phi::DenseTensor &mask) { } // when platform::is_gpu_place(mask.place()) is true std::unique_ptr cpu_mask{new phi::DenseTensor()}; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_CUSTOM_DEVICE) framework::TensorCopySync(mask, platform::CPUPlace(), cpu_mask.get()); #else PADDLE_THROW(platform::errors::PreconditionNotMet( From 29d75c14f1e25ca9c4b741270859027fa390179a Mon Sep 17 00:00:00 2001 From: limingshu <61349199+JamesLim-sy@users.noreply.github.com> Date: Wed, 23 Nov 2022 14:00:32 +0800 Subject: [PATCH 168/210] Add bfloat16 type support for abs op (#48205) * first commit * 2nd commit --- paddle/phi/kernels/funcs/complex_functors.h | 69 ++++++------------- paddle/phi/kernels/gpu/abs_grad_kernel.cu | 1 + paddle/phi/kernels/gpu/abs_kernel.cu | 16 ++++- .../phi/kernels/impl/abs_grad_kernel_impl.h | 68 +++++++++++++++++- .../tests/unittests/test_activation_op.py | 1 + 5 files changed, 105 insertions(+), 50 deletions(-) diff --git a/paddle/phi/kernels/funcs/complex_functors.h b/paddle/phi/kernels/funcs/complex_functors.h index 8b292cb5dc52e..e6ffeb3b5602e 100644 --- a/paddle/phi/kernels/funcs/complex_functors.h +++ b/paddle/phi/kernels/funcs/complex_functors.h @@ -110,53 +110,6 @@ struct AbsFunctor>> { int64_t numel_; }; -template -struct AbsGradCUDAFunctor { - HOSTDEVICE inline AbsGradCUDAFunctor() {} - - HOSTDEVICE inline T operator()(const T x, const T dout) const { - T output; - if (x == T(0)) { - output = T(0); - } else { - output = T(dout) * (x / T(std::abs(x))); - } - return output; - } -}; - -template <> -struct AbsGradCUDAFunctor> { - HOSTDEVICE inline AbsGradCUDAFunctor() {} - HOSTDEVICE inline phi::dtype::complex operator()( - const phi::dtype::complex x, const float dout) const { - phi::dtype::complex output; - if (x == phi::dtype::complex(0)) { - output = phi::dtype::complex(0); - } else { - output = phi::dtype::complex(dout) * - (x / phi::dtype::complex(abs(x))); - } - return output; - } -}; - -template <> -struct AbsGradCUDAFunctor> { - HOSTDEVICE inline AbsGradCUDAFunctor() {} - HOSTDEVICE inline phi::dtype::complex operator()( - const phi::dtype::complex x, const double dout) const { - phi::dtype::complex output; - if (x == phi::dtype::complex(0)) { - output = phi::dtype::complex(0); - } else { - output = phi::dtype::complex(dout) * - (x / phi::dtype::complex(abs(x))); - } - return output; - } -}; - template struct AbsGradFunctor { AbsGradFunctor(const dtype::Real* dout, @@ -179,6 +132,28 @@ struct AbsGradFunctor { int64_t numel_; }; +template <> +struct AbsGradFunctor { + AbsGradFunctor(const dtype::Real* dout, + const phi::dtype::bfloat16* x, + phi::dtype::bfloat16* output, + int64_t numel) + : dout_(dout), x_(x), output_(output), numel_(numel) {} + + HOSTDEVICE void operator()(int64_t idx) const { + if (x_[idx] == static_cast(0)) { + output_[idx] = static_cast(0); + } else { + output_[idx] = dout_[idx] * (x_[idx] / (abs(x_[idx]))); + } + } + + const dtype::Real* dout_; + const phi::dtype::bfloat16* x_; + phi::dtype::bfloat16* output_; + int64_t numel_; +}; + template <> struct AbsGradFunctor> { AbsGradFunctor(const float* dout, diff --git a/paddle/phi/kernels/gpu/abs_grad_kernel.cu b/paddle/phi/kernels/gpu/abs_grad_kernel.cu index 8edb6b71224d6..a1afa8569b2fa 100644 --- a/paddle/phi/kernels/gpu/abs_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/abs_grad_kernel.cu @@ -31,6 +31,7 @@ PD_REGISTER_KERNEL(abs_grad, int, int64_t, phi::dtype::float16, + phi::dtype::bfloat16, complex, complex) { kernel->InputAt(1).SetDataType(phi::dtype::ToReal(kernel_key.dtype())); diff --git a/paddle/phi/kernels/gpu/abs_kernel.cu b/paddle/phi/kernels/gpu/abs_kernel.cu index d025f4b61e763..9f27c986166f4 100644 --- a/paddle/phi/kernels/gpu/abs_kernel.cu +++ b/paddle/phi/kernels/gpu/abs_kernel.cu @@ -16,8 +16,8 @@ #include #include - #include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/common/bfloat16.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/complex_functors.h" @@ -36,7 +36,18 @@ struct CudaAbsFunctor>> { }; template -struct CudaAbsFunctor>> { +struct CudaAbsFunctor< + T, + std::enable_if_t>::value && + std::is_same::value>> { + __device__ __forceinline__ T operator()(const T x) const { return abs(x); } +}; + +template +struct CudaAbsFunctor< + T, + std::enable_if_t>::value && + !std::is_same::value>> { __device__ __forceinline__ T operator()(const T x) const { return std::abs(x); } @@ -63,5 +74,6 @@ PD_REGISTER_KERNEL(abs, int, int64_t, phi::dtype::float16, + phi::dtype::bfloat16, phi::dtype::complex, phi::dtype::complex) {} diff --git a/paddle/phi/kernels/impl/abs_grad_kernel_impl.h b/paddle/phi/kernels/impl/abs_grad_kernel_impl.h index 9dad40b57c916..7064eec4f9e99 100644 --- a/paddle/phi/kernels/impl/abs_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/abs_grad_kernel_impl.h @@ -14,6 +14,7 @@ #pragma once +#include "paddle/phi/common/bfloat16.h" #include "paddle/phi/kernels/abs_grad_kernel.h" #include "paddle/phi/kernels/funcs/complex_functors.h" #include "paddle/phi/kernels/funcs/elementwise_base.h" @@ -22,6 +23,70 @@ namespace phi { #if defined(__NVCC__) + +template +struct AbsGradCUDAFunctor { + HOSTDEVICE inline AbsGradCUDAFunctor() {} + + HOSTDEVICE inline T operator()(const T x, const T dout) const { + T output; + if (x == T(0)) { + output = T(0); + } else { + output = T(dout) * (x / T(std::abs(x))); + } + return output; + } +}; + +template <> +struct AbsGradCUDAFunctor { + HOSTDEVICE inline AbsGradCUDAFunctor() {} + + HOSTDEVICE inline phi::dtype::bfloat16 operator()( + const phi::dtype::bfloat16 x, const phi::dtype::bfloat16 dout) const { + phi::dtype::bfloat16 output; + if (x == phi::dtype::bfloat16(0)) { + output = static_cast(0); + } else { + output = (dout) * (x / abs(x)); + } + return output; + } +}; + +template <> +struct AbsGradCUDAFunctor> { + HOSTDEVICE inline AbsGradCUDAFunctor() {} + HOSTDEVICE inline phi::dtype::complex operator()( + const phi::dtype::complex x, const float dout) const { + phi::dtype::complex output; + if (x == phi::dtype::complex(0)) { + output = phi::dtype::complex(0); + } else { + output = phi::dtype::complex(dout) * + (x / phi::dtype::complex(abs(x))); + } + return output; + } +}; + +template <> +struct AbsGradCUDAFunctor> { + HOSTDEVICE inline AbsGradCUDAFunctor() {} + HOSTDEVICE inline phi::dtype::complex operator()( + const phi::dtype::complex x, const double dout) const { + phi::dtype::complex output; + if (x == phi::dtype::complex(0)) { + output = phi::dtype::complex(0); + } else { + output = phi::dtype::complex(dout) * + (x / phi::dtype::complex(abs(x))); + } + return output; + } +}; + template void AbsGradKernelImpl(const GPUContext& dev_ctx, const DenseTensor& x, @@ -30,9 +95,10 @@ void AbsGradKernelImpl(const GPUContext& dev_ctx, std::vector ins = {&x, &dout}; std::vector outs = {dx}; dev_ctx.Alloc(dx); - phi::funcs::AbsGradCUDAFunctor abs_grad_cuda_functor; + AbsGradCUDAFunctor abs_grad_cuda_functor; phi::funcs::ElementwiseKernel(dev_ctx, ins, &outs, abs_grad_cuda_functor); } + template void AbsGradKernel(const Context& dev_ctx, const DenseTensor& x, diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py index 4411fdc3d1006..913777c2515f4 100755 --- a/python/paddle/fluid/tests/unittests/test_activation_op.py +++ b/python/paddle/fluid/tests/unittests/test_activation_op.py @@ -3699,6 +3699,7 @@ def test_check_grad(self): create_test_act_bf16_class(TestRelu) +create_test_act_bf16_class(TestAbs) if __name__ == "__main__": unittest.main() From 1066094a035ec7d21522a90d22c93f72b9e8e4ba Mon Sep 17 00:00:00 2001 From: sneaxiy <32832641+sneaxiy@users.noreply.github.com> Date: Wed, 23 Nov 2022 15:39:00 +0800 Subject: [PATCH 169/210] Make bfloat16 implicitly convert to float/double (#48238) * make bfloat16 implicit convert to float/double * fix bfloat16_test ut compile --- paddle/fluid/platform/bfloat16_test.cu | 2 +- paddle/phi/backends/gpu/cuda/cuda_device_function.h | 8 +++----- paddle/phi/common/bfloat16.h | 6 +++--- 3 files changed, 7 insertions(+), 9 deletions(-) diff --git a/paddle/fluid/platform/bfloat16_test.cu b/paddle/fluid/platform/bfloat16_test.cu index 1e1919bfca059..cec83cbd11fe9 100644 --- a/paddle/fluid/platform/bfloat16_test.cu +++ b/paddle/fluid/platform/bfloat16_test.cu @@ -39,7 +39,7 @@ TEST(bfloat16, convert_float32_to_bfloat16_on_gpu) { TEST(bfloat16, assignment_operator_on_gpu) { // Assignment operator bfloat16 v_assign; - v_assign = nv_bfloat16(bfloat16(1.0f)); + v_assign = bfloat16(1.0f).to_nv_bfloat16(); EXPECT_EQ(v_assign.x, 0x3f80); v_assign = 0.33333; EXPECT_EQ(v_assign.x, 0x3eab); diff --git a/paddle/phi/backends/gpu/cuda/cuda_device_function.h b/paddle/phi/backends/gpu/cuda/cuda_device_function.h index 10aee53c45cf9..4ff2e528a91cd 100644 --- a/paddle/phi/backends/gpu/cuda/cuda_device_function.h +++ b/paddle/phi/backends/gpu/cuda/cuda_device_function.h @@ -67,10 +67,8 @@ template <> __forceinline__ __device__ phi::dtype::bfloat16 CudaShuffleDownSync( unsigned mask, phi::dtype::bfloat16 val, int delta, int width) { #if defined(PADDLE_CUDA_BF16) - return phi::dtype::bfloat16(__shfl_down_sync(mask, - static_cast(val), - static_cast(delta), - width)); + return phi::dtype::bfloat16(__shfl_down_sync( + mask, val.to_nv_bfloat16(), static_cast(delta), width)); #else PADDLE_ENFORCE( false, "__shfl_down_sync with bfloat16 is not supported on cuda <= 11."); @@ -114,7 +112,7 @@ __forceinline__ __device__ phi::dtype::bfloat16 CudaShuffleXorSync( unsigned mask, phi::dtype::bfloat16 val, int width) { #if defined(PADDLE_CUDA_BF16) return phi::dtype::bfloat16( - __shfl_xor_sync(mask, static_cast(val), width)); + __shfl_xor_sync(mask, val.to_nv_bfloat16(), width)); #else PADDLE_ENFORCE( false, "__shfl_xor_sync with bfloat16 is not supported on cuda <= 11."); diff --git a/paddle/phi/common/bfloat16.h b/paddle/phi/common/bfloat16.h index 6a11f0c0714c2..37e4b55fbbc07 100644 --- a/paddle/phi/common/bfloat16.h +++ b/paddle/phi/common/bfloat16.h @@ -145,7 +145,7 @@ struct PADDLE_ALIGN(2) bfloat16 { } // Conversion opertors - HOSTDEVICE inline explicit operator float() const { + HOSTDEVICE inline operator float() const { #ifdef PADDLE_WITH_HIP uint32_t res = 0; // We should be using memcpy in order to respect the strict aliasing rule @@ -168,7 +168,7 @@ struct PADDLE_ALIGN(2) bfloat16 { } #ifdef PADDLE_CUDA_BF16 - HOSTDEVICE inline explicit operator __nv_bfloat16() const { + HOSTDEVICE inline __nv_bfloat16 to_nv_bfloat16() const { return *reinterpret_cast(&x); } #endif @@ -207,7 +207,7 @@ struct PADDLE_ALIGN(2) bfloat16 { return static_cast(static_cast(*this)); } - HOSTDEVICE inline explicit operator double() const { + HOSTDEVICE inline operator double() const { return static_cast(static_cast(*this)); } }; From a914d68eb70c2fbb0cf5e044c8d110dd78a43e49 Mon Sep 17 00:00:00 2001 From: GGBond8488 <33050871+GGBond8488@users.noreply.github.com> Date: Wed, 23 Nov 2022 15:40:30 +0800 Subject: [PATCH 170/210] =?UTF-8?q?=E3=80=90fluid=20clean=E3=80=91=20move?= =?UTF-8?q?=20out=20LayerList,=20ParameterList,=20Sequential=20from=20flui?= =?UTF-8?q?d=20(#48197)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fluid clean move out LayerList, ParameterList, Sequential from fluid * copy Sequential from fluid to paddle.nn * replace fluid LayerList,ParamterList, Sequential in unit test file * add omit unit test file use paddle.nn.Sequential * remove numpy and use paddle.uniform instead --- .../slim/tests/imperative_test_utils.py | 2 +- .../slim/tests/test_imperative_out_scale.py | 2 +- .../contrib/slim/tests/test_imperative_qat.py | 2 +- .../slim/tests/test_imperative_qat_lsq.py | 2 +- python/paddle/fluid/dygraph/container.py | 242 -------------- .../fleet/hybrid_parallel_pp_embedding.py | 2 +- .../unittests/hybrid_parallel_pp_layer.py | 2 +- .../test_imperative_container_layerlist.py | 11 +- ...test_imperative_container_parameterlist.py | 12 +- .../test_imperative_named_members.py | 2 +- ...perative_star_gan_with_gradient_penalty.py | 8 +- python/paddle/nn/__init__.py | 6 +- python/paddle/nn/layer/container.py | 309 ++++++++++++++++++ python/paddle/nn/layer/rnn.py | 3 +- python/paddle/nn/layer/transformer.py | 3 +- 15 files changed, 329 insertions(+), 279 deletions(-) diff --git a/python/paddle/fluid/contrib/slim/tests/imperative_test_utils.py b/python/paddle/fluid/contrib/slim/tests/imperative_test_utils.py index 91c2dacfdd91b..1a5f52b0406d8 100644 --- a/python/paddle/fluid/contrib/slim/tests/imperative_test_utils.py +++ b/python/paddle/fluid/contrib/slim/tests/imperative_test_utils.py @@ -17,7 +17,7 @@ import paddle import paddle.fluid as fluid from paddle.fluid import core -from paddle.fluid.dygraph.container import Sequential +from paddle.nn import Sequential from paddle.nn import ReLU, ReLU6, LeakyReLU, Sigmoid, Softmax, PReLU from paddle.nn import Linear, Conv2D, Softmax, BatchNorm2D, MaxPool2D from paddle.nn import BatchNorm1D diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py index 3dd871a64e2d5..02b19947ec9f6 100644 --- a/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py +++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py @@ -27,7 +27,7 @@ from paddle.fluid.optimizer import AdamOptimizer from paddle.fluid.framework import IrGraph, _test_eager_guard from paddle.fluid.contrib.slim.quantization import ImperativeQuantAware -from paddle.fluid.dygraph.container import Sequential +from paddle.nn import Sequential from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX from paddle.nn.layer import ReLU, LeakyReLU, Sigmoid, Softmax, PReLU from paddle.nn import Linear, Conv2D, Softmax, BatchNorm2D, MaxPool2D diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py index e037cea8f558c..aff07fb397cb9 100644 --- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py +++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py @@ -25,7 +25,7 @@ from paddle.fluid import core from paddle.fluid.optimizer import AdamOptimizer from paddle.fluid.contrib.slim.quantization import ImperativeQuantAware -from paddle.fluid.dygraph.container import Sequential +from paddle.nn import Sequential from paddle.nn import Linear, Conv2D, Softmax, Conv2DTranspose from paddle.fluid.log_helper import get_logger from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_lsq.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_lsq.py index bee201ee790f7..fbf1f4e7f5180 100644 --- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_lsq.py +++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_lsq.py @@ -29,7 +29,7 @@ MomentumOptimizer, ) from paddle.fluid.contrib.slim.quantization import ImperativeQuantAware -from paddle.fluid.dygraph.container import Sequential +from paddle.nn import Sequential from paddle.nn import ReLU, ReLU6, LeakyReLU, Sigmoid, Softmax, PReLU from paddle.nn import Linear, Conv2D, Softmax, BatchNorm2D, MaxPool2D from paddle.fluid.log_helper import get_logger diff --git a/python/paddle/fluid/dygraph/container.py b/python/paddle/fluid/dygraph/container.py index 254e13555b230..af0f043495cff 100644 --- a/python/paddle/fluid/dygraph/container.py +++ b/python/paddle/fluid/dygraph/container.py @@ -12,15 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -from collections import OrderedDict -from ..framework import Parameter from .layers import Layer -from .base import param_guard __all__ = [ 'Sequential', - 'ParameterList', - 'LayerList', ] @@ -97,240 +92,3 @@ def forward(self, input): for layer in self._sub_layers.values(): input = layer(input) return input - - -class ParameterList(Layer): - """ParameterList Container. - - This container acts like a Python list, but parameters it contains will be properly added. - - Parameters: - parameters (iterable, optional): Iterable Parameters to be added - - Examples: - .. code-block:: python - - import paddle - import numpy as np - - class MyLayer(paddle.nn.Layer): - def __init__(self, num_stacked_param): - super().__init__() - # create ParameterList with iterable Parameters - self.params = paddle.nn.ParameterList( - [paddle.create_parameter( - shape=[2, 2], dtype='float32')] * num_stacked_param) - - def forward(self, x): - for i, p in enumerate(self.params): - tmp = self._helper.create_variable_for_type_inference('float32') - self._helper.append_op( - type="mul", - inputs={"X": x, - "Y": p}, - outputs={"Out": tmp}, - attrs={"x_num_col_dims": 1, - "y_num_col_dims": 1}) - x = tmp - return x - - data_np = np.random.uniform(-1, 1, [5, 2]).astype('float32') - x = paddle.to_tensor(data_np) - num_stacked_param = 4 - model = MyLayer(num_stacked_param) - print(len(model.params)) # 4 - res = model(x) - print(res.shape) # [5, 2] - - replaced_param = paddle.create_parameter(shape=[2, 3], dtype='float32') - model.params[num_stacked_param - 1] = replaced_param # replace last param - res = model(x) - print(res.shape) # [5, 3] - model.params.append(paddle.create_parameter(shape=[3, 4], dtype='float32')) # append param - print(len(model.params)) # 5 - res = model(x) - print(res.shape) # [5, 4] - """ - - def __init__(self, parameters=None): - super().__init__() - if parameters is not None: - for idx, param in enumerate(parameters): - assert isinstance(param, Parameter) - self.add_parameter(str(idx), param) - - def __getitem__(self, idx): - with param_guard(self._parameters): - return self._parameters[str(idx)] - - def __setitem__(self, idx, param): - assert isinstance(param, Parameter) - setattr(self, str(idx), param) - - def __len__(self): - return len(self._parameters) - - def __iter__(self): - with param_guard(self._parameters): - return iter(self._parameters.values()) - - def append(self, parameter): - """Appends a given parameter at the end of the list. - - Parameters: - parameter (Parameter): parameter to append - """ - idx = len(self._parameters) - self.add_parameter(str(idx), parameter) - return self - - -class LayerList(Layer): - """ - LayerList holds sublayers, and sublayers it contains are properly registered. - Holded sublayers can be indexed like a regular python list. - - Parameters: - sublayers (iterable of Layer, optional): sublayers to hold - - Examples: - .. code-block:: python - - import paddle - import numpy as np - - class MyLayer(paddle.nn.Layer): - def __init__(self): - super().__init__() - self.linears = paddle.nn.LayerList( - [paddle.nn.Linear(10, 10) for i in range(10)]) - - def forward(self, x): - # LayerList can act as an iterable, or be indexed using ints - for i, l in enumerate(self.linears): - x = self.linears[i // 2](x) + l(x) - return x - """ - - def __init__(self, sublayers=None): - super().__init__() - if sublayers is not None: - for idx, layer in enumerate(sublayers): - self.add_sublayer(str(idx), layer) - - def _get_abs_idx(self, idx): - if isinstance(idx, int): - if not (-len(self) <= idx < len(self)): - raise IndexError( - 'index {} is out of range, should be an integer in range [{}, {})'.format( - idx, -len(self), len(self) - ) - ) - if idx < 0: - idx += len(self) - return idx - - def __getitem__(self, idx): - if isinstance(idx, slice): - return self.__class__(list(self._sub_layers.values())[idx]) - else: - idx = self._get_abs_idx(idx) - return self._sub_layers[str(idx)] - - def __setitem__(self, idx, sublayer): - idx = self._get_abs_idx(idx) - return setattr(self, str(idx), sublayer) - - def __delitem__(self, idx): - if isinstance(idx, slice): - for k in range(len(self._sub_layers))[idx]: - delattr(self, str(k)) - else: - idx = self._get_abs_idx(idx) - delattr(self, str(idx)) - str_indices = [str(i) for i in range(len(self._sub_layers))] - self._sub_layers = OrderedDict( - list(zip(str_indices, self._sub_layers.values())) - ) - - def __len__(self): - return len(self._sub_layers) - - def __iter__(self): - return iter(self._sub_layers.values()) - - def append(self, sublayer): - """ - Appends a sublayer to the end of the list. - - Parameters: - sublayer (Layer): sublayer to append - - Examples: - .. code-block:: python - - import paddle - - linears = paddle.nn.LayerList([paddle.nn.Linear(10, 10) for i in range(10)]) - another = paddle.nn.Linear(10, 10) - linears.append(another) - print(len(linears)) # 11 - """ - self.add_sublayer(str(len(self)), sublayer) - return self - - def insert(self, index, sublayer): - """ - Insert a sublayer before a given index in the list. - - Parameters: - index (int): index to insert. - sublayer (Layer): sublayer to insert - - Examples: - .. code-block:: python - - import paddle - - linears = paddle.nn.LayerList([paddle.nn.Linear(10, 10) for i in range(10)]) - another = paddle.nn.Linear(10, 10) - linears.insert(3, another) - print(linears[3] is another) # True - another = paddle.nn.Linear(10, 10) - linears.insert(-1, another) - print(linears[-2] is another) # True - """ - assert isinstance(index, int) and -len(self._sub_layers) <= index < len( - self._sub_layers - ), "index should be an integer in range [{}, {})".format( - -len(self), len(self) - ) - - index = self._get_abs_idx(index) - for i in range(len(self._sub_layers), index, -1): - self._sub_layers[str(i)] = self._sub_layers[str(i - 1)] - self._sub_layers[str(index)] = sublayer - - def extend(self, sublayers): - """ - Appends sublayers to the end of the list. - - Parameters: - sublayers (iterable of Layer): iterable of sublayers to append - - Examples: - .. code-block:: python - - import paddle - - linears = paddle.nn.LayerList([paddle.nn.Linear(10, 10) for i in range(10)]) - another_list = paddle.nn.LayerList([paddle.nn.Linear(10, 10) for i in range(5)]) - linears.extend(another_list) - print(len(linears)) # 15 - print(another_list[0] is linears[10]) # True - """ - offset = len(self) - for i, sublayer in enumerate(sublayers): - idx = str(offset + i) - self.add_sublayer(idx, sublayer) - return self diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_pp_embedding.py b/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_pp_embedding.py index 33833454c0152..99bb63cd96970 100644 --- a/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_pp_embedding.py +++ b/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_pp_embedding.py @@ -19,7 +19,7 @@ import paddle import paddle.distributed as dist import paddle.distributed.fleet as fleet -from paddle.fluid.dygraph.container import Sequential +from paddle.nn import Sequential from paddle.distributed.fleet.meta_parallel import PipelineLayer from paddle.fluid.dygraph.layers import Layer import paddle.nn as nn diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_layer.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_layer.py index d073d7267a251..b9d8db5aa878a 100644 --- a/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_layer.py +++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_layer.py @@ -15,7 +15,7 @@ import unittest import numpy as np from paddle.distributed import fleet -from paddle.fluid.dygraph.container import Sequential +from paddle.nn import Sequential import paddle.nn as nn from paddle.fluid.dygraph.layers import Layer from paddle.distributed.fleet.meta_parallel import LayerDesc, PipelineLayer diff --git a/python/paddle/fluid/tests/unittests/test_imperative_container_layerlist.py b/python/paddle/fluid/tests/unittests/test_imperative_container_layerlist.py index f0ac7ee14c917..db86295a5aa91 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_container_layerlist.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_container_layerlist.py @@ -31,11 +31,6 @@ def forward(self, x): class TestImperativeContainer(unittest.TestCase): - def fluid_dygraph_list(self): - return fluid.dygraph.LayerList( - [fluid.dygraph.Linear(2**i, 2 ** (i + 1)) for i in range(6)] - ) - def paddle_imperative_list(self): return paddle.nn.LayerList( [fluid.dygraph.Linear(2**i, 2 ** (i + 1)) for i in range(6)] @@ -45,11 +40,7 @@ def layer_list(self, use_fluid_api): data_np = np.random.uniform(-1, 1, [5, 1]).astype('float32') with fluid.dygraph.guard(): x = fluid.dygraph.to_variable(data_np) - layerlist = ( - self.fluid_dygraph_list() - if use_fluid_api - else self.paddle_imperative_list() - ) + layerlist = self.paddle_imperative_list() size = len(layerlist) model = MyLayer(layerlist) diff --git a/python/paddle/fluid/tests/unittests/test_imperative_container_parameterlist.py b/python/paddle/fluid/tests/unittests/test_imperative_container_parameterlist.py index f1cd45904de28..a87392741223b 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_container_parameterlist.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_container_parameterlist.py @@ -24,17 +24,7 @@ class MyLayer(fluid.Layer): def __init__(self, num_stacked_param, use_fluid_api): super().__init__() # create ParameterList with iterable Parameters - self.params = ( - self.fluid_dygraph_ParameterList(num_stacked_param) - if use_fluid_api - else self.paddle_imperative_ParameterList(num_stacked_param) - ) - - def fluid_dygraph_ParameterList(self, num_stacked_param): - return fluid.dygraph.ParameterList( - [fluid.layers.create_parameter(shape=[2, 2], dtype='float32')] - * num_stacked_param - ) + self.params = self.paddle_imperative_ParameterList(num_stacked_param) def paddle_imperative_ParameterList(self, num_stacked_param): return paddle.nn.ParameterList( diff --git a/python/paddle/fluid/tests/unittests/test_imperative_named_members.py b/python/paddle/fluid/tests/unittests/test_imperative_named_members.py index c5e39d330d190..044661940cb11 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_named_members.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_named_members.py @@ -37,7 +37,7 @@ def func_test_named_sublayers(self): fc1 = fluid.Linear(10, 3) fc2 = fluid.Linear(3, 10, bias_attr=False) custom = MyLayer(3, 10) - model = fluid.dygraph.Sequential(fc1, fc2, custom) + model = paddle.nn.Sequential(fc1, fc2, custom) named_sublayers = model.named_sublayers() list_named_sublayers = list(named_sublayers) diff --git a/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py b/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py index 4a99e0fb63b18..a75bc4b8a8e58 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py @@ -268,7 +268,7 @@ def __init__(self, cfg, num_channels=3): cur_channels *= 2 sub_layers.append(sub_layer) - self._conv0 = fluid.dygraph.Sequential(*sub_layers) + self._conv0 = paddle.nn.Sequential(*sub_layers) repeat_num = cfg.g_repeat_num sub_layers = [] @@ -278,7 +278,7 @@ def __init__(self, cfg, num_channels=3): ) sub_layers.append(res_block) - self._res_block = fluid.dygraph.Sequential(*sub_layers) + self._res_block = paddle.nn.Sequential(*sub_layers) cur_channels = cfg.g_base_dims * 4 sub_layers = [] @@ -296,7 +296,7 @@ def __init__(self, cfg, num_channels=3): cur_channels = cfg.g_base_dims * rate sub_layers.append(deconv) - self._deconv = fluid.dygraph.Sequential(*sub_layers) + self._deconv = paddle.nn.Sequential(*sub_layers) self._conv1 = Conv2DLayer( num_channels=cur_channels, @@ -353,7 +353,7 @@ def __init__(self, cfg, num_channels=3): cur_dim *= 2 sub_layers.append(sub_layer) - self._conv0 = fluid.dygraph.Sequential(*sub_layers) + self._conv0 = paddle.nn.Sequential(*sub_layers) kernel_size = int(cfg.image_size / np.power(2, repeat_num)) diff --git a/python/paddle/nn/__init__.py b/python/paddle/nn/__init__.py index 8e02cdffd5e05..a4aaa18ea5552 100644 --- a/python/paddle/nn/__init__.py +++ b/python/paddle/nn/__init__.py @@ -15,9 +15,9 @@ # TODO: import all neural network related api under this directory, # including layers, linear, conv, rnn etc. from ..fluid.dygraph.layers import Layer # noqa: F401 -from ..fluid.dygraph.container import LayerList # noqa: F401 -from ..fluid.dygraph.container import ParameterList # noqa: F401 -from ..fluid.dygraph.container import Sequential # noqa: F401 +from .layer.container import LayerList # noqa: F401 +from .layer.container import ParameterList # noqa: F401 +from .layer.container import Sequential # noqa: F401 from .clip import ClipGradByGlobalNorm # noqa: F401 from .clip import ClipGradByNorm # noqa: F401 diff --git a/python/paddle/nn/layer/container.py b/python/paddle/nn/layer/container.py index d70d8fec2caa8..0a8e5ec009d02 100644 --- a/python/paddle/nn/layer/container.py +++ b/python/paddle/nn/layer/container.py @@ -15,6 +15,8 @@ from collections import OrderedDict from .. import Layer from collections.abc import Iterable, Mapping +from ...fluid.framework import Parameter +from ...fluid.dygraph.base import param_guard __all__ = [] @@ -295,3 +297,310 @@ def update(self, sublayers): + ", which must be 2." ) self.add_sublayer(kv[0], kv[1]) + + +class Sequential(Layer): + """Sequential container. + Sub layers will be added to this container in the order of argument in the constructor. + The argument passed to the constructor can be iterable Layers or iterable name Layer pairs. + + Parameters: + layers(Layer|list|tuple): Layer or list/tuple of iterable name Layer pair. + + Examples: + .. code-block:: python + + import paddle + + data = paddle.uniform(shape=[30, 10], dtype='float32') + # create Sequential with iterable Layers + model1 = paddle.nn.Sequential( + paddle.nn.Linear(10, 1), paddle.nn.Linear(1, 2) + ) + model1[0] # access the first layer + res1 = model1(data) # sequential execution + + # create Sequential with name Layer pairs + model2 = paddle.nn.Sequential( + ('l1', paddle.nn.Linear(10, 2)), + ('l2', paddle.nn.Linear(2, 3)) + ) + model2['l1'] # access l1 layer + model2.add_sublayer('l3', paddle.nn.Linear(3, 3)) # add sublayer + res2 = model2(data) # sequential execution + + """ + + def __init__(self, *layers): + super().__init__() + if len(layers) > 0 and isinstance(layers[0], (list, tuple)): + for name, layer in layers: + self.add_sublayer(name, layer) + else: + for idx, layer in enumerate(layers): + self.add_sublayer(str(idx), layer) + + def __getitem__(self, name): + if isinstance(name, slice): + return self.__class__(*(list(self._sub_layers.values())[name])) + elif isinstance(name, str): + return self._sub_layers[name] + else: + if name >= len(self._sub_layers): + raise IndexError('index {} is out of range'.format(name)) + elif name < 0 and name >= -len(self._sub_layers): + name += len(self._sub_layers) + elif name < -len(self._sub_layers): + raise IndexError('index {} is out of range'.format(name)) + return list(self._sub_layers.values())[name] + + def __setitem__(self, name, layer): + assert isinstance(layer, Layer) + setattr(self, str(name), layer) + + def __delitem__(self, name): + name = str(name) + assert name in self._sub_layers + del self._sub_layers[name] + + def __len__(self): + return len(self._sub_layers) + + def forward(self, input): + for layer in self._sub_layers.values(): + input = layer(input) + return input + + +class ParameterList(Layer): + """ParameterList Container. + + This container acts like a Python list, but parameters it contains will be properly added. + + Parameters: + parameters (iterable, optional): Iterable Parameters to be added + + Examples: + .. code-block:: python + + import paddle + + class MyLayer(paddle.nn.Layer): + def __init__(self, num_stacked_param): + super().__init__() + # create ParameterList with iterable Parameters + self.params = paddle.nn.ParameterList( + [paddle.create_parameter( + shape=[2, 2], dtype='float32')] * num_stacked_param) + + def forward(self, x): + for i, p in enumerate(self.params): + tmp = self._helper.create_variable_for_type_inference('float32') + self._helper.append_op( + type="mul", + inputs={"X": x, + "Y": p}, + outputs={"Out": tmp}, + attrs={"x_num_col_dims": 1, + "y_num_col_dims": 1}) + x = tmp + return x + + x = paddle.uniform(shape=[5, 2], dtype='float32') + num_stacked_param = 4 + model = MyLayer(num_stacked_param) + print(len(model.params)) # 4 + res = model(x) + print(res.shape) # [5, 2] + + replaced_param = paddle.create_parameter(shape=[2, 3], dtype='float32') + model.params[num_stacked_param - 1] = replaced_param # replace last param + res = model(x) + print(res.shape) # [5, 3] + model.params.append(paddle.create_parameter(shape=[3, 4], dtype='float32')) # append param + print(len(model.params)) # 5 + res = model(x) + print(res.shape) # [5, 4] + """ + + def __init__(self, parameters=None): + super().__init__() + if parameters is not None: + for idx, param in enumerate(parameters): + assert isinstance(param, Parameter) + self.add_parameter(str(idx), param) + + def __getitem__(self, idx): + with param_guard(self._parameters): + return self._parameters[str(idx)] + + def __setitem__(self, idx, param): + assert isinstance(param, Parameter) + setattr(self, str(idx), param) + + def __len__(self): + return len(self._parameters) + + def __iter__(self): + with param_guard(self._parameters): + return iter(self._parameters.values()) + + def append(self, parameter): + """Appends a given parameter at the end of the list. + + Parameters: + parameter (Parameter): parameter to append + """ + idx = len(self._parameters) + self.add_parameter(str(idx), parameter) + return self + + +class LayerList(Layer): + """ + LayerList holds sublayers, and sublayers it contains are properly registered. + Holded sublayers can be indexed like a regular python list. + + Parameters: + sublayers (iterable of Layer, optional): sublayers to hold + + Examples: + .. code-block:: python + + import paddle + + class MyLayer(paddle.nn.Layer): + def __init__(self): + super().__init__() + self.linears = paddle.nn.LayerList( + [paddle.nn.Linear(10, 10) for i in range(10)]) + + def forward(self, x): + # LayerList can act as an iterable, or be indexed using ints + for i, l in enumerate(self.linears): + x = self.linears[i // 2](x) + l(x) + return x + """ + + def __init__(self, sublayers=None): + super().__init__() + if sublayers is not None: + for idx, layer in enumerate(sublayers): + self.add_sublayer(str(idx), layer) + + def _get_abs_idx(self, idx): + if isinstance(idx, int): + if not (-len(self) <= idx < len(self)): + raise IndexError( + 'index {} is out of range, should be an integer in range [{}, {})'.format( + idx, -len(self), len(self) + ) + ) + if idx < 0: + idx += len(self) + return idx + + def __getitem__(self, idx): + if isinstance(idx, slice): + return self.__class__(list(self._sub_layers.values())[idx]) + else: + idx = self._get_abs_idx(idx) + return self._sub_layers[str(idx)] + + def __setitem__(self, idx, sublayer): + idx = self._get_abs_idx(idx) + return setattr(self, str(idx), sublayer) + + def __delitem__(self, idx): + if isinstance(idx, slice): + for k in range(len(self._sub_layers))[idx]: + delattr(self, str(k)) + else: + idx = self._get_abs_idx(idx) + delattr(self, str(idx)) + str_indices = [str(i) for i in range(len(self._sub_layers))] + self._sub_layers = OrderedDict( + list(zip(str_indices, self._sub_layers.values())) + ) + + def __len__(self): + return len(self._sub_layers) + + def __iter__(self): + return iter(self._sub_layers.values()) + + def append(self, sublayer): + """ + Appends a sublayer to the end of the list. + + Parameters: + sublayer (Layer): sublayer to append + + Examples: + .. code-block:: python + + import paddle + + linears = paddle.nn.LayerList([paddle.nn.Linear(10, 10) for i in range(10)]) + another = paddle.nn.Linear(10, 10) + linears.append(another) + print(len(linears)) # 11 + """ + self.add_sublayer(str(len(self)), sublayer) + return self + + def insert(self, index, sublayer): + """ + Insert a sublayer before a given index in the list. + + Parameters: + index (int): index to insert. + sublayer (Layer): sublayer to insert + + Examples: + .. code-block:: python + + import paddle + + linears = paddle.nn.LayerList([paddle.nn.Linear(10, 10) for i in range(10)]) + another = paddle.nn.Linear(10, 10) + linears.insert(3, another) + print(linears[3] is another) # True + another = paddle.nn.Linear(10, 10) + linears.insert(-1, another) + print(linears[-2] is another) # True + """ + assert isinstance(index, int) and -len(self._sub_layers) <= index < len( + self._sub_layers + ), "index should be an integer in range [{}, {})".format( + -len(self), len(self) + ) + + index = self._get_abs_idx(index) + for i in range(len(self._sub_layers), index, -1): + self._sub_layers[str(i)] = self._sub_layers[str(i - 1)] + self._sub_layers[str(index)] = sublayer + + def extend(self, sublayers): + """ + Appends sublayers to the end of the list. + + Parameters: + sublayers (iterable of Layer): iterable of sublayers to append + + Examples: + .. code-block:: python + + import paddle + + linears = paddle.nn.LayerList([paddle.nn.Linear(10, 10) for i in range(10)]) + another_list = paddle.nn.LayerList([paddle.nn.Linear(10, 10) for i in range(5)]) + linears.extend(another_list) + print(len(linears)) # 15 + print(another_list[0] is linears[10]) # True + """ + offset = len(self) + for i, sublayer in enumerate(sublayers): + idx = str(offset + i) + self.add_sublayer(idx, sublayer) + return self diff --git a/python/paddle/nn/layer/rnn.py b/python/paddle/nn/layer/rnn.py index aeac50d0680e1..c93dcae4ac051 100644 --- a/python/paddle/nn/layer/rnn.py +++ b/python/paddle/nn/layer/rnn.py @@ -20,7 +20,8 @@ from paddle import framework from paddle.nn import functional as F from paddle.nn import initializer as I -from paddle.nn import Layer, LayerList +from paddle.nn import Layer +from .container import LayerList from paddle.fluid.layers import utils from paddle.fluid.layers.utils import flatten, map_structure from paddle import _C_ops, _legacy_C_ops diff --git a/python/paddle/nn/layer/transformer.py b/python/paddle/nn/layer/transformer.py index af7fc19794a35..52cffc4998a1d 100644 --- a/python/paddle/nn/layer/transformer.py +++ b/python/paddle/nn/layer/transformer.py @@ -24,7 +24,8 @@ from .. import functional as F from ... import tensor from ...fluid import layers -from .. import Layer, LayerList +from .. import Layer +from .container import LayerList from ...framework import ParamAttr from paddle.fluid.data_feeder import convert_dtype From 67204c18d0d671887f54128e4348c6b07a3f3fb6 Mon Sep 17 00:00:00 2001 From: Yuanle Liu Date: Wed, 23 Nov 2022 15:57:14 +0800 Subject: [PATCH 171/210] [Paddle Inference] add Conv2d fusion layout transfer pass (#48128) --- paddle/fluid/framework/ir/CMakeLists.txt | 1 + .../ir/conv2d_fusion_layout_transfer_pass.cc | 278 ++++++++++++++++++ .../ir/conv2d_fusion_layout_transfer_pass.h | 34 +++ 3 files changed, 313 insertions(+) create mode 100644 paddle/fluid/framework/ir/conv2d_fusion_layout_transfer_pass.cc create mode 100644 paddle/fluid/framework/ir/conv2d_fusion_layout_transfer_pass.h diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index 9ea065b567f0b..96bb7c53271a9 100644 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -101,6 +101,7 @@ pass_library(delete_c_identity_op_pass inference) pass_library(preln_residual_bias_fuse_pass inference) pass_library(delete_fill_constant_op_pass inference) pass_library(constant_folding_pass inference) +pass_library(conv2d_fusion_layout_transfer_pass inference) pass_library(simplify_with_basic_ops_pass base) pass_library(fc_elementwise_layernorm_fuse_pass base) pass_library(skip_layernorm_fuse_pass base) diff --git a/paddle/fluid/framework/ir/conv2d_fusion_layout_transfer_pass.cc b/paddle/fluid/framework/ir/conv2d_fusion_layout_transfer_pass.cc new file mode 100644 index 0000000000000..7ac8096bb9177 --- /dev/null +++ b/paddle/fluid/framework/ir/conv2d_fusion_layout_transfer_pass.cc @@ -0,0 +1,278 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/conv2d_fusion_layout_transfer_pass.h" + +#include +#include +#include + +#include "paddle/fluid/framework/convert_utils.h" +#include "paddle/fluid/framework/data_layout_transform.h" +#include "paddle/fluid/framework/ir/graph_helper.h" +#include "paddle/phi/common/layout.h" +#include "paddle/phi/common/place.h" +#include "paddle/phi/core/dense_tensor.h" + +namespace paddle { +namespace framework { +namespace ir { +namespace { + +void TransDataLayout(DataLayout from_layout, + DataLayout to_layout, + const phi::DenseTensor &in, + phi::DenseTensor *out) { + PADDLE_ENFORCE_EQ( + arity(in.dims()), + 4, + platform::errors::InvalidArgument( + "Input dimension arity only can be 4, the input dimension is %s.", + in.dims())); + + auto &pool = platform::DeviceContextPool::Instance(); + + auto src_dim = in.dims(); + std::vector dst_dim; + + auto axis = GetAxis(from_layout, to_layout); + dst_dim.resize(axis.size()); + for (size_t i = 0; i < axis.size(); i++) { + dst_dim[i] = src_dim[axis[i]]; + } + + out->Resize(phi::make_ddim(dst_dim)); + out->mutable_data(phi::CPUPlace(), in.dtype()); + + framework::VisitDataType( + framework::TransToProtoVarType(in.dtype()), + CastDataLayout(pool.Get(phi::CPUPlace()), axis, in, out)); + + out->set_layout(to_layout); +} + +void InsertLayoutTransOp(ir::Graph *graph, + ir::Node *prev_node, + ir::Node *next_node, + DataLayout from_layout, + DataLayout to_layout, + framework::BlockDesc *block_desc, + std::unordered_map *cache) { + auto do_insert = [&](const std::string &in_var_name, + const std::string &out_var_name) { + auto update_op_desc = [&](framework::OpDesc &desc, + const std::string &x_name, + const std::string &out_name) { + desc.SetType("transfer_layout"); + desc.SetInput("X", {x_name}); + desc.SetOutput("Out", {out_name}); + desc.SetAttr("src_layout", static_cast(from_layout)); + desc.SetAttr("dst_layout", static_cast(to_layout)); + desc.Flush(); + }; + CHECK_NOTNULL(block_desc); + if (cache->count(prev_node) == 0) { + framework::OpDesc op_desc(block_desc); + update_op_desc(op_desc, in_var_name, out_var_name); + auto *op_node = graph->CreateOpNode(&op_desc); + auto *op_out_var_desc = block_desc->Var(out_var_name); + + op_out_var_desc->SetPersistable(false); + op_out_var_desc->SetDataType(prev_node->Var()->GetDataType()); + auto to_shape = prev_node->Var()->GetShape(); + if (from_layout == DataLayout::kNCHW) { + auto n = to_shape[0]; + auto c = to_shape[1]; + auto h = to_shape[2]; + auto w = to_shape[3]; + op_out_var_desc->SetShape({n, h, w, c}); + } else { + auto n = to_shape[0]; + auto h = to_shape[1]; + auto w = to_shape[2]; + auto c = to_shape[3]; + op_out_var_desc->SetShape({n, c, h, w}); + } + + auto *op_out_var_node = graph->CreateVarNode(op_out_var_desc); + IR_NODE_LINK_TO(op_node, op_out_var_node); + cache->insert(std::make_pair(prev_node, op_out_var_node)); + } + next_node->Op()->RenameInput(prev_node->Name(), + cache->at(prev_node)->Name()); + IR_NODE_LINK_TO(prev_node, cache->at(prev_node)->inputs.front()); + IR_NODE_LINK_TO(cache->at(prev_node), next_node); + + IR_NODE_UNLINK(prev_node, next_node); + }; + + if (from_layout == DataLayout::kNCHW && to_layout == DataLayout::kNHWC) { + auto in_var_name = prev_node->Var()->Name(); + auto out_var_name = in_var_name + "_nchw_to_nhwc"; + do_insert(in_var_name, out_var_name); + } else if (from_layout == DataLayout::kNHWC && + to_layout == DataLayout::kNCHW) { + auto in_var_name = prev_node->Var()->Name(); + auto out_var_name = in_var_name + "_nhwc_to_nchw"; + do_insert(in_var_name, out_var_name); + } +} + +} // namespace + +void Conv2dFusionLayoutTransferPass::ApplyImpl(ir::Graph *graph) const { + PADDLE_ENFORCE_NOT_NULL( + graph, + platform::errors::PreconditionNotMet("graph should not be nullptr.")); + FusePassBase::Init("data_layout_transfer", graph); + auto *scope = param_scope(); + + PADDLE_ENFORCE_EQ(graph->IsMainGraph(), + true, + platform::errors::InvalidArgument( + "the graph should be main graph when applying " + "conv2d_fusion_layout_transfer_pass")); + + PADDLE_ENFORCE_NOT_NULL( + scope, + platform::errors::Fatal("scope must not be nullptr when applying " + "conv2d_fusion_layout_transfer_pass")); + + // Not support multiple block now. + std::unordered_map cache; + auto op_nodes = ir::TopologySortOperations(*graph); + auto iter = op_nodes.cbegin(); + auto *block_desc = (*iter)->Op()->Block(); + + std::unordered_set vars_shape_nhwc; + + // Only support conv2d_fusion now. + std::string target_op_type = "conv2d_fusion"; + std::unordered_set valid_ops; + + auto OpIsValid = [&](ir::Node *op_node) -> bool { + if (op_node->Op()->Type() != target_op_type) return false; + + auto data_format = + op_node->Op()->GetAttrIfExists("data_format"); + if (data_format != "NCHW") return false; + + auto filter_names = op_node->Op()->Input("Filter"); + + // If filter's channel is not multiple of 8, conv2d_fusion not run at nhwc. + for (const auto &filter_name : filter_names) { + auto *filter_var = scope->FindLocalVar(filter_name); + const auto &filter_tensor = filter_var->Get(); + if (filter_tensor.dims().size() == 4 && + (filter_tensor.dims()[0] % 8 != 0 || + filter_tensor.dims()[1] % 8 != 0)) { + return false; + } + } + return true; + }; + + for (auto *op_node : op_nodes) { + CHECK_EQ(op_node->IsOp(), true); + if (OpIsValid(op_node)) { + valid_ops.insert(op_node); + auto *op_desc = op_node->Op(); + auto nhwc_attr = framework::Attribute(std::string("NHWC")); + op_desc->SetAttr("data_format", nhwc_attr); + op_desc->Flush(); + + // transfer weights + auto filter_names = op_desc->Input("Filter"); + for (const auto &filter_name : filter_names) { + auto *filter_var = scope->FindLocalVar(filter_name); + auto *filter_tensor = filter_var->GetMutable(); + phi::DenseTensor temp_tensor = *filter_tensor; + filter_tensor->clear(); + + TransDataLayout( + DataLayout::kNCHW, DataLayout::kNHWC, temp_tensor, filter_tensor); + } + auto op_inputs = op_node->inputs; + for (auto *in_var_node : op_inputs) { + CHECK_EQ(in_var_node->IsVar(), true); + if (in_var_node->Var()->Persistable()) { + if (std::find(filter_names.cbegin(), + filter_names.cend(), + in_var_node->Var()->Name()) != filter_names.cend()) { + auto from_shape = in_var_node->Var()->GetShape(); + in_var_node->Var()->SetShape( + {from_shape[0], from_shape[2], from_shape[3], from_shape[1]}); + } + } + } + + // transfer outputs + auto op_outputs = op_node->outputs; + for (auto *out_var_node : op_outputs) { + CHECK_EQ(out_var_node->IsVar(), true); + if (out_var_node->Var()->Persistable()) continue; + + auto from_shape = out_var_node->Var()->GetShape(); + out_var_node->Var()->SetShape( + {from_shape[0], from_shape[2], from_shape[3], from_shape[1]}); + vars_shape_nhwc.insert(out_var_node); + } + } + } + + // Insert transfer_layout op + for (auto *op_node : op_nodes) { + CHECK_EQ(op_node->IsOp(), true); + + if (valid_ops.count(op_node)) { + auto op_inputs = op_node->inputs; + for (auto *in_var_node : op_inputs) { + CHECK_EQ(in_var_node->IsVar(), true); + + if (in_var_node->Var()->Persistable()) continue; + if (vars_shape_nhwc.count(in_var_node)) continue; + + InsertLayoutTransOp(graph, + in_var_node, + op_node, + DataLayout::kNCHW, + DataLayout::kNHWC, + block_desc, + &cache); + } + } else { + auto op_inputs = op_node->inputs; + for (auto *in_var_node : op_inputs) { + CHECK_EQ(in_var_node->IsVar(), true); + + if (vars_shape_nhwc.count(in_var_node)) { + InsertLayoutTransOp(graph, + in_var_node, + op_node, + DataLayout::kNHWC, + DataLayout::kNCHW, + block_desc, + &cache); + } + } + } + } +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(conv2d_fusion_layout_transfer_pass, + paddle::framework::ir::Conv2dFusionLayoutTransferPass); diff --git a/paddle/fluid/framework/ir/conv2d_fusion_layout_transfer_pass.h b/paddle/fluid/framework/ir/conv2d_fusion_layout_transfer_pass.h new file mode 100644 index 0000000000000..7a67994e95259 --- /dev/null +++ b/paddle/fluid/framework/ir/conv2d_fusion_layout_transfer_pass.h @@ -0,0 +1,34 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/ir/fuse_pass_base.h" + +namespace paddle { +namespace framework { +namespace ir { + +class Conv2dFusionLayoutTransferPass : public FusePassBase { + public: + Conv2dFusionLayoutTransferPass() = default; + virtual ~Conv2dFusionLayoutTransferPass() = default; + + protected: + void ApplyImpl(ir::Graph* graph) const override; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle From 3daf5185efa5ce7092f15f58c55820da882d5afb Mon Sep 17 00:00:00 2001 From: Wilber Date: Wed, 23 Nov 2022 16:03:30 +0800 Subject: [PATCH 172/210] add map_depthwise_conv_to_conv pass (#47955) --- paddle/fluid/framework/ir/CMakeLists.txt | 1 + .../ir/map_depthwise_conv_to_conv_pass.cc | 61 +++++++++++++++++++ .../ir/map_depthwise_conv_to_conv_pass.h | 36 +++++++++++ .../inference/api/paddle_pass_builder.cc | 6 +- 4 files changed, 102 insertions(+), 2 deletions(-) create mode 100644 paddle/fluid/framework/ir/map_depthwise_conv_to_conv_pass.cc create mode 100644 paddle/fluid/framework/ir/map_depthwise_conv_to_conv_pass.h mode change 100755 => 100644 paddle/fluid/inference/api/paddle_pass_builder.cc diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index 96bb7c53271a9..06ea7acb3315e 100644 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -76,6 +76,7 @@ pass_library(embedding_fc_lstm_fuse_pass inference) pass_library(fc_gru_fuse_pass inference) pass_library(seq_concat_fc_fuse_pass inference) pass_library(multi_batch_merge_pass base) +pass_library(map_depthwise_conv_to_conv_pass inference) pass_library(conv_bn_fuse_pass inference) pass_library(seqconv_eltadd_relu_fuse_pass inference) pass_library(seqpool_concat_fuse_pass inference) diff --git a/paddle/fluid/framework/ir/map_depthwise_conv_to_conv_pass.cc b/paddle/fluid/framework/ir/map_depthwise_conv_to_conv_pass.cc new file mode 100644 index 0000000000000..341fedcd4bacd --- /dev/null +++ b/paddle/fluid/framework/ir/map_depthwise_conv_to_conv_pass.cc @@ -0,0 +1,61 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/map_depthwise_conv_to_conv_pass.h" + +#include + +#include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace framework { +namespace ir { + +void MapDepthwiseConv2ConvPass::ApplyImpl(ir::Graph* graph) const { + PADDLE_ENFORCE_NOT_NULL( + graph, platform::errors::InvalidArgument("Graph cannot be nullptr.")); + FusePassBase::Init("map_depthwise_conv_to_conv_pass", graph); + + int found_count = 0; + std::unordered_map replaced_map{ + {"depthwise_conv2d", "conv2d"}, + }; + + auto nodes = graph->Nodes(); + + for (auto& node : nodes) { + if (!node->IsOp()) continue; + auto* op_desc = node->Op(); + std::string op_type = op_desc->Type(); + if (!replaced_map.count(op_type)) continue; + op_desc->SetType(replaced_map[op_type]); + op_desc->Flush(); + ++found_count; + } + + AddStatis(found_count); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(map_depthwise_conv_to_conv_pass, + paddle::framework::ir::MapDepthwiseConv2ConvPass); +REGISTER_PASS_CAPABILITY(map_depthwise_conv_to_conv_pass) + .AddCombination( + paddle::framework::compatible::OpVersionComparatorCombination() + .LE("depthwise_conv2d", 1) + .LE("conv2d", 1)); diff --git a/paddle/fluid/framework/ir/map_depthwise_conv_to_conv_pass.h b/paddle/fluid/framework/ir/map_depthwise_conv_to_conv_pass.h new file mode 100644 index 0000000000000..1691ab6973465 --- /dev/null +++ b/paddle/fluid/framework/ir/map_depthwise_conv_to_conv_pass.h @@ -0,0 +1,36 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include "paddle/fluid/framework/ir/fuse_pass_base.h" + +namespace paddle { +namespace framework { +namespace ir { + +class MapDepthwiseConv2ConvPass : public FusePassBase { + public: + MapDepthwiseConv2ConvPass() = default; + virtual ~MapDepthwiseConv2ConvPass() = default; + + protected: + void ApplyImpl(Graph* graph) const override; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc old mode 100755 new mode 100644 index 062264222b255..c964ce7e4d0d2 --- a/paddle/fluid/inference/api/paddle_pass_builder.cc +++ b/paddle/fluid/inference/api/paddle_pass_builder.cc @@ -165,6 +165,7 @@ const std::vector kLiteSubgraphPasses({ // running errors. After fusion operator supports low precision, delete this. const std::vector kGpuLowerPrecisionPasses{ "simplify_with_basic_ops_pass", + "map_depthwise_conv_to_conv_pass", "conv_bn_fuse_pass", "conv_eltwiseadd_bn_fuse_pass", "conv_elementwise_add_act_fuse_pass", @@ -202,8 +203,9 @@ const std::vector kTrtLowerPrecisionPasses{ GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) { passes_.assign({ // "identity_scale_op_clean_pass", // - "is_test_pass", // - "simplify_with_basic_ops_pass", // + "is_test_pass", // + "simplify_with_basic_ops_pass", // + "map_depthwise_conv_to_conv_pass", "conv_bn_fuse_pass", // "conv_eltwiseadd_bn_fuse_pass", // "embedding_eltwise_layernorm_fuse_pass", // From a0f473504bdddb414497861335c83b7a10917c68 Mon Sep 17 00:00:00 2001 From: Vvsmile <450864116@qq.com> Date: Wed, 23 Nov 2022 17:46:06 +0800 Subject: [PATCH 173/210] [Clean Fluid API]Remove API: label_smooth (#47952) * Remove API: label_smooth (replace with paddle.nn.functional.label_smooth) Replace the paddle.fluid.layers.label_smooth with the paddle.nn.functional.label_smooth * modify the call of label_smooth from old style to new style --- python/paddle/fluid/layers/nn.py | 90 ------------------- .../fleet/parallel_dygraph_transformer.py | 3 +- .../fluid/tests/unittests/dist_transformer.py | 3 +- .../transformer_dygraph_model.py | 3 +- ..._imperative_transformer_sorted_gradient.py | 3 +- .../unittests/test_label_smooth_functional.py | 3 +- .../fluid/tests/unittests/test_layers.py | 5 +- 7 files changed, 11 insertions(+), 99 deletions(-) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index d782bf973a5f1..3d26fc1260381 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -103,7 +103,6 @@ 'lod_reset', 'lod_append', 'pad', - 'label_smooth', 'roi_pool', 'roi_align', 'image_resize', @@ -5843,95 +5842,6 @@ def pad(x, paddings, pad_value=0.0, name=None): return out -def label_smooth( - label, prior_dist=None, epsilon=0.1, dtype="float32", name=None -): - r""" - :alias_main: paddle.nn.functional.label_smooth - :alias: paddle.nn.functional.label_smooth,paddle.nn.functional.common.label_smooth - :old_api: paddle.fluid.layers.label_smooth - - Label smoothing is a mechanism to regularize the classifier layer and is called - label-smoothing regularization (LSR). - - Label smoothing is proposed to encourage the model to be less confident, - since optimizing the log-likelihood of the correct label directly may - cause overfitting and reduce the ability of the model to adapt. Label - smoothing replaces the ground-truth label :math:`y` with the weighted sum - of itself and some fixed distribution :math:`\mu`. For class :math:`k`, - i.e. - - .. math:: - - \\tilde{y_k} = (1 - \epsilon) * y_k + \epsilon * \mu_k, - - where :math:`1 - \epsilon` and :math:`\epsilon` are the weights - respectively, and :math:`\\tilde{y}_k` is the smoothed label. Usually - uniform distribution is used for :math:`\mu`. - - See more details about label smoothing in https://arxiv.org/abs/1512.00567. - - Parameters: - label(Variable): The input variable containing the label data. The - label data should use one-hot representation. It's - a multidimensional tensor with a shape of - :math:`[N_1, ..., Depth]`, where Depth is class number. The dtype can be "float32" and "float64". - prior_dist(Variable, optional): The prior distribution to be used to smooth - labels. If not provided, an uniform distribution - is used. It's a multidimensional tensor with a shape of - :math:`[1, class\_num]` . The default value is None. - epsilon(float, optional): The weight used to mix up the original ground-truth - distribution and the fixed distribution. The default value is - 0.1. - dtype(np.dtype|core.VarDesc.VarType|str, optional): The data type can be set - as 'float32', 'float64'. The default value is 'float32'. - name(str, optional): The default value is None. Normally there is no need for user - to set this property. For more information, please refer to - :ref:`api_guide_Name`. - - Returns: - Variable: The tensor variable containing the smoothed labels. - - Examples: - .. code-block:: python - - import paddle.fluid as fluid - import paddle.fluid.layers as layers - - label = layers.data(name="label", shape=[1], dtype="int32") - one_hot_label = layers.one_hot(input=label, depth=10) - smooth_label = layers.label_smooth( - label=one_hot_label, epsilon=0.1, dtype="float32") - """ - if in_dygraph_mode(): - return _C_ops.label_smooth(label, prior_dist, float(epsilon)) - - if epsilon > 1.0 or epsilon < 0.0: - raise ValueError("The value of epsilon must be between 0 and 1.") - - if _non_static_mode(): - return _legacy_C_ops.label_smooth( - label, prior_dist, 'epsilon', float(epsilon) - ) - - check_variable_and_dtype( - label, 'label', ['float32', 'float64'], 'label_smooth' - ) - - helper = LayerHelper("label_smooth", **locals()) - label.stop_gradient = True - smooth_label = helper.create_variable_for_type_inference(dtype) - helper.append_op( - type="label_smooth", - inputs={"X": label, "PriorDist": prior_dist} - if prior_dist - else {"X": label}, - outputs={"Out": smooth_label}, - attrs={"epsilon": float(epsilon)}, - ) - return smooth_label - - @templatedoc() def roi_pool( input, diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_transformer.py b/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_transformer.py index d3fd734d6aa76..07d43ccd551f5 100644 --- a/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_transformer.py +++ b/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_transformer.py @@ -26,6 +26,7 @@ from paddle.optimizer.lr import NoamDecay from test_dist_base import runtime_main, TestParallelDyGraphRunnerBase +import paddle.nn.functional as F """ Note(chenweihang): To compare loss of single-card and multi-card @@ -934,7 +935,7 @@ def forward(self, enc_inputs, dec_inputs, label, weights): enc_output = self._wrap_encoder_layer(enc_inputs) predict = self._wrap_decoder_layer(dec_inputs, enc_output) if self._label_smooth_eps: - label_out = fluid.layers.label_smooth( + label_out = F.label_smooth( label=fluid.layers.one_hot( input=label, depth=self._trg_vocab_size ), diff --git a/python/paddle/fluid/tests/unittests/dist_transformer.py b/python/paddle/fluid/tests/unittests/dist_transformer.py index 24de04dc6fb56..8ed0eae0ff682 100644 --- a/python/paddle/fluid/tests/unittests/dist_transformer.py +++ b/python/paddle/fluid/tests/unittests/dist_transformer.py @@ -27,6 +27,7 @@ import paddle.fluid as fluid import paddle.fluid.layers as layers from test_dist_base import TestDistRunnerBase, runtime_main, RUN_STEP +import paddle.nn.functional as F import paddle const_para_attr = fluid.ParamAttr(initializer=fluid.initializer.Constant(0.001)) @@ -1580,7 +1581,7 @@ def transformer( # cancel padding index in calculating the loss. label, weights = make_all_inputs(label_data_input_fields) if label_smooth_eps: - label = layers.label_smooth( + label = F.label_smooth( label=layers.one_hot(input=label, depth=trg_vocab_size), epsilon=label_smooth_eps, ) diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_dygraph_model.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_dygraph_model.py index 796329ab555d6..ee11e045d9aad 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_dygraph_model.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_dygraph_model.py @@ -26,6 +26,7 @@ from paddle.fluid.dygraph.jit import dygraph_to_static_func from paddle.fluid.layers.utils import map_structure import paddle +import paddle.nn.functional as F def position_encoding_init(n_position, d_pos_vec): @@ -574,7 +575,7 @@ def __init__(self, label_smooth_eps): def __call__(self, predict, label, weights): if self.label_smooth_eps: - label_out = layers.label_smooth( + label_out = F.label_smooth( label=layers.one_hot(input=label, depth=predict.shape[-1]), epsilon=self.label_smooth_eps, ) diff --git a/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py b/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py index 700ae9a9c878a..f8671a76e2cd6 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py @@ -22,6 +22,7 @@ from paddle.fluid.framework import _in_legacy_dygraph, _test_eager_guard from paddle.fluid import core import numpy as np +import paddle.nn.functional as F np.set_printoptions(suppress=True) @@ -1088,7 +1089,7 @@ def forward(self, enc_inputs, dec_inputs, label, weights): enc_output = self._wrap_encoder_layer(enc_inputs) predict = self._wrap_decoder_layer(dec_inputs, enc_output) if self._label_smooth_eps: - label_out = fluid.layers.label_smooth( + label_out = F.label_smooth( label=fluid.layers.one_hot( input=label, depth=self._trg_vocab_size ), diff --git a/python/paddle/fluid/tests/unittests/test_label_smooth_functional.py b/python/paddle/fluid/tests/unittests/test_label_smooth_functional.py index 8fad7dc1ea2af..7f6e6d8434d18 100644 --- a/python/paddle/fluid/tests/unittests/test_label_smooth_functional.py +++ b/python/paddle/fluid/tests/unittests/test_label_smooth_functional.py @@ -48,11 +48,10 @@ def fluid_layer(self, place): label_var = fluid.data( "input", self.label_shape, dtype=self.dtype ) - y_var = fluid.layers.label_smooth( + y_var = F.label_smooth( label_var, prior_dist=self.prior_dist, epsilon=self.epsilon, - dtype=self.dtype, ) feed_dict = {"input": self.label} exe = fluid.Executor(place) diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index cc33db2385afb..60e0543287662 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -33,6 +33,7 @@ from paddle.fluid.dygraph import base from paddle.fluid.dygraph import to_variable from paddle.fluid.framework import _test_eager_guard +import paddle.nn.functional as F class LayerTest(unittest.TestCase): @@ -3359,9 +3360,7 @@ def make_label_smooth(self): with fluid.framework._dygraph_place_guard(place=fluid.CPUPlace()): label = self._get_data(name="label", shape=[1], dtype="int32") one_hot_label = layers.one_hot(input=label, depth=10) - smooth_label = layers.label_smooth( - label=one_hot_label, epsilon=0.1, dtype="int32" - ) + smooth_label = F.label_smooth(label=one_hot_label, epsilon=0.1) return smooth_label def make_topk(self): From db0ea0ce70fd9f701a17052255228bb3b1284682 Mon Sep 17 00:00:00 2001 From: ykkk2333 <77383312+ykkk2333@users.noreply.github.com> Date: Wed, 23 Nov 2022 18:38:21 +0800 Subject: [PATCH 174/210] add masked_select_grad kernel (#48137) * add stat tool * add roll and roll_grad kernels and strided_slice and strided_slice_grad kernels, test=kunlun * add masked_selected_grad kernel,test=kunlun --- .../fluid/platform/device/xpu/xpu2_op_list.h | 15 +- paddle/phi/kernels/xpu/concat_kernel.cc | 13 +- paddle/phi/kernels/xpu/conv_kernel.cc | 87 ++ .../kernels/xpu/masked_select_grad_kernel.cc | 57 ++ paddle/phi/kernels/xpu/sgd_kernel.cc | 102 ++- .../tests/unittests/xpu/test_conv3d_op_xpu.py | 742 ++++++++++++++++++ .../xpu/test_masked_select_op_xpu.py | 3 + .../tests/unittests/xpu/test_sgd_op_xpu.py | 73 ++ 8 files changed, 1071 insertions(+), 21 deletions(-) create mode 100644 paddle/phi/kernels/xpu/masked_select_grad_kernel.cc create mode 100644 python/paddle/fluid/tests/unittests/xpu/test_conv3d_op_xpu.py diff --git a/paddle/fluid/platform/device/xpu/xpu2_op_list.h b/paddle/fluid/platform/device/xpu/xpu2_op_list.h index 4862401f83a6f..62a4daf727503 100644 --- a/paddle/fluid/platform/device/xpu/xpu2_op_list.h +++ b/paddle/fluid/platform/device/xpu/xpu2_op_list.h @@ -123,13 +123,17 @@ XPUOpMap& get_kl2_ops() { pOpKernelType(vartype::FP16, XPUPlace())})}, {"concat", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), - pOpKernelType(vartype::FP16, XPUPlace())})}, + pOpKernelType(vartype::FP16, XPUPlace()), + pOpKernelType(vartype::INT64, XPUPlace())})}, {"conv2d_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), pOpKernelType(vartype::FP16, XPUPlace())})}, {"conv2d", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), pOpKernelType(vartype::FP16, XPUPlace())})}, + {"conv3d", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace())})}, {"conv2d_transpose_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"conv2d_transpose", @@ -375,6 +379,12 @@ XPUOpMap& get_kl2_ops() { pOpKernelType(vartype::INT64, XPUPlace()), pOpKernelType(vartype::FP16, XPUPlace()), pOpKernelType(vartype::FP32, XPUPlace())})}, + {"masked_select_grad", + XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()), + pOpKernelType(vartype::INT64, XPUPlace()), + pOpKernelType(vartype::BOOL, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace()), + pOpKernelType(vartype::FP32, XPUPlace())})}, {"matmul_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), pOpKernelType(vartype::FP16, XPUPlace())})}, @@ -502,6 +512,9 @@ XPUOpMap& get_kl2_ops() { {"sgd", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), pOpKernelType(vartype::FP16, XPUPlace())})}, + {"sgd_dense_param_sparse_grad", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace())})}, {"sigmoid_cross_entropy_with_logits_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"sigmoid_cross_entropy_with_logits", diff --git a/paddle/phi/kernels/xpu/concat_kernel.cc b/paddle/phi/kernels/xpu/concat_kernel.cc index 50b323429b067..4e09f6ef85281 100644 --- a/paddle/phi/kernels/xpu/concat_kernel.cc +++ b/paddle/phi/kernels/xpu/concat_kernel.cc @@ -50,6 +50,7 @@ void ConcatKernel(const Context& dev_ctx, x[0]->dims().size())); // If axis is 0, the lod of the output is not the same as inputs. + if (axis == 0 && x[0]->lod().size() > 0) { size_t lod_size_0 = x[0]->lod().size(); size_t lod_size = lod_size_0; @@ -79,7 +80,9 @@ void ConcatKernel(const Context& dev_ctx, } } } + dev_ctx.template Alloc(out); + std::vector> xdims_list; std::vector ptrs; for (unsigned int i = 0; i < x.size(); ++i) { @@ -97,6 +100,7 @@ void ConcatKernel(const Context& dev_ctx, PADDLE_ENFORCE_GT(xdims_list.size(), 0, phi::errors::InvalidArgument("No tensor need concat")); + int r = xpu::concat(dev_ctx.x_context(), ptrs, reinterpret_cast(out->data()), @@ -107,5 +111,10 @@ void ConcatKernel(const Context& dev_ctx, } // namespace phi -PD_REGISTER_KERNEL( - concat, XPU, ALL_LAYOUT, phi::ConcatKernel, float, phi::dtype::float16) {} +PD_REGISTER_KERNEL(concat, + XPU, + ALL_LAYOUT, + phi::ConcatKernel, + float, + phi::dtype::float16, + int64_t) {} diff --git a/paddle/phi/kernels/xpu/conv_kernel.cc b/paddle/phi/kernels/xpu/conv_kernel.cc index 8bbbdc2c16d8b..bca16c84a907c 100644 --- a/paddle/phi/kernels/xpu/conv_kernel.cc +++ b/paddle/phi/kernels/xpu/conv_kernel.cc @@ -131,9 +131,96 @@ void DepthwiseConvKernel(const Context& dev_ctx, out); } +template +void Conv3DKernel(const Context& dev_ctx, + const DenseTensor& input, + const DenseTensor& filter, + const std::vector& strides, + const std::vector& paddings_t, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations_t, + const std::string& data_format, + DenseTensor* out) { + using XPUT = typename XPUTypeTrait::Type; + std::vector paddings = paddings_t; + std::vector dilations = dilations_t; + // The filter will be reshaped in the calculations, + // so here use an assignment operation, + // that avoids modifying the variable in the Scope. + dev_ctx.template Alloc(out); + + phi::DDim in_data_dims = + phi::slice_ddim(input.dims(), 2, input.dims().size()); + phi::DDim filter_data_dims = + phi::slice_ddim(filter.dims(), 2, filter.dims().size()); + std::vector ksize = phi::vectorize(filter_data_dims); + UpdatePaddingAndDilation( + &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize); + + int batch_size = static_cast(input.dims()[0]); + int img_c = static_cast(input.dims()[1]); + int img_d = static_cast(input.dims()[2]); + int img_h = static_cast(input.dims()[3]); + int img_w = static_cast(input.dims()[4]); + int f = static_cast(filter.dims()[0]); + bool is_ncdhw = true; + if (data_format == "NDHWC") { + img_c = static_cast(input.dims()[4]); + img_d = static_cast(input.dims()[1]); + img_h = static_cast(input.dims()[2]); + img_w = static_cast(input.dims()[3]); + is_ncdhw = false; + } + + XPUT* output_data = reinterpret_cast(out->data()); + const XPUT* filter_data = reinterpret_cast(filter.data()); + const XPUT* input_data = reinterpret_cast(input.data()); + + xpu::ctx_guard RAII_GUARD(dev_ctx.x_context()); + + XPUT* filter_data_tmp; + const XPUT* filter_data_ptr = filter_data; + if (data_format == "NDHWC") { + filter_data_tmp = RAII_GUARD.alloc(filter.numel()); + PADDLE_ENFORCE_XDNN_NOT_NULL(filter_data_tmp); + std::vector filter_shape = phi::vectorize(filter.dims()); + int r = xpu::transpose(dev_ctx.x_context(), + filter_data, + filter_data_tmp, + filter_shape, + {0, 2, 3, 4, 1}); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "transpose"); + filter_data_ptr = reinterpret_cast(filter_data_tmp); + } + + int r = xpu::conv3d(dev_ctx.x_context(), + input_data, + filter_data_ptr, + output_data, + batch_size, + img_c, + img_d, + img_h, + img_w, + f, + ksize, + strides, + paddings, + dilations, + groups, + nullptr, + nullptr, + nullptr, + is_ncdhw); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "conv3d"); +} + } // namespace phi PD_REGISTER_KERNEL( conv2d, XPU, ALL_LAYOUT, phi::ConvKernel, float, phi::dtype::float16) {} PD_REGISTER_KERNEL( depthwise_conv2d, XPU, ALL_LAYOUT, phi::DepthwiseConvKernel, float) {} +PD_REGISTER_KERNEL( + conv3d, XPU, ALL_LAYOUT, phi::Conv3DKernel, float, phi::dtype::float16) {} diff --git a/paddle/phi/kernels/xpu/masked_select_grad_kernel.cc b/paddle/phi/kernels/xpu/masked_select_grad_kernel.cc new file mode 100644 index 0000000000000..52a98c63f4898 --- /dev/null +++ b/paddle/phi/kernels/xpu/masked_select_grad_kernel.cc @@ -0,0 +1,57 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/masked_select_grad_kernel.h" + +#include "paddle/phi/backends/xpu/enforce_xpu.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void MaskedSelectGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& mask, + const DenseTensor& out_grad, + DenseTensor* x_grad) { + using XPUType = typename XPUTypeTrait::Type; + auto* mask_data = mask.data(); + auto* input_data = reinterpret_cast(out_grad.data()); + auto* out_data = + reinterpret_cast(dev_ctx.template Alloc(x_grad)); + + auto mask_shape = phi::vectorize(mask.dims()); + auto xshape = phi::vectorize(x_grad->dims()); + + int r = xpu::masked_select_grad(dev_ctx.x_context(), + input_data, + mask_data, + out_data, + xshape, + mask_shape, + 1); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "masked_select_grad"); +} + +} // namespace phi + +PD_REGISTER_KERNEL(masked_select_grad, + XPU, + ALL_LAYOUT, + phi::MaskedSelectGradKernel, + float, + phi::dtype::float16, + int, + bool, + int64_t) {} diff --git a/paddle/phi/kernels/xpu/sgd_kernel.cc b/paddle/phi/kernels/xpu/sgd_kernel.cc index 510fddae3ba7c..1f821a8de2821 100644 --- a/paddle/phi/kernels/xpu/sgd_kernel.cc +++ b/paddle/phi/kernels/xpu/sgd_kernel.cc @@ -20,14 +20,14 @@ namespace phi { template -void SGDDenseKernel(const Context &dev_ctx, - const DenseTensor ¶m, - const DenseTensor &learning_rate, - const DenseTensor &grad, - const paddle::optional &master_param, +void SGDDenseKernel(const Context& dev_ctx, + const DenseTensor& param, + const DenseTensor& learning_rate, + const DenseTensor& grad, + const paddle::optional& master_param, bool multi_precision, - DenseTensor *param_out, - DenseTensor *master_param_out) { + DenseTensor* param_out, + DenseTensor* master_param_out) { using XPUType = typename XPUTypeTrait::Type; auto sz = param_out->numel(); PADDLE_ENFORCE_EQ( @@ -49,37 +49,103 @@ void SGDDenseKernel(const Context &dev_ctx, grad.numel(), sz)); - const T *lr_t = learning_rate.data(); + const T* lr_t = learning_rate.data(); xpu::ctx_guard RAII_GUARD(dev_ctx.x_context()); - const float *lr = nullptr; + const float* lr = nullptr; if (std::is_same::value) { - float *lr_float = RAII_GUARD.alloc_l3_or_gm(learning_rate.numel()); + float* lr_float = RAII_GUARD.alloc_l3_or_gm(learning_rate.numel()); int r = xpu::cast(dev_ctx.x_context(), - reinterpret_cast(lr_t), + reinterpret_cast(lr_t), lr_float, learning_rate.numel()); PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast"); lr = lr_float; } else { - lr = reinterpret_cast(lr_t); + lr = reinterpret_cast(lr_t); } - const T *param_data = param.data(); - const T *grad_data = grad.data(); + const T* param_data = param.data(); + const T* grad_data = grad.data(); dev_ctx.template Alloc(param_out); - T *out_data = param_out->data(); + T* out_data = param_out->data(); int r = xpu::sgd(dev_ctx.x_context(), - reinterpret_cast(grad_data), - reinterpret_cast(param_data), + reinterpret_cast(grad_data), + reinterpret_cast(param_data), lr, - reinterpret_cast(out_data), + reinterpret_cast(out_data), sz); PADDLE_ENFORCE_XDNN_SUCCESS(r, "sgd"); } +template +void SGDDenseParamSparseGradKernel( + const Context& dev_ctx, + const DenseTensor& param, + const DenseTensor& learning_rate, + const SelectedRows& grad, + const paddle::optional& master_param, + bool multi_precision, + DenseTensor* param_out, + DenseTensor* master_param_out) { + using XPUType = typename XPUTypeTrait::Type; + dev_ctx.template Alloc(param_out); + + PADDLE_ENFORCE_EQ( + ¶m, + param_out, + phi::errors::InvalidArgument( + "The input tensor Param of SgdOp should be equal with ParamOut " + "if variable's type is SelectedRows.")); + + auto in_height = grad.height(); + auto out_dims = param_out->dims(); + PADDLE_ENFORCE_EQ(in_height, + out_dims[0], + phi::errors::InvalidArgument( + "The input tensor Grad's height of SgdOp should be " + "equal with ParamOut's dims. But received Grad's " + "height [%s] and ParamOut's dims [%s]", + in_height, + out_dims[0])); + + auto& in_value = grad.value(); + auto& in_rows = grad.rows(); + int64_t* in_rows_data = nullptr; + xpu::VectorParam in_rows_vec{ + in_rows.data(), static_cast(in_rows.size()), in_rows_data}; + + int64_t in_row_numel = in_value.numel() / in_rows.size(); + PADDLE_ENFORCE_EQ(in_row_numel, + param_out->numel() / in_height, + phi::errors::InvalidArgument( + "The in_row_numel of SgdOp should be equal with " + "param_out's numel / in_height.")); + + auto* in_data = in_value.data(); + auto* out_data = param_out->data(); + + int r = xpu::sparse_sgd( + dev_ctx.x_context(), + reinterpret_cast(in_data), + reinterpret_cast(param.data()), + learning_rate.data(), + in_rows_vec, + reinterpret_cast(out_data), + in_row_numel, + in_rows.size()); + + PADDLE_ENFORCE_XDNN_SUCCESS(r, "sparse_sgd"); +} + } // namespace phi PD_REGISTER_KERNEL( sgd, XPU, ALL_LAYOUT, phi::SGDDenseKernel, phi::dtype::float16, float) {} +PD_REGISTER_KERNEL(sgd_dense_param_sparse_grad, + XPU, + ALL_LAYOUT, + phi::SGDDenseParamSparseGradKernel, + phi::dtype::float16, + float) {} diff --git a/python/paddle/fluid/tests/unittests/xpu/test_conv3d_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_conv3d_op_xpu.py new file mode 100644 index 0000000000000..f949d7eeef87b --- /dev/null +++ b/python/paddle/fluid/tests/unittests/xpu/test_conv3d_op_xpu.py @@ -0,0 +1,742 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys + +sys.path.append("..") +import unittest +import numpy as np + +from op_test_xpu import XPUOpTest +import paddle.fluid as fluid +import paddle +from xpu.get_test_cover_info import ( + create_test_class, + XPUOpTestWrapper, +) + + +def conv3d_forward_naive( + input, + filter, + group, + conv_param, + padding_algorithm='EXPLICIT', + data_format="NCDHW", +): + + if padding_algorithm not in ["SAME", "VALID", "EXPLICIT"]: + raise ValueError( + "Unknown Attr(padding_algorithm): '%s'. " + "It can only be 'SAME' or 'VALID'." % str(padding_algorithm) + ) + + if data_format not in ["NCDHW", "NDHWC"]: + raise ValueError( + "Unknown Attr(data_format): '%s' ." + "It can only be 'NCDHW' or 'NDHWC'." % str(data_format) + ) + + channel_last = data_format == "NDHWC" + if channel_last: + input = np.transpose(input, [0, 4, 1, 2, 3]) + + in_n, in_c, in_d, in_h, in_w = input.shape + + f_n, f_c, f_d, f_h, f_w = filter.shape + out_n = in_n + out_c = f_n + assert f_c * group == in_c + assert np.mod(out_c, group) == 0 + sub_out_c = out_c // group + sub_f_n = f_n // group + + stride, pad, dilation = ( + conv_param['stride'], + conv_param['pad'], + conv_param['dilations'], + ) + + # update pad and dilation + def _get_padding_with_SAME(input_shape, pool_size, pool_stride): + padding = [] + for input_size, filter_size, stride_size in zip( + input_shape, pool_size, pool_stride + ): + out_size = int((input_size + stride_size - 1) / stride_size) + pad_sum = np.max( + ((out_size - 1) * stride_size + filter_size - input_size, 0) + ) + pad_0 = int(pad_sum / 2) + pad_1 = int(pad_sum - pad_0) + padding.append(pad_0) + padding.append(pad_1) + return padding + + ksize = filter.shape[2:5] + if padding_algorithm == "VALID": + pad = [0, 0, 0, 0, 0, 0] + elif padding_algorithm == "SAME": + dilation = [1, 1, 1] + input_data_shape = input.shape[2:5] + pad = _get_padding_with_SAME(input_data_shape, ksize, stride) + + pad_d_0, pad_d_1 = pad[0], pad[0] + pad_h_0, pad_h_1 = pad[1], pad[1] + pad_w_0, pad_w_1 = pad[2], pad[2] + if len(pad) == 6: + pad_d_0, pad_d_1 = pad[0], pad[1] + pad_h_0, pad_h_1 = pad[2], pad[3] + pad_w_0, pad_w_1 = pad[4], pad[5] + + out_d = ( + 1 + + (in_d + pad_d_0 + pad_d_1 - (dilation[0] * (f_d - 1) + 1)) + // stride[0] + ) + out_h = ( + 1 + + (in_h + pad_h_0 + pad_h_1 - (dilation[1] * (f_h - 1) + 1)) + // stride[1] + ) + out_w = ( + 1 + + (in_w + pad_w_0 + pad_w_1 - (dilation[2] * (f_w - 1) + 1)) + // stride[2] + ) + + out = np.zeros((in_n, out_c, out_d, out_h, out_w)) + + d_bolck_d = dilation[0] * (f_d - 1) + 1 + d_bolck_h = dilation[1] * (f_h - 1) + 1 + d_bolck_w = dilation[2] * (f_w - 1) + 1 + + input_pad = np.pad( + input, + ( + (0, 0), + (0, 0), + (pad_d_0, pad_d_1), + (pad_h_0, pad_h_1), + (pad_w_0, pad_w_1), + ), + mode='constant', + constant_values=0, + ) + + filter_dilation = np.zeros((f_n, f_c, d_bolck_d, d_bolck_h, d_bolck_w)) + filter_dilation[ + :, + :, + 0 : d_bolck_d : dilation[0], + 0 : d_bolck_h : dilation[1], + 0 : d_bolck_w : dilation[2], + ] = filter + + for d in range(out_d): + for i in range(out_h): + for j in range(out_w): + for g in range(group): + input_pad_masked = input_pad[ + :, + g * f_c : (g + 1) * f_c, + d * stride[0] : d * stride[0] + d_bolck_d, + i * stride[1] : i * stride[1] + d_bolck_h, + j * stride[2] : j * stride[2] + d_bolck_w, + ] + + f_sub = filter_dilation[ + g * sub_f_n : (g + 1) * sub_f_n, :, :, :, : + ] + for k in range(sub_out_c): + out[:, g * sub_out_c + k, d, i, j] = np.sum( + input_pad_masked * f_sub[k, :, :, :, :], + axis=(1, 2, 3, 4), + ) + if channel_last: + out = np.transpose(out, [0, 2, 3, 4, 1]) + return out + + +def create_test_padding_SAME_class(parent): + class TestPaddingSMAECase(parent): + def init_paddings(self): + self.pad = [0, 0, 0] + self.padding_algorithm = "SAME" + + cls_name = "{0}_{1}".format(parent.__name__, "PaddingSAMEOp") + TestPaddingSMAECase.__name__ = cls_name + globals()[cls_name] = TestPaddingSMAECase + + +def create_test_padding_VALID_class(parent): + class TestPaddingVALIDCase(parent): + def init_paddings(self): + self.pad = [1, 1, 1] + self.padding_algorithm = "VALID" + + cls_name = "{0}_{1}".format(parent.__name__, "PaddingVALIDOp") + TestPaddingVALIDCase.__name__ = cls_name + globals()[cls_name] = TestPaddingVALIDCase + + +def create_test_channel_last_class(parent): + class TestChannelLastCase(parent): + def init_data_format(self): + self.data_format = "NDHWC" + + def init_test_case_2(self): + N, C, D, H, W = self.input_size + self.input_size = [N, D, H, W, C] + + cls_name = "{0}_{1}".format(parent.__name__, "ChannelLast") + TestChannelLastCase.__name__ = cls_name + globals()[cls_name] = TestChannelLastCase + + +paddle.enable_static() + + +class XPUTestConv3DOp(XPUOpTestWrapper): + def __init__(self): + self.op_name = 'conv3d' + self.use_dynamic_create_class = False + + class TestConv3DOp(XPUOpTest): + def setUp(self): + self.dtype = self.in_type + self.op_type = "conv3d" + self.use_cudnn = False + self.use_mkldnn = False + self.data_format = "AnyLayout" + self.init_kernel_type() + self.init_group() + self.init_dilation() + self.init_test_case() + + conv3d_param = { + 'stride': self.stride, + 'pad': self.pad, + 'dilations': self.dilations, + } + + np.random.seed(100) + input = np.random.random(self.input_size).astype(self.dtype) + filter = np.random.random(self.filter_size).astype(self.dtype) + output = conv3d_forward_naive( + input, + filter, + self.groups, + conv3d_param, + ).astype(self.dtype) + + self.inputs = { + 'Input': XPUOpTest.np_dtype_to_fluid_dtype(input), + 'Filter': XPUOpTest.np_dtype_to_fluid_dtype(filter), + } + self.attrs = { + 'strides': self.stride, + 'paddings': self.pad, + 'groups': self.groups, + 'dilations': self.dilations, + 'use_cudnn': self.use_cudnn, + 'use_mkldnn': self.use_mkldnn, + 'data_format': self.data_format, + } + self.outputs = {'Output': output} + + def test_check_output(self): + place = paddle.XPUPlace(0) + self.check_output_with_place(place) + + def init_test_case(self): + self.pad = [0, 0, 0] + self.stride = [1, 1, 1] + self.input_size = [2, 3, 4, 4, 4] # NCDHW + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] // self.groups + self.filter_size = [6, f_c, 3, 3, 3] + + def init_test_case_2(self): + pass + + def init_dilation(self): + self.dilations = [1, 1, 1] + + def init_group(self): + self.groups = 1 + + def init_kernel_type(self): + pass + + class TestCase1(TestConv3DOp): + def init_test_case(self): + self.pad = [1, 1, 1] + self.stride = [1, 1, 1] + self.input_size = [2, 3, 4, 4, 4] # NCDHW + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] // self.groups + self.filter_size = [6, f_c, 3, 3, 3] + + class TestWithGroup1(TestConv3DOp): + def init_group(self): + self.groups = 3 + + class TestWithGroup2(TestCase1): + def init_group(self): + self.groups = 3 + + class TestWith1x1(TestConv3DOp): + def init_test_case(self): + self.pad = [0, 0, 0] + self.stride = [1, 1, 1] + self.input_size = [2, 3, 4, 4, 4] + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] // self.groups + self.filter_size = [120, f_c, 1, 1, 1] + + def init_dilation(self): + self.dilations = [1, 1, 1] + + def init_group(self): + self.groups = 3 + + class TestWithInput1x1Filter1x1(TestConv3DOp): + def init_test_case(self): + self.pad = [0, 0, 0] + self.stride = [1, 1, 1] + self.input_size = [40, 3, 1, 1, 1] + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] // self.groups + self.filter_size = [120, f_c, 1, 1, 1] + + def init_dilation(self): + self.dilations = [1, 1, 1] + + def init_group(self): + self.groups = 3 + + class TestWithDilation(TestConv3DOp): + def init_test_case(self): + self.pad = [0, 0, 0] + self.stride = [1, 1, 1] + self.input_size = [2, 3, 6, 6, 6] + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] // self.groups + self.filter_size = [24, f_c, 2, 2, 2] + + def init_dilation(self): + self.dilations = [2, 2, 2] + + def init_group(self): + self.groups = 3 + + +# ---- test asymmetric padding ---- +class XPUTestConv3DOp_v2(XPUOpTestWrapper): + def __init__(self): + self.op_name = 'conv3d' + self.use_dynamic_create_class = False + + class TestConv3DOp_2(XPUOpTest): + def setUp(self): + self.dtype = self.in_type + self.op_type = "conv3d" + self.use_cudnn = False + self.use_mkldnn = False + self.data_format = "NCDHW" + self.init_kernel_type() + self.init_group() + self.init_dilation() + self.init_data_format() + self.init_test_case() + self.init_paddings() + + self.init_test_case_2() + + conv3d_param = { + 'stride': self.stride, + 'pad': self.pad, + 'dilations': self.dilations, + } + + np.random.seed(100) + input = np.random.random(self.input_size).astype(self.dtype) + filter = np.random.random(self.filter_size).astype(self.dtype) + output = conv3d_forward_naive( + input, + filter, + self.groups, + conv3d_param, + self.padding_algorithm, + self.data_format, + ).astype(self.dtype) + + self.inputs = { + 'Input': XPUOpTest.np_dtype_to_fluid_dtype(input), + 'Filter': XPUOpTest.np_dtype_to_fluid_dtype(filter), + } + self.attrs = { + 'strides': self.stride, + 'paddings': self.pad, + 'padding_algorithm': self.padding_algorithm, + 'groups': self.groups, + 'dilations': self.dilations, + 'use_cudnn': self.use_cudnn, + 'use_mkldnn': self.use_mkldnn, + 'data_format': self.data_format, + } + self.outputs = {'Output': output} + + def test_check_output(self): + place = paddle.XPUPlace(0) + self.check_output_with_place(place) + + def init_test_case(self): + self.stride = [1, 1, 1] + self.input_size = [2, 3, 4, 4, 4] # NCDHW + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] // self.groups + self.filter_size = [6, f_c, 3, 3, 3] + + def init_test_case_2(self): + pass + + def init_dilation(self): + self.dilations = [1, 1, 1] + + def init_group(self): + self.groups = 1 + + def init_kernel_type(self): + pass + + def init_paddings(self): + self.pad = [0, 0, 0] + self.padding_algorithm = "EXPLICIT" + + def init_data_format(self): + self.data_format = "NCDHW" + + class TestConv3DOp_AsyPadding(TestConv3DOp_2): + def init_test_case(self): + self.stride = [1, 1, 2] + self.input_size = [2, 3, 4, 4, 4] # NCDHW + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] // self.groups + self.filter_size = [6, f_c, 3, 3, 3] + + def init_paddings(self): + self.pad = [1, 0, 1, 0, 0, 2] + self.padding_algorithm = "EXPLICIT" + + class TestConv3DOp_DiffDataInDiffDim(TestConv3DOp_2): + def init_test_case(self): + self.stride = [1, 1, 2] + self.input_size = [2, 3, 4, 5, 5] # NCDHW + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] // self.groups + self.filter_size = [6, f_c, 3, 4, 3] + + def init_paddings(self): + self.pad = [1, 0, 1, 0, 0, 2] + self.padding_algorithm = "EXPLICIT" + + class TestCase1_AsyPadding(TestConv3DOp_2): + def init_test_case(self): + self.stride = [1, 1, 1] + self.input_size = [2, 3, 4, 4, 4] # NCDHW + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] // self.groups + self.filter_size = [6, f_c, 3, 3, 3] + + def init_paddings(self): + self.pad = [0, 0, 1, 0, 0, 2] + self.padding_algorithm = "EXPLICIT" + + class TestWithGroup1_AsyPadding(TestConv3DOp_2): + def init_group(self): + self.groups = 3 + + def init_paddings(self): + self.pad = [1, 1, 1, 0, 0, 2] + self.padding_algorithm = "EXPLICIT" + + class TestWithGroup2_AsyPadding(TestConv3DOp_2): + def init_test_case(self): + self.stride = [1, 1, 1] + self.input_size = [2, 3, 4, 4, 4] # NCDHW + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] // self.groups + self.filter_size = [6, f_c, 3, 3, 3] + + def init_group(self): + self.groups = 3 + + def init_paddings(self): + self.pad = [1, 1, 0, 1, 0, 2] + self.padding_algorithm = "EXPLICIT" + + class TestWithDilation_AsyPadding(TestConv3DOp_2): + def init_test_case(self): + self.stride = [1, 1, 1] + self.input_size = [2, 3, 6, 6, 6] + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] // self.groups + self.filter_size = [24, f_c, 2, 2, 2] + + def init_dilation(self): + self.dilations = [2, 2, 2] + + def init_group(self): + self.groups = 3 + + def init_paddings(self): + self.pad = [0, 0, 1, 0, 1, 0] + self.padding_algorithm = "EXPLICIT" + + +# --------- test python API --------------- +class TestConv3DAPI(unittest.TestCase): + def test_api(self): + + input_NDHWC = fluid.layers.data( + name="input_NDHWC", + shape=[2, 5, 5, 5, 3], + append_batch_size=False, + dtype="float32", + ) + + input_NCDHW = fluid.layers.data( + name="input_NCDHW", + shape=[2, 3, 5, 5, 3], + append_batch_size=False, + dtype="float32", + ) + + fluid.layers.conv3d( + input=input_NDHWC, + num_filters=3, + filter_size=[3, 3, 3], + stride=[1, 1, 1], + padding=0, + dilation=[1, 1, 1], + groups=1, + data_format="NCDHW", + ) + + fluid.layers.conv3d( + input=input_NCDHW, + num_filters=3, + filter_size=[3, 3, 3], + stride=[1, 1, 1], + padding=[1, 2, 1, 0, 1, 0], + dilation=[1, 1, 1], + groups=1, + data_format="NCDHW", + ) + + fluid.layers.conv3d( + input=input_NCDHW, + num_filters=3, + filter_size=[3, 3, 3], + stride=[1, 1, 1], + padding=[[0, 0], [0, 0], [1, 1], [1, 1], [1, 1]], + dilation=[1, 1, 1], + groups=1, + data_format="NCDHW", + ) + + fluid.layers.conv3d( + input=input_NDHWC, + num_filters=3, + filter_size=[3, 3, 3], + stride=[1, 1, 1], + padding=[[0, 0], [1, 1], [1, 1], [1, 1], [0, 0]], + dilation=[1, 1, 1], + groups=1, + data_format="NDHWC", + ) + + fluid.layers.conv3d( + input=input_NCDHW, + num_filters=3, + filter_size=[3, 3, 3], + stride=[1, 1, 1], + padding="SAME", + dilation=[1, 1, 1], + groups=1, + data_format="NCDHW", + ) + + fluid.layers.conv3d( + input=input_NCDHW, + num_filters=3, + filter_size=[3, 3, 3], + stride=[1, 1, 1], + padding="VALID", + dilation=[1, 1, 1], + groups=1, + data_format="NCDHW", + ) + + +class TestConv3DAPI_Error(unittest.TestCase): + def test_api(self): + input = fluid.layers.data( + name="input", + shape=[2, 5, 5, 5, 4], + append_batch_size=False, + dtype="float32", + ) + + # ValueError: cudnn + def run_1(): + fluid.layers.conv3d( + input=input, + num_filters=3, + filter_size=3, + stride=1, + padding=0, + dilation=1, + groups=1, + use_cudnn=[0], + data_format="NCDHW", + ) + + self.assertRaises(ValueError, run_1) + + # ValueError: data_format + def run_2(): + fluid.layers.conv3d( + input=input, + num_filters=3, + filter_size=[3, 3, 3], + stride=[1, 1, 1], + padding=0, + dilation=[1, 1, 1], + groups=1, + use_cudnn=False, + data_format="NCHWC", + ) + + self.assertRaises(ValueError, run_2) + + # ValueError: padding + def run_3(): + fluid.layers.conv3d( + input=input, + num_filters=3, + filter_size=3, + stride=1, + padding="SAMEE", + dilation=1, + groups=1, + use_cudnn=False, + data_format="NCDHW", + ) + + self.assertRaises(ValueError, run_3) + + def run_4(): + fluid.layers.conv3d( + input=input, + num_filters=3, + filter_size=3, + stride=1, + padding=[[0, 1], [0, 0], [0, 1], [0, 1], [0, 1]], + dilation=1, + groups=1, + use_cudnn=False, + data_format="NCDHW", + ) + + self.assertRaises(ValueError, run_4) + + def run_5(): + fluid.layers.conv3d( + input=input, + num_filters=3, + filter_size=0, + stride=0, + padding=[[0, 1], [0, 1], [0, 1], [0, 1], [0, 1]], + dilation=1, + groups=1, + use_cudnn=False, + data_format="NDHWC", + ) + + self.assertRaises(ValueError, run_5) + + # ValueError: channel dimmention + x = fluid.layers.data( + name="x", + shape=[2, 5, 5, 5, -1], + append_batch_size=False, + dtype="float32", + ) + + def run_6(): + fluid.layers.conv3d( + input=x, + num_filters=3, + filter_size=3, + stride=1, + padding=0, + dilation=1, + groups=1, + use_cudnn=False, + data_format="NDHWC", + ) + + self.assertRaises(ValueError, run_6) + + # ValueError: groups + def run_7(): + fluid.layers.conv3d( + input=input, + num_filters=3, + filter_size=3, + stride=1, + padding=0, + dilation=1, + groups=3, + use_cudnn=False, + data_format="NDHWC", + ) + + self.assertRaises(ValueError, run_7) + + # ValueError: filter num + def run_8(): + fluid.layers.conv3d( + input=input, + num_filters=0, + filter_size=0, + stride=0, + padding=0, + dilation=0, + groups=1, + use_cudnn=False, + data_format="NDHWC", + ) + + self.assertRaises(ValueError, run_8) + + +for stype in ["float32"]: + create_test_class(globals(), XPUTestConv3DOp, stype) + create_test_class(globals(), XPUTestConv3DOp_v2, stype) +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/xpu/test_masked_select_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_masked_select_op_xpu.py index 6a2976ccbb528..f596f22dd4993 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_masked_select_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_masked_select_op_xpu.py @@ -58,6 +58,9 @@ def setUp(self): def test_check_output(self): self.check_output_with_place(self.place) + def test_check_grad(self): + self.check_grad_with_place(self.place, ['X'], 'Y') + def init(self): self.shape = (50, 3) diff --git a/python/paddle/fluid/tests/unittests/xpu/test_sgd_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_sgd_op_xpu.py index 2c7bb9414104b..7929b0f3fc315 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_sgd_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_sgd_op_xpu.py @@ -19,6 +19,8 @@ sys.path.append("..") import paddle import paddle.fluid as fluid +import paddle.fluid.core as core +from paddle.fluid.op import Operator from op_test_xpu import XPUOpTest from xpu.get_test_cover_info import ( @@ -83,6 +85,77 @@ def runTest(self): result = exe.run(fluid.default_main_program(), fetch_list=[avg_cost]) +class TestSparseSGDOp(unittest.TestCase): + def check_with_place(self, place): + scope = core.Scope() + + # create and initialize Grad Variable + height = 10 + rows = [0, 4, 7] + self.conf() + + grad_selected_rows = scope.var('Grad').get_selected_rows() + grad_selected_rows.set_height(height) + grad_selected_rows.set_rows(rows) + np_array = np.ones((len(rows), self.row_numel)).astype("float32") + np_array[0, 0] = 2.0 + np_array[2, 8] = 4.0 + + grad_tensor = grad_selected_rows.get_tensor() + grad_tensor.set(np_array, place) + + # create and initialize Param Variable + param = scope.var('Param').get_tensor() + param_array = np.full((height, self.row_numel), 5.0).astype("float32") + param.set(param_array, place) + + # create and initialize LeraningRate Variable + lr = scope.var('LearningRate').get_tensor() + lr_array = np.full((1), 2.0).astype("float32") + lr.set(lr_array, place) + + # create and run sgd operator + sgd_op = Operator( + "sgd", + Param='Param', + Grad='Grad', + ParamOut='Param', + LearningRate='LearningRate', + ) + sgd_op.run(scope, place) + + # get and compare result + result_array = np.array(param) + + # rows[0] = 0, 5.0 - 2.0 * 2.0 + self.assertAlmostEqual(1.0, result_array[rows[0], 0]) + # rows[0] = 0, 5.0 - 2.0 * 1.0 + self.assertAlmostEqual(3.0, result_array[rows[0], 2]) + # 5.0 - 2.0 * 0.0 + self.assertAlmostEqual(5.0, result_array[1, 0]) + # rows[1] = 4, 5.0 - 2.0 * 1.0 + self.assertAlmostEqual(3.0, result_array[rows[1], 10]) + # 5.0 - 2.0 * 0.0 + self.assertAlmostEqual(5.0, result_array[5, 8]) + # rows[2] = 7, 5.0 - 2.0 * 1.0 + self.assertAlmostEqual(3.0, result_array[rows[2], 1]) + # rows[2] = 7, 5.0 - 2.0 * 4.0 + self.assertAlmostEqual(-3.0, result_array[rows[2], 8]) + + def test_sparse_sgd(self): + places = [core.XPUPlace(0)] + for place in places: + self.check_with_place(place) + + def conf(self): + self.row_numel = 12 + + +class TestSparseSGDOpCase8X(TestSparseSGDOp): + def conf(self): + self.row_numel = 16 + + if __name__ == "__main__": paddle.enable_static() unittest.main() From e16d9a8e57bbafd3cc3462d20c2ead6e033bcfc7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=82=85=E5=89=91=E5=AF=92?= Date: Wed, 23 Nov 2022 18:41:23 +0800 Subject: [PATCH 175/210] remove_leaky_relu in nn.py under fluid (#47901) --- python/paddle/fluid/layers/nn.py | 28 ------------------- .../unittests/dygraph_to_static/darknet.py | 2 +- .../dygraph_to_static/test_cycle_gan.py | 4 +-- .../test_mkldnn_elt_act_fuse_pass.py | 6 ++-- .../ir/inference/test_trt_activation_pass.py | 2 +- .../unittests/test_activation_nn_grad.py | 2 +- .../tests/unittests/test_activation_op.py | 1 - .../unittests/test_imperative_double_grad.py | 2 +- ...perative_star_gan_with_gradient_penalty.py | 4 +-- .../tests/unittests/test_inplace_abn_op.py | 2 +- .../test_paddle_imperative_double_grad.py | 2 +- 11 files changed, 13 insertions(+), 42 deletions(-) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 3d26fc1260381..f8c741585081a 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -121,7 +121,6 @@ 'pow', 'prelu', 'brelu', - 'leaky_relu', 'flatten', 'pad2d', 'unique', @@ -8238,33 +8237,6 @@ def brelu(x, t_min=0.0, t_max=24.0, name=None): return out -@deprecated(since="2.0.0", update_to="paddle.nn.functional.leaky_relu") -@templatedoc() -def leaky_relu(x, alpha=0.02, name=None): - """ - ${comment} - Args: - x(${x_type}): ${x_comment} - alpha(${alpha_type}|0.02): ${alpha_comment} - name(str|None): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name` - - Returns: - output(${out_type}): ${out_comment} - - Examples: - - .. code-block:: python - - import paddle - - x = paddle.to_tensor([[-1, 2], [3, -4]], dtype='float32') - y = paddle.fluid.layers.leaky_relu(x, alpha=0.1) - print(y) # [[-0.1, 2], [3, -0.4]] - - """ - return paddle.nn.functional.leaky_relu(x, alpha, name) - - def flatten(x, axis=1, name=None): r""" **Flatten op** diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/darknet.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/darknet.py index b1cb22c57008d..b532a36821445 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/darknet.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/darknet.py @@ -65,7 +65,7 @@ def forward(self, inputs): out = self.conv(inputs) out = self.batch_norm(out) if self.act == 'leaky': - out = fluid.layers.leaky_relu(x=out, alpha=0.1) + out = paddle.nn.functional.leaky_relu(out, 0.1) return out diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_cycle_gan.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_cycle_gan.py index ab79a05796de4..a41791eb04ef4 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_cycle_gan.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_cycle_gan.py @@ -396,7 +396,7 @@ def forward(self, inputs): if self.norm: conv = self.bn(conv) if self.relu: - conv = fluid.layers.leaky_relu(conv, alpha=self.relufactor) + conv = paddle.nn.functional.leaky_relu(conv, self.relufactor) return conv @@ -468,7 +468,7 @@ def forward(self, inputs): if self.norm: conv = self.bn(conv) if self.relu: - conv = fluid.layers.leaky_relu(conv, alpha=self.relufactor) + conv = paddle.nn.functional.leaky_relu(conv, self.relufactor) return conv diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_elt_act_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_elt_act_fuse_pass.py index bcf79b82baf47..6cb722517468d 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_elt_act_fuse_pass.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_elt_act_fuse_pass.py @@ -84,7 +84,7 @@ class ElementwiseActivationMkldnnFusePassTest_Add_LeakyRelu( def set_params(self): self.operand = fluid.layers.elementwise_add self.act_alpha = 0.2 - self.act = fluid.layers.leaky_relu + self.act = paddle.nn.functional.leaky_relu class ElementwiseActivationMkldnnFusePassTest_Add_Swish( @@ -184,7 +184,7 @@ class ElementwiseActivationMkldnnFusePassTest_Sub_LeakyRelu( def set_params(self): self.operand = fluid.layers.elementwise_sub self.act_alpha = 0.2 - self.act = fluid.layers.leaky_relu + self.act = paddle.nn.functional.leaky_relu class ElementwiseActivationMkldnnFusePassTest_Sub_Swish( @@ -276,7 +276,7 @@ class ElementwiseActivationMkldnnFusePassTest_Mul_LeakyRelu( def set_params(self): self.operand = fluid.layers.elementwise_mul self.act_alpha = 0.2 - self.act = fluid.layers.leaky_relu + self.act = paddle.nn.functional.leaky_relu class ElementwiseActivationMkldnnFusePassTest_Mul_Swish( diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_activation_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_activation_pass.py index 7f4276bff5e7d..763608afdf7ac 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_activation_pass.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_activation_pass.py @@ -67,7 +67,7 @@ def test_check_output(self): class TensorRTSubgraphPassLeakyReluTest(TensorRTSubgraphPassActivationTest): def append_act(self, x): - return fluid.layers.leaky_relu(x) + return paddle.nn.functional.leaky_relu(x) class TensorRTSubgraphPassRelu6Test(TensorRTSubgraphPassActivationTest): diff --git a/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py b/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py index 4182d1c586df5..e0c40e2002111 100644 --- a/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py +++ b/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py @@ -217,7 +217,7 @@ def func(self, place): x = layers.data('x', shape, False, dtype) x.persistable = True - y = layers.leaky_relu(x, alpha=alpha) + y = paddle.nn.functional.leaky_relu(x, alpha) x_arr = np.random.uniform(-1, 1, shape).astype(dtype) x_arr[np.abs(x_arr) < 0.005] = 0.02 diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py index 913777c2515f4..abfc0527dbde4 100755 --- a/python/paddle/fluid/tests/unittests/test_activation_op.py +++ b/python/paddle/fluid/tests/unittests/test_activation_op.py @@ -1689,7 +1689,6 @@ def init_shape(self): class TestLeakyReluAPI(unittest.TestCase): # test paddle.nn.LeakyReLU, paddle.nn.functional.leaky_relu, - # fluid.layers.leaky_relu def setUp(self): np.random.seed(1024) self.x_np = np.random.uniform(-1, 1, [10, 12]).astype('float32') diff --git a/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py b/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py index f121bacb2a585..c38caf69e086a 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py @@ -365,7 +365,7 @@ def func_none_one_initial_gradient(self): x.stop_gradient = False alpha = 0.2 - y = fluid.layers.leaky_relu(x, alpha=alpha) + y = paddle.nn.functional.leaky_relu(x, alpha) y = y * y z = y * y diff --git a/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py b/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py index a75bc4b8a8e58..2c8d408316b6d 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py @@ -163,7 +163,7 @@ def forward(self, input): conv = self._norm(conv) if self.relufactor is not None: - conv = fluid.layers.leaky_relu(conv, alpha=self.relufactor) + conv = paddle.nn.functional.leaky_relu(conv, self.relufactor) return conv @@ -205,7 +205,7 @@ def forward(self, input): deconv = self._norm(deconv) if self.relufactor is not None: - deconv = fluid.layers.leaky_relu(deconv, alpha=self.relufactor) + deconv = paddle.nn.functional.leaky_relu(deconv, self.relufactor) return deconv diff --git a/python/paddle/fluid/tests/unittests/test_inplace_abn_op.py b/python/paddle/fluid/tests/unittests/test_inplace_abn_op.py index d02214623b7ce..56f3c13f4f33d 100644 --- a/python/paddle/fluid/tests/unittests/test_inplace_abn_op.py +++ b/python/paddle/fluid/tests/unittests/test_inplace_abn_op.py @@ -78,7 +78,7 @@ def build_program( in_place=inplace, ) if activation == 'leaky_relu': - bn = fluid.layers.leaky_relu(bn, alpha) + bn = paddle.nn.functional.leaky_relu(bn, alpha) if activation == 'elu': bn = paddle.nn.functional.elu(bn, alpha) diff --git a/python/paddle/fluid/tests/unittests/test_paddle_imperative_double_grad.py b/python/paddle/fluid/tests/unittests/test_paddle_imperative_double_grad.py index e6e0e50ac76d7..9d7bfc6888f1a 100644 --- a/python/paddle/fluid/tests/unittests/test_paddle_imperative_double_grad.py +++ b/python/paddle/fluid/tests/unittests/test_paddle_imperative_double_grad.py @@ -164,7 +164,7 @@ def func_none_one_initial_gradient(self): x.stop_gradient = False alpha = 0.2 - y = fluid.layers.leaky_relu(x, alpha=alpha) + y = paddle.nn.functional.leaky_relu(x, alpha) y = y * y z = y * y From e62ab3755672ba5027b9f628164b554b50d5e387 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=82=85=E5=89=91=E5=AF=92?= Date: Wed, 23 Nov 2022 18:41:40 +0800 Subject: [PATCH 176/210] remove pow in nn.py under fluid (#47878) --- .../contrib/slim/quantization/adaround.py | 3 +- python/paddle/fluid/layers/nn.py | 54 ------------------- .../dygraph_to_static/ifelse_simple_func.py | 2 +- .../tests/unittests/ipu/test_pow_op_ipu.py | 4 +- .../tests/unittests/test_activation_op.py | 25 +-------- .../fluid/tests/unittests/test_name_scope.py | 3 +- 6 files changed, 9 insertions(+), 82 deletions(-) diff --git a/python/paddle/fluid/contrib/slim/quantization/adaround.py b/python/paddle/fluid/contrib/slim/quantization/adaround.py index b253e15918a5d..257363baf9f3a 100644 --- a/python/paddle/fluid/contrib/slim/quantization/adaround.py +++ b/python/paddle/fluid/contrib/slim/quantization/adaround.py @@ -19,6 +19,7 @@ import paddle import paddle.fluid as fluid +import paddle from ....log_helper import get_logger from .utils import ( @@ -76,7 +77,7 @@ def round_loss_fn(): # calculate regularization term - which ensures parameter to converge to exactly zeros and ones # at the end of optimization reg_term = fluid.layers.reduce_sum( - -fluid.layers.pow(paddle.abs(2 * h_v - 1), factor=beta) + 1 + -paddle.pow(paddle.abs(2 * h_v - 1), beta) + 1 ) # calculate the rounding loss diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index f8c741585081a..517e86c3f91d7 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -118,7 +118,6 @@ 'relu', 'log', 'crop_tensor', - 'pow', 'prelu', 'brelu', 'flatten', @@ -8032,59 +8031,6 @@ def pad2d( return out -@templatedoc() -def pow(x, factor=1.0, name=None): - """ - This is Pow Activation Operator. - - :math:`out = x^{factor}` - - Args: - x(Variable): A ``Tensor`` or ``LoDTensor`` . The data type is ``float32`` or ``float64``. - factor(float32|Variable, optional): A scalar with type ``float32`` or a ``Tensor`` with shape [1] and type ``float32``. The exponential factor of Pow. Default 1.0. - name(str, optional): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name` . - - Returns: - Variable: A ``Tensor`` or ``LoDTensor``. The data type is same as ``x``. - - Examples: - - .. code-block:: python - - import paddle.fluid as fluid - - x = fluid.data(name="x", shape=[32,32], dtype="float32") - - # example 1: argument factor is float - y_1 = fluid.layers.pow(x, factor=2.0) - # y_1 is x^{2.0} - - # example 2: argument factor is Variable - factor_tensor = fluid.layers.fill_constant([1], "float32", 3.0) - y_2 = fluid.layers.pow(x, factor=factor_tensor) - # y_2 is x^{3.0} - """ - check_variable_and_dtype( - x, 'x', ['int32', 'int64', 'float16', 'float32', 'float64'], 'pow' - ) - - helper = LayerHelper('pow', **locals()) - inputs = {'X': x} - attrs = {} - if isinstance(factor, Variable): - check_variable_and_dtype(factor, 'factor', ['float32'], 'pow') - factor.stop_gradient = True - inputs['FactorTensor'] = factor - else: - attrs['factor'] = factor - - out = helper.create_variable_for_type_inference(dtype=x.dtype) - helper.append_op( - type='pow', inputs=inputs, outputs={'Out': out}, attrs=attrs - ) - return out - - @deprecated(since="2.0.0", update_to="paddle.static.nn.prelu") def prelu(x, mode, param_attr=None, data_format="NCHW", name=None): r""" diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/ifelse_simple_func.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/ifelse_simple_func.py index 8459d0d60e7d3..3862ab6f9420a 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/ifelse_simple_func.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/ifelse_simple_func.py @@ -60,7 +60,7 @@ def dyfunc_with_if_else2(x, col=100): if fluid.layers.reduce_mean(x).numpy()[0] > x.numpy()[row][col]: y = fluid.layers.relu(x) else: - x_pow = fluid.layers.pow(x, 2) + x_pow = paddle.pow(x, 2) y = paddle.tanh(x_pow) return y diff --git a/python/paddle/fluid/tests/unittests/ipu/test_pow_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_pow_op_ipu.py index f61a8b8a24e39..28b71ab3c8853 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_pow_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_pow_op_ipu.py @@ -46,7 +46,7 @@ def build_model(self): x = paddle.static.data( name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32' ) - out = paddle.fluid.layers.pow(x, **self.attrs) + out = paddle.pow(x, **self.attrs) self.fetch_list = [out.name] def run_model(self, exec_mode): @@ -85,7 +85,7 @@ def build_model(self): factor = paddle.static.data( name=self.feed_list[1], shape=self.feed_shape[1], dtype='float32' ) - out = paddle.fluid.layers.pow(x, factor=factor, **self.attrs) + out = paddle.pow(x, factor, **self.attrs) self.fetch_list = [out.name] diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py index abfc0527dbde4..39a866434afd6 100755 --- a/python/paddle/fluid/tests/unittests/test_activation_op.py +++ b/python/paddle/fluid/tests/unittests/test_activation_op.py @@ -2783,8 +2783,8 @@ def test_api(self): factor_1 = 2.0 factor_2 = fluid.layers.fill_constant([1], "float32", 3.0) - out_1 = fluid.layers.pow(x, factor=factor_1) - out_2 = fluid.layers.pow(x, factor=factor_2) + out_1 = paddle.pow(x, factor_1) + out_2 = paddle.pow(x, factor_2) out_4 = paddle.pow(x, factor_1, name='pow_res') out_6 = paddle.pow(x, factor_2) self.assertEqual(('pow_res' in out_4.name), True) @@ -2800,27 +2800,6 @@ def test_api(self): assert np.allclose(res_2, np.power(input, 3)) assert np.allclose(res_6, np.power(input, 3)) - def test_error(self): - in1 = fluid.layers.data( - name="in1", shape=[11, 17], append_batch_size=False, dtype="int32" - ) - in2 = fluid.layers.data( - name="in2", shape=[11, 17], append_batch_size=False, dtype="int64" - ) - in3 = fluid.layers.data( - name="in3", shape=[11, 17], append_batch_size=False, dtype="float32" - ) - in4 = fluid.layers.data( - name="in4", shape=[11, 17], append_batch_size=False, dtype="float64" - ) - - factor_1 = fluid.layers.fill_constant([1], "float64", 3.0) - - self.assertRaises(TypeError, fluid.layers.pow, x=in1, factor=factor_1) - self.assertRaises(TypeError, fluid.layers.pow, x=in2, factor=factor_1) - self.assertRaises(TypeError, fluid.layers.pow, x=in3, factor=factor_1) - self.assertRaises(TypeError, fluid.layers.pow, x=in4, factor=factor_1) - def ref_stanh(x, scale_a=0.67, scale_b=1.7159): out = scale_b * np.tanh(x * scale_a) diff --git a/python/paddle/fluid/tests/unittests/test_name_scope.py b/python/paddle/fluid/tests/unittests/test_name_scope.py index 475ebec9ecee2..eb61e27e74080 100644 --- a/python/paddle/fluid/tests/unittests/test_name_scope.py +++ b/python/paddle/fluid/tests/unittests/test_name_scope.py @@ -14,6 +14,7 @@ import unittest import paddle.fluid as fluid +import paddle class TestNameScope(unittest.TestCase): @@ -26,7 +27,7 @@ def test_name_scope(self): with fluid.name_scope("s3"): d = c / 1 with fluid.name_scope("s1"): - f = fluid.layers.pow(d, 2.0) + f = paddle.pow(d, 2.0) with fluid.name_scope("s4"): g = f - 1 From b7d3143f4f64df18123f9d821b34d04fe5ea3601 Mon Sep 17 00:00:00 2001 From: Charles-hit <56987902+Charles-hit@users.noreply.github.com> Date: Wed, 23 Nov 2022 19:03:46 +0800 Subject: [PATCH 177/210] Add nparray case for basic operator (#48229) * add nparray case for basic operator * fix unit test * fix unit test * add unit test * fix unit test --- paddle/fluid/pybind/eager_math_op_patch.cc | 211 +++++++++++++----- .../unittests/test_elementwise_add_op.py | 26 +++ .../unittests/test_elementwise_div_op.py | 29 +++ .../unittests/test_elementwise_mul_op.py | 27 +++ .../unittests/test_elementwise_sub_op.py | 30 +++ .../tests/unittests/test_math_op_patch.py | 102 +++++++++ .../tests/unittests/test_matmul_v2_op.py | 22 ++ 7 files changed, 396 insertions(+), 51 deletions(-) diff --git a/paddle/fluid/pybind/eager_math_op_patch.cc b/paddle/fluid/pybind/eager_math_op_patch.cc index 24ec364efb3b6..8387123ae11fe 100644 --- a/paddle/fluid/pybind/eager_math_op_patch.cc +++ b/paddle/fluid/pybind/eager_math_op_patch.cc @@ -75,6 +75,47 @@ static bool IsNumpyType(PyObject* obj) { type_name == "numpy.int32" || type_name == "numpy.int16"; } +static bool IsNumpyArray(PyObject* obj) { + auto type_name = std::string(Py_TYPE(obj)->tp_name); + return type_name == "numpy.ndarray"; +} + +void InitTensorWithNumpyValue(const py::object& array, + const paddle::platform::Place& place, + Tensor* self, + bool zero_copy = false) { + PADDLE_ENFORCE_EQ( + self->defined(), + true, + paddle::platform::errors::Fatal( + "Calling InitTensorWithNumpyValue of Eager Tensor without " + "EmptyTensorInitializer is " + "forbidden. Please check your code and make sure you new a " + "eager tensor before init it with NumPy.")); + phi::DenseTensor* impl_ptr = + static_cast(self->impl().get()); + if (platform::is_cpu_place(place)) { + SetTensorFromPyArray(impl_ptr, array, place, zero_copy); + } else if (platform::is_xpu_place(place)) { + SetTensorFromPyArray(impl_ptr, array, place, zero_copy); + } else if (platform::is_gpu_place(place)) { + SetTensorFromPyArray( + impl_ptr, array, place, zero_copy); + } else if (platform::is_cuda_pinned_place(place)) { + SetTensorFromPyArray( + impl_ptr, array, place, zero_copy); + } else if (platform::is_npu_place(place)) { + SetTensorFromPyArray(impl_ptr, array, place, zero_copy); + } else if (platform::is_custom_place(place)) { + SetTensorFromPyArray( + impl_ptr, array, place, zero_copy); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "Place should be one of " + "CPUPlace/XPUPlace/CUDAPlace/CUDAPinnedPlace/NPUPlace/CustomPlace")); + } +} + std::set _supported_int_dtype_{DataType::UINT8, DataType::INT8, DataType::INT16, @@ -192,7 +233,13 @@ static PyObject* tensor__add__method(TensorObject* self, // 2. create or get tensor for other_obj paddle::experimental::Tensor other_tensor; - if (!PyCheckTensor(other_obj)) { + if (PyCheckTensor(other_obj)) { + other_tensor = CastPyArg2Tensor(other_obj, 0); + } else if (IsNumpyArray(other_obj)) { + py::object numpy_value = py::object(py::handle(other_obj), true); + other_tensor = paddle::experimental::Tensor(place); + InitTensorWithNumpyValue(numpy_value, place, &other_tensor); + } else { paddle::experimental::Scalar value = CastPyArg2Scalar(other_obj, "__add__", 0); { @@ -200,8 +247,6 @@ static PyObject* tensor__add__method(TensorObject* self, other_tensor = full_ad_func( self_tensor.shape(), value, self_tensor.dtype(), self_tensor.place()); } - } else { - other_tensor = CastPyArg2Tensor(other_obj, 0); } // 3. promote types or unify right var type to left var @@ -289,7 +334,13 @@ static PyObject* tensor__sub__method(TensorObject* self, } // 2. create or get tensor for other_obj paddle::experimental::Tensor other_tensor; - if (!PyCheckTensor(other_obj)) { + if (PyCheckTensor(other_obj)) { + other_tensor = CastPyArg2Tensor(other_obj, 0); + } else if (IsNumpyArray(other_obj)) { + py::object numpy_value = py::object(py::handle(other_obj), true); + other_tensor = paddle::experimental::Tensor(place); + InitTensorWithNumpyValue(numpy_value, place, &other_tensor); + } else { paddle::experimental::Scalar value = CastPyArg2Scalar(other_obj, "__sub__", 0); { @@ -297,8 +348,6 @@ static PyObject* tensor__sub__method(TensorObject* self, other_tensor = full_ad_func( self_tensor.shape(), value, self_tensor.dtype(), self_tensor.place()); } - } else { - other_tensor = CastPyArg2Tensor(other_obj, 0); } // 3. promote types or unify right var type to left var @@ -382,7 +431,13 @@ static PyObject* tensor__rsub__method(TensorObject* self, // 2. create or get tensor for other_obj paddle::experimental::Tensor other_tensor; - if (!PyCheckTensor(other_obj)) { + if (PyCheckTensor(other_obj)) { + other_tensor = CastPyArg2Tensor(other_obj, 0); + } else if (IsNumpyArray(other_obj)) { + py::object numpy_value = py::object(py::handle(other_obj), true); + other_tensor = paddle::experimental::Tensor(place); + InitTensorWithNumpyValue(numpy_value, place, &other_tensor); + } else { paddle::experimental::Scalar value = CastPyArg2Scalar(other_obj, "__rsub__", 0); { @@ -390,8 +445,6 @@ static PyObject* tensor__rsub__method(TensorObject* self, other_tensor = full_ad_func( self_tensor.shape(), value, self_tensor.dtype(), self_tensor.place()); } - } else { - other_tensor = CastPyArg2Tensor(other_obj, 0); } // 3. promote types or unify right var type to left var @@ -477,7 +530,13 @@ static PyObject* tensor__mul__method(TensorObject* self, // 2. create or get tensor for other_obj paddle::experimental::Tensor other_tensor; - if (!PyCheckTensor(other_obj)) { + if (PyCheckTensor(other_obj)) { + other_tensor = CastPyArg2Tensor(other_obj, 0); + } else if (IsNumpyArray(other_obj)) { + py::object numpy_value = py::object(py::handle(other_obj), true); + other_tensor = paddle::experimental::Tensor(place); + InitTensorWithNumpyValue(numpy_value, place, &other_tensor); + } else { paddle::experimental::Scalar value = CastPyArg2Scalar(other_obj, "__mul__", 0); if (PyComplex_Check(other_obj)) { @@ -489,8 +548,6 @@ static PyObject* tensor__mul__method(TensorObject* self, other_tensor = full_ad_func( self_tensor.shape(), value, self_tensor.dtype(), self_tensor.place()); } - } else { - other_tensor = CastPyArg2Tensor(other_obj, 0); } // 3. promote types or unify right var type to left var @@ -579,7 +636,13 @@ static PyObject* tensor__div__method(TensorObject* self, // 2. create or get tensor for other_obj paddle::experimental::Tensor other_tensor; - if (!PyCheckTensor(other_obj)) { + if (PyCheckTensor(other_obj)) { + other_tensor = CastPyArg2Tensor(other_obj, 0); + } else if (IsNumpyArray(other_obj)) { + py::object numpy_value = py::object(py::handle(other_obj), true); + other_tensor = paddle::experimental::Tensor(place); + InitTensorWithNumpyValue(numpy_value, place, &other_tensor); + } else { paddle::experimental::Scalar value = CastPyArg2Scalar(other_obj, "__div__", 0); if (PyComplex_Check(other_obj)) { @@ -591,8 +654,6 @@ static PyObject* tensor__div__method(TensorObject* self, other_tensor = full_ad_func( self_tensor.shape(), value, self_tensor.dtype(), self_tensor.place()); } - } else { - other_tensor = CastPyArg2Tensor(other_obj, 0); } // 3. promote types or unify right var type to left var @@ -695,7 +756,13 @@ static PyObject* tensor__rdiv__method(TensorObject* self, phi::Scalar(other_double), self_tensor.dtype(), place); - } else if (!PyCheckTensor(other_obj)) { + } else if (PyCheckTensor(other_obj)) { + other_tensor = CastPyArg2Tensor(other_obj, 0); + } else if (IsNumpyArray(other_obj)) { + py::object numpy_value = py::object(py::handle(other_obj), true); + other_tensor = paddle::experimental::Tensor(place); + InitTensorWithNumpyValue(numpy_value, place, &other_tensor); + } else { paddle::experimental::Scalar value = CastPyArg2Scalar(other_obj, "__rdiv__", 0); if (PyComplex_Check(other_obj)) { @@ -707,8 +774,6 @@ static PyObject* tensor__rdiv__method(TensorObject* self, other_tensor = full_ad_func( self_tensor.shape(), value, self_tensor.dtype(), self_tensor.place()); } - } else { - other_tensor = CastPyArg2Tensor(other_obj, 0); } // 3. promote types or unify right var type to left var @@ -809,7 +874,13 @@ static PyObject* tensor__gt__method(TensorObject* self, phi::Scalar(other_double), self_tensor.dtype(), place); - } else if (!PyCheckTensor(other_obj)) { + } else if (PyCheckTensor(other_obj)) { + other_tensor = CastPyArg2Tensor(other_obj, 0); + } else if (IsNumpyArray(other_obj)) { + py::object numpy_value = py::object(py::handle(other_obj), true); + other_tensor = paddle::experimental::Tensor(place); + InitTensorWithNumpyValue(numpy_value, place, &other_tensor); + } else { paddle::experimental::Scalar value = CastPyArg2Scalar(other_obj, "__gt__", 0); if (PyComplex_Check(other_obj)) { @@ -821,8 +892,6 @@ static PyObject* tensor__gt__method(TensorObject* self, other_tensor = full_ad_func( self_tensor.shape(), value, self_tensor.dtype(), self_tensor.place()); } - } else { - other_tensor = CastPyArg2Tensor(other_obj, 0); } // 3. promote types or unify right var type to left var @@ -895,7 +964,13 @@ static PyObject* tensor__ge__method(TensorObject* self, phi::Scalar(other_double), self_tensor.dtype(), place); - } else if (!PyCheckTensor(other_obj)) { + } else if (PyCheckTensor(other_obj)) { + other_tensor = CastPyArg2Tensor(other_obj, 0); + } else if (IsNumpyArray(other_obj)) { + py::object numpy_value = py::object(py::handle(other_obj), true); + other_tensor = paddle::experimental::Tensor(place); + InitTensorWithNumpyValue(numpy_value, place, &other_tensor); + } else { paddle::experimental::Scalar value = CastPyArg2Scalar(other_obj, "__ge__", 0); if (PyComplex_Check(other_obj)) { @@ -907,8 +982,6 @@ static PyObject* tensor__ge__method(TensorObject* self, other_tensor = full_ad_func( self_tensor.shape(), value, self_tensor.dtype(), self_tensor.place()); } - } else { - other_tensor = CastPyArg2Tensor(other_obj, 0); } // 3. promote types or unify right var type to left var @@ -982,7 +1055,13 @@ static PyObject* tensor__mod__method(TensorObject* self, phi::Scalar(other_double), self_tensor.dtype(), self_tensor.place()); - } else if (!PyCheckTensor(other_obj)) { + } else if (PyCheckTensor(other_obj)) { + other_tensor = CastPyArg2Tensor(other_obj, 0); + } else if (IsNumpyArray(other_obj)) { + py::object numpy_value = py::object(py::handle(other_obj), true); + other_tensor = paddle::experimental::Tensor(place); + InitTensorWithNumpyValue(numpy_value, place, &other_tensor); + } else { paddle::experimental::Scalar value = CastPyArg2Scalar(other_obj, "__mod__", 0); if (PyComplex_Check(other_obj)) { @@ -994,8 +1073,6 @@ static PyObject* tensor__mod__method(TensorObject* self, other_tensor = full_ad_func( self_tensor.shape(), value, self_tensor.dtype(), self_tensor.place()); } - } else { - other_tensor = CastPyArg2Tensor(other_obj, 0); } // 3. promote types or unify right var type to left var @@ -1068,7 +1145,13 @@ static PyObject* tensor__matmul__method(TensorObject* self, phi::Scalar(other_double), self_tensor.dtype(), self_tensor.place()); - } else if (!PyCheckTensor(other_obj)) { + } else if (PyCheckTensor(other_obj)) { + other_tensor = CastPyArg2Tensor(other_obj, 0); + } else if (IsNumpyArray(other_obj)) { + py::object numpy_value = py::object(py::handle(other_obj), true); + other_tensor = paddle::experimental::Tensor(place); + InitTensorWithNumpyValue(numpy_value, place, &other_tensor); + } else { paddle::experimental::Scalar value = CastPyArg2Scalar(other_obj, "__matmul__", 0); if (PyComplex_Check(other_obj)) { @@ -1080,8 +1163,6 @@ static PyObject* tensor__matmul__method(TensorObject* self, other_tensor = full_ad_func({1}, value, self_tensor.dtype(), self_tensor.place()); } - } else { - other_tensor = CastPyArg2Tensor(other_obj, 0); } // 3. promote types or unify right var type to left var @@ -1172,7 +1253,13 @@ static PyObject* tensor__lt__method(TensorObject* self, phi::Scalar(other_double), self_tensor.dtype(), self_tensor.place()); - } else if (!PyCheckTensor(other_obj)) { + } else if (PyCheckTensor(other_obj)) { + other_tensor = CastPyArg2Tensor(other_obj, 0); + } else if (IsNumpyArray(other_obj)) { + py::object numpy_value = py::object(py::handle(other_obj), true); + other_tensor = paddle::experimental::Tensor(place); + InitTensorWithNumpyValue(numpy_value, place, &other_tensor); + } else { paddle::experimental::Scalar value = CastPyArg2Scalar(other_obj, "__lt__", 0); if (PyComplex_Check(other_obj)) { @@ -1184,8 +1271,6 @@ static PyObject* tensor__lt__method(TensorObject* self, other_tensor = full_ad_func( self_tensor.shape(), value, self_tensor.dtype(), self_tensor.place()); } - } else { - other_tensor = CastPyArg2Tensor(other_obj, 0); } // 3. promote types or unify right var type to left var @@ -1258,7 +1343,13 @@ static PyObject* tensor__le__method(TensorObject* self, phi::Scalar(other_double), self_tensor.dtype(), self_tensor.place()); - } else if (!PyCheckTensor(other_obj)) { + } else if (PyCheckTensor(other_obj)) { + other_tensor = CastPyArg2Tensor(other_obj, 0); + } else if (IsNumpyArray(other_obj)) { + py::object numpy_value = py::object(py::handle(other_obj), true); + other_tensor = paddle::experimental::Tensor(place); + InitTensorWithNumpyValue(numpy_value, place, &other_tensor); + } else { paddle::experimental::Scalar value = CastPyArg2Scalar(other_obj, "__le__", 0); if (PyComplex_Check(other_obj)) { @@ -1270,8 +1361,6 @@ static PyObject* tensor__le__method(TensorObject* self, other_tensor = full_ad_func( self_tensor.shape(), value, self_tensor.dtype(), self_tensor.place()); } - } else { - other_tensor = CastPyArg2Tensor(other_obj, 0); } // 3. promote types or unify right var type to left var @@ -1345,7 +1434,13 @@ static PyObject* tensor__floordiv__method(TensorObject* self, phi::Scalar(other_double), self_tensor.dtype(), self_tensor.place()); - } else if (!PyCheckTensor(other_obj)) { + } else if (PyCheckTensor(other_obj)) { + other_tensor = CastPyArg2Tensor(other_obj, 0); + } else if (IsNumpyArray(other_obj)) { + py::object numpy_value = py::object(py::handle(other_obj), true); + other_tensor = paddle::experimental::Tensor(place); + InitTensorWithNumpyValue(numpy_value, place, &other_tensor); + } else { paddle::experimental::Scalar value = CastPyArg2Scalar(other_obj, "__floordiv__", 0); if (PyComplex_Check(other_obj)) { @@ -1357,8 +1452,6 @@ static PyObject* tensor__floordiv__method(TensorObject* self, other_tensor = full_ad_func({1}, value, self_tensor.dtype(), self_tensor.place()); } - } else { - other_tensor = CastPyArg2Tensor(other_obj, 0); } // 3. promote types or unify right var type to left var @@ -1430,7 +1523,13 @@ static PyObject* tensor__pow__method(TensorObject* self, // 2. create or get tensor for other_obj paddle::experimental::Tensor other_tensor; - if (!PyCheckTensor(other_obj)) { + if (PyCheckTensor(other_obj)) { + other_tensor = CastPyArg2Tensor(other_obj, 0); + } else if (IsNumpyArray(other_obj)) { + py::object numpy_value = py::object(py::handle(other_obj), true); + other_tensor = paddle::experimental::Tensor(place); + InitTensorWithNumpyValue(numpy_value, place, &other_tensor); + } else { paddle::experimental::Scalar value = CastPyArg2Scalar(other_obj, "__pow__", 0); if (PyComplex_Check(other_obj)) { @@ -1442,8 +1541,6 @@ static PyObject* tensor__pow__method(TensorObject* self, other_tensor = full_ad_func({1}, value, self_tensor.dtype(), self_tensor.place()); } - } else { - other_tensor = CastPyArg2Tensor(other_obj, 0); } // 3. promote types or unify right var type to left var @@ -1518,7 +1615,13 @@ static PyObject* tensor__rpow__method(TensorObject* self, phi::Scalar(other_double), self_tensor.dtype(), self_tensor.place()); - } else if (!PyCheckTensor(other_obj)) { + } else if (PyCheckTensor(other_obj)) { + other_tensor = CastPyArg2Tensor(other_obj, 0); + } else if (IsNumpyArray(other_obj)) { + py::object numpy_value = py::object(py::handle(other_obj), true); + other_tensor = paddle::experimental::Tensor(place); + InitTensorWithNumpyValue(numpy_value, place, &other_tensor); + } else { paddle::experimental::Scalar value = CastPyArg2Scalar(other_obj, "__rpow__", 0); if (PyComplex_Check(other_obj)) { @@ -1530,8 +1633,6 @@ static PyObject* tensor__rpow__method(TensorObject* self, other_tensor = full_ad_func( self_tensor.shape(), value, self_tensor.dtype(), self_tensor.place()); } - } else { - other_tensor = CastPyArg2Tensor(other_obj, 0); } // 3. promote types or unify right var type to left var @@ -1604,7 +1705,13 @@ static PyObject* tensor__ne__method(TensorObject* self, phi::Scalar(other_double), self_tensor.dtype(), self_tensor.place()); - } else if (!PyCheckTensor(other_obj)) { + } else if (PyCheckTensor(other_obj)) { + other_tensor = CastPyArg2Tensor(other_obj, 0); + } else if (IsNumpyArray(other_obj)) { + py::object numpy_value = py::object(py::handle(other_obj), true); + other_tensor = paddle::experimental::Tensor(place); + InitTensorWithNumpyValue(numpy_value, place, &other_tensor); + } else { paddle::experimental::Scalar value = CastPyArg2Scalar(other_obj, "__ne__", 0); if (PyComplex_Check(other_obj)) { @@ -1616,8 +1723,6 @@ static PyObject* tensor__ne__method(TensorObject* self, other_tensor = full_ad_func({1}, value, self_tensor.dtype(), self_tensor.place()); } - } else { - other_tensor = CastPyArg2Tensor(other_obj, 0); } // 3. promote types or unify right var type to left var @@ -1690,7 +1795,13 @@ static PyObject* tensor__eq__method(TensorObject* self, phi::Scalar(other_double), self_tensor.dtype(), self_tensor.place()); - } else if (!PyCheckTensor(other_obj)) { + } else if (PyCheckTensor(other_obj)) { + other_tensor = CastPyArg2Tensor(other_obj, 0); + } else if (IsNumpyArray(other_obj)) { + py::object numpy_value = py::object(py::handle(other_obj), true); + other_tensor = paddle::experimental::Tensor(place); + InitTensorWithNumpyValue(numpy_value, place, &other_tensor); + } else { paddle::experimental::Scalar value = CastPyArg2Scalar(other_obj, "__eq__", 0); if (PyComplex_Check(other_obj)) { @@ -1702,8 +1813,6 @@ static PyObject* tensor__eq__method(TensorObject* self, other_tensor = full_ad_func({1}, value, self_tensor.dtype(), self_tensor.place()); } - } else { - other_tensor = CastPyArg2Tensor(other_obj, 0); } // 3. promote types or unify right var type to left var diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py index d9057ee4ca6ab..79c5bdda4337b 100644 --- a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py +++ b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py @@ -739,6 +739,32 @@ def test_dygraph_add(self): self.func_dygraph_add() +class TestElementwiseAddop1(unittest.TestCase): + def func_dygraph_add(self): + paddle.disable_static() + + np_a = np.random.random((2, 3, 4)).astype(np.float32) + np_b = np.random.random((2, 3, 4)).astype(np.float32) + + tensor_a = paddle.to_tensor(np_a, dtype="float32") + tensor_b = paddle.to_tensor(np_b, dtype="float32") + + # normal case: nparray + tenor + expect_out = np_a + np_b + actual_out = np_a + tensor_b + np.testing.assert_allclose(actual_out, expect_out) + + # normal case: tensor + nparray + actual_out = tensor_a + np_b + np.testing.assert_allclose(actual_out, expect_out) + + paddle.enable_static() + + def test_dygraph_add(self): + with _test_eager_guard(): + self.func_dygraph_add() + + if __name__ == '__main__': paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py index 7a0c5d09fbffc..9f37a456b7441 100644 --- a/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py +++ b/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py @@ -18,6 +18,7 @@ import paddle from paddle import fluid from paddle.fluid import core +from paddle.fluid.framework import _test_eager_guard class ElementwiseDivOp(OpTest): @@ -436,6 +437,34 @@ def init_grad_input_output(self): self.grad_y = -self.grad_out * np.conj(self.x / self.y / self.y) +class TestElementwiseDivop(unittest.TestCase): + def func_dygraph_div(self): + paddle.disable_static() + + np_a = np.random.random((2, 3, 4)).astype(np.float32) + np_b = np.random.random((2, 3, 4)).astype(np.float32) + np_a[np.abs(np_a) < 0.0005] = 0.002 + np_b[np.abs(np_b) < 0.0005] = 0.002 + + tensor_a = paddle.to_tensor(np_a, dtype="float32") + tensor_b = paddle.to_tensor(np_b, dtype="float32") + + # normal case: nparray / tenor + expect_out = np_a / np_b + actual_out = np_a / tensor_b + np.testing.assert_allclose(actual_out, expect_out) + + # normal case: tensor / nparray + actual_out = tensor_a / np_b + np.testing.assert_allclose(actual_out, expect_out) + + paddle.enable_static() + + def test_dygraph_div(self): + with _test_eager_guard(): + self.func_dygraph_div() + + if __name__ == '__main__': paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py index 263fb8a998182..c72728cfe951b 100644 --- a/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py +++ b/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py @@ -19,6 +19,7 @@ import paddle.fluid as fluid import paddle.fluid.core as core from paddle.fluid import Program, program_guard +from paddle.fluid.framework import _test_eager_guard from paddle.fluid.tests.unittests.op_test import ( OpTest, @@ -386,6 +387,32 @@ def init_grad_input_output(self): self.grad_y = self.grad_out * np.conj(self.x) +class TestElementwiseMulop(unittest.TestCase): + def func_dygraph_mul(self): + paddle.disable_static() + + np_a = np.random.random((2, 3, 4)).astype(np.float32) + np_b = np.random.random((2, 3, 4)).astype(np.float32) + + tensor_a = paddle.to_tensor(np_a, dtype="float32") + tensor_b = paddle.to_tensor(np_b, dtype="float32") + + # normal case: nparray * tenor + expect_out = np_a * np_b + actual_out = np_a * tensor_b + np.testing.assert_allclose(actual_out, expect_out) + + # normal case: tensor * nparray + actual_out = tensor_a * np_b + np.testing.assert_allclose(actual_out, expect_out) + + paddle.enable_static() + + def test_dygraph_mul(self): + with _test_eager_guard(): + self.func_dygraph_mul() + + if __name__ == '__main__': paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py index d89b3b22aa3bb..d2ad1d90f0846 100644 --- a/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py +++ b/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py @@ -450,6 +450,36 @@ def test_dygraph_sub(self): self.func_dygraph_sub() +class TestFloatElementwiseSubop1(unittest.TestCase): + def func_dygraph_sub(self): + paddle.disable_static() + + np_a = np.random.random((2, 3, 4)).astype(np.float32) + np_b = np.random.random((2, 3, 4)).astype(np.float32) + + tensor_a = paddle.to_tensor(np_a, dtype="float32") + tensor_b = paddle.to_tensor(np_b, dtype="float32") + + # normal case: nparray - tenor + expect_out = np_a - np_b + actual_out = np_a - tensor_b + np.testing.assert_allclose( + actual_out, expect_out, rtol=1e-07, atol=1e-07 + ) + + # normal case: tenor - nparray + actual_out = tensor_a - np_b + np.testing.assert_allclose( + actual_out, expect_out, rtol=1e-07, atol=1e-07 + ) + + paddle.enable_static() + + def test_dygraph_sub(self): + with _test_eager_guard(): + self.func_dygraph_sub() + + if __name__ == '__main__': paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_math_op_patch.py b/python/paddle/fluid/tests/unittests/test_math_op_patch.py index 9efdb268a4b69..61c843e9780c7 100644 --- a/python/paddle/fluid/tests/unittests/test_math_op_patch.py +++ b/python/paddle/fluid/tests/unittests/test_math_op_patch.py @@ -380,5 +380,107 @@ def test_matmul(self): np.testing.assert_allclose(a_np @ b_np, c_np, rtol=1e-05) +class TestDygraphMathOpPatches(unittest.TestCase): + def init_data(self): + self.np_a = np.random.random((2, 3, 4)).astype(np.float32) + self.np_b = np.random.random((2, 3, 4)).astype(np.float32) + self.np_a[np.abs(self.np_a) < 0.0005] = 0.002 + self.np_b[np.abs(self.np_b) < 0.0005] = 0.002 + + self.tensor_a = paddle.to_tensor(self.np_a, dtype="float32") + self.tensor_b = paddle.to_tensor(self.np_b, dtype="float32") + + def test_dygraph_greater_than(self): + paddle.disable_static() + self.init_data() + # normal case: tenor > nparray + expect_out = self.np_a > self.np_b + actual_out = self.tensor_a > self.np_b + np.testing.assert_equal(actual_out, expect_out) + paddle.enable_static() + + def test_dygraph_greater_equal(self): + paddle.disable_static() + self.init_data() + # normal case: tenor >= nparray + expect_out = self.np_a >= self.np_b + actual_out = self.tensor_a >= self.np_b + np.testing.assert_equal(actual_out, expect_out) + paddle.enable_static() + + def test_dygraph_reminder(self): + paddle.disable_static() + self.init_data() + # normal case: tenor % nparray + expect_out = self.np_a % self.np_b + actual_out = self.tensor_a % self.np_b + np.testing.assert_allclose(actual_out, expect_out, rtol=1e-7, atol=1e-7) + paddle.enable_static() + + def test_dygraph_less_than(self): + paddle.disable_static() + self.init_data() + # normal case: tenor < nparray + expect_out = self.np_a < self.np_b + actual_out = self.tensor_a < self.np_b + np.testing.assert_equal(actual_out, expect_out) + paddle.enable_static() + + def test_dygraph_less_equal(self): + paddle.disable_static() + self.init_data() + # normal case: tenor <= nparray + expect_out = self.np_a <= self.np_b + actual_out = self.tensor_a <= self.np_b + np.testing.assert_equal(actual_out, expect_out) + paddle.enable_static() + + def test_dygraph_floor_divide(self): + paddle.disable_static() + np_a = np.random.random((2, 3, 4)).astype(np.int32) + np_b = np.random.random((2, 3, 4)).astype(np.int32) + np_b[np.abs(np_b) < 1] = 2 + # normal case: tenor // nparray + tensor_a = paddle.to_tensor(np_a, dtype="int32") + tensor_b = paddle.to_tensor(np_b, dtype="int32") + expect_out = np_a // np_b + actual_out = tensor_a // np_b + np.testing.assert_equal(actual_out, expect_out) + paddle.enable_static() + + def test_dygraph_elementwise_pow(self): + paddle.disable_static() + self.init_data() + # normal case: tenor ** nparray + expect_out = self.np_a**self.np_b + actual_out = self.tensor_a**self.np_b + np.testing.assert_allclose(actual_out, expect_out, rtol=1e-7, atol=1e-7) + + # normal case: nparray ** tensor + expect_out = self.np_a**self.np_b + actual_out = self.np_a**self.tensor_b + np.testing.assert_allclose(actual_out, expect_out, rtol=1e-7, atol=1e-7) + + paddle.enable_static() + + def test_dygraph_not_equal(self): + paddle.disable_static() + self.init_data() + # normal case: tenor != nparray + expect_out = self.np_a != self.np_b + actual_out = self.tensor_a != self.np_b + np.testing.assert_equal(actual_out, expect_out) + paddle.enable_static() + + def test_dygraph_equal(self): + paddle.disable_static() + self.init_data() + # normal case: tenor == nparray + expect_out = self.np_a == self.np_b + actual_out = self.tensor_a == self.np_b + np.testing.assert_equal(actual_out, expect_out) + paddle.enable_static() + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py b/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py index 9af6d6598d29a..a7c199bb4b3fd 100644 --- a/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py +++ b/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py @@ -714,6 +714,28 @@ def init_grad_input_output(self): self.grad_y = np.matmul(np.conj(self.x).T, self.grad_out) +class TestMatmulop(unittest.TestCase): + def func_dygraph_matmul(self): + paddle.disable_static() + + np_a = np.random.random((2, 4)).astype(np.float32) + np_b = np.random.random((4, 2)).astype(np.float32) + + tensor_a = paddle.to_tensor(np_a, dtype="float32") + tensor_b = paddle.to_tensor(np_b, dtype="float32") + + # normal case: tensor @ nparray + expect_out = np_a @ np_b + actual_out = tensor_a @ np_b + np.testing.assert_allclose(actual_out, expect_out) + + paddle.enable_static() + + def func_dygraph_matmul(self): + with _test_eager_guard(): + self.func_dygraph_matmul() + + if __name__ == "__main__": paddle.enable_static() unittest.main() From baa1f6634ea63cd5f664049791ebdafc7130c4c2 Mon Sep 17 00:00:00 2001 From: Vvsmile <450864116@qq.com> Date: Wed, 23 Nov 2022 19:40:08 +0800 Subject: [PATCH 178/210] Remove API: mean_iou (#47971) remove mean_iou which is not used in paddle 2.0 --- python/paddle/fluid/layers/nn.py | 68 ------------------- .../fluid/tests/unittests/test_layers.py | 7 -- .../fluid/tests/unittests/test_mean_iou.py | 18 ----- 3 files changed, 93 deletions(-) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 517e86c3f91d7..3cc25fd643be6 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -114,7 +114,6 @@ 'gather_nd', 'scatter', 'random_crop', - 'mean_iou', 'relu', 'log', 'crop_tensor', @@ -7626,73 +7625,6 @@ def relu(x, name=None): return out -def mean_iou(input, label, num_classes): - r""" - Mean Intersection-Over-Union is a common evaluation metric for - semantic image segmentation, which first computes the IOU for each - semantic class and then computes the average over classes. - IOU is defined as follows: - - .. math:: - - IOU = \\frac{true\_positive}{(true\_positive + false\_positive + false\_negative)}. - - The predictions are accumulated in a confusion matrix and mean-IOU - is then calculated from it. - - - Parameters: - input (Tensor): A n-D Tensor of prediction results for semantic labels with type int32 or int64. - label (Tensor): A Tensor of ground truth labels with type int32 or int64. - Its shape should be the same as input. - num_classes (int32): The possible number of labels. - - Returns: - Three Tensors. - - - mean_iou(Tensor) : A 1-D Tensor representing the mean intersection-over-union with shape [1]. \ - Data type is float32. - - out_wrong(Tensor) : A 1-D Tensor with shape [num_classes]. Data type is int32. \ - The wrong numbers of each class. - - out_correct(Tensor): A 1-D Tensor with shape [num_classes]. Data type is int32. The correct numbers of each class. - - - Examples: - - .. code-block:: python - - import paddle - - iou_shape = [64, 32, 32] - num_classes = 5 - predict = paddle.randint(low=0, high=255, shape=iou_shape, dtype='int64') - label = paddle.randint(low=0, high=255, shape=iou_shape, dtype='int64') - mean_iou, out_wrong, out_correct = paddle.metric.mean_iou(predict, label, num_classes) - """ - if _non_static_mode(): - return _legacy_C_ops.mean_iou(input, label, 'num_classes', num_classes) - - helper = LayerHelper('mean_iou', **locals()) - check_variable_and_dtype( - input, 'Predictions', ['int32', 'int64'], 'mean_iou' - ) - check_variable_and_dtype(label, 'Labels', ['int32', 'int64'], 'mean_iou') - out_mean_iou = helper.create_variable_for_type_inference(dtype='float32') - out_wrong = helper.create_variable_for_type_inference(dtype='int32') - out_correct = helper.create_variable_for_type_inference(dtype='int32') - helper.append_op( - type="mean_iou", - inputs={"Predictions": input, "Labels": label}, - outputs={ - "OutMeanIou": out_mean_iou, - "OutWrong": out_wrong, - "OutCorrect": out_correct, - }, - attrs={"num_classes": num_classes}, - ) - return out_mean_iou, out_wrong, out_correct - - def crop_tensor(x, shape=None, offsets=None, name=None): """ Crop input into output, as specified by offsets and shape. diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index 60e0543287662..9fa79c681a892 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -3476,13 +3476,6 @@ def make_l2_normalize(self): output = layers.l2_normalize(x, axis=1) return output - def make_mean_iou(self): - with fluid.framework._dygraph_place_guard(place=fluid.CPUPlace()): - x = self._get_data(name='x', shape=[16], dtype='int32') - y = self._get_data(name='label', shape=[16], dtype='int32') - iou = layers.mean_iou(x, y, self._high_data_bound) - return iou - def make_argsort(self): with program_guard( fluid.default_main_program(), fluid.default_startup_program() diff --git a/python/paddle/fluid/tests/unittests/test_mean_iou.py b/python/paddle/fluid/tests/unittests/test_mean_iou.py index 3c78395755fb8..c37e519c2cf51 100644 --- a/python/paddle/fluid/tests/unittests/test_mean_iou.py +++ b/python/paddle/fluid/tests/unittests/test_mean_iou.py @@ -15,7 +15,6 @@ import unittest import numpy as np from op_test import OpTest -import paddle.fluid as fluid def compute_mean_iou( @@ -140,22 +139,5 @@ def test_check_output(self): self.check_output(check_dygraph=False, check_eager=False) -class TestMeanIOUOpError(unittest.TestCase): - def test_errors(self): - with fluid.program_guard(fluid.Program(), fluid.Program()): - # The input type of accuracy_op must be Variable. - x1 = fluid.create_lod_tensor( - np.array([[-1]]), [[1]], fluid.CPUPlace() - ) - y1 = fluid.create_lod_tensor( - np.array([[-1]]), [[1]], fluid.CPUPlace() - ) - self.assertRaises(TypeError, fluid.layers.mean_iou, x1, y1) - # The input dtype of accuracy_op must be float32 or float64. - x2 = fluid.layers.data(name='x2', shape=[4], dtype="float32") - y2 = fluid.layers.data(name='x2', shape=[4], dtype="float32") - self.assertRaises(TypeError, fluid.layers.mean_iou, x2, y2) - - if __name__ == '__main__': unittest.main() From 6ea8bfc6d3e38f7686c42028b5687b9cbff0f986 Mon Sep 17 00:00:00 2001 From: heyanru <81976792+heyanru01@users.noreply.github.com> Date: Wed, 23 Nov 2022 19:58:08 +0800 Subject: [PATCH 179/210] [Fluid Clean] Remove paddle.fluid.layers.nn.reduce_max/min (#48236) --- python/paddle/fluid/layers/distributions.py | 8 +- python/paddle/fluid/layers/nn.py | 148 +----------------- .../dygraph_to_static/test_sentiment.py | 6 +- .../unittests/dygraph_to_static/test_tsm.py | 2 +- .../unittests/ipu/test_reduce_x_op_ipu.py | 4 +- .../fluid/tests/unittests/test_assert_op.py | 4 +- 6 files changed, 13 insertions(+), 159 deletions(-) diff --git a/python/paddle/fluid/layers/distributions.py b/python/paddle/fluid/layers/distributions.py index 196d89db33e5f..18b7f26713ab1 100644 --- a/python/paddle/fluid/layers/distributions.py +++ b/python/paddle/fluid/layers/distributions.py @@ -530,9 +530,9 @@ def kl_divergence(self, other): """ check_type(other, 'other', Categorical, 'kl_divergence') - logits = self.logits - nn.reduce_max(self.logits, dim=-1, keep_dim=True) - other_logits = other.logits - nn.reduce_max( - other.logits, dim=-1, keep_dim=True + logits = self.logits - paddle.max(self.logits, axis=-1, keepdim=True) + other_logits = other.logits - paddle.max( + other.logits, axis=-1, keepdim=True ) e_logits = paddle.exp(logits) other_e_logits = paddle.exp(other_logits) @@ -554,7 +554,7 @@ def entropy(self): Variable: Shannon entropy of Categorical distribution. The data type is float32. """ - logits = self.logits - nn.reduce_max(self.logits, dim=-1, keep_dim=True) + logits = self.logits - paddle.max(self.logits, axis=-1, keepdim=True) e_logits = paddle.exp(logits) z = nn.reduce_sum(e_logits, dim=-1, keep_dim=True) prob = e_logits / z diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 3cc25fd643be6..3e8eddff75bc3 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -79,8 +79,6 @@ 'data_norm', 'reduce_sum', 'reduce_mean', - 'reduce_max', - 'reduce_min', 'reduce_all', 'reduce_any', 'dropout', @@ -190,7 +188,7 @@ def _get_reduce_dim(dim, input): """ - Internal function for reduce_sum, reduce_mean, reduce_max, reduce_min, reduce_prod. + Internal function for reduce_sum, reduce_mean, reduce_prod. It computes the attribute reduce_all value based on axis. """ if dim is not None and not isinstance(dim, list): @@ -3938,150 +3936,6 @@ def reduce_mean(input, dim=None, keep_dim=False, name=None): return paddle.mean(x=input, axis=dim, keepdim=keep_dim, name=name) -def reduce_max(input, dim=None, keep_dim=False, name=None): - """ - - Computes the maximum of tensor elements over the given dimension. - - Args: - input (Variable): The input variable which is a Tensor, the data type is float32, - float64, int32, int64. - dim (list|int, optional): The dimension along which the maximum is computed. - If :attr:`None`, compute the maximum over all elements of - :attr:`input` and return a Tensor variable with a single element, - otherwise must be in the range :math:`[-rank(input), rank(input))`. - If :math:`dim[i] < 0`, the dimension to reduce is :math:`rank + dim[i]`. - keep_dim (bool, optional): Whether to reserve the reduced dimension in the - output Tensor. The result tensor will have one fewer dimension - than the :attr:`input` unless :attr:`keep_dim` is true, default - value is False. - name(str, optional): The default value is None. Normally there is no need for - user to set this property. For more information, please refer to :ref:`api_guide_Name` - - Returns: - Variable: Tensor, results of maximum on the specified dim of input tensor, - it's data type is the same as input's Tensor. - - Examples: - .. code-block:: python - - import paddle.fluid as fluid - import paddle - paddle.enable_static() - # x is a Tensor variable with following elements: - # [[0.2, 0.3, 0.5, 0.9] - # [0.1, 0.2, 0.6, 0.7]] - # Each example is followed by the corresponding output tensor. - x = fluid.data(name='x', shape=[2, 4], dtype='float32') - fluid.layers.reduce_max(x) # [0.9] - fluid.layers.reduce_max(x, dim=0) # [0.2, 0.3, 0.6, 0.9] - fluid.layers.reduce_max(x, dim=-1) # [0.9, 0.7] - fluid.layers.reduce_max(x, dim=1, keep_dim=True) # [[0.9], [0.7]] - - # y is a Tensor variable with shape [2, 2, 2] and elements as below: - # [[[1.0, 2.0], [3.0, 4.0]], - # [[5.0, 6.0], [7.0, 8.0]]] - # Each example is followed by the corresponding output tensor. - y = fluid.data(name='y', shape=[2, 2, 2], dtype='float32') - fluid.layers.reduce_max(y, dim=[1, 2]) # [4.0, 8.0] - fluid.layers.reduce_max(y, dim=[0, 1]) # [7.0, 8.0] - """ - helper = LayerHelper('reduce_max', **locals()) - out = helper.create_variable_for_type_inference(dtype=helper.input_dtype()) - - if dim is not None and not isinstance(dim, list): - dim = [dim] - - if in_dygraph_mode(): - return _C_ops.max(input, dim if dim is not None else [], keep_dim) - - helper.append_op( - type='reduce_max', - inputs={'X': input}, - outputs={'Out': out}, - attrs={ - 'dim': dim if dim is not None and dim != [] else [0], - 'keep_dim': keep_dim, - 'reduce_all': True - if dim is None or dim == [] or len(dim) == len(input.shape) - else False, - }, - ) - return out - - -def reduce_min(input, dim=None, keep_dim=False, name=None): - """ - - Computes the minimum of tensor elements over the given dimension. - - Args: - input (Variable): The input variable which is a Tensor, the data type is float32, - float64, int32, int64. - dim (list|int, optional): The dimensions along which the minimum is computed. - If :attr:`None`, compute the minimum over all elements of - :attr:`input` and return a Tensor variable with a single element, - otherwise must be in the range :math:`[-rank(input), rank(input))`. - If :math:`dim[i] < 0`, the dimension to reduce is :math:`rank + dim[i]`. - keep_dim (bool, optional): Whether to reserve the reduced dimension in the - output Tensor. The result tensor will have one fewer dimension - than the :attr:`input` unless :attr:`keep_dim` is true, default - value is False. - name(str, optional): The default value is None. Normally there is no need for - user to set this property. For more information, please refer to :ref:`api_guide_Name` - - Returns: - Variable: Tensor, result of minimum on the specified dim of input tensor, - it's data type is the same as input's Tensor. - - Examples: - .. code-block:: python - - import paddle.fluid as fluid - import paddle - paddle.enable_static() - - # x is a Tensor variable with following elements: - # [[0.2, 0.3, 0.5, 0.9] - # [0.1, 0.2, 0.6, 0.7]] - # Each example is followed by the corresponding output tensor. - x = fluid.data(name='x', shape=[2, 4], dtype='float32') - fluid.layers.reduce_min(x) # [0.1] - fluid.layers.reduce_min(x, dim=0) # [0.1, 0.2, 0.5, 0.7] - fluid.layers.reduce_min(x, dim=-1) # [0.2, 0.1] - fluid.layers.reduce_min(x, dim=1, keep_dim=True) # [[0.2], [0.1]] - - # y is a Tensor variable with shape [2, 2, 2] and elements as below: - # [[[1.0, 2.0], [3.0, 4.0]], - # [[5.0, 6.0], [7.0, 8.0]]] - # Each example is followed by the corresponding output tensor. - y = fluid.data(name='y', shape=[2, 2, 2], dtype='float32') - fluid.layers.reduce_min(y, dim=[1, 2]) # [1.0, 5.0] - fluid.layers.reduce_min(y, dim=[0, 1]) # [1.0, 2.0] - """ - helper = LayerHelper('reduce_min', **locals()) - out = helper.create_variable_for_type_inference(dtype=helper.input_dtype()) - if dim is not None and not isinstance(dim, list): - dim = [dim] - - if in_dygraph_mode(): - return _C_ops.min(input, dim if dim is not None else [], keep_dim) - - helper.append_op( - type='reduce_min', - inputs={'X': input}, - outputs={'Out': out}, - attrs={ - 'dim': dim if dim is not None and dim != [] else [0], - 'keep_dim': keep_dim, - 'reduce_all': True - if dim is None or dim == [] or len(dim) == len(input.shape) - else False, - }, - ) - return out - - def reduce_all(input, dim=None, keep_dim=False, name=None): """ diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_sentiment.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_sentiment.py index 3e4d1ddf8d1ef..13593e1c02c08 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_sentiment.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_sentiment.py @@ -52,7 +52,7 @@ def __init__( def forward(self, inputs): x = paddle.tanh(self._conv2d(inputs)) - x = fluid.layers.reduce_max(x, dim=-1) + x = paddle.max(x, axis=-1) x = paddle.reshape(x, shape=[self.batch_size, -1]) return x @@ -194,7 +194,7 @@ def forward(self, inputs, label=None): emb = paddle.reshape(emb, shape=[self.batch_size, -1, self.hid_dim]) fc_1 = self._fc1(emb) gru_hidden = self._gru(fc_1) - gru_hidden = fluid.layers.reduce_max(gru_hidden, dim=1) + gru_hidden = paddle.max(gru_hidden, axis=1) tanh_1 = paddle.tanh(gru_hidden) fc_2 = self._fc2(tanh_1) prediction = self._fc_prediction(fc_2) @@ -254,7 +254,7 @@ def forward(self, inputs, label=None): encoded_vector = fluid.layers.concat( input=[gru_forward_tanh, gru_backward_tanh], axis=2 ) - encoded_vector = fluid.layers.reduce_max(encoded_vector, dim=1) + encoded_vector = paddle.max(encoded_vector, axis=1) fc_2 = self._fc2(encoded_vector) prediction = self._fc_prediction(fc_2) # TODO(Aurelius84): Uncomment the following codes when we support return variable-length vars. diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tsm.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tsm.py index dde28dadfd4e7..f9c69aca6849e 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tsm.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tsm.py @@ -208,7 +208,7 @@ def forward(self, inputs): y = self.pool2d_avg(y) y = fluid.layers.dropout(y, dropout_prob=0.5) y = paddle.reshape(y, [-1, self.seg_num, y.shape[1]]) - y = fluid.layers.reduce_mean(y, dim=1) + y = paddle.mean(y, axis=1) y = paddle.reshape(y, shape=[-1, 2048]) y = self.out(y) return y diff --git a/python/paddle/fluid/tests/unittests/ipu/test_reduce_x_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_reduce_x_op_ipu.py index 3756ee91dbfbb..a7071e45d42bf 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_reduce_x_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_reduce_x_op_ipu.py @@ -123,12 +123,12 @@ def test_case7(self): class TestMax(TestMean): def set_test_op(self): - self.op = paddle.fluid.layers.reduce_max + self.op = paddle.max class TestMin(TestMean): def set_test_op(self): - self.op = paddle.fluid.layers.reduce_min + self.op = paddle.min class TestSum(TestMean): diff --git a/python/paddle/fluid/tests/unittests/test_assert_op.py b/python/paddle/fluid/tests/unittests/test_assert_op.py index 9a91ebca5d89c..4ce1fb9a65c52 100644 --- a/python/paddle/fluid/tests/unittests/test_assert_op.py +++ b/python/paddle/fluid/tests/unittests/test_assert_op.py @@ -70,7 +70,7 @@ def net_func(): def test_assert_summary(self): def net_func(): x = layers.fill_constant(shape=[10], dtype='float32', value=2.0) - condition = layers.reduce_max(x) < 1.0 + condition = paddle.max(x) < 1.0 layers.Assert(condition, (x,), 5) print("test_assert_summary") @@ -80,7 +80,7 @@ def net_func(): def test_assert_summary_greater_than_size(self): def net_func(): x = layers.fill_constant(shape=[2, 3], dtype='float32', value=2.0) - condition = layers.reduce_max(x) < 1.0 + condition = paddle.max(x) < 1.0 layers.Assert(condition, [x], 10, name="test") print("test_assert_summary_greater_than_size") From 88cac16b2cd3797c5009b0505fbed40ec39714ae Mon Sep 17 00:00:00 2001 From: huangjiyi <43315610+huangjiyi@users.noreply.github.com> Date: Wed, 23 Nov 2022 20:36:48 +0800 Subject: [PATCH 180/210] [PHI decoupling] move im2col from fluid to phi (#48174) * decouple im2col from fluid * move im2col to phi * fix build error * delete redundant comment --- paddle/fluid/operators/conv_op.h | 2 +- paddle/fluid/operators/im2sequence_op.h | 11 +- paddle/fluid/operators/math/CMakeLists.txt | 1 - paddle/fluid/operators/math/context_project.h | 8 +- paddle/fluid/operators/math/im2col_test.cc | 59 +++------- paddle/phi/kernels/funcs/CMakeLists.txt | 1 + .../math => phi/kernels/funcs}/im2col.cc | 103 ++++++++--------- .../math => phi/kernels/funcs}/im2col.cu | 108 ++++++++---------- .../math => phi/kernels/funcs}/im2col.h | 19 ++- .../kernels/funcs}/im2col_cfo_cpu.h | 14 +-- .../phi/kernels/impl/conv_grad_kernel_impl.h | 22 +--- paddle/phi/kernels/impl/conv_kernel_impl.h | 6 +- .../impl/conv_transpose_grad_kernel_impl.h | 6 +- .../kernels/impl/conv_transpose_kernel_impl.h | 6 +- .../phi/kernels/impl/fold_grad_kernel_impl.h | 6 +- paddle/phi/kernels/impl/fold_kernel_impl.h | 6 +- .../kernels/impl/unfold_grad_kernel_impl.h | 6 +- paddle/phi/kernels/impl/unfold_kernel_impl.h | 6 +- 18 files changed, 160 insertions(+), 230 deletions(-) rename paddle/{fluid/operators/math => phi/kernels/funcs}/im2col.cc (79%) rename paddle/{fluid/operators/math => phi/kernels/funcs}/im2col.cu (87%) rename paddle/{fluid/operators/math => phi/kernels/funcs}/im2col.h (91%) rename paddle/{fluid/operators/math => phi/kernels/funcs}/im2col_cfo_cpu.h (98%) diff --git a/paddle/fluid/operators/conv_op.h b/paddle/fluid/operators/conv_op.h index 79d07887fb0e0..924ed1fcf7d35 100644 --- a/paddle/fluid/operators/conv_op.h +++ b/paddle/fluid/operators/conv_op.h @@ -22,8 +22,8 @@ limitations under the License. */ #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/layout_utils.h" -#include "paddle/fluid/operators/math/im2col.h" #include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/funcs/im2col.h" #include "paddle/phi/kernels/funcs/vol2col.h" namespace paddle { diff --git a/paddle/fluid/operators/im2sequence_op.h b/paddle/fluid/operators/im2sequence_op.h index 9fcf02e999d17..afb4db0f3c633 100644 --- a/paddle/fluid/operators/im2sequence_op.h +++ b/paddle/fluid/operators/im2sequence_op.h @@ -20,7 +20,7 @@ #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/eigen/eigen_function.h" -#include "paddle/fluid/operators/math/im2col.h" +#include "paddle/phi/kernels/funcs/im2col.h" #include "paddle/phi/kernels/funcs/math_function.h" namespace paddle { @@ -101,7 +101,8 @@ class Im2SequenceKernel : public framework::OpKernel { kernels[1]}); offset_out += output_height[i] * output_width[i]; - math::Im2ColFunctor f; + phi::funcs::Im2ColFunctor + f; auto& dev_ctx = ctx.template device_context(); f(dev_ctx, src, dilations, strides, paddings, &dst); } @@ -135,7 +136,8 @@ class Im2SequenceKernel : public framework::OpKernel { kernels[0], kernels[1]}); - math::Im2ColFunctor f; + phi::funcs::Im2ColFunctor + f; auto& dev_ctx = ctx.template device_context(); f(dev_ctx, src, dilations, strides, paddings, &dst); } @@ -190,7 +192,8 @@ class Im2SequenceGradKernel : public framework::OpKernel { d_x->Slice(i, i + 1).Resize({img_channels, img_height, img_width}); const Tensor src = d_out->Slice(i, i + 1).Resize( {output_height, output_width, img_channels, kernels[0], kernels[1]}); - math::Col2ImFunctor f; + phi::funcs::Col2ImFunctor + f; auto& dev_ctx = ctx.template device_context(); f(dev_ctx, src, dilations, strides, paddings, &dst); } diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt index 1f5dd8a9b2284..e2a62273d0328 100644 --- a/paddle/fluid/operators/math/CMakeLists.txt +++ b/paddle/fluid/operators/math/CMakeLists.txt @@ -23,7 +23,6 @@ endif() math_library(context_project DEPS im2col math_function) math_library(cos_sim_functor) math_library(depthwise_conv) -math_library(im2col) math_library(sample_prob) math_library(sampler DEPS generator) diff --git a/paddle/fluid/operators/math/context_project.h b/paddle/fluid/operators/math/context_project.h index f7f2cfb64aa34..832be9b0efd2f 100644 --- a/paddle/fluid/operators/math/context_project.h +++ b/paddle/fluid/operators/math/context_project.h @@ -18,8 +18,8 @@ limitations under the License. */ #include #include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/operators/math/im2col.h" #include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/funcs/im2col.h" namespace paddle { namespace operators { @@ -100,7 +100,8 @@ class ContextProjectFunctor { phi::DenseTensor* col) { auto lod_level_0 = in.lod()[0]; - math::Im2ColFunctor im2col_ocf; + phi::funcs::Im2ColFunctor + im2col_ocf; std::vector dilation({1, 1}); std::vector padding({up_pad, 0, down_pad, 0}); @@ -230,7 +231,8 @@ class ContextProjectGradFunctor { phi::DenseTensor* col) { auto lod_level_0 = in.lod()[0]; - math::Col2ImFunctor col2im_ocf; + phi::funcs::Col2ImFunctor + col2im_ocf; std::vector dilation({1, 1}); std::vector padding({up_pad, 0, down_pad, 0}); diff --git a/paddle/fluid/operators/math/im2col_test.cc b/paddle/fluid/operators/math/im2col_test.cc index 70ac7a225d6a3..1fa0cb1aeb161 100644 --- a/paddle/fluid/operators/math/im2col_test.cc +++ b/paddle/fluid/operators/math/im2col_test.cc @@ -12,13 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/math/im2col.h" +#include "paddle/phi/kernels/funcs/im2col.h" #include -#include "paddle/fluid/operators/math/im2col_cfo_cpu.h" +#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/place.h" +#include "paddle/phi/kernels/funcs/im2col_cfo_cpu.h" template void testIm2col() { @@ -76,15 +77,9 @@ void testIm2col() { {output_height, output_width, 1, filter_size, filter_size}, *place); // Im2Col - paddle::operators::math::Im2ColFunctor< - paddle::operators::math::ColFormat::kCFO, - DeviceContext, - float> + phi::funcs::Im2ColFunctor im2col; - paddle::operators::math::Im2ColFunctor< - paddle::operators::math::ColFormat::kOCF, - DeviceContext, - float> + phi::funcs::Im2ColFunctor im2col_ocf; im2col(*context, input, dilation, stride, padding, &output_cfo); @@ -119,15 +114,9 @@ void testIm2col() { } // Col2Im: kCFO - paddle::operators::math::Col2ImFunctor< - paddle::operators::math::ColFormat::kCFO, - DeviceContext, - float> + phi::funcs::Col2ImFunctor col2im; - paddle::operators::math::Col2ImFunctor< - paddle::operators::math::ColFormat::kOCF, - DeviceContext, - float> + phi::funcs::Col2ImFunctor col2im_ocf; float col2im_data[] = {0, 2, 2, 3, 8, 5}; @@ -237,15 +226,9 @@ void testIm2col() { {output_height, output_width, 1, filter_size, filter_size}, *place); // Im2Col - paddle::operators::math::Im2ColFunctor< - paddle::operators::math::ColFormat::kCFO, - phi::GPUContext, - float> + phi::funcs::Im2ColFunctor im2col; - paddle::operators::math::Im2ColFunctor< - paddle::operators::math::ColFormat::kOCF, - phi::GPUContext, - float> + phi::funcs::Im2ColFunctor im2col_ocf; im2col(*context, input, dilation, stride, padding, &output_cfo); @@ -280,15 +263,9 @@ void testIm2col() { } // Col2Im: kCFO - paddle::operators::math::Col2ImFunctor< - paddle::operators::math::ColFormat::kCFO, - phi::GPUContext, - float> + phi::funcs::Col2ImFunctor col2im; - paddle::operators::math::Col2ImFunctor< - paddle::operators::math::ColFormat::kOCF, - phi::GPUContext, - float> + phi::funcs::Col2ImFunctor col2im_ocf; float col2im_data[] = {0, 2, 2, 3, 8, 5}; @@ -363,18 +340,15 @@ TEST(math, im2col) { int output_width = (iw - fw + padding[1] * 2) / stride[1] + 1; \ out.mutable_data({ic, fh, fw, output_height, output_width}, place); \ ref.mutable_data({ic, fh, fw, output_height, output_width}, place); \ - paddle::operators::math::Im2ColFunctor< \ - paddle::operators::math::ColFormat::kCFO, \ - phi::CPUContext, \ - float> \ - im2col + phi::funcs:: \ + Im2ColFunctor \ + im2col void testIm2colCPU(int ic, int ih, int iw, int fh, int fw, int ph, int pw) { PREPARE_IM2COL_CPU; im2col(context, input, dilation, stride, padding, &out); - paddle::operators::math::im2col_common( - input, dilation, stride, padding, &ref); + phi::funcs::im2col_common(input, dilation, stride, padding, &ref); float* ref_data = ref.data(); float* out_data = out.data(); @@ -398,8 +372,7 @@ void benchIm2col(int ic, int ih, int iw, int fh, int fw, int ph, int pw) { auto t2 = GetCurrentMs(); for (int i = 0; i < repeat; ++i) { - paddle::operators::math::im2col_common( - input, dilation, stride, padding, &ref); + phi::funcs::im2col_common(input, dilation, stride, padding, &ref); } auto t3 = GetCurrentMs(); diff --git a/paddle/phi/kernels/funcs/CMakeLists.txt b/paddle/phi/kernels/funcs/CMakeLists.txt index 41c6cf677717d..d429d4a8dad2c 100644 --- a/paddle/phi/kernels/funcs/CMakeLists.txt +++ b/paddle/phi/kernels/funcs/CMakeLists.txt @@ -17,6 +17,7 @@ math_library(segment_pooling) math_library(sequence2batch) math_library(matrix_solve DEPS dense_tensor eigen3 blas math_function) math_library(cross_entropy) +math_library(im2col) math_library(vol2col) cc_library( diff --git a/paddle/fluid/operators/math/im2col.cc b/paddle/phi/kernels/funcs/im2col.cc similarity index 79% rename from paddle/fluid/operators/math/im2col.cc rename to paddle/phi/kernels/funcs/im2col.cc index 39b0312e67766..71d9c49e347d0 100644 --- a/paddle/fluid/operators/math/im2col.cc +++ b/paddle/phi/kernels/funcs/im2col.cc @@ -1,4 +1,4 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -12,17 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/math/im2col.h" - -#include "paddle/fluid/operators/math/im2col_cfo_cpu.h" +#include "paddle/phi/kernels/funcs/im2col.h" +#include "paddle/phi/kernels/funcs/im2col_cfo_cpu.h" namespace phi { class CPUContext; } // namespace phi -namespace paddle { -namespace operators { -namespace math { +namespace phi { +namespace funcs { /* * im = [input_channels, input_height, input_width] @@ -30,9 +28,7 @@ namespace math { * [input_channels, filter_height, filter_width, output_height, output_width] */ template -class Im2ColFunctor { +class Im2ColFunctor { public: void operator()(const DeviceContext& context, const phi::DenseTensor& im, @@ -43,13 +39,13 @@ class Im2ColFunctordims().size(), 5, - platform::errors::InvalidArgument( + phi::errors::InvalidArgument( "The dimension of tensor 'col' should be 5. But got " "the dims of tensor 'col' is [%s].", col->dims())); @@ -77,9 +73,7 @@ class Im2ColFunctor -class Col2ImFunctor { +class Col2ImFunctor { public: void operator()(const DeviceContext& context, const phi::DenseTensor& col, @@ -90,13 +84,13 @@ class Col2ImFunctordims().size(), 3, - platform::errors::InvalidArgument( + phi::errors::InvalidArgument( "The dimension of tensor 'im' should be 3. But got " "the dims of tensor 'im' is [%s].", im->dims())); PADDLE_ENFORCE_EQ(col.dims().size(), 5, - platform::errors::InvalidArgument( + phi::errors::InvalidArgument( "The dimension of tensor 'col' should be 5. But got " "the dims of tensor 'col' is [%s].", col.dims())); @@ -111,22 +105,22 @@ class Col2ImFunctor; -template class Im2ColFunctor; -template class Col2ImFunctor; -template class Col2ImFunctor; @@ -179,9 +173,7 @@ template class Col2ImFunctor -class Im2ColFunctor { +class Im2ColFunctor { public: void operator()(const DeviceContext& context, const phi::DenseTensor& im, @@ -192,13 +184,13 @@ class Im2ColFunctordims().size(), 5, - platform::errors::InvalidArgument( + phi::errors::InvalidArgument( "The dimension of tensor 'col' should be 5. But got " "the dims of tensor 'col' is [%s].", col->dims())); @@ -254,9 +246,7 @@ class Im2ColFunctor -class Col2ImFunctor { +class Col2ImFunctor { public: void operator()(const DeviceContext& context, const phi::DenseTensor& col, @@ -267,13 +257,13 @@ class Col2ImFunctordims().size(), 3, - platform::errors::InvalidArgument( + phi::errors::InvalidArgument( "The dimension of tensor 'im' should be 3. But got " "the dims of tensor 'im' is [%s].", im->dims())); PADDLE_ENFORCE_EQ(col.dims().size(), 5, - platform::errors::InvalidArgument( + phi::errors::InvalidArgument( "The dimension of tensor 'col' should be 5. But got " "the dims of tensor 'col' is [%s].", col.dims())); @@ -288,14 +278,14 @@ class Col2ImFunctordata(); const T* col_data = col.data(); @@ -335,18 +325,17 @@ class Col2ImFunctor; -template class Im2ColFunctor; -template class Col2ImFunctor; -template class Col2ImFunctor; -} // namespace math -} // namespace operators -} // namespace paddle +} // namespace funcs +} // namespace phi diff --git a/paddle/fluid/operators/math/im2col.cu b/paddle/phi/kernels/funcs/im2col.cu similarity index 87% rename from paddle/fluid/operators/math/im2col.cu rename to paddle/phi/kernels/funcs/im2col.cu index 5c7038714e93c..78a0b345e389f 100644 --- a/paddle/fluid/operators/math/im2col.cu +++ b/paddle/phi/kernels/funcs/im2col.cu @@ -1,4 +1,4 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -15,14 +15,13 @@ limitations under the License. */ #include #include -#include "paddle/fluid/operators/math/im2col.h" -#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" #include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/backends/gpu/gpu_primitives.h" +#include "paddle/phi/kernels/funcs/im2col.h" -namespace paddle { -namespace operators { -namespace math { +namespace phi { +namespace funcs { template __global__ void im2col(const T* data_im, @@ -86,9 +85,7 @@ __global__ void im2col(const T* data_im, * [input_channels, filter_height, filter_width, output_height, output_width] */ template -class Im2ColFunctor { +class Im2ColFunctor { public: void operator()(const DeviceContext& context, const phi::DenseTensor& im, @@ -99,13 +96,13 @@ class Im2ColFunctordims().size(), 5, - platform::errors::InvalidArgument( + phi::errors::InvalidArgument( "The dimension of tensor 'col' should be 5. But got " "the dims of tensor 'col' is [%s].", col->dims())); @@ -124,7 +121,7 @@ class Im2ColFunctor -class Col2ImFunctor { +class Col2ImFunctor { public: void operator()(const DeviceContext& context, const phi::DenseTensor& col, @@ -236,13 +231,13 @@ class Col2ImFunctordims().size(), 3, - platform::errors::InvalidArgument( + phi::errors::InvalidArgument( "The dimension of tensor 'im' should be 3. But got " "the dims of tensor 'im' is [%s].", im->dims())); PADDLE_ENFORCE_EQ(col.dims().size(), 5, - platform::errors::InvalidArgument( + phi::errors::InvalidArgument( "The dimension of tensor 'col' should be 5. But got " "the dims of tensor 'col' is [%s].", col.dims())); @@ -258,28 +253,28 @@ class Col2ImFunctor; -template class Im2ColFunctor; -template class Col2ImFunctor; -template class Col2ImFunctor; @@ -367,9 +362,7 @@ __global__ void im2colOCF(const T* im_data, * [output_height, output_width, input_channels, filter_height, filter_width] */ template -class Im2ColFunctor { +class Im2ColFunctor { public: void operator()(const DeviceContext& context, const phi::DenseTensor& im, @@ -380,13 +373,13 @@ class Im2ColFunctordims().size(), 5, - platform::errors::InvalidArgument( + phi::errors::InvalidArgument( "The dimension of tensor 'col' should be 5. But got " "the dims of tensor 'col' is [%s].", col->dims())); @@ -479,9 +472,7 @@ __global__ void col2imOCF(const T* col_data, * [output_height, output_width, input_channels, filter_height, filter_width] */ template -class Col2ImFunctor { +class Col2ImFunctor { public: void operator()(const DeviceContext& context, const phi::DenseTensor& col, @@ -492,13 +483,13 @@ class Col2ImFunctordims().size(), 3, - platform::errors::InvalidArgument( + phi::errors::InvalidArgument( "The dimension of tensor 'im' should be 3. But got " "the dims of tensor 'im' is [%s].", im->dims())); PADDLE_ENFORCE_EQ(col.dims().size(), 5, - platform::errors::InvalidArgument( + phi::errors::InvalidArgument( "The dimension of tensor 'col' should be 5. But got " "the dims of tensor 'col' is [%s].", col.dims())); @@ -511,22 +502,22 @@ class Col2ImFunctor; -template class Im2ColFunctor; -template class Col2ImFunctor; -template class Col2ImFunctor; -} // namespace math -} // namespace operators -} // namespace paddle +} // namespace funcs +} // namespace phi diff --git a/paddle/fluid/operators/math/im2col.h b/paddle/phi/kernels/funcs/im2col.h similarity index 91% rename from paddle/fluid/operators/math/im2col.h rename to paddle/phi/kernels/funcs/im2col.h index 3ce785a8901a6..73b2866924d1e 100644 --- a/paddle/fluid/operators/math/im2col.h +++ b/paddle/phi/kernels/funcs/im2col.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -16,13 +16,13 @@ limitations under the License. */ #include -#include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/platform/device_context.h" +#include "paddle/phi/common/layout.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/enforce.h" +#include "paddle/phi/core/errors.h" -namespace paddle { -namespace operators { -namespace math { +namespace phi { +namespace funcs { using DataLayout = phi::DataLayout; @@ -107,6 +107,5 @@ class Col2ImFunctor { const DataLayout data_layout = DataLayout::kNCHW); }; -} // namespace math -} // namespace operators -} // namespace paddle +} // namespace funcs +} // namespace phi diff --git a/paddle/fluid/operators/math/im2col_cfo_cpu.h b/paddle/phi/kernels/funcs/im2col_cfo_cpu.h similarity index 98% rename from paddle/fluid/operators/math/im2col_cfo_cpu.h rename to paddle/phi/kernels/funcs/im2col_cfo_cpu.h index bef9e0a8449f6..c901cc9f55144 100644 --- a/paddle/fluid/operators/math/im2col_cfo_cpu.h +++ b/paddle/phi/kernels/funcs/im2col_cfo_cpu.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -16,11 +16,10 @@ limitations under the License. */ #include -#include "paddle/fluid/framework/tensor.h" +#include "paddle/phi/core/dense_tensor.h" -namespace paddle { -namespace operators { -namespace math { +namespace phi { +namespace funcs { /** * The most common im2col algorithm. @@ -317,6 +316,5 @@ inline void im2col_sh1sw1dh1dw1ph1pw1(const phi::DenseTensor& im, } } -} // namespace math -} // namespace operators -} // namespace paddle +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/impl/conv_grad_kernel_impl.h b/paddle/phi/kernels/impl/conv_grad_kernel_impl.h index e66a870c3aa25..ec75952aaae8e 100644 --- a/paddle/phi/kernels/impl/conv_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/conv_grad_kernel_impl.h @@ -14,10 +14,10 @@ #pragma once -#include "paddle/fluid/operators/math/im2col.h" #include "paddle/phi/kernels/cpu/conv_util.h" #include "paddle/phi/kernels/funcs/batch_norm_utils.h" #include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/funcs/im2col.h" #include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/vol2col.h" @@ -147,10 +147,8 @@ void ConvGradKernel(const Context& dev_ctx, if (is_expand) { set_zero(dev_ctx, &transformed_input_grad, static_cast(0)); } + phi::funcs::Col2ImFunctor col2im; phi::funcs::Col2VolFunctor col2vol; - paddle::operators::math:: - Col2ImFunctor - col2im; for (int i = 0; i < batch_size; i++) { DenseTensor out_grad_batch = @@ -203,9 +201,7 @@ void ConvGradKernel(const Context& dev_ctx, Tensor filter_grad_ = *filter_grad; filter_grad_.Resize(filter_matrix_shape); set_zero(dev_ctx, filter_grad, static_cast(0)); - paddle::operators::math:: - Im2ColFunctor - im2col; + phi::funcs::Im2ColFunctor im2col; phi::funcs::Vol2ColFunctor vol2col; for (int i = 0; i < batch_size; i++) { DenseTensor out_grad_batch = @@ -381,10 +377,8 @@ void ConvGradGradKernel(const Context& dev_ctx, if (is_expand) { set_zero(dev_ctx, &transformed_dX, static_cast(0)); } + phi::funcs::Col2ImFunctor col2im; phi::funcs::Col2VolFunctor col2vol; - paddle::operators::math:: - Col2ImFunctor - col2im; for (int i = 0; i < batch_size; i++) { DenseTensor dy_batch = @@ -428,9 +422,7 @@ void ConvGradGradKernel(const Context& dev_ctx, set_zero(dev_ctx, dW, static_cast(0)); DenseTensor dW_arr = *dW; dW_arr.Resize(filter_matrix_shape); - paddle::operators::math:: - Im2ColFunctor - im2col; + phi::funcs::Im2ColFunctor im2col; phi::funcs::Vol2ColFunctor vol2col; for (int i = 0; i < batch_size; ++i) { DenseTensor dy_batch = @@ -477,9 +469,7 @@ void ConvGradGradKernel(const Context& dev_ctx, } set_zero(dev_ctx, &transformed_ddY, static_cast(0)); - paddle::operators::math:: - Im2ColFunctor - im2col; + phi::funcs::Im2ColFunctor im2col; phi::funcs::Vol2ColFunctor vol2col; for (int i = 0; i < batch_size; ++i) { DenseTensor ddy_batch = diff --git a/paddle/phi/kernels/impl/conv_kernel_impl.h b/paddle/phi/kernels/impl/conv_kernel_impl.h index 59bea1d0564c6..06ba3104a8112 100644 --- a/paddle/phi/kernels/impl/conv_kernel_impl.h +++ b/paddle/phi/kernels/impl/conv_kernel_impl.h @@ -14,11 +14,11 @@ #pragma once -#include "paddle/fluid/operators/math/im2col.h" #include "paddle/phi/kernels/conv_kernel.h" #include "paddle/phi/kernels/cpu/conv_util.h" #include "paddle/phi/kernels/funcs/batch_norm_utils.h" #include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/funcs/im2col.h" #include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/vol2col.h" @@ -133,10 +133,8 @@ void ConvKernelImpl(const Context& dev_ctx, int in_step = static_cast(transformed_input.dims()[1]) / groups; int out_step = static_cast(transformed_output.dims()[1]) / groups; + phi::funcs::Im2ColFunctor im2col; phi::funcs::Vol2ColFunctor vol2col; - paddle::operators::math:: - Im2ColFunctor - im2col; auto blas = phi::funcs::GetBlas(dev_ctx); for (int i = 0; i < batch_size; i++) { diff --git a/paddle/phi/kernels/impl/conv_transpose_grad_kernel_impl.h b/paddle/phi/kernels/impl/conv_transpose_grad_kernel_impl.h index e25a6fd56ee2a..64810d82f0034 100644 --- a/paddle/phi/kernels/impl/conv_transpose_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/conv_transpose_grad_kernel_impl.h @@ -14,13 +14,13 @@ #pragma once -#include "paddle/fluid/operators/math/im2col.h" #include "paddle/phi/common/layout.h" #include "paddle/phi/core/ddim.h" #include "paddle/phi/kernels/conv_transpose_grad_kernel.h" #include "paddle/phi/kernels/cpu/conv_util.h" #include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/concat_and_split_functor.h" +#include "paddle/phi/kernels/funcs/im2col.h" #include "paddle/phi/kernels/funcs/slice.h" #include "paddle/phi/kernels/funcs/vol2col.h" @@ -143,9 +143,7 @@ void ConvTransposeGradRawKernel(const Context& ctx, DenseTensor dfilter_; funcs::SetConstant set_zero; - paddle::operators::math:: - Im2ColFunctor - im2col; + phi::funcs::Im2ColFunctor im2col; phi::funcs::Vol2ColFunctor vol2col; funcs::ConcatFunctor concat_functor; diff --git a/paddle/phi/kernels/impl/conv_transpose_kernel_impl.h b/paddle/phi/kernels/impl/conv_transpose_kernel_impl.h index a854bf3ee70de..819b1afcf6bb6 100644 --- a/paddle/phi/kernels/impl/conv_transpose_kernel_impl.h +++ b/paddle/phi/kernels/impl/conv_transpose_kernel_impl.h @@ -14,13 +14,13 @@ #pragma once -#include "paddle/fluid/operators/math/im2col.h" #include "paddle/phi/common/layout.h" #include "paddle/phi/core/ddim.h" #include "paddle/phi/kernels/conv_transpose_kernel.h" #include "paddle/phi/kernels/cpu/conv_util.h" #include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/concat_and_split_functor.h" +#include "paddle/phi/kernels/funcs/im2col.h" #include "paddle/phi/kernels/funcs/slice.h" #include "paddle/phi/kernels/funcs/vol2col.h" @@ -136,9 +136,7 @@ void ConvTransposeRawKernel(const Context& ctx, (data_layout != DataLayout::kNHWC ? static_cast(out_dims[1]) / groups : static_cast(out_dims[out_dims.size() - 1]) / groups); - paddle::operators::math:: - Col2ImFunctor - col2im; + phi::funcs::Col2ImFunctor col2im; phi::funcs::Col2VolFunctor col2vol; funcs::ConcatFunctor concat_functor; diff --git a/paddle/phi/kernels/impl/fold_grad_kernel_impl.h b/paddle/phi/kernels/impl/fold_grad_kernel_impl.h index b9320eab85046..7118de3174f7d 100644 --- a/paddle/phi/kernels/impl/fold_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/fold_grad_kernel_impl.h @@ -16,8 +16,8 @@ #include -#include "paddle/fluid/operators/math/im2col.h" #include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/kernels/funcs/im2col.h" #include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/unfold_functor.h" @@ -60,9 +60,7 @@ void FoldGradKernel(const Context& ctx, output_height, output_width}); - paddle::operators::math:: - Im2ColFunctor - im2col; + phi::funcs::Im2ColFunctor im2col; for (int i = 0; i < batch_size; i++) { DenseTensor out_grad_batch = out_grad.Slice(i, i + 1).Resize(out_shape); diff --git a/paddle/phi/kernels/impl/fold_kernel_impl.h b/paddle/phi/kernels/impl/fold_kernel_impl.h index 415beca7bd928..21864b00cae76 100644 --- a/paddle/phi/kernels/impl/fold_kernel_impl.h +++ b/paddle/phi/kernels/impl/fold_kernel_impl.h @@ -16,9 +16,9 @@ #include -#include "paddle/fluid/operators/math/im2col.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/enforce.h" +#include "paddle/phi/kernels/funcs/im2col.h" #include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/unfold_functor.h" @@ -36,9 +36,7 @@ void FoldKernel(const Context& ctx, const int batch_size = static_cast(x.dims()[0]); ctx.template Alloc(out); - paddle::operators::math:: - Col2ImFunctor - col2im; + phi::funcs::Col2ImFunctor col2im; const auto& x_dims = x.dims(); int output_height = (output_sizes[0] + 2 * paddings[0] - diff --git a/paddle/phi/kernels/impl/unfold_grad_kernel_impl.h b/paddle/phi/kernels/impl/unfold_grad_kernel_impl.h index 66fa2a4dc04f5..78bd068041dd5 100644 --- a/paddle/phi/kernels/impl/unfold_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/unfold_grad_kernel_impl.h @@ -16,8 +16,8 @@ #include -#include "paddle/fluid/operators/math/im2col.h" #include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/kernels/funcs/im2col.h" #include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/unfold_functor.h" @@ -56,9 +56,7 @@ void UnfoldGradKernel(const Context& ctx, DDim out_matrix_shape = make_ddim( {x_dims[1], kernel_sizes[0], kernel_sizes[1], out_height, out_width}); - paddle::operators::math:: - Col2ImFunctor - col2im; + phi::funcs::Col2ImFunctor col2im; phi::funcs::SetConstant set_zero; set_zero(ctx, x_grad, static_cast(0)); diff --git a/paddle/phi/kernels/impl/unfold_kernel_impl.h b/paddle/phi/kernels/impl/unfold_kernel_impl.h index 3b75e149f48e2..7b7e9923d0004 100644 --- a/paddle/phi/kernels/impl/unfold_kernel_impl.h +++ b/paddle/phi/kernels/impl/unfold_kernel_impl.h @@ -16,8 +16,8 @@ #include -#include "paddle/fluid/operators/math/im2col.h" #include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/kernels/funcs/im2col.h" #include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/unfold_functor.h" @@ -34,9 +34,7 @@ void UnfoldKernel(const Context& ctx, const int batch_size = static_cast(x.dims()[0]); ctx.template Alloc(out); - paddle::operators::math:: - Im2ColFunctor - im2col; + phi::funcs::Im2ColFunctor im2col; const auto& x_dims = x.dims(); int out_height = phi::funcs::CalcOutputSize(x_dims[2], From d828ca460a89c2ce88be15bb5cdb76c676decf91 Mon Sep 17 00:00:00 2001 From: Wen Sun <35923278+HermitSun@users.noreply.github.com> Date: Wed, 23 Nov 2022 21:27:06 +0800 Subject: [PATCH 181/210] Add static checks for collective communication on NCCL (#48256) * feat: static check --- .../fluid/distributed/collective/NCCLTools.cc | 104 ++++++++++++++++++ .../fluid/distributed/collective/NCCLTools.h | 27 +++++ .../collective/ProcessGroupNCCL.cc | 21 ++++ .../distributed/collective/ProcessGroupNCCL.h | 5 +- paddle/fluid/distributed/collective/utils.h | 2 +- .../communication/stream/all_gather.py | 23 ---- .../communication/stream/all_to_all.py | 24 ---- .../communication/stream/reduce_scatter.py | 25 ----- .../communication/stream/scatter.py | 22 ---- 9 files changed, 156 insertions(+), 97 deletions(-) diff --git a/paddle/fluid/distributed/collective/NCCLTools.cc b/paddle/fluid/distributed/collective/NCCLTools.cc index a8c437bb12225..988232b617194 100644 --- a/paddle/fluid/distributed/collective/NCCLTools.cc +++ b/paddle/fluid/distributed/collective/NCCLTools.cc @@ -44,5 +44,109 @@ std::string SerializeNCCLUniqueId(const ncclUniqueId& ncclID) { return oss.str(); } +void StaticCheckTensor(const phi::DenseTensor& tensor, + int rank, + int world_size) { + // place check + PADDLE_ENFORCE_EQ( + platform::is_gpu_place(tensor.place()), + true, + platform::errors::InvalidArgument("Tensor should be in GPU place.")); + // rank check + PADDLE_ENFORCE_GE(rank, + 0, + platform::errors::InvalidArgument( + "Rank should be greater than or equal to 0.")); + PADDLE_ENFORCE_LT( + rank, + world_size, + platform::errors::InvalidArgument("Rank is out of the process group.")); +} + +// static check for collective +void StaticCheckTensors(const phi::DenseTensor& out_tensor, + const phi::DenseTensor& in_tensor, + int rank, + int world_size, + int out_size_factor, + int in_size_factor) { + // place check + PADDLE_ENFORCE_EQ(platform::is_gpu_place(out_tensor.place()), + true, + platform::errors::InvalidArgument( + "Output tensor should be in GPU place.")); + PADDLE_ENFORCE_EQ(platform::is_gpu_place(in_tensor.place()), + true, + platform::errors::InvalidArgument( + "Input tensor should be in GPU place.")); + // rank check + PADDLE_ENFORCE_GE(rank, + 0, + platform::errors::InvalidArgument( + "Rank should be greater than or equal to 0.")); + PADDLE_ENFORCE_LT( + rank, + world_size, + platform::errors::InvalidArgument("Rank is out of the process group.")); + // shape check + int64_t out_size = out_tensor.numel(); + PADDLE_ENFORCE_GT(out_size, + 0, + platform::errors::InvalidArgument( + "Size of output tensor should be greater than 0.")); + int64_t in_size = in_tensor.numel(); + PADDLE_ENFORCE_GT(in_size, + 0, + platform::errors::InvalidArgument( + "Size of input tensor should be greater than 0.")); + PADDLE_ENFORCE_EQ( + out_size * out_size_factor, + in_size * in_size_factor, + platform::errors::InvalidArgument( + "Input and output tensors should have matching sizes.")); + // dtype check + PADDLE_ENFORCE_EQ( + out_tensor.dtype(), + in_tensor.dtype(), + platform::errors::InvalidArgument( + "Input and output tensors should have the same data type.")); +} + +void StaticCheckTensorsSameShape(const phi::DenseTensor& out_tensor, + const phi::DenseTensor& in_tensor, + int rank, + int world_size) { + StaticCheckTensors(out_tensor, + in_tensor, + rank, + world_size, + /*out_size_factor*/ 1, + /*in_size_factor*/ 1); +} + +void StaticCheckTensorsScatterLikeShape(const phi::DenseTensor& out_tensor, + const phi::DenseTensor& in_tensor, + int rank, + int world_size) { + StaticCheckTensors(out_tensor, + in_tensor, + rank, + world_size, + /*out_size_factor*/ world_size, + /*in_size_factor*/ 1); +} + +void StaticCheckTensorsGatherLikeShape(const phi::DenseTensor& out_tensor, + const phi::DenseTensor& in_tensor, + int rank, + int world_size) { + StaticCheckTensors(out_tensor, + in_tensor, + rank, + world_size, + /*out_size_factor*/ 1, + /*in_size_factor*/ world_size); +} + } // namespace distributed } // namespace paddle diff --git a/paddle/fluid/distributed/collective/NCCLTools.h b/paddle/fluid/distributed/collective/NCCLTools.h index 37b1e0f114c3d..a882dae2e990d 100644 --- a/paddle/fluid/distributed/collective/NCCLTools.h +++ b/paddle/fluid/distributed/collective/NCCLTools.h @@ -63,5 +63,32 @@ ncclRedOp_t ToNCCLRedType(ReduceOp reduction); std::string SerializeNCCLUniqueId(const ncclUniqueId& ncclID); +// static check for p2p +void StaticCheckTensor(const phi::DenseTensor& tensor, + int rank, + int world_size); + +// static check for collective +void StaticCheckTensors(const phi::DenseTensor& out_tensor, + const phi::DenseTensor& in_tensor, + int rank, + int world_size, + int out_size_factor, + int in_size_factor); + +void StaticCheckTensorsSameShape(const phi::DenseTensor& out_tensor, + const phi::DenseTensor& in_tensor, + int rank, + int world_size); + +void StaticCheckTensorsScatterLikeShape(const phi::DenseTensor& out_tensor, + const phi::DenseTensor& in_tensor, + int rank, + int world_size); + +void StaticCheckTensorsGatherLikeShape(const phi::DenseTensor& out_tensor, + const phi::DenseTensor& in_tensor, + int rank, + int world_size); } // namespace distributed } // namespace paddle diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc index 96666f50c91ef..e995161cf304d 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc @@ -15,6 +15,7 @@ #include "paddle/fluid/distributed/collective/ProcessGroupNCCL.h" #include "paddle/fluid/distributed/collective/Common.h" +#include "paddle/fluid/distributed/collective/NCCLTools.h" #include "paddle/fluid/distributed/collective/utils.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h" #include "paddle/fluid/platform/place.h" @@ -137,6 +138,8 @@ std::shared_ptr ProcessGroupNCCL::AllGather( // numel > 0 indicates the tensor need to be sliced const phi::DenseTensor& in_tensor_maybe_partial = numel > 0 ? GetPartialTensor(in_tensor, offset, numel) : in_tensor; + StaticCheckTensorsGatherLikeShape( + *out_tensor, in_tensor_maybe_partial, rank_, size_); return RunFnInNCCLEnv( [&](ncclComm_t comm, gpuStream_t stream) { NCCL_CHECK(platform::dynload::ncclAllGather( @@ -159,6 +162,7 @@ std::shared_ptr ProcessGroupNCCL::AllReduce( const AllreduceOptions& opts, bool sync_op, bool use_calc_stream) { + StaticCheckTensorsSameShape(*out_tensor, in_tensor, rank_, size_); return RunFnInNCCLEnv( [&](ncclComm_t comm, gpuStream_t stream) { NCCL_CHECK(platform::dynload::ncclAllReduce( @@ -207,6 +211,15 @@ std::shared_ptr ProcessGroupNCCL::AllToAll( CheckSizeOnEachRank(out_dim, out_size_each_rank, size_); CheckSizeOnEachRank(in_dim, in_size_each_rank, size_); + // NOTE: Since `all_to_all` needs other processes's participation, it cannot + // simply be covered by static checks. Factors are set to 0 here to skip the + // shape check. Its shape check will be done by dynamic checks in debug mode. + StaticCheckTensors(*out_tensor, + in_tensor, + rank_, + size_, + /*out_size_factor*/ 0, + /*in_size_factor*/ 0); return RunFnInNCCLEnv( [&](ncclComm_t comm, gpuStream_t stream) { int64_t in_row_size = in_tensor.numel() / in_dim[0], @@ -274,6 +287,7 @@ std::shared_ptr ProcessGroupNCCL::Broadcast( const BroadcastOptions& opts, bool sync_op, bool use_calc_stream) { + StaticCheckTensorsSameShape(*out_tensor, in_tensor, rank_, size_); return RunFnInNCCLEnv( [&](ncclComm_t comm, gpuStream_t stream) { int root = opts.source_rank + opts.source_root; @@ -298,6 +312,7 @@ std::shared_ptr ProcessGroupNCCL::Reduce( const ReduceOptions& opts, bool sync_op, bool use_calc_stream) { + StaticCheckTensorsSameShape(*out_tensor, in_tensor, rank_, size_); return RunFnInNCCLEnv( [&](ncclComm_t comm, gpuStream_t stream) { NCCL_CHECK(platform::dynload::ncclReduce( @@ -322,6 +337,7 @@ std::shared_ptr ProcessGroupNCCL::ReduceScatter( const ReduceScatterOptions& opts, bool sync_op, bool use_calc_stream) { + StaticCheckTensorsScatterLikeShape(*out_tensor, in_tensor, rank_, size_); return RunFnInNCCLEnv( [&](ncclComm_t comm, gpuStream_t stream) { NCCL_CHECK(platform::dynload::ncclReduceScatter( @@ -345,6 +361,7 @@ std::shared_ptr ProcessGroupNCCL::Scatter( const ScatterOptions& opts, bool sync_op, bool use_calc_stream) { + StaticCheckTensorsScatterLikeShape(*out_tensor, in_tensor, rank_, size_); return RunFnInNCCLEnv( [&](ncclComm_t comm, gpuStream_t stream) { int64_t numel = in_tensor.numel() / size_; @@ -400,6 +417,8 @@ std::shared_ptr ProcessGroupNCCL::Recv( partial_tensor = GetPartialTensor(*tensor, offset, numel); tensor = &partial_tensor; } + + StaticCheckTensor(*tensor, rank_, size_); return RunFnInNCCLEnv( [&](ncclComm_t comm, gpuStream_t stream) { NCCL_CHECK(platform::dynload::ncclRecv( @@ -426,6 +445,8 @@ std::shared_ptr ProcessGroupNCCL::Send( // numel > 0 indicates the tensor need to be sliced const phi::DenseTensor& tensor_maybe_partial = numel > 0 ? GetPartialTensor(tensor, offset, numel) : tensor; + + StaticCheckTensor(tensor_maybe_partial, rank_, size_); return RunFnInNCCLEnv( [&](ncclComm_t comm, gpuStream_t stream) { NCCL_CHECK(platform::dynload::ncclSend( diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h index 5153b7a678dd4..2a184e182aae9 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h +++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h @@ -210,6 +210,8 @@ class ProcessGroupNCCL final : public ProcessGroupStream { void CreateNCCLEnvCache(const Place& place, const std::string& place_key); + void SyncCalcStream(const Place& place); + std::shared_ptr RunFnInNCCLEnv( std::function fn, const phi::DenseTensor& tensor, @@ -217,8 +219,6 @@ class ProcessGroupNCCL final : public ProcessGroupStream { bool sync_op, bool use_calc_stream); - void SyncCalcStream(const Place& place); - // TODO(sunyilun): methods below will be removed later std::shared_ptr CreateTask( std::vector places, @@ -245,6 +245,7 @@ class ProcessGroupNCCL final : public ProcessGroupStream { private: std::shared_ptr store_; + std::unordered_map place_to_calc_event_; // event on calc stream std::unordered_map place_to_calc_ctx_; diff --git a/paddle/fluid/distributed/collective/utils.h b/paddle/fluid/distributed/collective/utils.h index a730a47dd0dff..5b98a36357011 100644 --- a/paddle/fluid/distributed/collective/utils.h +++ b/paddle/fluid/distributed/collective/utils.h @@ -19,7 +19,7 @@ namespace paddle { namespace distributed { -inline phi::DenseTensor GetPartialTensor(const phi::DenseTensor &tensor, +inline phi::DenseTensor GetPartialTensor(const phi::DenseTensor& tensor, int64_t offset, int64_t numel) { phi::DenseTensor tensor_flattened; diff --git a/python/paddle/distributed/communication/stream/all_gather.py b/python/paddle/distributed/communication/stream/all_gather.py index 12f9e08f9d50b..1e3344d0dbba0 100644 --- a/python/paddle/distributed/communication/stream/all_gather.py +++ b/python/paddle/distributed/communication/stream/all_gather.py @@ -17,32 +17,11 @@ from paddle.distributed import collective -def _check_tensor_shape(tensor, shape, nranks=1): - expect_shape = list(shape) - expect_shape[0] *= nranks - if list(tensor.shape) != expect_shape: - raise RuntimeError("The tensor for all_gather is not correctly-sized.") - - -def _check_tensor_list_shape(tensor_list, shape, nranks=1): - if len(tensor_list) != nranks: - raise RuntimeError( - "The tensor_list for all_gather is not correctly-sized." - ) - for tensor in tensor_list: - if tensor.shape != shape: - raise RuntimeError( - "The tensor_list for all_gather is not correctly-sized." - ) - - def _all_gather_into_tensor_in_dygraph( out_tensor, in_tensor, group, sync_op, use_calc_stream ): group = collective._get_default_group() if group is None else group - _check_tensor_shape(out_tensor, in_tensor.shape, group.nranks) - if use_calc_stream: return group.process_group.all_gather_into_tensor_on_calc_stream( out_tensor, @@ -65,8 +44,6 @@ def _all_gather_in_dygraph( if len(tensor_list) == 0: tensor_list += [paddle.empty_like(tensor) for _ in range(group.nranks)] - else: - _check_tensor_list_shape(tensor_list, tensor.shape, group.nranks) if use_calc_stream: return group.process_group.all_gather_on_calc_stream( diff --git a/python/paddle/distributed/communication/stream/all_to_all.py b/python/paddle/distributed/communication/stream/all_to_all.py index 2787c6a3d4d09..cc3b473d90d34 100644 --- a/python/paddle/distributed/communication/stream/all_to_all.py +++ b/python/paddle/distributed/communication/stream/all_to_all.py @@ -23,29 +23,9 @@ ) -def _check_tensor_shape(tensor, shape, nranks=1): - if tensor.shape != shape: - raise RuntimeError('The tensor for alltoall is not correctly-sized.') - - -def _check_tensor_list_shape(tensor_list, shape, nranks=1): - if len(tensor_list) != nranks: - raise RuntimeError( - 'The tensor_list for alltoall is not correctly-sized.' - ) - for tensor in tensor_list: - if tensor.shape != shape: - raise RuntimeError( - 'The tensor_list for alltoall is not correctly-sized.' - ) - - def _all_to_all_tensor_in_dygraph( out_tensor, in_tensor, group, sync_op, use_calc_stream ): - - _check_tensor_shape(out_tensor, in_tensor.shape, group.nranks) - if use_calc_stream: return group.process_group.all_to_all_tensor_on_calc_stream( in_tensor, out_tensor @@ -68,10 +48,6 @@ def _all_to_all_in_dygraph( out_tensor_list += [ paddle.empty_like(tensor) for tensor in in_tensor_list ] - else: - _check_tensor_list_shape( - out_tensor_list, in_tensor_list[0].shape, group.nranks - ) if use_calc_stream: return group.process_group.all_to_all_on_calc_stream( diff --git a/python/paddle/distributed/communication/stream/reduce_scatter.py b/python/paddle/distributed/communication/stream/reduce_scatter.py index 4d26e8d2b66c5..80e1ae7aa2156 100644 --- a/python/paddle/distributed/communication/stream/reduce_scatter.py +++ b/python/paddle/distributed/communication/stream/reduce_scatter.py @@ -21,27 +21,6 @@ from paddle.distributed.communication.reduce import _get_reduce_op, ReduceOp -def _check_tensor_shape(tensor, shape, nranks=1): - expect_shape = list(shape) - expect_shape[0] //= nranks - if list(tensor.shape) != expect_shape: - raise RuntimeError( - "The in_tensor for reduce_scatter is not correctly-sized." - ) - - -def _check_tensor_list_shape(tensor_list, shape, nranks=1): - if len(tensor_list) != nranks: - raise RuntimeError( - "The tensor_list for reduce_scatter is not correctly-sized." - ) - for tensor in tensor_list: - if tensor.shape != shape: - raise RuntimeError( - "The tensor_list for reduce_scatter is not correctly-sized." - ) - - def _reduce_scatter_tensor_in_dygraph( out_tensor, in_tensor, @@ -53,8 +32,6 @@ def _reduce_scatter_tensor_in_dygraph( ): op_type = _get_reduce_op(op, caller) - _check_tensor_shape(out_tensor, in_tensor.shape, group.nranks) - if use_calc_stream: return group.process_group.reduce_scatter_tensor_on_calc_stream( out_tensor, in_tensor, op_type @@ -74,8 +51,6 @@ def _reduce_scatter_in_dygraph( ): op_type = _get_reduce_op(op, "reduce_scatter") - _check_tensor_list_shape(tensor_list, tensor.shape, group.nranks) - if use_calc_stream: return group.process_group.reduce_scatter_on_calc_stream( tensor, tensor_list, op_type diff --git a/python/paddle/distributed/communication/stream/scatter.py b/python/paddle/distributed/communication/stream/scatter.py index 5767c2150d813..a1df9c71aee86 100644 --- a/python/paddle/distributed/communication/stream/scatter.py +++ b/python/paddle/distributed/communication/stream/scatter.py @@ -25,31 +25,10 @@ ) -def _check_tensor_shape(tensor, shape, nranks=1): - expect_shape = list(shape) - expect_shape[0] //= nranks - if list(tensor.shape) != expect_shape: - raise RuntimeError("The in_tensor for scatter is not correctly-sized.") - - -def _check_tensor_list_shape(tensor_list, shape, nranks=1): - if len(tensor_list) != nranks: - raise RuntimeError( - "The tensor_list for scatter is not correctly-sized." - ) - for tensor in tensor_list: - if tensor.shape != shape: - raise RuntimeError( - "The tensor_list for scatter is not correctly-sized." - ) - - def _scatter_tensor_in_dygraph( out_tensor, in_tensor, src_rank_in_group, group, sync_op, use_calc_stream ): nranks = group.nranks - if group.rank == src_rank_in_group: - _check_tensor_shape(out_tensor, in_tensor.shape, nranks) if use_calc_stream: return group.process_group.scatter_tensor_on_calc_stream( @@ -74,7 +53,6 @@ def _scatter_in_dygraph( raise RuntimeError( "The tensor_list should not be empty on src rank." ) - _check_tensor_list_shape(tensor_list, tensor.shape, nranks) else: tensor_list = [tensor for _ in range(nranks)] From 9c5837fb8da4d08930f383a9b20a0607c6c16455 Mon Sep 17 00:00:00 2001 From: Vvsmile <450864116@qq.com> Date: Wed, 23 Nov 2022 22:37:00 +0800 Subject: [PATCH 182/210] [Clean Fluid API]Remove API: scatter (#47958) * Remove API: scatter replace paddle.fluid.layers.scatter with paddle.scatter * modify the call of scatter from old style to new style --- python/paddle/fluid/layers/nn.py | 91 ------------------- .../fluid/tests/unittests/test_layers.py | 2 +- 2 files changed, 1 insertion(+), 92 deletions(-) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 3e8eddff75bc3..a140894990bb1 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -110,7 +110,6 @@ 'resize_trilinear', 'resize_nearest', 'gather_nd', - 'scatter', 'random_crop', 'relu', 'log', @@ -7246,96 +7245,6 @@ def gather_nd(input, index, name=None): return output -@deprecated(since="2.0.0", update_to="paddle.scatter") -def scatter(input, index, updates, name=None, overwrite=True): - """ - :alias_main: paddle.scatter - :alias: paddle.scatter,paddle.tensor.scatter,paddle.tensor.manipulation.scatter - :old_api: paddle.fluid.layers.scatter - - **Scatter Layer** - - Output is obtained by updating the input on selected indices based on updates. - - .. code-block:: python - - import numpy as np - - #input: - input = np.array([[1, 1], [2, 2], [3, 3]]) - index = np.array([2, 1, 0, 1]) - # shape of updates should be the same as input - # shape of updates with dim > 1 should be the same as input - updates = np.array([[1, 1], [2, 2], [3, 3], [4, 4]]) - overwrite = False - - # calculation: - if not overwrite: - for i in range(len(index)): - input[index[i]] = np.zeros((2)) - - for i in range(len(index)): - if (overwrite): - input[index[i]] = updates[i] - else: - input[index[i]] += updates[i] - # output: - out = np.array([[3, 3], [6, 6], [1, 1]]) - out.shape # [3, 2] - - Args: - input (Variable): The input N-D Tensor with rank>=1. Data type can be float32. - index (Variable): The index 1-D Tensor. Data type can be int32, int64. The length of index cannot exceed updates's length, and the value in index cannot exceed input's length. - updates (Variable): update input with updates parameter based on index. shape should be the same as input, and dim value with dim > 1 should be the same as input. - name(str, optional): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name` . - overwrite (bool): The mode that updating the output when there are same indices. - If True, use the overwrite mode to update the output of the same index, - if False, use the accumulate mode to update the output of the same index. - Default value is True. - - Returns: - Variable(Tensor|LoDTensor): The output is a Tensor with the same shape as input. - - Examples: - - .. code-block:: python - - import paddle - import numpy as np - import paddle.fluid as fluid - paddle.enable_static() - - input = fluid.layers.data(name='data', shape=[3, 2], dtype='float32', append_batch_size=False) - index = fluid.layers.data(name='index', shape=[4], dtype='int64', append_batch_size=False) - updates = fluid.layers.data(name='update', shape=[4, 2], dtype='float32', append_batch_size=False) - - output = fluid.layers.scatter(input, index, updates, overwrite=False) - - exe = fluid.Executor(fluid.CPUPlace()) - exe.run(fluid.default_startup_program()) - - in_data = np.array([[1, 1], [2, 2], [3, 3]]).astype(np.float32) - index_data = np.array([2, 1, 0, 1]).astype(np.int64) - update_data = np.array([[1, 1], [2, 2], [3, 3], [4, 4]]).astype(np.float32) - - res = exe.run(fluid.default_main_program(), feed={'data':in_data, "index":index_data, "update":update_data}, fetch_list=[output]) - print(res) - # [array([[3., 3.], - # [6., 6.], - # [1., 1.]], dtype=float32)] - """ - helper = LayerHelper('scatter', **locals()) - dtype = helper.input_dtype() - out = helper.create_variable_for_type_inference(dtype) - helper.append_op( - type="scatter", - inputs={"X": input, "Ids": index, "Updates": updates}, - attrs={'overwrite': overwrite}, - outputs={"Out": out}, - ) - return out - - @templatedoc() def random_crop(x, shape, seed=None): """ diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index 9fa79c681a892..ad1226e12d4a3 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -3345,7 +3345,7 @@ def make_scatter(self): append_batch_size=False, dtype='float32', ) - out = layers.scatter(input=x, index=idx, updates=updates) + out = paddle.scatter(x, index=idx, updates=updates) return out def make_one_hot(self): From 3f2658150c52c56c9074fa1b253f3e05192274e0 Mon Sep 17 00:00:00 2001 From: wanghuancoder Date: Thu, 24 Nov 2022 07:40:01 +0800 Subject: [PATCH 183/210] dense tensor in eager mode support data_ptr (#48235) * dense tensor in eager mode support data_ptr --- paddle/fluid/pybind/eager_method.cc | 17 ++++++++++ .../tests/unittests/test_tensor_data_ptr.py | 31 +++++++++++++++++++ 2 files changed, 48 insertions(+) create mode 100644 python/paddle/fluid/tests/unittests/test_tensor_data_ptr.py diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc index 3c52a705fc506..545889d347376 100644 --- a/paddle/fluid/pybind/eager_method.cc +++ b/paddle/fluid/pybind/eager_method.cc @@ -1845,6 +1845,19 @@ static PyObject* tensor__unset_fake_empty(TensorObject* self, EAGER_CATCH_AND_THROW_RETURN_NULL } +static PyObject* tensor_data_ptr(TensorObject* self, + PyObject* args, + PyObject* kwargs) { + EAGER_TRY + if (self->tensor.initialized() && self->tensor.is_dense_tensor()) { + ToPyObject((int64_t)std::dynamic_pointer_cast( // NOLINT + self->tensor.impl()) + ->data()); + } + RETURN_PY_NONE + EAGER_CATCH_AND_THROW_RETURN_NULL +} + #if defined(PADDLE_WITH_CUDA) static PyObject* tensor_method__uva(TensorObject* self, PyObject* args, @@ -2100,6 +2113,10 @@ PyMethodDef variable_methods[] = { (PyCFunction)(void (*)(void))tensor__unset_fake_empty, METH_VARARGS | METH_KEYWORDS, NULL}, + {"data_ptr", + (PyCFunction)(void (*)(void))tensor_data_ptr, + METH_VARARGS | METH_KEYWORDS, + NULL}, #if defined(PADDLE_WITH_CUDA) {"_tensor_uva", (PyCFunction)(void (*)(void))tensor_method__uva, diff --git a/python/paddle/fluid/tests/unittests/test_tensor_data_ptr.py b/python/paddle/fluid/tests/unittests/test_tensor_data_ptr.py new file mode 100644 index 0000000000000..10e816370ecac --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_tensor_data_ptr.py @@ -0,0 +1,31 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np + +import paddle + + +class TestTensorDataPtr(unittest.TestCase): + def test_tensor_data_ptr(self): + np_src = np.random.random((3, 8, 8)) + src = paddle.to_tensor(np_src, dtype="float64") + dst = paddle.Tensor() + src._share_buffer_to(dst) + self.assertEqual(src.data_ptr(), dst.data_ptr()) + + +if __name__ == '__main__': + unittest.main() From 0a9c1f59eb2199ff36650212370f1902a54bcc18 Mon Sep 17 00:00:00 2001 From: wanghuancoder Date: Thu, 24 Nov 2022 07:40:26 +0800 Subject: [PATCH 184/210] [multiprocessing] Eager tensor support pickle (#48179) * eager tensot support pickle --- .../paddle/fluid/dataloader/dataloader_iter.py | 3 ++- python/paddle/fluid/dataloader/worker.py | 16 +++++++--------- python/paddle/incubate/__init__.py | 1 + 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/python/paddle/fluid/dataloader/dataloader_iter.py b/python/paddle/fluid/dataloader/dataloader_iter.py index f248a1188bbee..e47d75326b357 100644 --- a/python/paddle/fluid/dataloader/dataloader_iter.py +++ b/python/paddle/fluid/dataloader/dataloader_iter.py @@ -21,7 +21,6 @@ import itertools import threading import numpy as np -import multiprocessing from collections import namedtuple from paddle.fluid.framework import ( _set_expected_place, @@ -422,6 +421,8 @@ def __init__(self, loader): self._shutdown = False def _init_workers(self): + import paddle.incubate.multiprocessing as multiprocessing + # multiprocess worker and indice queue list initial as empty self._workers = [] self._worker_status = [] diff --git a/python/paddle/fluid/dataloader/worker.py b/python/paddle/fluid/dataloader/worker.py index 155208791eacd..64ec697cf5308 100644 --- a/python/paddle/fluid/dataloader/worker.py +++ b/python/paddle/fluid/dataloader/worker.py @@ -373,21 +373,19 @@ def _worker_loop( out_queue.put((idx, batch, None)) batch, structure = _flatten_batch(batch) if use_shared_memory: - # NOTE: In eager mode, Tensor._share_memory has no - # effect, fall back to _array_to_share_memory_tensor - def tensor_share_memory(tensor): - if _in_eager_without_dygraph_check(): - return core._array_to_share_memory_tensor(tensor) - return tensor._share_memory() + + def numpy2lodtensor(arr): + lodtensor = core.Tensor() + lodtensor.set(arr, core.CPUPlace()) + return lodtensor tensor_list = [ - core._array_to_share_memory_tensor(b) + numpy2lodtensor(b) if isinstance(b, np.ndarray) - else tensor_share_memory(b) + else b.value().get_tensor() for b in batch ] out_queue.put((idx, tensor_list, structure)) - core._remove_tensor_list_mmap_fds(tensor_list) else: out_queue.put((idx, batch, structure)) except KeyboardInterrupt: diff --git a/python/paddle/incubate/__init__.py b/python/paddle/incubate/__init__.py index 0c5e90c6975e2..d5ff9454a8046 100644 --- a/python/paddle/incubate/__init__.py +++ b/python/paddle/incubate/__init__.py @@ -34,6 +34,7 @@ from . import autotune # noqa: F401 from . import nn # noqa: F401 from . import asp # noqa: F401 +from . import multiprocessing # noqa: F401 from ..fluid.layers.loss import identity_loss From bcf7513270f125112497dccdfa19c5a56ad4cc79 Mon Sep 17 00:00:00 2001 From: wanghuancoder Date: Thu, 24 Nov 2022 09:13:10 +0800 Subject: [PATCH 185/210] do not calc reduce_all in eager mode (#48199) * do not calc reduce_all in eager mode * refine python c cast list * refine * refine * refine * refine * refine * refine * refine * refine * refine --- paddle/fluid/pybind/eager_method.cc | 2 +- paddle/fluid/pybind/eager_utils.cc | 96 ++++++++++++++++--- paddle/fluid/pybind/eager_utils.h | 4 +- paddle/fluid/pybind/op_function_common.cc | 6 +- paddle/phi/kernels/prod_kernel.cc | 2 +- .../fluid/tests/unittests/test_mean_op.py | 4 +- python/paddle/tensor/linalg.py | 72 +++++++------- python/paddle/tensor/math.py | 23 +++-- python/paddle/tensor/stat.py | 5 +- 9 files changed, 141 insertions(+), 73 deletions(-) diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc index 545889d347376..a3e7f43faedaf 100644 --- a/paddle/fluid/pybind/eager_method.cc +++ b/paddle/fluid/pybind/eager_method.cc @@ -1462,7 +1462,7 @@ static PyObject* tensor_method_set_string_list(TensorObject* self, PyObject* kwargs) { EAGER_TRY using Strings = std::vector; - auto strings = CastPyArg2Strings(PyTuple_GET_ITEM(args, 0), 0); + auto strings = CastPyArg2VectorOfString(PyTuple_GET_ITEM(args, 0), 0); auto var_tensor = std::make_shared(); *var_tensor->GetMutable() = strings; self->tensor.set_impl(var_tensor); diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc index 4cbac193ad070..f5f409673a106 100644 --- a/paddle/fluid/pybind/eager_utils.cc +++ b/paddle/fluid/pybind/eager_utils.cc @@ -289,6 +289,9 @@ std::vector CastPyArg2VectorOfTensor( } } else if (obj == Py_None) { return {}; + } else if (PyObject_IsInstance(obj, + reinterpret_cast(p_tensor_type))) { + return {reinterpret_cast(obj)->tensor}; } else { PADDLE_THROW(platform::errors::InvalidArgument( "argument (position %d) must be " @@ -335,6 +338,56 @@ std::vector CastPyArg2VectorOfInt(PyObject* obj, size_t arg_pos) { } } else if (obj == Py_None) { return {}; + } else if (PyObject_CheckLongOrConvertToLong(&obj)) { + return {static_cast(PyLong_AsLong(obj))}; + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "argument (position %d) must be " + "list or tuple, but got %s", + arg_pos + 1, + reinterpret_cast(obj->ob_type)->tp_name)); + } + return result; +} + +std::vector CastPyArg2VectorOfInt64(PyObject* obj, size_t arg_pos) { + std::vector result; + if (PyList_Check(obj)) { + Py_ssize_t len = PyList_Size(obj); + PyObject* item = nullptr; + for (Py_ssize_t i = 0; i < len; i++) { + item = PyList_GET_ITEM(obj, i); + if (PyObject_CheckLongOrConvertToLong(&item)) { + result.emplace_back(static_cast(PyLong_AsLong(item))); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "argument (position %d) must be " + "list of int, but got %s at pos %d", + arg_pos + 1, + reinterpret_cast(item->ob_type)->tp_name, + i)); + } + } + } else if (PyTuple_Check(obj)) { + Py_ssize_t len = PyTuple_Size(obj); + PyObject* item = nullptr; + for (Py_ssize_t i = 0; i < len; i++) { + item = PyTuple_GET_ITEM(obj, i); + if (PyObject_CheckLongOrConvertToLong(&item)) { + result.emplace_back(static_cast(PyLong_AsLong(item))); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "argument (position %d) must be " + "list of int, but got %s at pos %d", + arg_pos + 1, + reinterpret_cast(item->ob_type)->tp_name, + i)); + } + } + } else if (obj == Py_None) { + return {}; + } else if (PyObject_CheckLongOrConvertToLong(&obj)) { + return {static_cast(PyLong_AsLong(obj))}; } else { PADDLE_THROW(platform::errors::InvalidArgument( "argument (position %d) must be " @@ -363,10 +416,30 @@ std::vector CastPyArg2VectorOfSize_t(PyObject* obj, size_t arg_pos) { i)); } } + } else if (PyTuple_Check(obj)) { + Py_ssize_t len = PyTuple_Size(obj); + PyObject* item = nullptr; + for (Py_ssize_t i = 0; i < len; i++) { + item = PyTuple_GET_ITEM(obj, i); + if (PyObject_CheckLongOrConvertToLong(&item)) { + result.emplace_back(PyLong_AsSize_t(item)); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "argument (position %d) must be " + "list of size_t, but got %s at pos %d", + arg_pos + 1, + reinterpret_cast(item->ob_type)->tp_name, + i)); + } + } + } else if (obj == Py_None) { + return {}; + } else if (PyObject_CheckLongOrConvertToLong(&obj)) { + return {PyLong_AsSize_t(obj)}; } else { PADDLE_THROW(platform::errors::InvalidArgument( "argument (position %d) must be " - "list, but got %s", + "list of size_t, but got %s", arg_pos + 1, reinterpret_cast(obj->ob_type)->tp_name)); } @@ -487,6 +560,9 @@ std::vector CastPyArg2VectorOfTensorBase(PyObject* obj, } } else if (obj == Py_None) { return {}; + } else if (PyObject_IsInstance( + obj, reinterpret_cast(g_framework_tensor_pytype))) { + return {::pybind11::handle(obj).cast()}; } else { PADDLE_THROW(platform::errors::InvalidArgument( "argument (position %d) must be " @@ -527,7 +603,8 @@ std::unordered_map CastPyArg2Vocab(PyObject* obj, } } -std::vector CastPyArg2Strings(PyObject* obj, ssize_t arg_pos) { +std::vector CastPyArg2VectorOfString(PyObject* obj, + ssize_t arg_pos) { if (PyList_Check(obj)) { return ::pybind11::handle(obj).cast>(); } else { @@ -1385,16 +1462,8 @@ std::vector CastPyArg2ScalarArray(PyObject* obj, paddle::experimental::IntArray CastPyArg2IntArray(PyObject* obj, const std::string& op_type, ssize_t arg_pos) { - // In case of IntArray, only two possible PyObjects: - // 1. list of int - // 2. Tensor if (obj == Py_None) { - PADDLE_THROW(platform::errors::InvalidArgument( - "%s(): argument (position %d) must be " - "list or Tensor, but got %s", - op_type, - arg_pos + 1, - ((PyTypeObject*)obj->ob_type)->tp_name)); // NOLINT + return paddle::experimental::IntArray({}); } // obj could be: int, float, bool, paddle.Tensor @@ -1408,10 +1477,13 @@ paddle::experimental::IntArray CastPyArg2IntArray(PyObject* obj, paddle::experimental::Tensor& value = GetTensorFromPyObject( op_type, "" /*arg_name*/, obj, arg_pos, false /*dispensable*/); return paddle::experimental::IntArray(value); + } else if (PyObject_CheckLongOrConvertToLong(&obj)) { + return paddle::experimental::IntArray( + {static_cast(PyLong_AsLong(obj))}); } else { PADDLE_THROW(platform::errors::InvalidArgument( "%s(): argument (position %d) must be " - "list or Tensor, but got %s", + "list or int, but got %s", op_type, arg_pos + 1, ((PyTypeObject*)obj->ob_type)->tp_name)); // NOLINT diff --git a/paddle/fluid/pybind/eager_utils.h b/paddle/fluid/pybind/eager_utils.h index ea24711fabd23..654a03ae8808c 100644 --- a/paddle/fluid/pybind/eager_utils.h +++ b/paddle/fluid/pybind/eager_utils.h @@ -68,6 +68,7 @@ phi::DenseTensor CastPyArg2FrameworkTensor(PyObject* obj, ssize_t arg_pos); std::vector CastPyArg2VectorOfTensorBase(PyObject* obj, ssize_t arg_pos); std::vector CastPyArg2VectorOfInt(PyObject* obj, size_t arg_pos); +std::vector CastPyArg2VectorOfInt64(PyObject* obj, size_t arg_pos); std::vector CastPyArg2VectorOfSize_t(PyObject* obj, size_t arg_pos); std::vector> CastPyArg2VectorOfVectorOfSize_t( PyObject* obj, size_t arg_pos); @@ -75,7 +76,8 @@ framework::proto::VarType::Type CastPyArg2ProtoType(PyObject* obj, ssize_t arg_pos); std::unordered_map CastPyArg2Vocab(PyObject* obj, ssize_t arg_pos); -std::vector CastPyArg2Strings(PyObject* obj, ssize_t arg_pos); +std::vector CastPyArg2VectorOfString(PyObject* obj, + ssize_t arg_pos); std::shared_ptr CastPyArg2JitFunction(PyObject* obj, ssize_t arg_pos); diff --git a/paddle/fluid/pybind/op_function_common.cc b/paddle/fluid/pybind/op_function_common.cc index 6a6b8841d3e54..5cdd9a0fa0668 100644 --- a/paddle/fluid/pybind/op_function_common.cc +++ b/paddle/fluid/pybind/op_function_common.cc @@ -461,7 +461,11 @@ std::vector CastPyArg2Longs(PyObject* obj, i)); } } - } else if ((PyObject*)obj != Py_None) { // NOLINT + } else if (obj == Py_None) { + return {}; + } else if (PyObject_CheckLongOrToLong(&obj)) { + return {static_cast(PyLong_AsLong(obj))}; + } else { PADDLE_THROW(platform::errors::InvalidArgument( "%s(): argument (position %d) must be " "list or tuple, but got %s", diff --git a/paddle/phi/kernels/prod_kernel.cc b/paddle/phi/kernels/prod_kernel.cc index 1fce5167da958..61ed575a19878 100644 --- a/paddle/phi/kernels/prod_kernel.cc +++ b/paddle/phi/kernels/prod_kernel.cc @@ -25,7 +25,7 @@ void ProdKernel(const Context& dev_ctx, const IntArray& dims, bool keep_dim, DenseTensor* out) { - bool reduce_all = false; // recompute_reduce_all(x, dims); + bool reduce_all = recompute_reduce_all(x, dims); ProdRawKernel(dev_ctx, x, dims, keep_dim, reduce_all, out); } diff --git a/python/paddle/fluid/tests/unittests/test_mean_op.py b/python/paddle/fluid/tests/unittests/test_mean_op.py index fb52745c7593d..a27752f2a9d6e 100644 --- a/python/paddle/fluid/tests/unittests/test_mean_op.py +++ b/python/paddle/fluid/tests/unittests/test_mean_op.py @@ -29,13 +29,13 @@ def mean_wrapper(x, axis=None, keepdim=False, reduce_all=False): if reduce_all: - return paddle.mean(x, range(len(x.shape)), keepdim) + return paddle.mean(x, list(range(len(x.shape))), keepdim) return paddle.mean(x, axis, keepdim) def reduce_mean_wrapper(x, axis=0, keepdim=False, reduce_all=False): if reduce_all: - return paddle.mean(x, range(len(x.shape)), keepdim) + return paddle.mean(x, list(range(len(x.shape))), keepdim) return paddle.mean(x, axis, keepdim) diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py index 3c26703a80d51..d7dcd5412ac38 100644 --- a/python/paddle/tensor/linalg.py +++ b/python/paddle/tensor/linalg.py @@ -465,12 +465,6 @@ def inf_norm( ): if in_dygraph_mode(): out = _C_ops.abs(input) - reduce_all = ( - True if axis is None or axis == [] or asvector else False - ) - axis = axis if axis is not None and axis != [] else [0] - if reduce_all: - assert (axis == []) or (axis is None) if porder == np.float64('inf'): return _C_ops.max(out, axis, keepdim) else: @@ -844,27 +838,25 @@ def mat_norm(input, porder=1.0, axis=None): Calculate the matrix norm of a square matrix or batches of square matrices, when porder is in (1, -1, inf, -inf) """ - reduce_all = True if axis is None or axis == [] else False - axis = axis if axis is not None and axis != [] else [0] - keepdim = False - if in_dygraph_mode(): abs_out = _C_ops.abs(input) - sum_out = _C_ops.sum(abs_out, axis, None, keepdim) + sum_out = _C_ops.sum(abs_out, axis, None, False) if porder == 1 or porder == np.inf: - return _C_ops.max(sum_out, [-1], keepdim) + return _C_ops.max(sum_out, [-1], False) if porder == -1 or porder == -np.inf: - return _C_ops.min(sum_out, [-1], keepdim) + return _C_ops.min(sum_out, [-1], False) elif _in_legacy_dygraph(): + reduce_all = True if axis is None or axis == [] else False + axis = axis if axis is not None and axis != [] else [0] abs_out = _legacy_C_ops.abs(input) sum_out = _legacy_C_ops.reduce_sum( abs_out, 'dim', axis, 'keepdim', - keepdim, + False, 'reduce_all', reduce_all, ) @@ -874,7 +866,7 @@ def mat_norm(input, porder=1.0, axis=None): 'dim', [-1], 'keepdim', - keepdim, + False, 'reduce_all', reduce_all, ) @@ -884,11 +876,13 @@ def mat_norm(input, porder=1.0, axis=None): 'dim', [-1], 'keepdim', - keepdim, + False, 'reduce_all', reduce_all, ) else: + reduce_all = True if axis is None or axis == [] else False + axis = axis if axis is not None and axis != [] else [0] block = LayerHelper('norm', **locals()) abs_out = block.create_variable_for_type_inference( dtype=block.input_dtype() @@ -908,7 +902,7 @@ def mat_norm(input, porder=1.0, axis=None): outputs={'Out': sum_out}, attrs={ 'dim': axis, - 'keep_dim': keepdim, + 'keep_dim': False, 'reduce_all': reduce_all, }, ) @@ -919,7 +913,7 @@ def mat_norm(input, porder=1.0, axis=None): outputs={'Out': out}, attrs={ 'dim': [-1], - 'keep_dim': keepdim, + 'keep_dim': False, 'reduce_all': reduce_all, }, ) @@ -930,7 +924,7 @@ def mat_norm(input, porder=1.0, axis=None): outputs={'Out': out}, attrs={ 'dim': [-1], - 'keep_dim': keepdim, + 'keep_dim': False, 'reduce_all': reduce_all, }, ) @@ -941,22 +935,20 @@ def fro_norm(input, porder=2, axis=[-1]): NOTE: Calculate the frobenius norm of a square matrix or batches of square matrices. """ - reduce_all = True if axis is None or axis == [] else False - keepdim = False - if in_dygraph_mode(): pow_out = _C_ops.pow(input, porder) - sum_out_1 = _C_ops.sum(pow_out, axis, None, keepdim) - sum_out_2 = _C_ops.sum(sum_out_1, axis, None, keepdim) + sum_out_1 = _C_ops.sum(pow_out, axis, None, False) + sum_out_2 = _C_ops.sum(sum_out_1, axis, None, False) return _C_ops.pow(sum_out_2, float(1.0 / porder)) elif paddle.in_dynamic_mode(): + reduce_all = True if axis is None or axis == [] else False pow_out = _legacy_C_ops.pow(input, 'factor', porder) sum_out_1 = _legacy_C_ops.reduce_sum( pow_out, 'dim', axis, 'keepdim', - keepdim, + False, 'reduce_all', reduce_all, ) @@ -965,12 +957,13 @@ def fro_norm(input, porder=2, axis=[-1]): 'dim', axis, 'keepdim', - keepdim, + False, 'reduce_all', reduce_all, ) return _legacy_C_ops.pow(sum_out_2, 'factor', float(1.0 / porder)) + reduce_all = True if axis is None or axis == [] else False block = LayerHelper('norm', **locals()) pow_out = block.create_variable_for_type_inference( dtype=block.input_dtype() @@ -994,13 +987,13 @@ def fro_norm(input, porder=2, axis=[-1]): type='reduce_sum', inputs={'X': pow_out}, outputs={'Out': sum_out_1}, - attrs={'dim': axis, 'keep_dim': keepdim, 'reduce_all': reduce_all}, + attrs={'dim': axis, 'keep_dim': False, 'reduce_all': reduce_all}, ) block.append_op( type='reduce_sum', inputs={'X': sum_out_1}, outputs={'Out': sum_out_2}, - attrs={'dim': axis, 'keep_dim': keepdim, 'reduce_all': reduce_all}, + attrs={'dim': axis, 'keep_dim': False, 'reduce_all': reduce_all}, ) block.append_op( type='pow', @@ -1016,28 +1009,27 @@ def svd_norm(input, porder, axis=[-1]): Calculate the matrix norm, which is related to singular values, of a matrix or batches of matrices, including nuclear norm, 2-norm and (-2)-norm. """ - reduce_all = True if axis is None or axis == [] else False - keepdim = False - + if not in_dygraph_mode(): + reduce_all = True if axis is None or axis == [] else False u, s, vh = svd(input, full_matrices=False) if _non_static_mode(): if porder == "nuc": if in_dygraph_mode(): - return _C_ops.sum(s, axis, None, keepdim) + return _C_ops.sum(s, axis, None, False) else: return _legacy_C_ops.reduce_sum( s, 'dim', axis, 'keepdim', - keepdim, + False, 'reduce_all', reduce_all, ) if in_dygraph_mode(): - max_out = _C_ops.max(s, axis, keepdim) - min_out = _C_ops.min(s, axis, keepdim) + max_out = _C_ops.max(s, axis, False) + min_out = _C_ops.min(s, axis, False) if porder == 2: return _C_ops.divide(max_out, min_out) if porder == -2: @@ -1045,10 +1037,10 @@ def svd_norm(input, porder, axis=[-1]): else: max_out = _legacy_C_ops.reduce_max( - s, 'dim', axis, 'keepdim', keepdim, 'reduce_all', reduce_all + s, 'dim', axis, 'keepdim', False, 'reduce_all', reduce_all ) min_out = _legacy_C_ops.reduce_min( - s, 'dim', axis, 'keepdim', keepdim, 'reduce_all', reduce_all + s, 'dim', axis, 'keepdim', False, 'reduce_all', reduce_all ) if porder == 2: return _legacy_C_ops.elementwise_div( @@ -1070,7 +1062,7 @@ def svd_norm(input, porder, axis=[-1]): outputs={'Out': out}, attrs={ 'dim': axis, - 'keep_dim': keepdim, + 'keep_dim': False, 'reduce_all': reduce_all, }, ) @@ -1085,13 +1077,13 @@ def svd_norm(input, porder, axis=[-1]): type='reduce_max', inputs={'X': s}, outputs={'Out': max_out}, - attrs={'dim': axis, 'keep_dim': keepdim, 'reduce_all': reduce_all}, + attrs={'dim': axis, 'keep_dim': False, 'reduce_all': reduce_all}, ) block.append_op( type='reduce_min', inputs={'X': s}, outputs={'Out': min_out}, - attrs={'dim': axis, 'keep_dim': keepdim, 'reduce_all': reduce_all}, + attrs={'dim': axis, 'keep_dim': False, 'reduce_all': reduce_all}, ) if porder == 2: block.append_op( diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py index 62dfcad0b3500..5a49c659952da 100644 --- a/python/paddle/tensor/math.py +++ b/python/paddle/tensor/math.py @@ -1303,7 +1303,6 @@ def sum(x, axis=None, dtype=None, keepdim=False, name=None): out8 = paddle.sum(x, axis=0) # [1, 1, 1, 1] out9 = paddle.sum(x, axis=1) # [4, 0] """ - reduce_all, axis = _get_reduce_axis_with_tensor(axis, x) dtype_flag = False if dtype is not None: @@ -1313,6 +1312,8 @@ def sum(x, axis=None, dtype=None, keepdim=False, name=None): if in_dygraph_mode(): return _C_ops.sum(x, axis, dtype, keepdim) + reduce_all, axis = _get_reduce_axis_with_tensor(axis, x) + if _in_legacy_dygraph(): if dtype_flag: return _legacy_C_ops.reduce_sum( @@ -2382,9 +2383,9 @@ def max(x, axis=None, keepdim=False, name=None): #[7., 8.], [[[0., 0.], [0., 0.]], [[0., 0.], [1., 1.]]] """ - reduce_all, axis = _get_reduce_axis_with_tensor(axis, x) if in_dygraph_mode(): return _C_ops.max(x, axis, keepdim) + reduce_all, axis = _get_reduce_axis_with_tensor(axis, x) if _in_legacy_dygraph(): return _legacy_C_ops.reduce_max( x, 'dim', axis, 'keep_dim', keepdim, 'reduce_all', reduce_all @@ -2484,10 +2485,10 @@ def min(x, axis=None, keepdim=False, name=None): #[1., 2.], [[[1., 1.], [0., 0.]], [[0., 0.], [0., 0.]]] """ - reduce_all, axis = _get_reduce_axis_with_tensor(axis, x) if in_dygraph_mode(): return _C_ops.min(x, axis, keepdim) + reduce_all, axis = _get_reduce_axis_with_tensor(axis, x) if _in_legacy_dygraph(): return _legacy_C_ops.reduce_min( x, 'dim', axis, 'keep_dim', keepdim, 'reduce_all', reduce_all @@ -2597,10 +2598,10 @@ def amax(x, axis=None, keepdim=False, name=None): print(result6, y.grad) #[0.9., 0.9], [[[0., 0.3333], [0.5, 0.3333]], [[0.5, 0.3333], [1., 1.]]] """ - - reduce_all, axis = _get_reduce_axis(axis, x) if in_dygraph_mode(): return _C_ops.amax(x, axis, keepdim) + + reduce_all, axis = _get_reduce_axis(axis, x) if _in_legacy_dygraph(): return _legacy_C_ops.reduce_amax( x, 'dim', axis, 'keep_dim', keepdim, 'reduce_all', reduce_all @@ -2711,11 +2712,11 @@ def amin(x, axis=None, keepdim=False, name=None): print(result6, y.grad) #[0.1., 0.1], [[[0., 0.3333], [0.5, 0.3333]], [[0.5, 0.3333], [1., 1.]]] """ - - reduce_all, axis = _get_reduce_axis(axis, x) if in_dygraph_mode(): return _C_ops.amin(x, axis, keepdim) - elif _in_legacy_dygraph(): + + reduce_all, axis = _get_reduce_axis(axis, x) + if _in_legacy_dygraph(): return _legacy_C_ops.reduce_amin( x, 'dim', axis, 'keep_dim', keepdim, 'reduce_all', reduce_all ) @@ -3860,11 +3861,10 @@ def all(x, axis=None, keepdim=False, name=None): print(out4) """ - reduce_all, axis = _get_reduce_axis(axis, x) - if in_dygraph_mode(): return _C_ops.all(x, axis, keepdim) + reduce_all, axis = _get_reduce_axis(axis, x) if _in_legacy_dygraph(): return _legacy_C_ops.reduce_all( x, 'dim', axis, 'keep_dim', keepdim, 'reduce_all', reduce_all @@ -3937,11 +3937,10 @@ def any(x, axis=None, keepdim=False, name=None): print(out4) """ - reduce_all, axis = _get_reduce_axis(axis, x) - if in_dygraph_mode(): return _C_ops.any(x, axis, keepdim) + reduce_all, axis = _get_reduce_axis(axis, x) if _in_legacy_dygraph(): return _legacy_C_ops.reduce_any( x, 'dim', axis, 'keep_dim', keepdim, 'reduce_all', reduce_all diff --git a/python/paddle/tensor/stat.py b/python/paddle/tensor/stat.py index 8fc5fe42cbd9b..4eefa198ea437 100644 --- a/python/paddle/tensor/stat.py +++ b/python/paddle/tensor/stat.py @@ -79,11 +79,10 @@ def mean(x, axis=None, keepdim=False, name=None): out4 = paddle.mean(x, axis=[0, 2]) # [ 8.5 12.5 16.5] """ - - reduce_all, axis = _get_reduce_axis_with_tensor(axis, x) - if in_dygraph_mode(): return _C_ops.mean(x, axis, keepdim) + + reduce_all, axis = _get_reduce_axis_with_tensor(axis, x) if _in_legacy_dygraph(): return _legacy_C_ops.reduce_mean( x, 'dim', axis, 'keep_dim', keepdim, 'reduce_all', reduce_all From dd27996c3b597e32a1a60e4b6c7cad8f3279c71b Mon Sep 17 00:00:00 2001 From: sneaxiy <32832641+sneaxiy@users.noreply.github.com> Date: Thu, 24 Nov 2022 09:43:36 +0800 Subject: [PATCH 186/210] fix adam thread num (#48297) --- paddle/phi/kernels/gpu/adam_kernel.cu | 4 ++-- paddle/phi/kernels/gpu/adamw_kernel.cu | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/paddle/phi/kernels/gpu/adam_kernel.cu b/paddle/phi/kernels/gpu/adam_kernel.cu index 0597311e219da..c4c9ff9e06ce0 100644 --- a/paddle/phi/kernels/gpu/adam_kernel.cu +++ b/paddle/phi/kernels/gpu/adam_kernel.cu @@ -253,7 +253,7 @@ void AdamDenseKernel(const Context& dev_ctx, param.numel()); if (!use_global_beta_pow) { // Update with gpu - UpdateBetaPow<<<1, 32, 0, dev_ctx.stream()>>>( + UpdateBetaPow<<<1, 1, 0, dev_ctx.stream()>>>( beta1_, beta2_, beta1_pow.data(), @@ -352,7 +352,7 @@ void MergedAdamKernel( param[idx]->numel()); if (!use_global_beta_pow) { // Update with gpu - UpdateBetaPow<<<1, 32, 0, dev_ctx.stream()>>>( + UpdateBetaPow<<<1, 1, 0, dev_ctx.stream()>>>( beta1_, beta2_, beta1_pow[idx]->data(), diff --git a/paddle/phi/kernels/gpu/adamw_kernel.cu b/paddle/phi/kernels/gpu/adamw_kernel.cu index 6994c83f53624..2252deb1da2ef 100644 --- a/paddle/phi/kernels/gpu/adamw_kernel.cu +++ b/paddle/phi/kernels/gpu/adamw_kernel.cu @@ -282,7 +282,7 @@ void AdamwDenseKernel(const Context& dev_ctx, param.numel()); if (!use_global_beta_pow) { // Update with gpu - UpdateAdamWBetaPow<<<1, 32, 0, dev_ctx.stream()>>>( + UpdateAdamWBetaPow<<<1, 1, 0, dev_ctx.stream()>>>( beta1_, beta2_, beta1_pow.data(), From f254d0a0f77c21af2f0fb34950e3bce60fb3e753 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=82=85=E5=89=91=E5=AF=92?= Date: Thu, 24 Nov 2022 10:12:24 +0800 Subject: [PATCH 187/210] [Fluid Clean] remove expand and eye under fluid.layers (#47996) * remove expand and eye under fluid.layers * delete expand API test case --- python/paddle/fluid/layers/nn.py | 123 ------------------ .../seq2seq_dygraph_model.py | 31 ++--- .../dygraph_to_static/test_sentiment.py | 9 +- .../transformer_dygraph_model.py | 13 +- .../tests/unittests/ipu/test_expand_op_ipu.py | 6 +- .../tests/unittests/npu/test_expand_op_npu.py | 2 +- .../test_dynamic_rnn_stop_gradient.py | 4 +- .../unittests/test_eager_deletion_while_op.py | 2 +- .../fluid/tests/unittests/test_expand_op.py | 59 --------- .../tests/unittests/test_imperative_basic.py | 8 +- .../test_imperative_ocr_attention_model.py | 5 +- ...perative_star_gan_with_gradient_penalty.py | 4 +- .../fluid/tests/unittests/test_layers.py | 8 -- .../fluid/tests/unittests/test_nn_grad.py | 25 ---- 14 files changed, 42 insertions(+), 257 deletions(-) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index a140894990bb1..bd5b11e1364b5 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -120,7 +120,6 @@ 'pad2d', 'unique', 'unique_with_counts', - 'expand', 'scale', 'elementwise_add', 'elementwise_div', @@ -7970,128 +7969,6 @@ def flatten(x, axis=1, name=None): return out -@deprecated(since='2.0.0', update_to="paddle.expand") -def expand(x, expand_times, name=None): - """ - :alias_main: paddle.expand - :alias: paddle.expand,paddle.tensor.expand,paddle.tensor.manipulation.expand - :old_api: paddle.fluid.layers.expand - - This operation tiles ``x`` multiple times according to the parameter ``expand_times``. - The times number for each dimension of ``x`` is set by the parameter ``expand_times``. - The rank of ``x`` should be less than or equal to 6. Please note that size of ``expand_times`` must be the same - with X's rank. Following is a using case: - - - .. code-block:: text - - Input(X) is a 3-D tensor with shape [2, 3, 1]: - - [ - [[1], [2], [3]], - [[4], [5], [6]] - ] - - Attr(expand_times): [1, 2, 2] - - Output(Out) is a 3-D tensor with shape [2, 6, 2]: - - [ - [[1, 1], [2, 2], [3, 3], [1, 1], [2, 2], [3, 3]], - [[4, 4], [5, 5], [6, 6], [4, 4], [5, 5], [6, 6]] - ] - - Args: - x (Variable): A ``Tensor`` or ``LoDTensor`` with dimension in [1, 6]. The data type is ``bool``, ``float32``, ``float64`` or ``int32`` . - expand_times (list|tuple|Variable): The data type is ``int32`` . If ``expand_times`` is a list or tuple, the elements of - it should be integers or Tensors with shape [1]. If ``expand_times`` is an Variable, it should be an 1-D Tensor. - Expand times number for each dimension of ``x`` . - name (str, optional): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name` . - - Returns: - Variable: A ``Tensor`` or ``LoDTensor``. The data type is same as ``x``. After expanding, size of each dimension of output is equal to the size of the corresponding dimension of ``x`` multiplying the corresponding value given by ``expand_times`` . - - Raises: - TypeError: The type of ``expand_times`` must be list, tuple or Variable. - ValueError: The elements of ``expand_times`` cannot be negative. - - Examples: - .. code-block:: python - - import paddle.fluid as fluid - - # example 1: - data_1 = fluid.layers.fill_constant(shape=[2, 3, 1], dtype='int32', value=0) - expanded_1 = fluid.layers.expand(data_1, expand_times=[1, 2, 2]) - # the shape of expanded_1 is [2, 6, 2]. - - # example 2: - data_2 = fluid.layers.fill_constant(shape=[12, 14], dtype="int32", value=3) - expand_times = fluid.layers.fill_constant(shape=[2], dtype="int32", value=4) - expanded_2 = fluid.layers.expand(data_2, expand_times=expand_times) - # the shape of expanded_2 is [48, 56]. - """ - if _non_static_mode(): - attrs = () - expand_times_tensor = None - if isinstance(expand_times, (list, tuple)): - expand_times = [ - item.numpy().item(0) if isinstance(item, Variable) else item - for item in expand_times - ] - attrs += ('expand_times', expand_times) - elif isinstance(expand_times, Variable): - expand_times_tensor = expand_times - expand_times_tensor.stop_gradient = True - - return _legacy_C_ops.expand(x, expand_times_tensor, *attrs) - - inputs = {"X": [x]} - attrs = {} - check_variable_and_dtype( - x, - 'x', - ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'], - 'expand', - ) - check_type(expand_times, 'expand_times', (list, tuple, Variable), 'expand') - if convert_dtype(x.dtype) == 'bool' and x.stop_gradient == True: - raise ValueError( - "expand op bool date type must set the stop_gradient to be False" - ) - - helper = LayerHelper('expand', input=x, **locals()) - - def get_attr_expand_times(list_expand_times): - attrs_expand_times = [] - for idx, times in enumerate(list_expand_times): - if isinstance(times, Variable): - attrs_expand_times.append(-1) - else: - attrs_expand_times.append(times) - assert ( - times > 0 - ), "Each element given in expand_times must not be negative." - return attrs_expand_times - - if isinstance(expand_times, Variable): - expand_times.stop_gradient = True - inputs['ExpandTimes'] = expand_times - elif isinstance(expand_times, (list, tuple)): - attrs['expand_times'] = get_attr_expand_times(expand_times) - if utils._contain_var(expand_times): - inputs['expand_times_tensor'] = utils._convert_to_tensor_list( - expand_times - ) - - dtype = helper.input_dtype(input_param_name='x') - out = helper.create_variable_for_type_inference(dtype) - helper.append_op( - type='expand', inputs=inputs, outputs={'Out': out}, attrs=attrs - ) - return out - - from paddle.fluid.framework import convert_np_dtype_to_dtype_ diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_dygraph_model.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_dygraph_model.py index 4b52df98c22d5..c650870f75d08 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_dygraph_model.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_dygraph_model.py @@ -186,9 +186,9 @@ def _split_batch_beams(self, x): def _expand_to_beam_size(self, x): x = fluid.layers.unsqueeze(x, [1]) - expand_times = [1] * len(x.shape) - expand_times[1] = self.beam_size - x = fluid.layers.expand(x, expand_times) + expand_shape = [-1] * len(x.shape) + expand_shape[1] = self.beam_size * x.shape[1] + x = paddle.expand(x, expand_shape) return x def _real_state(self, state, new_state, step_mask): @@ -386,19 +386,20 @@ def beam_search(self, inputs): [[0.0] + [-self.kinf] * (self.beam_size - 1)], dtype="float32" ) ) - beam_state_log_probs = fluid.layers.expand( - beam_state_log_probs, [self.batch_size, 1] + beam_state_log_probs = paddle.expand( + beam_state_log_probs, + [self.batch_size * beam_state_log_probs.shape[0], -1], ) dec_hidden, dec_cell = enc_hidden, enc_cell dec_hidden = [self._expand_to_beam_size(ele) for ele in dec_hidden] dec_cell = [self._expand_to_beam_size(ele) for ele in dec_cell] - batch_pos = fluid.layers.expand( + batch_pos = paddle.expand( fluid.layers.unsqueeze( to_variable(np.arange(0, self.batch_size, 1, dtype="int64")), [1], ), - [1, self.beam_size], + [-1, self.beam_size], ) predicted_ids = [] parent_ids = [] @@ -442,9 +443,9 @@ def beam_search(self, inputs): ) step_log_probs = fluid.layers.elementwise_mul( - fluid.layers.expand( + paddle.expand( fluid.layers.unsqueeze(beam_finished, [2]), - [1, 1, self.tar_vocab_size], + [-1, -1, self.tar_vocab_size], ), noend_mask_tensor, axis=-1, @@ -650,9 +651,9 @@ def _merge_batch_beams(self, x): def tile_beam_merge_with_batch(self, x): x = fluid.layers.unsqueeze(x, [1]) # [batch_size, 1, ...] - expand_times = [1] * len(x.shape) - expand_times[1] = self.beam_size - x = fluid.layers.expand(x, expand_times) # [batch_size, beam_size, ...] + expand_shape = [-1] * len(x.shape) + expand_shape[1] = self.beam_size * x.shape[1] + x = paddle.expand(x, expand_shape) # [batch_size, beam_size, ...] x = paddle.transpose( x, list(range(2, len(x.shape))) + [0, 1] ) # [..., batch_size, beam_size] @@ -670,9 +671,9 @@ def _split_batch_beams(self, x): def _expand_to_beam_size(self, x): x = fluid.layers.unsqueeze(x, [1]) - expand_times = [1] * len(x.shape) - expand_times[1] = self.beam_size - x = fluid.layers.expand(x, expand_times) + expand_shape = [-1] * len(x.shape) + expand_shape[1] = self.beam_size * x.shape[1] + x = paddle.expand(x, expand_shape) return x def _real_state(self, state, new_state, step_mask): diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_sentiment.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_sentiment.py index 13593e1c02c08..ed7a3b8cc2f6e 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_sentiment.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_sentiment.py @@ -95,7 +95,7 @@ def forward(self, inputs, label=None): o_np_mask = (paddle.reshape(inputs, [-1, 1]) != self.dict_dim).astype( dtype='float32' ) - mask_emb = fluid.layers.expand(o_np_mask, [1, self.hid_dim]) + mask_emb = paddle.expand(o_np_mask, [-1, self.hid_dim]) emb = emb * mask_emb emb = paddle.reshape( emb, shape=[-1, self.channels, self.seq_len, self.hid_dim] @@ -141,7 +141,7 @@ def forward(self, inputs, label=None): o_np_mask = (paddle.reshape(inputs, [-1, 1]) != self.dict_dim).astype( dtype='float32' ) - mask_emb = fluid.layers.expand(o_np_mask, [1, self.hid_dim]) + mask_emb = paddle.expand(o_np_mask, [-1, self.hid_dim]) emb = emb * mask_emb emb = paddle.reshape(emb, shape=[-1, self.seq_len, self.hid_dim]) bow_1 = fluid.layers.reduce_sum(emb, dim=1) @@ -189,7 +189,7 @@ def forward(self, inputs, label=None): o_np_mask = (paddle.reshape(inputs, [-1, 1]) != self.dict_dim).astype( 'float32' ) - mask_emb = fluid.layers.expand(o_np_mask, [1, self.hid_dim]) + mask_emb = paddle.expand(o_np_mask, [-1, self.hid_dim]) emb = emb * mask_emb emb = paddle.reshape(emb, shape=[self.batch_size, -1, self.hid_dim]) fc_1 = self._fc1(emb) @@ -243,7 +243,8 @@ def forward(self, inputs, label=None): o_np_mask = (paddle.reshape(inputs, [-1, 1]) != self.dict_dim).astype( 'float32' ) - mask_emb = fluid.layers.expand(o_np_mask, [1, self.hid_dim]) + mask_emb = paddle.expand(o_np_mask, [-1, self.hid_dim]) + emb = emb * mask_emb emb = paddle.reshape(emb, shape=[self.batch_size, -1, self.hid_dim]) fc_1 = self._fc1(emb) diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_dygraph_model.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_dygraph_model.py index ee11e045d9aad..18c94262c913b 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_dygraph_model.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_dygraph_model.py @@ -701,9 +701,9 @@ def expand_to_beam_size(tensor, beam_size): tensor = paddle.reshape( tensor, [tensor.shape[0], 1] + list(tensor.shape[1:]) ) - tile_dims = [1] * len(tensor.shape) + tile_dims = [-1] * len(tensor.shape) tile_dims[1] = beam_size - return layers.expand(tensor, tile_dims) + return paddle.expand(tensor, tile_dims) def merge_batch_beams(tensor): var_dim_in_state = 2 # count in beam dim @@ -757,8 +757,9 @@ def split_batch_beams(tensor): def mask_probs(probs, finished, noend_mask_tensor): finished = layers.cast(finished, dtype=probs.dtype) probs = layers.elementwise_mul( - layers.expand( - layers.unsqueeze(finished, [2]), [1, 1, self.trg_vocab_size] + paddle.expand( + layers.unsqueeze(finished, [2]), + [-1, -1, self.trg_vocab_size], ), noend_mask_tensor, axis=-1, @@ -785,11 +786,11 @@ def gather(input, indices, batch_pos): noend_array = [-inf] * self.trg_vocab_size noend_array[eos_id] = 0 noend_mask_tensor = to_variable(np.array(noend_array, dtype="float32")) - batch_pos = layers.expand( + batch_pos = paddle.expand( layers.unsqueeze( to_variable(np.arange(0, batch_size, 1, dtype="int64")), [1] ), - [1, beam_size], + [-1, beam_size], ) predict_ids = [] parent_ids = [] diff --git a/python/paddle/fluid/tests/unittests/ipu/test_expand_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_expand_op_ipu.py index 784a6a41a41f5..d0cf2d5be9510 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_expand_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_expand_op_ipu.py @@ -46,7 +46,7 @@ def build_model(self): x = paddle.static.data( name=self.feed_list[0], shape=self.feed_shape[0], dtype="float32" ) - out = paddle.fluid.layers.expand(x, **self.attrs) + out = paddle.expand(x, **self.attrs) self.fetch_list = [out.name] def run_model(self, exec_mode): @@ -82,9 +82,7 @@ def build_model(self): expand_times = paddle.fluid.layers.fill_constant( shape=[len(self.feed_shape[0])], dtype="int32", value=2 ) - out = paddle.fluid.layers.expand( - x, expand_times=expand_times, **self.attrs - ) + out = paddle.expand(x, expand_times, **self.attrs) self.fetch_list = [out.name] diff --git a/python/paddle/fluid/tests/unittests/npu/test_expand_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_expand_op_npu.py index 0ac8cfc5d3c13..5b4bef5361b4c 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_expand_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_expand_op_npu.py @@ -97,7 +97,7 @@ def _test(self, run_npu=True): name="label", shape=[32, 1], dtype='int64' ) - res = paddle.fluid.layers.expand(a, [1, 32]) + res = paddle.expand(a, [-1, 32]) loss = res.sum() sgd = fluid.optimizer.SGD(learning_rate=0.01) sgd.minimize(loss) diff --git a/python/paddle/fluid/tests/unittests/test_dynamic_rnn_stop_gradient.py b/python/paddle/fluid/tests/unittests/test_dynamic_rnn_stop_gradient.py index f3f971b5778d8..d20ad3d0c0ec6 100644 --- a/python/paddle/fluid/tests/unittests/test_dynamic_rnn_stop_gradient.py +++ b/python/paddle/fluid/tests/unittests/test_dynamic_rnn_stop_gradient.py @@ -43,9 +43,9 @@ def build_and_run_program(place, batch_size, beam_size, stop_gradient=False): for _ in range(20): bs = layers.cast(bs, 'int64') bs.stop_gradient = stop_gradient - batch_pos = layers.expand( + batch_pos = paddle.expand( layers.unsqueeze(paddle.arange(0, bs, 1, dtype=bs.dtype), [1]), - [1, beam_size], + [-1, beam_size], ) topk_coordinates = paddle.stack([batch_pos, indices], axis=2) topk_coordinates.stop_gradient = stop_gradient diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_while_op.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_while_op.py index 37ee4897e77f2..8c359a39195e3 100644 --- a/python/paddle/fluid/tests/unittests/test_eager_deletion_while_op.py +++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_while_op.py @@ -128,7 +128,7 @@ def run_main(self, place, with_data_parallel): sum_result = layers.array_read(array=mem_array, i=j) sum_result.persistable = True tmp = layers.unsqueeze(sum_result, axes=[0]) - tmp = layers.expand(tmp, expand_times=[10, 1]) + tmp = paddle.expand(tmp, [10, -1]) fc = layers.fc(tmp, size=256) loss = paddle.mean(sum_result) diff --git a/python/paddle/fluid/tests/unittests/test_expand_op.py b/python/paddle/fluid/tests/unittests/test_expand_op.py index fd3dac2472e7c..9fb4c7c804e01 100644 --- a/python/paddle/fluid/tests/unittests/test_expand_op.py +++ b/python/paddle/fluid/tests/unittests/test_expand_op.py @@ -16,8 +16,6 @@ import numpy as np from op_test import OpTest import paddle.fluid as fluid -from paddle.fluid import Program, program_guard -import paddle # Situation 1: expand_times is a list(without tensor) @@ -201,62 +199,5 @@ def test_check_output(self): self.check_output() -class TestExpandError(unittest.TestCase): - def test_errors(self): - with program_guard(Program(), Program()): - x1 = fluid.create_lod_tensor( - np.array([[-1]]), [[1]], fluid.CPUPlace() - ) - expand_times = [2, 2] - self.assertRaises(TypeError, fluid.layers.expand, x1, expand_times) - x2 = fluid.layers.data(name='x2', shape=[4], dtype="uint8") - self.assertRaises(TypeError, fluid.layers.expand, x2, expand_times) - x3 = fluid.layers.data(name='x3', shape=[4], dtype="bool") - x3.stop_gradient = True - self.assertRaises(ValueError, fluid.layers.expand, x3, expand_times) - - -# Test python API -class TestExpandAPI(unittest.TestCase): - def test_api(self): - input = np.random.random([12, 14]).astype("float32") - x = fluid.layers.data( - name='x', shape=[12, 14], append_batch_size=False, dtype="float32" - ) - - positive_2 = fluid.layers.fill_constant([1], "int32", 2) - expand_times = fluid.layers.data( - name="expand_times", shape=[2], append_batch_size=False - ) - - out_1 = fluid.layers.expand(x, expand_times=[2, 3]) - out_2 = fluid.layers.expand(x, expand_times=[positive_2, 3]) - out_3 = fluid.layers.expand(x, expand_times=expand_times) - - g0 = fluid.backward.calc_gradient(out_2, x) - - exe = fluid.Executor(place=fluid.CPUPlace()) - res_1, res_2, res_3 = exe.run( - fluid.default_main_program(), - feed={"x": input, "expand_times": np.array([1, 3]).astype("int32")}, - fetch_list=[out_1, out_2, out_3], - ) - assert np.array_equal(res_1, np.tile(input, (2, 3))) - assert np.array_equal(res_2, np.tile(input, (2, 3))) - assert np.array_equal(res_3, np.tile(input, (1, 3))) - - -class TestExpandDygraphAPI(unittest.TestCase): - def test_expand_times_is_tensor(self): - with paddle.fluid.dygraph.guard(): - a = paddle.rand([2, 5]) - b = paddle.fluid.layers.expand(a, expand_times=[2, 3]) - c = paddle.fluid.layers.expand( - a, expand_times=paddle.to_tensor([2, 3], dtype='int32') - ) - np.testing.assert_array_equal(b.numpy(), np.tile(a.numpy(), [2, 3])) - np.testing.assert_array_equal(c.numpy(), np.tile(a.numpy(), [2, 3])) - - if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_imperative_basic.py b/python/paddle/fluid/tests/unittests/test_imperative_basic.py index 076c38773d583..5f543c04842da 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_basic.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_basic.py @@ -738,13 +738,13 @@ def func_dygraph_vs_static(self): name='inp2', shape=[3, 3], dtype=np.float32 ) - a = fluid.layers.expand( + a = paddle.expand( paddle.reshape(fluid.layers.reduce_sum(inp_data1), [1, 1]), - [4, 1], + [4, -1], ) - b = fluid.layers.expand( + b = paddle.expand( paddle.reshape(fluid.layers.reduce_sum(inp_data2), [1, 1]), - [4, 1], + [4, -1], ) cond = fluid.layers.less_than(x=a, y=b) diff --git a/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py b/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py index 0eb20a7dca158..5ab7ef78a3622 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py @@ -306,8 +306,9 @@ def forward(self, encoder_vec, encoder_proj, decoder_state): decoder_state_proj_reshape = paddle.reshape( decoder_state_fc, [-1, 1, decoder_state_fc.shape[1]] ) - decoder_state_expand = fluid.layers.expand( - decoder_state_proj_reshape, [1, encoder_proj.shape[1], 1] + decoder_state_expand = paddle.expand( + decoder_state_proj_reshape, + [-1, encoder_proj.shape[1], -1], ) concated = fluid.layers.elementwise_add( encoder_proj, decoder_state_expand diff --git a/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py b/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py index 2c8d408316b6d..d623a277cf006 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py @@ -310,9 +310,7 @@ def __init__(self, cfg, num_channels=3): def forward(self, input, label_trg): shape = input.shape label_trg_e = paddle.reshape(label_trg, [-1, label_trg.shape[1], 1, 1]) - label_trg_e = fluid.layers.expand( - x=label_trg_e, expand_times=[1, 1, shape[2], shape[3]] - ) + label_trg_e = paddle.expand(label_trg_e, [-1, -1, shape[2], shape[3]]) input1 = fluid.layers.concat([input, label_trg_e], 1) diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index ad1226e12d4a3..f191a948c9d2c 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -3554,14 +3554,6 @@ def make_cross_entropy(self): out = layers.cross_entropy(x, label, False, 4) return out - def make_expand(self): - with program_guard( - fluid.default_main_program(), fluid.default_startup_program() - ): - x = self._get_data(name="input", shape=[10], dtype='int32') - out = layers.expand(x, [1, 2]) - return out - def make_uniform_random_batch_size_like(self): with program_guard( fluid.default_main_program(), fluid.default_startup_program() diff --git a/python/paddle/fluid/tests/unittests/test_nn_grad.py b/python/paddle/fluid/tests/unittests/test_nn_grad.py index 274aa25142e40..c7473895b08fb 100644 --- a/python/paddle/fluid/tests/unittests/test_nn_grad.py +++ b/python/paddle/fluid/tests/unittests/test_nn_grad.py @@ -114,31 +114,6 @@ def test_grad(self): class TestReshapeDoubleGradCheck(unittest.TestCase): - @prog_scope() - def func(self, place): - x_shape = [3, 12] - expand_times = [4, 9] - eps = 0.005 - dtype = np.float64 - - x = layers.data('x', x_shape, False, dtype) - x.persistable = True - out = layers.expand(x, expand_times) - x_arr = np.random.uniform(-1, 1, x_shape).astype(dtype) - - gradient_checker.double_grad_check( - [x], out, x_init=x_arr, place=place, eps=eps - ) - - def test_grad(self): - places = [fluid.CPUPlace()] - if core.is_compiled_with_cuda(): - places.append(fluid.CUDAPlace(0)) - for p in places: - self.func(p) - - -class TestExpandDoubleGradCheck(unittest.TestCase): @prog_scope() def func(self, place): x_shape = [3, 12] From 5f995d3fd2b89b1d1d3f4b5311e0f4c6fdf89daa Mon Sep 17 00:00:00 2001 From: james Date: Thu, 24 Nov 2022 10:26:41 +0800 Subject: [PATCH 188/210] processgroup bkcl support reduce (#48232) Note: this is a temporary solution, should be replaced once reduce kernel is natively supported on KL2 --- .../collective/ProcessGroupBKCL.cc | 51 +++++++++++++++++++ .../distributed/collective/ProcessGroupBKCL.h | 6 +++ .../tests/unittests/xpu/process_group_bkcl.py | 21 ++++++++ 3 files changed, 78 insertions(+) diff --git a/paddle/fluid/distributed/collective/ProcessGroupBKCL.cc b/paddle/fluid/distributed/collective/ProcessGroupBKCL.cc index 75953dc0b4289..ff39196b92b3a 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupBKCL.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupBKCL.cc @@ -260,6 +260,57 @@ std::shared_ptr ProcessGroupBKCL::AllGather( use_calc_stream); } +std::shared_ptr ProcessGroupBKCL::Reduce( + phi::DenseTensor* out_tensor, + const phi::DenseTensor& in_tensor, + const ReduceOptions& opts, + bool sync_op, + bool use_calc_stream) { + return Collective( + out_tensor, + in_tensor, + [&](phi::DenseTensor* output, + const phi::DenseTensor& input, + BKCLContext_t comm, + const XPUStream& stream) { + phi::DenseTensor output_t(*output); + const auto& place = input.place(); + auto* calc_ctx = static_cast( + platform::DeviceContextPool::Instance().Get(place)); + switch (input.dtype()) { + case phi::DataType::FLOAT32: + calc_ctx->template Alloc(&output_t); + break; + case phi::DataType::FLOAT16: + calc_ctx->template Alloc(&output_t); + break; + case phi::DataType::INT32: + calc_ctx->template Alloc(&output_t); + break; + default: + VLOG(0) << "Error: type " << input.dtype() << " not supported for " + << GetBackendName(); + break; + } + int ret = + bkcl_all_reduce(comm, + input.data(), + output_t.data(), + input.numel(), + platform::ToBKCLDataType( + framework::TransToProtoVarType(input.type())), + ToBKCLRedType(opts.reduce_op), + stream); + if (rank_ == opts.root_rank) { + *output = output_t; + } + return ret; + }, + CommType::ALLREDUCE, + sync_op, + use_calc_stream); +} + std::shared_ptr ProcessGroupBKCL::Barrier( const BarrierOptions& opts) { PADDLE_ENFORCE_GE(opts.device_id, diff --git a/paddle/fluid/distributed/collective/ProcessGroupBKCL.h b/paddle/fluid/distributed/collective/ProcessGroupBKCL.h index b4a47e83fdd8a..79d97609d9274 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupBKCL.h +++ b/paddle/fluid/distributed/collective/ProcessGroupBKCL.h @@ -107,6 +107,12 @@ class ProcessGroupBKCL : public ProcessGroupStream { bool sync_op, bool use_calc_stream) override; + std::shared_ptr Reduce(phi::DenseTensor* out_tensor, + const phi::DenseTensor& in_tensor, + const ReduceOptions& opts, + bool sync_op, + bool use_calc_stream) override; + std::shared_ptr Barrier( const BarrierOptions& = BarrierOptions()) override; diff --git a/python/paddle/fluid/tests/unittests/xpu/process_group_bkcl.py b/python/paddle/fluid/tests/unittests/xpu/process_group_bkcl.py index 2317e38cb28d0..a106c630f3634 100644 --- a/python/paddle/fluid/tests/unittests/xpu/process_group_bkcl.py +++ b/python/paddle/fluid/tests/unittests/xpu/process_group_bkcl.py @@ -168,6 +168,27 @@ def test_create_process_group_bkcl(self): "rank {}: test allgather api2 ok\n".format(pg.rank()) ) + # test Reduce + # rank 0 + x = np.random.random(self.shape).astype(self.dtype) + y = np.random.random(self.shape).astype(self.dtype) + tensor_x = paddle.to_tensor(x) + tensor_y = paddle.to_tensor(y) + sum_result = tensor_x + tensor_y + if pg.rank() == 0: + task = dist.reduce(tensor_x, 0, sync_op=True) + paddle.device.xpu.synchronize() + # rank 1 + else: + task = dist.reduce(tensor_y, 0, sync_op=False) + task.wait() + paddle.device.xpu.synchronize() + if pg.rank() == 0: + assert np.array_equal(tensor_x, sum_result) + sys.stdout.write( + "rank {}: test reduce sum api ok\n".format(pg.rank()) + ) + class TestProcessGroupFp16(TestProcessGroupFp32): def setUp(self): From 561b727834529fe5613a31edc2170f66bd4b8add Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awomir=20Siwek?= Date: Thu, 24 Nov 2022 03:29:20 +0100 Subject: [PATCH 189/210] [PHI] Migrate batch_norm_grad kernel (#48288) --- paddle/fluid/operators/batch_norm_op.cc | 2 +- paddle/fluid/operators/inplace_abn_op.cc | 2 +- .../operators/mkldnn/batch_norm_mkldnn_op.cc | 199 ------------------ paddle/fluid/operators/unity_build_rule.cmake | 1 - paddle/phi/backends/onednn/onednn_reuse.h | 54 +++-- .../phi/kernels/cpu/batch_norm_grad_kernel.cc | 1 - .../kernels/onednn/batch_norm_grad_kernel.cc | 134 ++++++++++++ 7 files changed, 178 insertions(+), 215 deletions(-) delete mode 100644 paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc create mode 100644 paddle/phi/kernels/onednn/batch_norm_grad_kernel.cc diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc index 7452c64f6fca8..a20b2ad21d3e9 100644 --- a/paddle/fluid/operators/batch_norm_op.cc +++ b/paddle/fluid/operators/batch_norm_op.cc @@ -350,7 +350,7 @@ void BatchNormGradOp::InferShape(framework::InferShapeContext *ctx) const { true, platform::errors::InvalidArgument( "Using global stats during training is not supported " - "in gradient op kernel of batch_norm_mkldnn_op now.")); + "in oneDNN version of batch_norm_gradient kernel now.")); } OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "BatchNormGrad"); diff --git a/paddle/fluid/operators/inplace_abn_op.cc b/paddle/fluid/operators/inplace_abn_op.cc index f87d7effcae45..61379a3d893ea 100644 --- a/paddle/fluid/operators/inplace_abn_op.cc +++ b/paddle/fluid/operators/inplace_abn_op.cc @@ -113,7 +113,7 @@ class InplaceABNGradOp : public paddle::operators::BatchNormGradOp { true, platform::errors::InvalidArgument( "Using global stats during training is not supported " - "in gradient op kernel of batch_norm_mkldnn_op now.")); + "in oneDNN version of batch_norm_gradient kernel now.")); } OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "InplaceABNGrad"); diff --git a/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc deleted file mode 100644 index aeba1e0ae6379..0000000000000 --- a/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc +++ /dev/null @@ -1,199 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/batch_norm_op.h" -#include "paddle/fluid/platform/mkldnn_reuse.h" - -namespace phi { -class DenseTensor; -} // namespace phi - -namespace paddle { -namespace operators { - -using dnnl::memory; -using dnnl::primitive; -using dnnl::stream; -using paddle::platform::MKLDNNDeviceContext; - -template -class BatchNormMKLDNNHandler : public phi::funcs::OneDNNHandlerNoCachingT< - T, - dnnl::batch_normalization_forward, - dnnl::batch_normalization_backward> { - public: - BatchNormMKLDNNHandler(const paddle::framework::ExecutionContext &ctx, - const dnnl::engine mkldnn_engine, - const Tensor *in_x, - const Tensor *scale, - const Tensor *out_grad) - : phi::funcs::OneDNNHandlerNoCachingT( - mkldnn_engine, ctx.GetPlace()) { - auto scale_tz = phi::vectorize(scale->dims()); - PADDLE_ENFORCE_EQ( - scale_tz.size(), - 1, - platform::errors::InvalidArgument( - "Dims of scale tensor must be 1, but received scale's size is %d", - scale_tz.size())); - - const float epsilon = ctx.Attr("epsilon"); - - this->AcquireForwardPrimitiveDescriptor( - dnnl::prop_kind::forward_training, - in_x->mem_desc(), - epsilon, - dnnl::normalization_flags::use_scale_shift); - this->AcquireBackwardPrimitiveDescriptor( - dnnl::prop_kind::backward, - out_grad->mem_desc(), - in_x->mem_desc(), - epsilon, - dnnl::normalization_flags::use_scale_shift); - } - - std::shared_ptr AcquireScaleShiftMemory(const Tensor *scale, - const Tensor *shift) { - auto scale_tz = phi::vectorize(scale->dims()); - const unsigned int C = scale_tz[0]; - PADDLE_ENFORCE_EQ( - scale_tz.size(), - 1, - platform::errors::InvalidArgument( - "Dims of scale tensor must be 1, but received scale's size is %d", - scale_tz.size())); - - auto scaleshift_memory = - this->AcquireMemoryFromPrimitive(this->fwd_pd_->weights_desc()); - - // MKLDNN requires a single piece of memory for scale and shift/bias data - auto mem_data_handle = - reinterpret_cast(scaleshift_memory->get_data_handle()); - std::copy(scale->data(), scale->data() + C, mem_data_handle); - std::copy(shift->data(), shift->data() + C, mem_data_handle + C); - return scaleshift_memory; - } - - std::shared_ptr AcquireDiffScaleShiftMemory( - T *diff_scaleshift_data) { - return this->AcquireMemoryFromPrimitive(this->bwd_pd_->diff_weights_desc(), - diff_scaleshift_data); - } - - std::shared_ptr AcquireMeanMemory( - const phi::DenseTensor *mean) { - const T *mean_data = mean->data(); - return this->AcquireMemoryFromPrimitive( - this->fwd_pd_->mean_desc(), phi::funcs::to_void_cast(mean_data)); - } - - std::shared_ptr AcquireMeanMemory(phi::DenseTensor *mean) { - T *mean_data = mean->mutable_data(this->place_, - this->fwd_pd_->mean_desc().get_size()); - return this->AcquireMemoryFromPrimitive(this->fwd_pd_->mean_desc(), - mean_data); - } - - std::shared_ptr AcquireVarianceMemory( - const phi::DenseTensor *variance) { - const T *variance_data = variance->data(); - return this->AcquireMemoryFromPrimitive( - this->fwd_pd_->variance_desc(), - phi::funcs::to_void_cast(variance_data)); - } - - std::shared_ptr AcquireVarianceMemory( - phi::DenseTensor *variance) { - T *variance_data = variance->mutable_data( - this->place_, this->fwd_pd_->variance_desc().get_size()); - return this->AcquireMemoryFromPrimitive(this->fwd_pd_->variance_desc(), - variance_data); - } -}; - -template -class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel { - public: - void Compute(const paddle::framework::ExecutionContext &ctx) const override { - auto &dev_ctx = ctx.template device_context(); - auto mkldnn_engine = dev_ctx.GetEngine(); - - const auto *x = ctx.Input("X"); - const auto *scale = ctx.Input("Scale"); - const auto *shift = ctx.Input("Bias"); - const auto *batch_mean = ctx.Input("SavedMean"); - const auto *batch_variance = ctx.Input("SavedVariance"); - const auto *diff_y = - ctx.Input(framework::GradVarName("Y")); - auto *diff_x = ctx.Output(framework::GradVarName("X")); - auto *diff_scale = - ctx.Output(framework::GradVarName("Scale")); - auto *diff_shift = - ctx.Output(framework::GradVarName("Bias")); - - BatchNormMKLDNNHandler handler(ctx, mkldnn_engine, x, scale, diff_y); - - // MKLDNN requires a single piece of memory for scale and shift/bias data - const unsigned int C = phi::vectorize(scale->dims())[0]; - const size_t scaleshift_size = 2 * C; - std::vector diff_scaleshift_data; - diff_scaleshift_data.reserve(scaleshift_size); - - auto src_memory = handler.AcquireSrcMemory(x); - auto mean_memory = handler.AcquireMeanMemory(batch_mean); - auto variance_memory = handler.AcquireVarianceMemory(batch_variance); - auto diff_dst_memory = handler.AcquireDiffDstMemory(diff_y); - auto scaleshift_memory = handler.AcquireScaleShiftMemory(scale, shift); - auto diff_src_memory = handler.AcquireDiffSrcMemory(diff_x); - auto diff_scaleshift_memory = - handler.AcquireDiffScaleShiftMemory(diff_scaleshift_data.data()); - // finally create batch_norm backward primitive - auto batch_norm_bwd_p = handler.AcquireBackwardPrimitive(); - - auto &astream = platform::MKLDNNDeviceContext::tls().get_stream(); - batch_norm_bwd_p->execute( - astream, - {{DNNL_ARG_SRC, *src_memory}, - {DNNL_ARG_MEAN, *mean_memory}, - {DNNL_ARG_VARIANCE, *variance_memory}, - {DNNL_ARG_DIFF_DST, *diff_dst_memory}, - {DNNL_ARG_SCALE_SHIFT, *scaleshift_memory}, - {DNNL_ARG_DIFF_SRC, *diff_src_memory}, - {DNNL_ARG_DIFF_SCALE_SHIFT, *diff_scaleshift_memory}}); - astream.wait(); - - T *diff_scale_data = diff_scale->mutable_data(ctx.GetPlace()); - T *diff_shift_data = diff_shift->mutable_data(ctx.GetPlace()); - - // copy back diff scale/shift to output tensors (diff scale/shift) - diff_scaleshift_data.resize(scaleshift_size); - auto it = std::begin(diff_scaleshift_data); - std::copy(it, std::next(it, C), diff_scale_data); - std::copy( - std::next(it, C), std::end(diff_scaleshift_data), diff_shift_data); - - // set memory descriptor of out tensor - diff_x->set_mem_desc(diff_src_memory->get_desc()); - } -}; -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_KERNEL(batch_norm_grad, - MKLDNN, - ::paddle::platform::CPUPlace, - ops::BatchNormMKLDNNGradOpKernel); diff --git a/paddle/fluid/operators/unity_build_rule.cmake b/paddle/fluid/operators/unity_build_rule.cmake index 97fe4d620cb9c..891cb40ab28df 100644 --- a/paddle/fluid/operators/unity_build_rule.cmake +++ b/paddle/fluid/operators/unity_build_rule.cmake @@ -27,7 +27,6 @@ register_unity_group( bilateral_slice_op.cc) register_unity_group( cc - mkldnn/batch_norm_mkldnn_op.cc bilinear_tensor_product_op.cc bmm_op.cc bpr_loss_op.cc diff --git a/paddle/phi/backends/onednn/onednn_reuse.h b/paddle/phi/backends/onednn/onednn_reuse.h index bc88fef443df2..f4577dab5aa47 100644 --- a/paddle/phi/backends/onednn/onednn_reuse.h +++ b/paddle/phi/backends/onednn/onednn_reuse.h @@ -47,7 +47,7 @@ bool constexpr is_int8() { template constexpr bool is_bfloat16() { - return std::is_same::value; + return std::is_same::value; } static void AppendActivation(const OneDNNContext& dev_ctx, @@ -102,7 +102,7 @@ static void AppendActivation(const OneDNNContext& dev_ctx, PADDLE_ENFORCE_NE( activation_type, activation_map.end(), - phi::errors::InvalidArgument( + errors::InvalidArgument( "Activation '%s' not found in oneDNN algorithms mapper", fuse_activation)); @@ -810,7 +810,7 @@ class SoftmaxOneDNNHandler PADDLE_ENFORCE_EQ( x->dims(), out->dims(), - phi::errors::InvalidArgument( + errors::InvalidArgument( "The shape of input and output tensor must be identical.")); const int canonical_axis = funcs::CanonicalAxis(axis, x->dims().size()); @@ -1145,7 +1145,7 @@ class PReluOneDNNHandler const bool is_test) : OneDNNHandlerNoCachingT( engine, cpu_place) { - auto weights_dims = phi::vectorize(weights.dims()); + auto weights_dims = vectorize(weights.dims()); // weights must have same size as X only for "element" case if (weights.dims().size() != x.dims().size()) { auto new_weights_dims = std::vector(x.dims().size(), 1); @@ -1304,21 +1304,52 @@ class BatchNormOneDNNHandler flags); } + BatchNormOneDNNHandler(const dnnl::engine engine, + Place cpu_place, + const float epsilon, + const DenseTensor* in_x, + const DenseTensor* scale, + const DenseTensor* out_grad) + : OneDNNHandlerNoCachingT(engine, + cpu_place) { + auto scale_tz = vectorize(scale->dims()); + PADDLE_ENFORCE_EQ( + scale_tz.size(), + 1, + errors::InvalidArgument( + "Dims of scale tensor must be 1, but received scale's size is %d", + scale_tz.size())); + + this->AcquireForwardPrimitiveDescriptor( + dnnl::prop_kind::forward_training, + in_x->mem_desc(), + epsilon, + dnnl::normalization_flags::use_scale_shift); + this->AcquireBackwardPrimitiveDescriptor( + dnnl::prop_kind::backward, + out_grad->mem_desc(), + in_x->mem_desc(), + epsilon, + dnnl::normalization_flags::use_scale_shift); + } + std::shared_ptr AcquireScaleShiftMemory( const DenseTensor* scale, const DenseTensor* shift) { - auto scale_tz = phi::vectorize(scale->dims()); + auto scale_tz = vectorize(scale->dims()); const unsigned int C = scale_tz[0]; PADDLE_ENFORCE_EQ( scale_tz.size(), 1, - phi::errors::InvalidArgument( + errors::InvalidArgument( "Dims of scale tensor must be 1, but received scale's size is %d", scale_tz.size())); auto scaleshift_memory = this->AcquireMemoryFromPrimitive(this->fwd_pd_->weights_desc()); - // MKLDNN requires a single piece of memory for scale and shift/bias data + // oneDNN requires a single piece of memory for scale and shift/bias data auto mem_data_handle = reinterpret_cast(scaleshift_memory->get_data_handle()); std::copy(scale->data(), scale->data() + C, mem_data_handle); @@ -1692,7 +1723,7 @@ static std::vector GetInputStrides(const OneDNNContext& dev_ctx, auto& MatrixDimsFromVector = input_name == "X" ? RowMatrixDimsFromVector : ColumnMatrixDimsFromVector; - phi::funcs::MatDescriptor mat_dim = phi::funcs::CreateMatrixDescriptor( + MatDescriptor mat_dim = CreateMatrixDescriptor( MatrixDimsFromVector(new_dims), 0, transpose_input); std::vector strides; @@ -1728,8 +1759,7 @@ static bool IsOutputFused(const OneDNNContext& dev_ctx) { } template -class MatmulOneDNNHandler - : public phi::funcs::OneDNNHandlerNoCachingT { +class MatmulOneDNNHandler : public OneDNNHandlerNoCachingT { public: MatmulOneDNNHandler(const OneDNNContext& dev_ctx, const std::vector& x_org_dims, @@ -1739,8 +1769,8 @@ class MatmulOneDNNHandler const std::vector& x_strides_override, const std::vector& y_strides_override, bool is_output_fused) - : phi::funcs::OneDNNHandlerNoCachingT( - dev_ctx.GetEngine(), dev_ctx.GetPlace()) { + : OneDNNHandlerNoCachingT(dev_ctx.GetEngine(), + dev_ctx.GetPlace()) { // M X K * K X N std::vector x_dims(x_org_dims); std::vector y_dims(y_org_dims); diff --git a/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc b/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc index efd55dee88cd0..8d0ae7e08d70b 100644 --- a/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc @@ -36,7 +36,6 @@ using ConstEigenVectorArrayMap = template void BatchNormGradRawKernel(const Context& ctx, - const DenseTensor& x, const DenseTensor& scale, const DenseTensor& bias, diff --git a/paddle/phi/kernels/onednn/batch_norm_grad_kernel.cc b/paddle/phi/kernels/onednn/batch_norm_grad_kernel.cc new file mode 100644 index 0000000000000..503dd6416b46b --- /dev/null +++ b/paddle/phi/kernels/onednn/batch_norm_grad_kernel.cc @@ -0,0 +1,134 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/batch_norm_grad_kernel.h" + +#include "paddle/phi/backends/onednn/onednn_reuse.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void BatchNormGradRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& scale, + const DenseTensor& bias, + const paddle::optional& mean, + const paddle::optional& variance, + const DenseTensor& saved_mean, + const DenseTensor& saved_variance, + const paddle::optional& reserve_space, + const DenseTensor& y_grad, + float momentum, + float epsilon, + const std::string& data_layout, + bool is_test, + bool use_global_stats, + bool trainable_statistics, + bool is_inplace, + DenseTensor* x_grad, + DenseTensor* scale_grad, + DenseTensor* bias_grad) { + funcs::BatchNormOneDNNHandler handler( + dev_ctx.GetEngine(), dev_ctx.GetPlace(), epsilon, &x, &scale, &y_grad); + + const unsigned int C = vectorize(scale.dims())[0]; + const size_t scaleshift_size = 2 * C; + std::vector diff_scaleshift_data; + diff_scaleshift_data.reserve(scaleshift_size); + + auto src_memory = handler.AcquireSrcMemory(&x); + auto mean_memory = handler.AcquireMeanMemory(&saved_mean); + auto variance_memory = handler.AcquireVarianceMemory(&saved_variance); + auto diff_dst_memory = handler.AcquireDiffDstMemory(&y_grad); + auto scaleshift_memory = handler.AcquireScaleShiftMemory(&scale, &bias); + auto diff_src_memory = handler.AcquireDiffSrcMemory(x_grad); + auto diff_scaleshift_memory = + handler.AcquireDiffScaleShiftMemory(diff_scaleshift_data.data()); + + auto batch_norm_bwd_p = handler.AcquireBackwardPrimitive(); + + auto& astream = OneDNNContext::tls().get_stream(); + batch_norm_bwd_p->execute( + astream, + {{DNNL_ARG_SRC, *src_memory}, + {DNNL_ARG_MEAN, *mean_memory}, + {DNNL_ARG_VARIANCE, *variance_memory}, + {DNNL_ARG_DIFF_DST, *diff_dst_memory}, + {DNNL_ARG_SCALE_SHIFT, *scaleshift_memory}, + {DNNL_ARG_DIFF_SRC, *diff_src_memory}, + {DNNL_ARG_DIFF_SCALE_SHIFT, *diff_scaleshift_memory}}); + astream.wait(); + + T* diff_scale_data = dev_ctx.template Alloc(scale_grad); + T* diff_shift_data = dev_ctx.template Alloc(bias_grad); + + // copy back diff scale/shift to output tensors (diff scale/shift) + diff_scaleshift_data.resize(scaleshift_size); + auto it = std::begin(diff_scaleshift_data); + std::copy(it, std::next(it, C), diff_scale_data); + std::copy(std::next(it, C), std::end(diff_scaleshift_data), diff_shift_data); + + // set memory descriptor of out tensor + x_grad->set_mem_desc(diff_src_memory->get_desc()); +} + +template +void BatchNormGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& scale, + const DenseTensor& bias, + const paddle::optional& mean, + const paddle::optional& variance, + const DenseTensor& saved_mean, + const DenseTensor& saved_variance, + const paddle::optional& reserve_space, + const DenseTensor& y_grad, + float momentum, + float epsilon, + const std::string& data_layout, + bool is_test, + bool use_global_stats, + bool trainable_statistics, + DenseTensor* x_grad, + DenseTensor* scale_grad, + DenseTensor* bias_grad) { + BatchNormGradRawKernel(dev_ctx, + x, + scale, + bias, + mean, + variance, + saved_mean, + saved_variance, + reserve_space, + y_grad, + momentum, + epsilon, + data_layout, + is_test, + use_global_stats, + trainable_statistics, + /*is_inplace*/ false, + x_grad, + scale_grad, + bias_grad); +} + +} // namespace phi + +PD_REGISTER_KERNEL( + batch_norm_grad, OneDNN, ONEDNN, phi::BatchNormGradKernel, float) {} +PD_REGISTER_KERNEL( + batch_norm_grad_raw, OneDNN, ONEDNN, phi::BatchNormGradRawKernel, float) {} From 4f975b418dd46c33db91ab128f1b0f9ebe7b85e1 Mon Sep 17 00:00:00 2001 From: Zhou Wei <1183042833@qq.com> Date: Thu, 24 Nov 2022 10:49:15 +0800 Subject: [PATCH 190/210] [Zero-Dim] Support input 0D Tensor for some api (#48007) --- .../tests/unittests/test_zero_dim_tensor.py | 251 +++++++++++++---- .../unittests/xpu/test_zero_dim_tensor_xpu.py | 258 ++++++++++++++++-- 2 files changed, 436 insertions(+), 73 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_zero_dim_tensor.py b/python/paddle/fluid/tests/unittests/test_zero_dim_tensor.py index 174172b026f21..dcfee03f40cfa 100644 --- a/python/paddle/fluid/tests/unittests/test_zero_dim_tensor.py +++ b/python/paddle/fluid/tests/unittests/test_zero_dim_tensor.py @@ -14,16 +14,20 @@ import paddle import paddle.fluid as fluid +from decorator_helper import prog_scope import paddle.nn.functional as F import numpy as np import unittest +fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True}) unary_api_list = [ paddle.nn.functional.elu, paddle.nn.functional.gelu, paddle.nn.functional.hardsigmoid, paddle.nn.functional.hardswish, + paddle.nn.functional.hardshrink, + paddle.nn.functional.hardtanh, paddle.nn.functional.leaky_relu, paddle.nn.functional.log_sigmoid, paddle.nn.functional.relu, @@ -37,9 +41,11 @@ paddle.nn.functional.thresholded_relu, paddle.stanh, paddle.nn.functional.celu, + paddle.nn.functional.selu, paddle.nn.functional.mish, paddle.nn.functional.silu, paddle.nn.functional.tanh, + paddle.nn.functional.dropout, paddle.cosh, paddle.sinh, paddle.abs, @@ -65,6 +71,24 @@ paddle.log10, paddle.log2, paddle.tan, + paddle.erf, + paddle.erfinv, + paddle.rsqrt, + paddle.sign, + paddle.deg2rad, + paddle.rad2deg, + paddle.neg, + paddle.logit, + paddle.trunc, + paddle.digamma, + paddle.lgamma, + paddle.poisson, + paddle.bernoulli, +] + +inplace_api_list = [ + paddle.nn.functional.relu_, + paddle.nn.functional.tanh_, ] @@ -72,7 +96,6 @@ class TestUnaryAPI(unittest.TestCase): def test_dygraph_unary(self): paddle.disable_static() - fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True}) for api in unary_api_list: x = paddle.rand([]) x.stop_gradient = False @@ -81,8 +104,15 @@ def test_dygraph_unary(self): self.assertEqual(x.shape, []) self.assertEqual(out.shape, []) - self.assertEqual(x.grad.shape, []) - self.assertEqual(out.grad.shape, []) + if x.grad is not None: + self.assertEqual(x.grad.shape, []) + self.assertEqual(out.grad.shape, []) + + for api in inplace_api_list: + x = paddle.rand([]) + out = api(x) + self.assertEqual(x.shape, []) + self.assertEqual(out.shape, []) paddle.enable_static() @@ -95,28 +125,32 @@ def test_static_unary(self): x = paddle.rand([]) x.stop_gradient = False out = api(x) - fluid.backward.append_backward(out) + paddle.static.append_backward(out) - # ScaleLossGradOp / append_backward always set grad shape to [1] - prog = paddle.static.default_main_program() - block = prog.global_block() - - x_grad = block.var(fluid.framework.grad_var_name(x.name)) - out_grad = block.var(fluid.framework.grad_var_name(out.name)) - - # Test compile shape, grad is always [1] + # Test compile shape self.assertEqual(x.shape, ()) self.assertEqual(out.shape, ()) - exe = fluid.Executor() - result = exe.run( - main_prog, fetch_list=[x, out, x_grad, out_grad] - ) + fetch_list = [x, out] + # TODO(zhouwei): ScaleLossGradOp / append_backward set grad shape to [1] + # will change to [] after kernel is fixed + prog = paddle.static.default_main_program() + block = prog.global_block() + if block.has_var(fluid.framework.grad_var_name(x.name)): + out_grad = block.var( + fluid.framework.grad_var_name(out.name) + ) + fetch_list.append(out_grad) + self.assertEqual(out_grad.shape, ()) # Test runtime shape + exe = fluid.Executor() + result = exe.run(main_prog, fetch_list=fetch_list) self.assertEqual(result[0].shape, ()) self.assertEqual(result[1].shape, ()) - self.assertEqual(result[3].shape, (1,)) + if len(result) == 3: + # TODO(zhouwei): will change to [] after kernel is fixed + self.assertEqual(result[2].shape, (1,)) # 0D will be stacked when 1+ place, due to it cannot be concated # for 1 place: [ x-place1 ] @@ -135,28 +169,30 @@ def test_static_unary(self): ).with_data_parallel(out.name, places=places) result = exe.run( compiled_program, - fetch_list=[x, out, x_grad, out_grad], + fetch_list=fetch_list, return_merged=True, ) # Test runtime parallel shape self.assertEqual(result[0].shape, expect_shape) self.assertEqual(result[1].shape, expect_shape) - self.assertEqual(result[3].shape, (device_num,)) + if len(result) == 3: + self.assertEqual(result[2].shape, (device_num,)) compiled_program = fluid.CompiledProgram( main_prog ).with_data_parallel(out.name, places=places) result = exe.run( compiled_program, - fetch_list=[x, out, x_grad, out_grad], + fetch_list=fetch_list, return_merged=False, ) # [[x-place1, x-place2, ...], [], [], ...] self.assertEqual(np.array(result[0]).shape, (device_num,)) self.assertEqual(np.array(result[1]).shape, (device_num,)) - self.assertEqual(np.array(result[3]).shape, (device_num, 1)) + if len(result) == 3: + self.assertEqual(np.array(result[2]).shape, (device_num, 1)) paddle.disable_static() @@ -181,7 +217,6 @@ def test_static_unary(self): class TestReduceAPI(unittest.TestCase): def test_dygraph(self): paddle.disable_static() - fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True}) for api in reduce_api_list: if api in [paddle.all, paddle.any]: x = paddle.randint(0, 2, []).astype('bool') @@ -234,9 +269,6 @@ def test_static(self): {'func': paddle.multiply, 'cls_method': '__mul__'}, {'func': paddle.divide, 'cls_method': '__div__'}, {'func': paddle.pow, 'cls_method': '__pow__'}, -] - -binary_api_list_without_grad = [ {'func': paddle.equal, 'cls_method': '__eq__'}, {'func': paddle.not_equal, 'cls_method': '__ne__'}, {'func': paddle.greater_equal, 'cls_method': '__ge__'}, @@ -251,7 +283,7 @@ def test_static(self): paddle.logical_xor, ] -binary_int_api_list_without_grad = [ +binary_int_api_list = [ paddle.bitwise_and, paddle.bitwise_or, paddle.bitwise_xor, @@ -262,8 +294,7 @@ def test_static(self): class TestBinaryAPI(unittest.TestCase): def test_dygraph_binary(self): paddle.disable_static() - fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True}) - for api in binary_api_list + binary_api_list_without_grad: + for api in binary_api_list: # 1) x/y is 0D x = paddle.rand([]) y = paddle.rand([]) @@ -275,10 +306,10 @@ def test_dygraph_binary(self): np.testing.assert_array_equal(out_cls.numpy(), out.numpy()) else: out = api(x, y) - self.assertEqual(out.shape, []) - if api not in binary_api_list_without_grad: - out.backward() + + out.backward() + if x.grad is not None: self.assertEqual(x.grad.shape, []) self.assertEqual(y.grad.shape, []) self.assertEqual(out.grad.shape, []) @@ -294,10 +325,10 @@ def test_dygraph_binary(self): np.testing.assert_array_equal(out_cls.numpy(), out.numpy()) else: out = api(x, y) - self.assertEqual(out.shape, [2, 3, 4]) - if api not in binary_api_list_without_grad: - out.backward() + + out.backward() + if x.grad is not None: self.assertEqual(x.grad.shape, [2, 3, 4]) self.assertEqual(y.grad.shape, []) self.assertEqual(out.grad.shape, [2, 3, 4]) @@ -313,10 +344,10 @@ def test_dygraph_binary(self): np.testing.assert_array_equal(out_cls.numpy(), out.numpy()) else: out = api(x, y) - self.assertEqual(out.shape, [2, 3, 4]) - if api not in binary_api_list_without_grad: - out.backward() + + out.backward() + if x.grad is not None: self.assertEqual(x.grad.shape, []) self.assertEqual(y.grad.shape, [2, 3, 4]) self.assertEqual(out.grad.shape, [2, 3, 4]) @@ -329,7 +360,7 @@ def test_dygraph_binary(self): out = getattr(paddle.Tensor, api['cls_method'])(x, y) self.assertEqual(out.shape, []) - for api in binary_int_api_list_without_grad: + for api in binary_int_api_list: # 1) x/y is 0D x = paddle.randint(-10, 10, []) y = paddle.randint(-10, 10, []) @@ -352,7 +383,7 @@ def test_dygraph_binary(self): def test_static_binary(self): paddle.enable_static() - for api in binary_api_list + binary_api_list_without_grad: + for api in binary_api_list: main_prog = fluid.Program() with fluid.program_guard(main_prog, fluid.Program()): # 1) x/y is 0D @@ -368,16 +399,15 @@ def test_static_binary(self): self.assertEqual(out.shape, out_cls.shape) else: out = api(x, y) - fluid.backward.append_backward(out) + paddle.static.append_backward(out) - # Test compile shape self.assertEqual(out.shape, ()) + exe = fluid.Executor() - out_np = exe.run(main_prog, fetch_list=[out])[0] - # Test runtime shape - self.assertEqual(out_np.shape, ()) + result = exe.run(main_prog, fetch_list=[out]) + self.assertEqual(result[0].shape, ()) - # TODO(zhouwei): will open when create_scalar is [] + # TODO: will open when create_scalar is [] # 2) x is 0D , y is scalar ''' x = paddle.rand([]) @@ -391,7 +421,7 @@ def test_static_binary(self): self.assertEqual(out.shape, ()) ''' - for api in binary_int_api_list_without_grad: + for api in binary_int_api_list: main_prog = fluid.Program() with fluid.program_guard(main_prog, fluid.Program()): # 1) x/y is 0D @@ -415,10 +445,11 @@ def test_static_binary(self): paddle.disable_static() -# Use to test zero-dim of Sundry API, which is simple and do -# not have backward, or is not need to test backward in OpTest. +# Use to test zero-dim of Sundry API, which is unique and can not be classified +# with others. It can be implemented here flexibly. class TestSundryAPI(unittest.TestCase): def setUp(self): + paddle.disable_static() self.x = paddle.rand([]) def test_linear(self): @@ -501,6 +532,130 @@ def test_shape(self): self.assertEqual(out.shape, [0]) np.testing.assert_array_equal(out.numpy(), np.array([])) + def test_pow_factor(self): + x = paddle.rand([]) + x.stop_gradient = False + out = paddle.pow(x, 2.0) + out.backward() + + self.assertEqual(out.shape, []) + self.assertEqual(out.grad.shape, []) + self.assertEqual(x.grad.shape, []) + + def test_cast(self): + x = paddle.full([], 1.0, 'float32') + x.stop_gradient = False + out = paddle.cast(x, 'int32') + out.backward() + + self.assertEqual(out.shape, []) + self.assertEqual(out.grad.shape, []) + self.assertEqual(x.grad.shape, []) + + def test_clip(self): + x = paddle.uniform([], None, -10, 10) + x.stop_gradient = False + out = paddle.clip(x, -5, 5) + out.backward() + + self.assertEqual(out.shape, []) + self.assertEqual(out.grad.shape, []) + self.assertEqual(x.grad.shape, []) + + def test_increment(self): + x = paddle.rand([]) + x.stop_gradient = False + out = paddle.increment(x, 1.0) + out.backward() + + self.assertEqual(out.shape, []) + self.assertEqual(out.grad.shape, []) + self.assertEqual(x.grad.shape, []) + + def test_bitwise_not(self): + x = paddle.randint(-1, 1, []) + out1 = ~x + out2 = paddle.bitwise_not(x) + + self.assertEqual(out1.shape, []) + self.assertEqual(out2.shape, []) + + def test_logical_not(self): + x = paddle.randint(0, 1, []) + out = paddle.logical_not(x) + + self.assertEqual(out.shape, []) + + +class TestSundryAPIStatic(unittest.TestCase): + def setUp(self): + paddle.enable_static() + self.exe = paddle.static.Executor() + + @prog_scope() + def test_pow_factor(self): + x = paddle.rand([]) + x.stop_gradient = False + out = paddle.pow(x, 2.0) + paddle.static.append_backward(out) + + prog = paddle.static.default_main_program() + res = self.exe.run(prog, fetch_list=[out]) + self.assertEqual(res[0].shape, ()) + + @prog_scope() + def test_cast(self): + x = paddle.full([], 1.0, 'float32') + x.stop_gradient = False + out = paddle.cast(x, 'int32') + paddle.static.append_backward(out) + + prog = paddle.static.default_main_program() + res = self.exe.run(prog, fetch_list=[out]) + self.assertEqual(res[0].shape, ()) + + @prog_scope() + def test_clip(self): + x = paddle.uniform([], None, -10, 10) + x.stop_gradient = False + out = paddle.clip(x, -5, 5) + paddle.static.append_backward(out) + + prog = paddle.static.default_main_program() + res = self.exe.run(prog, fetch_list=[out]) + self.assertEqual(res[0].shape, ()) + + @prog_scope() + def test_increment(self): + x = paddle.rand([]) + x.stop_gradient = False + out = paddle.increment(x, 1.0) + paddle.static.append_backward(out) + + prog = paddle.static.default_main_program() + res = self.exe.run(prog, fetch_list=[out]) + self.assertEqual(res[0].shape, ()) + + @prog_scope() + def test_bitwise_not(self): + x = paddle.randint(-1, 1, []) + out = paddle.bitwise_not(x) + paddle.static.append_backward(out) + + prog = paddle.static.default_main_program() + res = self.exe.run(prog, fetch_list=[out]) + self.assertEqual(res[0].shape, ()) + + @prog_scope() + def test_logical_not(self): + x = paddle.randint(0, 1, []) + out = paddle.logical_not(x) + paddle.static.append_backward(out) + + prog = paddle.static.default_main_program() + res = self.exe.run(prog, fetch_list=[out]) + self.assertEqual(res[0].shape, ()) + # Use to test API whose zero-dim input tensors don't have grad and not need to test backward in OpTest. class TestNoBackwardAPI(unittest.TestCase): diff --git a/python/paddle/fluid/tests/unittests/xpu/test_zero_dim_tensor_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_zero_dim_tensor_xpu.py index 5868fe9cb531b..a0925207c957e 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_zero_dim_tensor_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_zero_dim_tensor_xpu.py @@ -20,12 +20,15 @@ paddle.set_device('xpu') +fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True}) unary_api_list = [ paddle.nn.functional.elu, paddle.nn.functional.gelu, paddle.nn.functional.hardsigmoid, paddle.nn.functional.hardswish, + paddle.nn.functional.hardshrink, + paddle.nn.functional.hardtanh, paddle.nn.functional.leaky_relu, paddle.nn.functional.log_sigmoid, paddle.nn.functional.relu, @@ -39,9 +42,11 @@ paddle.nn.functional.thresholded_relu, paddle.stanh, paddle.nn.functional.celu, + paddle.nn.functional.selu, paddle.nn.functional.mish, paddle.nn.functional.silu, paddle.nn.functional.tanh, + paddle.nn.functional.dropout, paddle.cosh, paddle.sinh, paddle.abs, @@ -67,14 +72,31 @@ paddle.log10, paddle.log2, paddle.tan, + paddle.erf, + paddle.erfinv, + paddle.rsqrt, + paddle.sign, + paddle.deg2rad, + paddle.rad2deg, + paddle.neg, + paddle.logit, + paddle.trunc, + paddle.digamma, + paddle.lgamma, + paddle.poisson, + paddle.bernoulli, +] + +inplace_api_list = [ + paddle.nn.functional.relu_, + paddle.nn.functional.tanh_, ] # Use to test zero-dim in unary API. class TestUnaryAPI(unittest.TestCase): - def test(self): + def test_dygraph_unary(self): paddle.disable_static() - fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True}) for api in unary_api_list: x = paddle.rand([]) x.stop_gradient = False @@ -83,8 +105,15 @@ def test(self): self.assertEqual(x.shape, []) self.assertEqual(out.shape, []) - self.assertEqual(x.grad.shape, []) - self.assertEqual(out.grad.shape, []) + if x.grad is not None: + self.assertEqual(x.grad.shape, []) + self.assertEqual(out.grad.shape, []) + + for api in inplace_api_list: + x = paddle.rand([]) + out = api(x) + self.assertEqual(x.shape, []) + self.assertEqual(out.shape, []) paddle.enable_static() @@ -107,9 +136,8 @@ def test(self): # Use to test zero-dim of reduce API class TestReduceAPI(unittest.TestCase): - def test(self): + def test_dygraph(self): paddle.disable_static() - fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True}) for api in reduce_api_list: if api in [paddle.all, paddle.any]: x = paddle.randint(0, 2, []).astype('bool') @@ -136,9 +164,6 @@ def test(self): {'func': paddle.multiply, 'cls_method': '__mul__'}, {'func': paddle.divide, 'cls_method': '__div__'}, {'func': paddle.pow, 'cls_method': '__pow__'}, -] - -binary_api_list_without_grad = [ {'func': paddle.equal, 'cls_method': '__eq__'}, {'func': paddle.not_equal, 'cls_method': '__ne__'}, {'func': paddle.greater_equal, 'cls_method': '__ge__'}, @@ -153,7 +178,7 @@ def test(self): paddle.logical_xor, ] -binary_int_api_list_without_grad = [ +binary_int_api_list = [ paddle.bitwise_and, paddle.bitwise_or, paddle.bitwise_xor, @@ -162,10 +187,9 @@ def test(self): # Use to test zero-dim of binary API class TestBinaryAPI(unittest.TestCase): - def test(self): + def test_dygraph_binary(self): paddle.disable_static() - fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True}) - for api in binary_api_list + binary_api_list_without_grad: + for api in binary_api_list: # 1) x/y is 0D x = paddle.rand([]) y = paddle.rand([]) @@ -177,10 +201,10 @@ def test(self): np.testing.assert_array_equal(out_cls.numpy(), out.numpy()) else: out = api(x, y) - self.assertEqual(out.shape, []) - if api not in binary_api_list_without_grad: - out.backward() + + out.backward() + if x.grad is not None: self.assertEqual(x.grad.shape, []) self.assertEqual(y.grad.shape, []) self.assertEqual(out.grad.shape, []) @@ -196,10 +220,10 @@ def test(self): np.testing.assert_array_equal(out_cls.numpy(), out.numpy()) else: out = api(x, y) - self.assertEqual(out.shape, [2, 3, 4]) - if api not in binary_api_list_without_grad: - out.backward() + + out.backward() + if x.grad is not None: self.assertEqual(x.grad.shape, [2, 3, 4]) self.assertEqual(y.grad.shape, []) self.assertEqual(out.grad.shape, [2, 3, 4]) @@ -215,10 +239,10 @@ def test(self): np.testing.assert_array_equal(out_cls.numpy(), out.numpy()) else: out = api(x, y) - self.assertEqual(out.shape, [2, 3, 4]) - if api not in binary_api_list_without_grad: - out.backward() + + out.backward() + if x.grad is not None: self.assertEqual(x.grad.shape, []) self.assertEqual(y.grad.shape, [2, 3, 4]) self.assertEqual(out.grad.shape, [2, 3, 4]) @@ -231,7 +255,7 @@ def test(self): out = getattr(paddle.Tensor, api['cls_method'])(x, y) self.assertEqual(out.shape, []) - for api in binary_int_api_list_without_grad: + for api in binary_int_api_list: # 1) x/y is 0D x = paddle.randint(-10, 10, []) y = paddle.randint(-10, 10, []) @@ -253,8 +277,8 @@ def test(self): paddle.enable_static() -# Use to test zero-dim of Sundry API, which is simple and do -# not have backward, or is not need to test backward in OpTest. +# Use to test zero-dim of Sundry API, which is unique and can not be classified +# with others. It can be implemented here flexibly. class TestSundryAPI(unittest.TestCase): def setUp(self): paddle.disable_static() @@ -336,6 +360,190 @@ def test_shape(self): self.assertEqual(out.shape, [0]) np.testing.assert_array_equal(out.numpy(), np.array([])) + def test_pow_factor(self): + x = paddle.rand([]) + x.stop_gradient = False + out = paddle.pow(x, 2.0) + out.backward() + + self.assertEqual(out.shape, []) + self.assertEqual(out.grad.shape, []) + self.assertEqual(x.grad.shape, []) + + def test_cast(self): + x = paddle.full([], 1.0, 'float32') + x.stop_gradient = False + out = paddle.cast(x, 'int32') + out.backward() + + self.assertEqual(out.shape, []) + self.assertEqual(out.grad.shape, []) + self.assertEqual(x.grad.shape, []) + + def test_clip(self): + x = paddle.uniform([], None, -10, 10) + x.stop_gradient = False + out = paddle.clip(x, -5, 5) + out.backward() + + self.assertEqual(out.shape, []) + self.assertEqual(out.grad.shape, []) + self.assertEqual(x.grad.shape, []) + + def test_increment(self): + x = paddle.rand([]) + x.stop_gradient = False + out = paddle.increment(x, 1.0) + out.backward() + + self.assertEqual(out.shape, []) + self.assertEqual(out.grad.shape, []) + self.assertEqual(x.grad.shape, []) + + def test_bitwise_not(self): + x = paddle.randint(-1, 1, []) + out1 = ~x + out2 = paddle.bitwise_not(x) + + self.assertEqual(out1.shape, []) + self.assertEqual(out2.shape, []) + + def test_logical_not(self): + x = paddle.randint(0, 1, []) + out = paddle.logical_not(x) + + self.assertEqual(out.shape, []) + + +# Use to test API whose zero-dim input tensors don't have grad and not need to test backward in OpTest. +class TestNoBackwardAPI(unittest.TestCase): + def setUp(self): + paddle.disable_static() + self.shape = [ + paddle.full([], 2, 'int32'), + paddle.full([], 3, 'int32'), + paddle.full([], 4, 'int32'), + ] + + def test_slice(self): + starts = [paddle.full([], 1, 'int32'), paddle.full([], 1, 'int32')] + ends = [paddle.full([], 3, 'int32'), paddle.full([], 3, 'int32')] + x = paddle.rand([5, 3, 3]) + out = paddle.slice(x, [1, 2], starts, ends) + self.assertEqual(out.shape, [5, 2, 2]) + + def test_strided_slice(self): + starts = [paddle.full([], 0, 'int32'), paddle.full([], 0, 'int32')] + ends = [paddle.full([], 4, 'int32'), paddle.full([], 4, 'int32')] + strides = [paddle.full([], 2, 'int32'), paddle.full([], 2, 'int32')] + x = paddle.rand([5, 5, 5]) + out = paddle.strided_slice(x, [1, 2], starts, ends, strides) + self.assertEqual(out.shape, [5, 2, 2]) + + def test_linspace(self): + start = paddle.full([], 1.0) + stop = paddle.full([], 5.0) + num = paddle.full([], 5, 'int32') + out = paddle.linspace(start, stop, num) + np.testing.assert_array_equal(out.numpy(), [1.0, 2.0, 3.0, 4.0, 5.0]) + + def test_arange(self): + start = paddle.full([], 1.0) + stop = paddle.full([], 6.0) + step = paddle.full([], 1.0) + out = paddle.arange(start, stop, step) + np.testing.assert_array_equal(out.numpy(), [1.0, 2.0, 3.0, 4.0, 5.0]) + + def test_normal(self): + mean = paddle.full([], 0.0) + std = paddle.full([], 0.0) + out = paddle.normal(mean, std) + self.assertEqual(out.shape, []) + + out = paddle.normal(0.0, 1.0, []) + self.assertEqual(out.shape, []) + + out = paddle.normal(0.0, 1.0, self.shape) + self.assertEqual(out.shape, [2, 3, 4]) + + def test_rand(self): + out = paddle.rand([]) + self.assertEqual(out.shape, []) + + out = paddle.rand(self.shape) + self.assertEqual(out.shape, [2, 3, 4]) + + def test_randn(self): + out = paddle.randn([]) + self.assertEqual(out.shape, []) + + out = paddle.randn(self.shape) + self.assertEqual(out.shape, [2, 3, 4]) + + def test_randint_and_randint_like(self): + out = paddle.randint(-10, 10, []) + self.assertEqual(out.shape, []) + + out = paddle.randint_like(out, -10, 10) + self.assertEqual(out.shape, []) + + out = paddle.randint(-10, 10, self.shape) + self.assertEqual(out.shape, [2, 3, 4]) + + def test_standard_normal(self): + out = paddle.standard_normal([]) + self.assertEqual(out.shape, []) + + out = paddle.standard_normal(self.shape) + self.assertEqual(out.shape, [2, 3, 4]) + + def test_uniform(self): + out = paddle.uniform([]) + self.assertEqual(out.shape, []) + + out = paddle.uniform(self.shape) + self.assertEqual(out.shape, [2, 3, 4]) + + def test_empty_and_empty_like(self): + out = paddle.empty([]) + self.assertEqual(out.shape, []) + + out = paddle.empty_like(out) + self.assertEqual(out.shape, []) + + out = paddle.empty(self.shape) + self.assertEqual(out.shape, [2, 3, 4]) + + def test_full_and_full_like(self): + out = paddle.full([], 0.5) + self.assertEqual(out.shape, []) + + out = paddle.full_like(out, 0.5) + self.assertEqual(out.shape, []) + + out = paddle.full(self.shape, 0.5) + self.assertEqual(out.shape, [2, 3, 4]) + + def test_ones_and_ones_like(self): + out = paddle.ones([]) + self.assertEqual(out.shape, []) + + out = paddle.ones_like(out) + self.assertEqual(out.shape, []) + + out = paddle.ones(self.shape) + self.assertEqual(out.shape, [2, 3, 4]) + + def test_zeros_and_zeros_like(self): + out = paddle.zeros([]) + self.assertEqual(out.shape, []) + + out = paddle.zeros_like(out) + self.assertEqual(out.shape, []) + + out = paddle.zeros(self.shape) + self.assertEqual(out.shape, [2, 3, 4]) + if __name__ == "__main__": unittest.main() From 6bdf126131d5bdde6c553c1585cfef8ff8d0826b Mon Sep 17 00:00:00 2001 From: zhaoyingli <86812880+zhaoyinglia@users.noreply.github.com> Date: Thu, 24 Nov 2022 11:06:27 +0800 Subject: [PATCH 191/210] [AutoParallel] dist_scale (#48295) --- .../auto_parallel/operators/__init__.py | 1 + .../auto_parallel/operators/dist_scale.py | 88 +++++++++++++++++++ .../auto_parallel/test_dist_scale.py | 74 ++++++++++++++++ 3 files changed, 163 insertions(+) create mode 100644 python/paddle/distributed/auto_parallel/operators/dist_scale.py create mode 100644 python/paddle/fluid/tests/unittests/auto_parallel/test_dist_scale.py diff --git a/python/paddle/distributed/auto_parallel/operators/__init__.py b/python/paddle/distributed/auto_parallel/operators/__init__.py index 4a0a05a4f1cd4..406ec4d8b36da 100644 --- a/python/paddle/distributed/auto_parallel/operators/__init__.py +++ b/python/paddle/distributed/auto_parallel/operators/__init__.py @@ -35,3 +35,4 @@ from . import dist_reduce_sum_p from . import dist_shape from . import dist_assign +from . import dist_scale diff --git a/python/paddle/distributed/auto_parallel/operators/dist_scale.py b/python/paddle/distributed/auto_parallel/operators/dist_scale.py new file mode 100644 index 0000000000000..e419dd6c82428 --- /dev/null +++ b/python/paddle/distributed/auto_parallel/operators/dist_scale.py @@ -0,0 +1,88 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .common import DistributedOperatorImplContainer +from .common import DistributedOperatorImpl +from .common import register_distributed_operator_impl_container +from .common import register_distributed_operator_impl +from .dist_default import DistributedDefaultImpl0 +from ..utils import compute_compatible_and_update_dim_mapping + + +class DistributedScale(DistributedOperatorImplContainer): + def __init__(self, op_type): + super().__init__(op_type) + + +register_distributed_operator_impl_container(DistributedScale("scale")) + + +class DistributedScaleImpl(DistributedOperatorImpl): + def __init__(self, name): + super().__init__(name) + self._forward_implemented = True + self._backward_implemented = True + + def is_input_compatible(self, dist_op): + return True + + def is_output_compatible(self, dist_op): + return True + + def is_auto_compatible(self, dist_op): + if (not self.is_input_compatible(dist_op)) or ( + not self.is_output_compatible(dist_op) + ): + return False + + op_desc = dist_op.serial_op.desc + op_dist_attr = dist_op.dist_attr + x_name = op_desc.input('X')[0] + out_name = op_desc.output('Out')[0] + x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name) + out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name) + + if x_dims_mapping != out_dims_mapping: + return False + + return True + + def update_dims_mapping(self, dist_op): + changed = False + op_desc = dist_op.serial_op.desc + op_dist_attr = dist_op.dist_attr + x_name = op_desc.input('X')[0] + out_name = op_desc.output('Out')[0] + x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name) + out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name) + + for i in range(len(x_dims_mapping)): + dim_changed = compute_compatible_and_update_dim_mapping( + [x_dims_mapping, out_dims_mapping], [i, i] + ) + if dim_changed: + changed = True + + return changed + + @staticmethod + def forward(ctx, *args, **kwargs): + DistributedDefaultImpl0.forward(ctx, *args, **kwargs) + + @staticmethod + def backward(ctx, *args, **kwargs): + DistributedDefaultImpl0.backward(ctx, *args, **kwargs) + + +register_distributed_operator_impl("scale", DistributedScaleImpl("scale")) diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_scale.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_scale.py new file mode 100644 index 0000000000000..2d106f6296ca9 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_scale.py @@ -0,0 +1,74 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import paddle +from paddle.distributed.fleet import auto + +paddle.enable_static() + + +def make_program(): + main_program = paddle.fluid.Program() + start_program = paddle.fluid.Program() + with paddle.static.program_guard(main_program, start_program): + x = paddle.static.data(name='x', shape=[4, 4, 8], dtype='float32') + x.stop_gradient = False + auto.shard_tensor( + x, auto.ProcessMesh([0, 1], dim_names=["x"]), [None, "x", None] + ) + res = paddle.scale(x, scale=2.0, bias=1.0) + return main_program, start_program + + +def parallelizer(program_func, rank): + from paddle.distributed.auto_parallel.completion import Completer + from paddle.distributed.auto_parallel.partitioner import Partitioner + from paddle.distributed.auto_parallel.dist_context import DistributedContext + + main_program, start_program = program_func() + + dist_context = DistributedContext() + completer = Completer(dist_context) + completer.complete_forward_annotation(main_program) + dist_context.block_state.parse_forward_blocks(main_program) + + partitioner = Partitioner(dist_context, rank) + dist_main_prog, _, _ = partitioner.partition( + main_program, start_program, [] + ) + + return dist_main_prog, dist_context + + +class TestDistScale(unittest.TestCase): + def test_dist_scale(self): + + dist_main_prog, dist_context = parallelizer(make_program, 0) + ops = dist_main_prog.global_block().ops + scale_op = ops[0] + dist_op = dist_context.get_dist_op_for_program(scale_op) + dist_op.dist_attr.impl_type == "scale" + dist_op.dist_attr.impl_idx == 0 + + in_name = scale_op.input_arg_names[0] + out_name = scale_op.output_arg_names[0] + in_dims_mapping = dist_op.dist_attr.get_input_dims_mapping(in_name) + out_dims_mapping = dist_op.dist_attr.get_output_dims_mapping(out_name) + + assert in_dims_mapping == out_dims_mapping + + +if __name__ == "__main__": + unittest.main() From 1623f1b4f3b23cac46c0b4e8ceacfda5697d9ec0 Mon Sep 17 00:00:00 2001 From: HongyuJia Date: Thu, 24 Nov 2022 12:03:10 +0800 Subject: [PATCH 192/210] [Phi Support CuDNN] Support ALL CuDNN (#47865) * support default use_gpudnn=True * fully support cudnn in phi * add header file * add white_list, verify accuracy * phi support all cudnn * opt affine_grad * try different arches of pretrained_model * try different arches of pretrained_model * add debug string * debug eager_method * add debug string, pass all local ctest * polish all debug code * delete use_cudnn relevant code autogen * fix depthwise_conv2d * Share all other members of Tensor except use_cudnn * polish codes according to review opinion * polish codes according to review opinion, fix bug * polish codes according to review opinion, opt performance * polish codes according to review opinion, fix pooling.py --- paddle/fluid/pybind/eager_method.cc | 42 +++++++++++++++++++ paddle/phi/api/lib/kernel_dispatch.cc | 21 ++++++---- paddle/phi/api/lib/kernel_dispatch.h | 13 +++++- paddle/phi/api/yaml/generator/api_base.py | 12 +----- paddle/phi/api/yaml/legacy_backward.yaml | 31 +++++--------- paddle/phi/api/yaml/legacy_ops.yaml | 15 ++----- paddle/phi/core/dense_tensor.cc | 3 +- paddle/phi/core/dense_tensor_impl.cc | 1 + paddle/phi/core/kernel_factory.cc | 11 +++-- paddle/phi/core/kernel_factory.h | 3 +- paddle/phi/core/tensor_meta.cc | 14 +++++-- paddle/phi/core/tensor_meta.h | 12 ++++-- python/paddle/fluid/dygraph/nn.py | 2 +- .../fluid/dygraph/varbase_patch_methods.py | 5 +++ python/paddle/fluid/layers/nn.py | 2 +- .../tests/unittests/test_egr_python_api.py | 15 +++++++ python/paddle/nn/functional/pooling.py | 12 ++---- python/paddle/nn/functional/vision.py | 3 +- 18 files changed, 134 insertions(+), 83 deletions(-) diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc index a3e7f43faedaf..0610a51d4cc22 100644 --- a/paddle/fluid/pybind/eager_method.cc +++ b/paddle/fluid/pybind/eager_method.cc @@ -54,6 +54,7 @@ typedef SSIZE_T ssize_t; #include "paddle/fluid/memory/allocation/mmap_allocator.h" #include "paddle/fluid/pybind/tensor_py.h" #include "paddle/phi/core/ddim.h" +#include "paddle/phi/core/tensor_utils.h" #include "paddle/phi/kernels/funcs/math_function.h" namespace paddle { @@ -1444,6 +1445,43 @@ static PyObject* tensor__copy_gradient_from(TensorObject* self, EAGER_CATCH_AND_THROW_RETURN_NULL } +static PyObject* tensor__use_cudnn(TensorObject* self, + PyObject* args, + PyObject* kwargs) { + EAGER_TRY + PADDLE_ENFORCE(self->tensor.defined() && self->tensor.is_dense_tensor(), + paddle::platform::errors::Fatal( + "function _use_cudnn is only effective for DenseTensor")); + + bool use_cudnn = pybind::CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 0), 0); + + // Set the same use_cudnn attribute, return directly + phi::DenseTensor* dense_tensor = + static_cast(self->tensor.impl().get()); + phi::DenseTensorMeta* dense_tensor_meta = + phi::DenseTensorUtils::GetMutableMeta(dense_tensor); + if (use_cudnn == dense_tensor_meta->use_cudnn) { + return ToPyObject(self->tensor); + } + + // Share all other members of Tensor except use_cudnn + phi::DenseTensorMeta target_dense_meta = *dense_tensor_meta; + target_dense_meta.use_cudnn = use_cudnn; + phi::DenseTensor target_dense_tensor; + target_dense_tensor.ShareDataWith(*dense_tensor); + target_dense_tensor.set_meta(target_dense_meta); + // Construct returned tensor + paddle::experimental::Tensor target_tensor( + std::make_shared(target_dense_tensor), + self->tensor.name()); + target_tensor.set_autograd_meta(self->tensor.mutable_autograd_meta()); + VLOG(4) << "Tensor: " << target_tensor.name() + << " set use_cudnn = " << use_cudnn; + + return ToPyObject(target_tensor); + EAGER_CATCH_AND_THROW_RETURN_NULL +} + static PyObject* tensor_method_set_vocab(TensorObject* self, PyObject* args, PyObject* kwargs) { @@ -2010,6 +2048,10 @@ PyMethodDef variable_methods[] = { (PyCFunction)(void (*)(void))tensor__copy_gradient_from, METH_VARARGS | METH_KEYWORDS, NULL}, + {"_tensor_use_cudnn", + (PyCFunction)(void (*)(void))tensor__use_cudnn, + METH_VARARGS | METH_KEYWORDS, + NULL}, /** the methods to adapt old dygraph, will be removed in the future **/ {"set_string_list", (PyCFunction)(void (*)(void))tensor_method_set_string_list, diff --git a/paddle/phi/api/lib/kernel_dispatch.cc b/paddle/phi/api/lib/kernel_dispatch.cc index ccf070c7249fd..941bc880b99f0 100644 --- a/paddle/phi/api/lib/kernel_dispatch.cc +++ b/paddle/phi/api/lib/kernel_dispatch.cc @@ -54,14 +54,11 @@ bool HasAllocation(const phi::TensorBase& t) { BackendSet GetTensorBackendSet(const phi::TensorBase& t) { if (HasAllocation(t) && t.place().GetType() != AllocationType::UNDEFINED) { - BackendSet backend_set(phi::TransToPhiBackend(t.place())); - switch (t.layout()) { - case DataLayout::ONEDNN: - backend_set = backend_set | BackendSet(Backend::ONEDNN); - break; - default: - // do nothing - break; + phi::Backend backend_key = phi::TransToPhiBackend(t.place()); + BackendSet backend_set(backend_key); + if (backend_key == Backend::GPU && phi::DenseTensor::classof(&t) && + static_cast(t).meta().use_cudnn) { + backend_set = backend_set | BackendSet(Backend::GPUDNN); } return backend_set; } @@ -126,7 +123,13 @@ Backend ParseBackend(const Place& place) { return phi::TransToPhiBackend(place); } Backend ParseBackend(const Tensor& tensor) { - return phi::TransToPhiBackend(tensor.place()); + Backend backend_key = phi::TransToPhiBackend(tensor.place()); + if (backend_key == Backend::GPU && + phi::DenseTensor::classof(tensor.impl().get()) && + static_cast(tensor.impl().get())->meta().use_cudnn) { + return Backend::GPUDNN; + } + return backend_key; } Backend ParseBackendWithInputOrder(const Place& place, const Tensor& tensor) { diff --git a/paddle/phi/api/lib/kernel_dispatch.h b/paddle/phi/api/lib/kernel_dispatch.h index 176713b71bbcf..bfe8eba2444b6 100644 --- a/paddle/phi/api/lib/kernel_dispatch.h +++ b/paddle/phi/api/lib/kernel_dispatch.h @@ -90,6 +90,7 @@ struct ArgsIterator { struct KernelKeyParser : ArgsIterator { KernelKeySet key_set; + bool disable_cudnn = false; // this dtype_set is used for cache multi-inputs dtype and used for // data_promote DataTypeSet dtype_set{DataType::UNDEFINED}; @@ -97,11 +98,19 @@ struct KernelKeyParser : ArgsIterator { // TODO(chenweihang): deal with multiple diff input Tensors // TODO(chenweihang): add global device guard method to set backend inline void AssignKernelKeySet(const phi::TensorBase& tensor) { - key_set.backend_set = - key_set.backend_set | detail::GetTensorBackendSet(tensor); + // assign Backend + BackendSet tensor_backend_set = detail::GetTensorBackendSet(tensor); + key_set.backend_set = key_set.backend_set | tensor_backend_set; + // tensor's attribute use_cudnn=False, explicitly disable cudnn kernel + if (tensor_backend_set == BackendSet(Backend::GPU) || disable_cudnn) { + disable_cudnn = true; + key_set.backend_set = key_set.backend_set - BackendSet(Backend::GPUDNN); + } + // assign DataLayout phi::DataLayout tensor_layout = tensor.layout(); key_set.layout = tensor_layout > key_set.layout ? tensor_layout : key_set.layout; + // assign DataType key_set.dtype = tensor.dtype(); dtype_set = dtype_set | DataTypeSet(key_set.dtype); auto promote_result = PromoteTypes(dtype_set); diff --git a/paddle/phi/api/yaml/generator/api_base.py b/paddle/phi/api/yaml/generator/api_base.py index 696ad8736b90e..3ad68e6d1d0bd 100644 --- a/paddle/phi/api/yaml/generator/api_base.py +++ b/paddle/phi/api/yaml/generator/api_base.py @@ -307,7 +307,6 @@ def parse_kernel(self, kernel_config): 'backend': None, 'layout': None, 'data_type': None, - 'use_gpudnn': 'false', 'dispatch': {}, } if 'backend' in kernel_config and len(kernel_config['backend']) > 0: @@ -318,10 +317,6 @@ def parse_kernel(self, kernel_config): kernel['data_type'] = kernel_config['data_type'] if 'param' in kernel_config: kernel['param'] = kernel_config['param'] - if 'use_gpudnn' in kernel_config: - kernel['use_gpudnn'] = kernel_config['use_gpudnn'] - if isinstance(kernel['use_gpudnn'], bool): - kernel['use_gpudnn'] = str(kernel['use_gpudnn']).lower() kernel_funcs = re.compile(r'([a-zA-Z0-9_]+)\s*({[^}]+})?').findall( kernel_config['func'] ) @@ -1124,15 +1119,10 @@ def gen_kernel_code(self, kernel_name, code_indent, inplace_flag=False): for kernel_out in outputs_args: fallback_kernel_output_trans += f""" {code_indent} TransDataBackend({kernel_out}, kernel_backend, {kernel_out});""" - cudnn_args = ( - '' - if self.kernel['use_gpudnn'] == 'false' - else ', ' + self.kernel['use_gpudnn'] - ) return f""" {code_indent} VLOG(6) << "{self.api} API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]"; {code_indent} auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError( -{code_indent} "{kernel_name}", {{kernel_backend, kernel_layout, kernel_data_type}}{cudnn_args}); +{code_indent} "{kernel_name}", {{kernel_backend, kernel_layout, kernel_data_type}}); {code_indent} const auto& kernel = kernel_result.kernel; {code_indent} VLOG(6) << "{kernel_name} kernel: " << kernel; {code_indent} auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend); diff --git a/paddle/phi/api/yaml/legacy_backward.yaml b/paddle/phi/api/yaml/legacy_backward.yaml index a61aa52cc821f..dc542a9964f53 100755 --- a/paddle/phi/api/yaml/legacy_backward.yaml +++ b/paddle/phi/api/yaml/legacy_backward.yaml @@ -67,8 +67,8 @@ func : addmm_grad - backward_op : affine_grid_grad - forward : affine_grid (Tensor input, IntArray outputShape, bool align_corners=true, bool use_cudnn=true) -> Tensor(output) - args : (Tensor output_grad, IntArray outputShape, bool use_cudnn=true, bool align_corners=true) + forward : affine_grid (Tensor input, IntArray outputShape, bool align_corners=true) -> Tensor(output) + args : (Tensor input, Tensor output_grad, IntArray outputShape, bool align_corners=true) output : Tensor(input_grad) infer_meta : func : AffineGridGradInferMeta @@ -76,7 +76,7 @@ kernel : func : affine_grid_grad param : [output_grad, outputShape, align_corners] - use_gpudnn: use_cudnn + no_need_buffer : input - backward_op : amax_grad forward: amax (Tensor x, int64_t[] axis={}, bool keepdim=false) -> Tensor(out) @@ -262,7 +262,6 @@ param : [input, filter] kernel : func : conv2d_grad - use_gpudnn : true backward : conv2d_grad_grad - backward_op : conv2d_grad_grad @@ -274,7 +273,6 @@ param: [input, filter, grad_out] kernel : func : conv2d_grad_grad - use_gpudnn : true optional : grad_input_grad, grad_filter_grad - backward_op : conv2d_transpose_double_grad @@ -285,7 +283,6 @@ func : Conv2dTransposeDoubleGradInferMeta kernel : func : conv2d_transpose_grad_grad - use_gpudnn : true - backward_op : conv2d_transpose_grad forward : conv2d_transpose(Tensor x, Tensor filter, int[] strides, int[] paddings, int[] output_padding, IntArray output_size, str padding_algorithm, int groups, int[] dilations, str data_format) -> Tensor(out) @@ -295,7 +292,6 @@ func : Conv2dTransposeGradInferMeta kernel : func : conv2d_transpose_grad - use_gpudnn : true backward : conv2d_transpose_double_grad - backward_op : conv3d_double_grad @@ -307,7 +303,6 @@ param: [input, filter, grad_out] kernel : func : conv3d_double_grad - use_gpudnn : true optional : grad_input_grad, grad_filter_grad - backward_op : conv3d_grad @@ -319,7 +314,6 @@ param : [input, filter] kernel : func : conv3d_grad - use_gpudnn : true backward : conv3d_double_grad - backward_op : conv3d_transpose_grad @@ -330,7 +324,6 @@ func : ConvTransposeGradInferMeta kernel : func : conv3d_transpose_grad - use_gpudnn : true - backward_op : crop_grad forward : crop_tensor (Tensor x, IntArray shape, IntArray offsets) -> Tensor(out) @@ -401,7 +394,6 @@ kernel : func : depthwise_conv2d_grad param : [input, filter, out_grad, strides, paddings, padding_algorithm, groups, dilations, data_format] - use_gpudnn : True backward : depthwise_conv2d_double_grad - backward_op : depthwise_conv2d_transpose_grad @@ -1210,8 +1202,8 @@ func : pixel_shuffle_grad - backward_op : pool2d_double_grad - forward : pool2d_grad(Tensor x, Tensor out, Tensor grad_out, IntArray kernel_size, int[] strides, int[] paddings, bool ceil_mode, bool exclusive, str data_format, str pooling_type, bool global_pooling, bool adaptive, str padding_algorithm, bool use_gpudnn) -> Tensor(grad_x) - args : (Tensor grad_x_grad, IntArray kernel_size, int[] strides, int[] paddings, bool ceil_mode, bool exclusive, str data_format, str pooling_type, bool global_pooling, bool adaptive, str padding_algorithm, bool use_gpudnn) + forward : pool2d_grad(Tensor x, Tensor out, Tensor grad_out, IntArray kernel_size, int[] strides, int[] paddings, bool ceil_mode, bool exclusive, str data_format, str pooling_type, bool global_pooling, bool adaptive, str padding_algorithm) -> Tensor(grad_x) + args : (Tensor x, Tensor grad_x_grad, IntArray kernel_size, int[] strides, int[] paddings, bool ceil_mode, bool exclusive, str data_format, str pooling_type, bool global_pooling, bool adaptive, str padding_algorithm) output : Tensor(grad_out_grad) infer_meta : func : Pool2DInferMeta @@ -1219,11 +1211,11 @@ kernel : func : pool2d_double_grad param : [grad_x_grad, kernel_size, strides, paddings, ceil_mode, exclusive, data_format, pooling_type, global_pooling, adaptive, padding_algorithm] - use_gpudnn : use_gpudnn + no_need_buffer : x - backward_op : pool2d_grad - forward : pool2d(Tensor x, IntArray kernel_size, int[] strides, int[] paddings, bool ceil_mode, bool exclusive, str data_format, str pooling_type, bool global_pooling, bool adaptive, str padding_algorithm, bool use_gpudnn) -> Tensor(out) - args : (Tensor x, Tensor out, Tensor out_grad, IntArray kernel_size, int[] strides, int[] paddings, bool ceil_mode, bool exclusive, str data_format, str pooling_type, bool global_pooling, bool adaptive, str padding_algorithm, bool use_gpudnn) + forward : pool2d(Tensor x, IntArray kernel_size, int[] strides, int[] paddings, bool ceil_mode, bool exclusive, str data_format, str pooling_type, bool global_pooling, bool adaptive, str padding_algorithm) -> Tensor(out) + args : (Tensor x, Tensor out, Tensor out_grad, IntArray kernel_size, int[] strides, int[] paddings, bool ceil_mode, bool exclusive, str data_format, str pooling_type, bool global_pooling, bool adaptive, str padding_algorithm) output : Tensor(x_grad) infer_meta : func : UnchangedInferMeta @@ -1231,12 +1223,11 @@ kernel : func : pool2d_grad param : [x, out, out_grad, kernel_size, strides, paddings, ceil_mode, exclusive, data_format, pooling_type, global_pooling, adaptive, padding_algorithm] - use_gpudnn : use_gpudnn backward : pool2d_double_grad - backward_op : pool3d_grad - forward : pool3d(Tensor x, int[] kernel_size, int[] strides, int[] paddings, bool ceil_mode, bool exclusive, str data_format, str pooling_type, bool global_pooling, bool adaptive, str padding_algorithm, bool use_gpudnn) -> Tensor(out) - args : (Tensor x, Tensor out, Tensor out_grad, int[] kernel_size, int[] strides, int[] paddings, bool ceil_mode, bool exclusive, str data_format, str pooling_type, bool global_pooling, bool adaptive, str padding_algorithm, bool use_gpudnn) + forward : pool3d(Tensor x, int[] kernel_size, int[] strides, int[] paddings, bool ceil_mode, bool exclusive, str data_format, str pooling_type, bool global_pooling, bool adaptive, str padding_algorithm) -> Tensor(out) + args : (Tensor x, Tensor out, Tensor out_grad, int[] kernel_size, int[] strides, int[] paddings, bool ceil_mode, bool exclusive, str data_format, str pooling_type, bool global_pooling, bool adaptive, str padding_algorithm) output : Tensor(x_grad) infer_meta : func : UnchangedInferMeta @@ -1244,7 +1235,6 @@ kernel : func : pool3d_grad param : [x, out, out_grad, kernel_size, strides, paddings, ceil_mode, exclusive, data_format, pooling_type, global_pooling, adaptive, padding_algorithm] - use_gpudnn : use_gpudnn - backward_op : pow_double_grad forward : pow_grad(Tensor x, Tensor grad_out, Scalar y) -> Tensor(grad_x) @@ -1601,7 +1591,6 @@ param : [out] kernel : func : softmax_grad - use_gpudnn : true - backward_op : spectral_norm_grad forward : spectral_norm (Tensor weight, Tensor u, Tensor v, int dim, int power_iters, float eps) -> Tensor(out) diff --git a/paddle/phi/api/yaml/legacy_ops.yaml b/paddle/phi/api/yaml/legacy_ops.yaml index 7fb2c2441055e..5f7bc550083c0 100755 --- a/paddle/phi/api/yaml/legacy_ops.yaml +++ b/paddle/phi/api/yaml/legacy_ops.yaml @@ -97,7 +97,7 @@ backward : addmm_grad - op : affine_grid - args : (Tensor input, IntArray outputShape, bool align_corners=true, bool use_cudnn=true) + args : (Tensor input, IntArray outputShape, bool align_corners=true) output : Tensor infer_meta : func : AffineGridInferMeta @@ -106,7 +106,6 @@ func : affine_grid param : [input, outputShape, align_corners] data_type : input - use_gpudnn: use_cudnn backward : affine_grid_grad - op : all @@ -431,7 +430,6 @@ func : ConvInferMeta kernel : func : conv2d - use_gpudnn : true backward : conv2d_grad - op : conv2d_transpose @@ -441,7 +439,6 @@ func : Conv2dTransposeInferMeta kernel : func : conv2d_transpose - use_gpudnn : true backward : conv2d_transpose_grad - op : conv3d @@ -451,7 +448,6 @@ func : Conv3DInferMeta kernel : func : conv3d - use_gpudnn : true backward : conv3d_grad - op : conv3d_transpose @@ -461,7 +457,6 @@ func : ConvTransposeInferMeta kernel : func : conv3d_transpose - use_gpudnn : true backward : conv3d_transpose_grad - op : copy_to @@ -540,7 +535,6 @@ kernel : func : depthwise_conv2d param : [x, filter, strides, paddings, padding_algorithm, groups, dilations, data_format] - use_gpudnn : true backward : depthwise_conv2d_grad - op : depthwise_conv2d_transpose @@ -1636,7 +1630,7 @@ backward : pixel_shuffle_grad - op : pool2d - args : (Tensor x, IntArray kernel_size, int[] strides, int[] paddings, bool ceil_mode, bool exclusive, str data_format, str pooling_type, bool global_pooling, bool adaptive, str padding_algorithm, bool use_gpudnn) + args : (Tensor x, IntArray kernel_size, int[] strides, int[] paddings, bool ceil_mode, bool exclusive, str data_format, str pooling_type, bool global_pooling, bool adaptive, str padding_algorithm) output : Tensor(out) infer_meta : func : Pool2DInferMeta @@ -1644,11 +1638,10 @@ kernel : func : pool2d param : [x, kernel_size, strides, paddings, ceil_mode, exclusive, data_format, pooling_type, global_pooling, adaptive, padding_algorithm] - use_gpudnn : use_gpudnn backward : pool2d_grad - op : pool3d - args : (Tensor x, int[] kernel_size, int[] strides, int[] paddings, bool ceil_mode, bool exclusive, str data_format, str pooling_type, bool global_pooling, bool adaptive, str padding_algorithm, bool use_gpudnn) + args : (Tensor x, int[] kernel_size, int[] strides, int[] paddings, bool ceil_mode, bool exclusive, str data_format, str pooling_type, bool global_pooling, bool adaptive, str padding_algorithm) output : Tensor(out) infer_meta : func : PoolInferMeta @@ -1656,7 +1649,6 @@ kernel : func : pool3d param : [x, kernel_size, strides, paddings, ceil_mode, exclusive, data_format, pooling_type, global_pooling, adaptive, padding_algorithm] - use_gpudnn : use_gpudnn backward : pool3d_grad - op : pow @@ -2048,7 +2040,6 @@ func : SoftmaxInferMeta kernel : func : softmax - use_gpudnn : true inplace : (x -> out) backward : softmax_grad diff --git a/paddle/phi/core/dense_tensor.cc b/paddle/phi/core/dense_tensor.cc index 3fbf3560aff95..09ce2414150e1 100644 --- a/paddle/phi/core/dense_tensor.cc +++ b/paddle/phi/core/dense_tensor.cc @@ -200,9 +200,10 @@ void DenseTensor::set_meta(const DenseTensorMeta& meta) { meta_.layout = meta.layout; meta_.lod = meta.lod; meta_.offset = meta.offset; + meta_.use_cudnn = meta.use_cudnn; } -/* @jim19930609: This interface will be further modified util we finalized the +/* @jim19930609: This interface will be further modified until we finalized the design for Allocator - Allocation For now, we have to temporarily accommodate two independent use cases: 1. Designed behaviour: DenseTensor constructed with its underlying storage_ diff --git a/paddle/phi/core/dense_tensor_impl.cc b/paddle/phi/core/dense_tensor_impl.cc index c8998f65efb6a..3906282187d4c 100644 --- a/paddle/phi/core/dense_tensor_impl.cc +++ b/paddle/phi/core/dense_tensor_impl.cc @@ -357,6 +357,7 @@ DenseTensor& DenseTensor::ShareDataWith(const DenseTensor& src) { meta_.dtype = src.meta_.dtype; meta_.layout = src.meta_.layout; meta_.offset = src.meta_.offset; + meta_.use_cudnn = src.meta_.use_cudnn; storage_properties_ = std::move(CopyStorageProperties(src.storage_properties_)); #ifdef PADDLE_WITH_MKLDNN diff --git a/paddle/phi/core/kernel_factory.cc b/paddle/phi/core/kernel_factory.cc index 3370e9b805889..0d43d3189028b 100644 --- a/paddle/phi/core/kernel_factory.cc +++ b/paddle/phi/core/kernel_factory.cc @@ -106,17 +106,16 @@ bool KernelFactory::HasKernel(const std::string& kernel_name, } KernelResult KernelFactory::SelectKernelOrThrowError( - const std::string& kernel_name, - const KernelKey& kernel_key, - bool use_gpudnn) const { + const std::string& kernel_name, const KernelKey& const_kernel_key) const { auto iter = kernels_.find(kernel_name); PADDLE_ENFORCE_NE( iter, kernels_.end(), phi::errors::NotFound("The kernel `%s` is not registered.", kernel_name)); + KernelKey kernel_key = const_kernel_key; #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - if (use_gpudnn && kernel_key.backend() == Backend::GPU) { + if (kernel_key.backend() == Backend::GPUDNN) { auto kernel_iter = iter->second.find( {Backend::GPUDNN, kernel_key.layout(), kernel_key.dtype()}); if (kernel_iter == iter->second.end() && @@ -127,8 +126,8 @@ KernelResult KernelFactory::SelectKernelOrThrowError( if (kernel_iter != iter->second.end()) { return {kernel_iter->second, false}; } - VLOG(3) << "The cudnn kernel for [" << kernel_name - << "] is not registered."; + kernel_key = + KernelKey(Backend::GPU, kernel_key.layout(), kernel_key.dtype()); } #endif auto kernel_iter = iter->second.find(kernel_key); diff --git a/paddle/phi/core/kernel_factory.h b/paddle/phi/core/kernel_factory.h index 423c2b8f0a5f6..69baf243e68ea 100644 --- a/paddle/phi/core/kernel_factory.h +++ b/paddle/phi/core/kernel_factory.h @@ -274,8 +274,7 @@ class KernelFactory { bool HasCompatiblePhiKernel(const std::string& op_type) const; KernelResult SelectKernelOrThrowError(const std::string& kernel_name, - const KernelKey& kernel_key, - bool use_gpudnn = false) const; + const KernelKey& kernel_key) const; bool HasKernel(const std::string& kernel_name, const KernelKey& kernel_key) const; diff --git a/paddle/phi/core/tensor_meta.cc b/paddle/phi/core/tensor_meta.cc index da08802576838..44b2dee358ad5 100644 --- a/paddle/phi/core/tensor_meta.cc +++ b/paddle/phi/core/tensor_meta.cc @@ -16,21 +16,29 @@ limitations under the License. */ namespace phi { +DenseTensorMeta::DenseTensorMeta() { use_cudnn = true; } + DenseTensorMeta::DenseTensorMeta(DataType dtype, const DDim& dims) - : dims(dims), dtype(dtype) {} + : dims(dims), dtype(dtype) { + use_cudnn = true; +} DenseTensorMeta::DenseTensorMeta(DataType dtype, const DDim& dims, DataLayout layout, size_t offset) - : dims(dims), dtype(dtype), layout(layout), offset(offset) {} + : dims(dims), dtype(dtype), layout(layout), offset(offset) { + use_cudnn = true; +} DenseTensorMeta::DenseTensorMeta(DataType dtype, const DDim& dims, DataLayout layout, const LoD& lod, size_t offset) - : dims(dims), dtype(dtype), layout(layout), lod(lod), offset(offset) {} + : dims(dims), dtype(dtype), layout(layout), lod(lod), offset(offset) { + use_cudnn = true; +} bool DenseTensorMeta::valid() const noexcept { bool valid{true}; diff --git a/paddle/phi/core/tensor_meta.h b/paddle/phi/core/tensor_meta.h index 71272235db11f..789a4422e25d1 100644 --- a/paddle/phi/core/tensor_meta.h +++ b/paddle/phi/core/tensor_meta.h @@ -48,7 +48,7 @@ using LoD = std::vector>; struct DenseTensorMeta { using DataType = paddle::experimental::DataType; - DenseTensorMeta() = default; + DenseTensorMeta(); DenseTensorMeta(DataType dtype, const DDim& dims); DenseTensorMeta(DataType dtype, const DDim& dims, @@ -65,6 +65,9 @@ struct DenseTensorMeta { bool valid() const noexcept; bool is_scalar{false}; + /// \brief Determine whether using CuDNN speed-up library in the new dygraph. + /// It maybe also support MKLDNN library in the near future. + bool use_cudnn{true}; DDim dims; DataType dtype{DataType::UNDEFINED}; DataLayout layout{DataLayout::NCHW}; @@ -73,9 +76,10 @@ struct DenseTensorMeta { }; inline bool operator==(const DenseTensorMeta& lhs, const DenseTensorMeta& rhs) { - return (lhs.is_scalar == rhs.is_scalar) && (lhs.dims == rhs.dims) && - (lhs.dtype == rhs.dtype) && (lhs.layout == rhs.layout) && - (lhs.lod == rhs.lod) && (lhs.offset == rhs.offset); + return (lhs.is_scalar == rhs.is_scalar) && lhs.use_cudnn == rhs.use_cudnn && + (lhs.dims == rhs.dims) && (lhs.dtype == rhs.dtype) && + (lhs.layout == rhs.layout) && (lhs.lod == rhs.lod) && + (lhs.offset == rhs.offset); } struct StringTensorMeta { diff --git a/python/paddle/fluid/dygraph/nn.py b/python/paddle/fluid/dygraph/nn.py index 4c8b9d7f555f1..4cbe12698c58a 100644 --- a/python/paddle/fluid/dygraph/nn.py +++ b/python/paddle/fluid/dygraph/nn.py @@ -672,6 +672,7 @@ def __init__( def forward(self, input): if _non_static_mode(): if not self._use_mkldnn and in_dygraph_mode(): + input = input._use_cudnn(self._use_cudnn) return _C_ops.pool2d( input, self._pool_size, @@ -684,7 +685,6 @@ def forward(self, input): self._global_pooling, False, "EXPLICIT", - self._use_cudnn, ) attrs = ( diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py index 7c7aa964cf84d..6fa46692c79e6 100644 --- a/python/paddle/fluid/dygraph/varbase_patch_methods.py +++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py @@ -880,6 +880,10 @@ def _numel(self): def _clear_data(self): self.get_tensor()._clear() + @framework.dygraph_only + def _use_cudnn(self, use_cudnn=True): + return self._tensor_use_cudnn(use_cudnn) + @framework.dygraph_only def _uva(self, device_id=0): ''' @@ -1064,6 +1068,7 @@ def __hash__(self): setattr(core.eager.Tensor, "_uva", _uva) setattr(core.eager.Tensor, "_clear_data", _clear_data) setattr(core.eager.Tensor, "__hash__", __hash__) + setattr(core.eager.Tensor, "_use_cudnn", _use_cudnn) else: setattr(core.VarBase, "__name__", "Tensor") setattr(core.VarBase, "grad", grad) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index bd5b11e1364b5..71f5702a7cc9a 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -2196,6 +2196,7 @@ def is_list_or_tuple(ele): pool_padding = update_padding(pool_padding, data_format) if in_dygraph_mode(): + input = input._use_cudnn(use_cudnn) return _C_ops.pool2d( input, pool_size, @@ -2208,7 +2209,6 @@ def is_list_or_tuple(ele): global_pooling, False, padding_algorithm, - use_cudnn, ) op_type = 'pool2d' helper = LayerHelper(op_type, **locals()) diff --git a/python/paddle/fluid/tests/unittests/test_egr_python_api.py b/python/paddle/fluid/tests/unittests/test_egr_python_api.py index eee9e8eac4b0e..4471d78936ab8 100644 --- a/python/paddle/fluid/tests/unittests/test_egr_python_api.py +++ b/python/paddle/fluid/tests/unittests/test_egr_python_api.py @@ -897,6 +897,21 @@ def test_clear(self): x._clear() self.assertFalse(x._is_initialized()) + def test_use_cudnn(self): + np_x = np.random.random((3, 8, 8)) + with _test_eager_guard(): + self.assertTrue(in_dygraph_mode()) + x = paddle.to_tensor(np_x, dtype="float64") + y = x._use_cudnn(False) + np.testing.assert_array_equal(x.numpy(), y.numpy()) + y = x._use_cudnn(True) + np.testing.assert_array_equal(x.numpy(), y.numpy()) + + self.assertFalse(in_dygraph_mode()) + x = paddle.to_tensor(np_x, dtype="float64") + with self.assertRaises(AttributeError): + x = x._use_cudnn(False) + class EagerParamBaseUsageTestCase(unittest.TestCase): def test_print(self): diff --git a/python/paddle/nn/functional/pooling.py b/python/paddle/nn/functional/pooling.py index 5e8f77a9810bf..f30be705207cd 100755 --- a/python/paddle/nn/functional/pooling.py +++ b/python/paddle/nn/functional/pooling.py @@ -258,7 +258,6 @@ def avg_pool1d( False, False, padding_algorithm, - True, ) return squeeze(output, [2]) @@ -407,7 +406,6 @@ def avg_pool2d( False, False, padding_algorithm, - True, ) else: output = _legacy_C_ops.pool2d( @@ -561,7 +559,6 @@ def avg_pool3d( False, False, padding_algorithm, - True, ) elif _in_legacy_dygraph(): pool_out = _legacy_C_ops.pool3d( @@ -718,7 +715,6 @@ def max_pool1d( False, False, padding_algorithm, - True, ) return squeeze(pool_out, [2]) @@ -1363,7 +1359,6 @@ def max_pool2d( False, False, padding_algorithm, - True, ) if _in_legacy_dygraph(): @@ -1554,7 +1549,6 @@ def max_pool3d( False, False, padding_algorithm, - True, ) if _in_legacy_dygraph(): @@ -1691,6 +1685,7 @@ def adaptive_avg_pool1d(x, output_size, name=None): x = unsqueeze(x, [2]) if in_dygraph_mode(): + x = x._use_cudnn(False) pool_out = _C_ops.pool2d( x, pool_size, @@ -1703,7 +1698,6 @@ def adaptive_avg_pool1d(x, output_size, name=None): False, True, "EXPLICIT", - False, ) return squeeze(pool_out, [2]) if _in_legacy_dygraph(): @@ -1828,6 +1822,7 @@ def adaptive_avg_pool2d(x, output_size, data_format='NCHW', name=None): output_size = utils._convert_to_tensor_list(output_size) if in_dygraph_mode(): + x = x._use_cudnn(False) return _C_ops.pool2d( x, output_size, @@ -1840,7 +1835,6 @@ def adaptive_avg_pool2d(x, output_size, data_format='NCHW', name=None): False, True, "EXPLICIT", - False, ) if _in_legacy_dygraph(): @@ -1973,6 +1967,7 @@ def adaptive_avg_pool3d(x, output_size, data_format='NCDHW', name=None): output_size[2] = in_w if in_dygraph_mode(): + x = x._use_cudnn(False) return _C_ops.pool3d( x, output_size, @@ -1985,7 +1980,6 @@ def adaptive_avg_pool3d(x, output_size, data_format='NCDHW', name=None): False, True, "EXPLICIT", - False, ) elif _in_legacy_dygraph(): return _legacy_C_ops.pool3d( diff --git a/python/paddle/nn/functional/vision.py b/python/paddle/nn/functional/vision.py index ebe1ec7e9bcdd..6d061ff6294b8 100644 --- a/python/paddle/nn/functional/vision.py +++ b/python/paddle/nn/functional/vision.py @@ -92,7 +92,8 @@ def affine_grid(theta, out_shape, align_corners=True, name=None): if isinstance(out_shape, Variable) else out_shape ) - return _C_ops.affine_grid(theta, _out_shape, align_corners, use_cudnn) + theta = theta._use_cudnn(use_cudnn) + return _C_ops.affine_grid(theta, _out_shape, align_corners) elif in_dynamic_mode(): _out_shape = ( out_shape.numpy().tolist() From 5664306b1875ec4968422e7336c6f5f7d5c2f5da Mon Sep 17 00:00:00 2001 From: Nyakku Shigure Date: Thu, 24 Nov 2022 14:20:56 +0800 Subject: [PATCH 193/210] [Dy2St] remove deprecated JIT engines (#48298) --- paddle/fluid/jit/CMakeLists.txt | 5 +- paddle/fluid/jit/engine/CMakeLists.txt | 10 -- paddle/fluid/jit/engine/executor_engine.cc | 66 ------------ paddle/fluid/jit/engine/executor_engine.h | 51 --------- paddle/fluid/jit/engine/pe_engine.cc | 115 --------------------- paddle/fluid/jit/engine/pe_engine.h | 67 ------------ paddle/fluid/jit/serializer.cc | 11 +- 7 files changed, 2 insertions(+), 323 deletions(-) delete mode 100644 paddle/fluid/jit/engine/executor_engine.cc delete mode 100644 paddle/fluid/jit/engine/executor_engine.h delete mode 100644 paddle/fluid/jit/engine/pe_engine.cc delete mode 100644 paddle/fluid/jit/engine/pe_engine.h diff --git a/paddle/fluid/jit/CMakeLists.txt b/paddle/fluid/jit/CMakeLists.txt index b6db37d82c3af..150af80d5a89b 100644 --- a/paddle/fluid/jit/CMakeLists.txt +++ b/paddle/fluid/jit/CMakeLists.txt @@ -34,8 +34,7 @@ cc_library( cc_library( jit_function SRCS function.cc - DEPS jit_function_utils jit_executor_engine jit_pe_engine - jit_interpreter_engine jit_predictor_engine) + DEPS jit_function_utils jit_interpreter_engine jit_predictor_engine) cc_library( jit_layer @@ -45,8 +44,6 @@ cc_library( jit_serializer_utils jit_compilation_unit jit_function_schema - jit_executor_engine - jit_pe_engine jit_interpreter_engine jit_predictor_engine jit_function) diff --git a/paddle/fluid/jit/engine/CMakeLists.txt b/paddle/fluid/jit/engine/CMakeLists.txt index b09e818227d76..94407497add2b 100644 --- a/paddle/fluid/jit/engine/CMakeLists.txt +++ b/paddle/fluid/jit/engine/CMakeLists.txt @@ -1,13 +1,3 @@ -cc_library( - jit_executor_engine - SRCS executor_engine.cc - DEPS executor) - -cc_library( - jit_pe_engine - SRCS pe_engine.cc - DEPS parallel_executor) - cc_library( jit_interpreter_engine SRCS interpreter_engine.cc diff --git a/paddle/fluid/jit/engine/executor_engine.cc b/paddle/fluid/jit/engine/executor_engine.cc deleted file mode 100644 index 1cde715b8f030..0000000000000 --- a/paddle/fluid/jit/engine/executor_engine.cc +++ /dev/null @@ -1,66 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/jit/engine/executor_engine.h" - -#include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/framework/variable.h" -#include "paddle/phi/core/enforce.h" - -namespace paddle { -namespace jit { - -ExecutorEngine::ExecutorEngine(const std::shared_ptr &info, - const VariableMap ¶ms_dict, - const phi::Place &place) - : info_(info), place_(place), inner_exe_(place_) { - info_->RemoveDescFeedFetch(); - PADDLE_ENFORCE_GT( - static_cast(info_->ProgramDesc().Block(0).OpSize()), - 0, - platform::errors::PreconditionNotMet( - "There is no operator in ProgramDesc.")); - utils::ShareParamsIntoScope(info_->ParamNames(), params_dict, &scope_); - VLOG(6) << framework::GenScopeTreeDebugInfo(&scope_); -} - -std::vector ExecutorEngine::operator()( - const std::vector &inputs) { - auto dense_tensors = utils::ToDenseTensors(inputs); - return utils::ToTensors(this->operator()(dense_tensors)); -} - -std::vector ExecutorEngine::operator()( - const std::vector &inputs) { - utils::ShareIntoScope(info_->InputArgNames(), inputs, &scope_); - const auto out_names = info_->OutputArgNames(); - inner_exe_.Run(info_->ProgramDesc(), - &scope_, - /*blockID=*/0, - false, - true, - out_names); - std::vector outputs; - utils::FetchOuts(out_names, scope_, &outputs); - // Erase output vars to avoid data rewriting. - scope_.EraseVars(out_names); - return outputs; -} - -const std::shared_ptr &ExecutorEngine::Info() const { - return info_; -} - -} // namespace jit -} // namespace paddle diff --git a/paddle/fluid/jit/engine/executor_engine.h b/paddle/fluid/jit/engine/executor_engine.h deleted file mode 100644 index a39cf85020c1b..0000000000000 --- a/paddle/fluid/jit/engine/executor_engine.h +++ /dev/null @@ -1,51 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include - -#include "paddle/fluid/framework/executor.h" -#include "paddle/fluid/framework/scope.h" - -#include "paddle/fluid/jit/engine/base_engine.h" -#include "paddle/fluid/jit/function_schema.h" -#include "paddle/fluid/jit/function_utils.h" - -namespace paddle { -namespace jit { - -class ExecutorEngine : public BaseEngine { - public: - ExecutorEngine(const std::shared_ptr &info, - const VariableMap ¶ms_dict, - const phi::Place &place); - - ~ExecutorEngine() noexcept {} - - std::vector operator()(const std::vector &inputs); - - std::vector operator()(const std::vector &inputs); - - const std::shared_ptr &Info() const; - - private: - std::shared_ptr info_; - framework::Scope scope_; - phi::Place place_; - framework::Executor inner_exe_; -}; - -} // namespace jit -} // namespace paddle diff --git a/paddle/fluid/jit/engine/pe_engine.cc b/paddle/fluid/jit/engine/pe_engine.cc deleted file mode 100644 index 576687c0efaf1..0000000000000 --- a/paddle/fluid/jit/engine/pe_engine.cc +++ /dev/null @@ -1,115 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/jit/engine/pe_engine.h" - -#include "paddle/fluid/framework/block_desc.h" -#include "paddle/fluid/framework/details/build_strategy.h" -#include "paddle/fluid/framework/details/execution_strategy.h" -#include "paddle/fluid/framework/ir/graph.h" -#include "paddle/fluid/framework/parallel_executor.h" -#include "paddle/fluid/framework/program_desc.h" -#include "paddle/phi/core/enforce.h" - -namespace paddle { -namespace jit { - -static ExecutionStrategy GetExecutionStrategy(const platform::Place &place) { - ExecutionStrategy execution_strategy; - - auto device_type = platform::Place2DeviceType(place); - switch (device_type) { - case platform::DeviceType::CPU: { - execution_strategy.num_threads_ = 1; - break; - } - case platform::DeviceType::CUDA: { - // NOTE: According experiments, one thread is faster in - // most model training. - execution_strategy.num_threads_ = 1; - break; - } - case platform::DeviceType::XPU: { - execution_strategy.num_threads_ = 1; - break; - } - case platform::DeviceType::IPU: { - execution_strategy.num_threads_ = 1; - break; - } - default: - PADDLE_THROW(platform::errors::Unavailable("Unsupported Device type %d.", - device_type)); - } - execution_strategy.use_device_ = device_type; - - return execution_strategy; -} - -PEEngine::PEEngine(const std::shared_ptr &info, - const VariableMap ¶ms_dict, - const phi::Place &place) - : info_(info), place_(place) { - info_->RemoveDescFeedFetch(); - PADDLE_ENFORCE_GT( - static_cast(info_->ProgramDesc().Block(0).OpSize()), - 0, - platform::errors::PreconditionNotMet( - "There is no operator in ProgramDesc.")); - utils::ShareParamsIntoScope(info_->ParamNames(), params_dict, &scope_); - VLOG(6) << framework::GenScopeTreeDebugInfo(&scope_); - CreateGraphAndPE(); -} - -void PEEngine::CreateGraphAndPE() { - framework::details::BuildStrategy build_strategy; - build_strategy.enable_inference_pass_ = true; // use pe to inference - auto execution_strategy = GetExecutionStrategy(place_); - - auto &program_desc = info_->ProgramDesc(); - const framework::BlockDesc &global_block = program_desc.Block(0); - int64_t start_op_index = 0; - int64_t end_op_index = static_cast(global_block.OpSize()); - - graph_ = std::make_shared(program_desc, start_op_index, end_op_index); - inner_pe_ = std::make_shared( - place_, &scope_, execution_strategy, build_strategy, graph_.get()); - inner_pe_->SkipMemoryReuse(/*scope_idx=*/0, info_->InputArgNames()); -} - -std::vector PEEngine::operator()(const std::vector &inputs) { - auto dense_tensors = utils::ToDenseTensors(inputs); - return utils::ToTensors(this->operator()(dense_tensors)); -} - -std::vector PEEngine::operator()( - const std::vector &inputs) { - utils::ShareIntoScope(info_->InputArgNames(), inputs, &scope_); - const auto out_names = info_->OutputArgNames(); - // need to recreate tmp variables in new scope - inner_pe_->PrepareVariables(&scope_); - inner_pe_->RunWithoutFetch(out_names); - - std::vector outputs; - utils::FetchOuts(out_names, scope_, &outputs); - // Erase output vars to avoid data rewriting. - scope_.EraseVars(out_names); - scope_.DropKids(); - return outputs; -} - -const std::shared_ptr &PEEngine::Info() const { return info_; } - -} // namespace jit -} // namespace paddle diff --git a/paddle/fluid/jit/engine/pe_engine.h b/paddle/fluid/jit/engine/pe_engine.h deleted file mode 100644 index 16ade6d77d8ac..0000000000000 --- a/paddle/fluid/jit/engine/pe_engine.h +++ /dev/null @@ -1,67 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include - -#include "paddle/fluid/framework/scope.h" - -#include "paddle/fluid/jit/engine/base_engine.h" -#include "paddle/fluid/jit/function_schema.h" -#include "paddle/fluid/jit/function_utils.h" - -namespace paddle { - -namespace framework { -class ParallelExecutor; -namespace details { -class ExecutionStrategy; -} -namespace ir { -class Graph; -} -} // namespace framework - -namespace jit { -using ExecutionStrategy = framework::details::ExecutionStrategy; -using ParallelExecutor = framework::ParallelExecutor; -using Graph = framework::ir::Graph; - -class PEEngine : public BaseEngine { - public: - PEEngine(const std::shared_ptr &info, - const VariableMap ¶ms_dict, - const phi::Place &place); - - ~PEEngine() noexcept {} - - void CreateGraphAndPE(); - - std::vector operator()(const std::vector &inputs); - - std::vector operator()(const std::vector &inputs); - - const std::shared_ptr &Info() const; - - private: - std::shared_ptr info_; - framework::Scope scope_; - phi::Place place_; - std::shared_ptr inner_pe_; - std::shared_ptr graph_; -}; - -} // namespace jit -} // namespace paddle diff --git a/paddle/fluid/jit/serializer.cc b/paddle/fluid/jit/serializer.cc index 9c819c52718c0..0a7fdc0e3525a 100644 --- a/paddle/fluid/jit/serializer.cc +++ b/paddle/fluid/jit/serializer.cc @@ -20,9 +20,7 @@ #include "paddle/fluid/framework/variable.h" #include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/jit/engine/executor_engine.h" #include "paddle/fluid/jit/engine/interpreter_engine.h" -#include "paddle/fluid/jit/engine/pe_engine.h" #include "paddle/fluid/jit/engine/predictor_engine.h" #include "paddle/fluid/jit/layer.h" #include "paddle/fluid/jit/property.h" @@ -74,14 +72,7 @@ Layer Deserializer::operator()(const std::string& path, auto& info = it->second; VLOG(3) << "Add function type: " << FLAGS_jit_engine_type << " Function name: " << func_name; - if (FLAGS_jit_engine_type == "Executor") { - layer.SetEngine( - func_name, - utils::MakeEngine(info, params_dict, place)); - } else if (FLAGS_jit_engine_type == "PE") { - layer.SetEngine(func_name, - utils::MakeEngine(info, params_dict, place)); - } else if (FLAGS_jit_engine_type == "New") { + if (FLAGS_jit_engine_type == "New") { layer.SetEngine( func_name, utils::MakeEngine(info, params_dict, place)); From d39f3fb6250089b0c56c18396c0c318449e8944a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=82=85=E5=89=91=E5=AF=92?= Date: Thu, 24 Nov 2022 14:54:12 +0800 Subject: [PATCH 194/210] =?UTF-8?q?=EF=BC=88fluid=20API=20clear=EF=BC=89re?= =?UTF-8?q?move=20fluid.layers.brelu=20in=20nn.py=20under=20fluid=20(#4789?= =?UTF-8?q?8)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * remove brelu in nn.py under fluid * add brelu op test case --- python/paddle/fluid/layers/nn.py | 47 ------------------- .../unittests/ipu/test_activation_ops_ipu.py | 10 ++-- .../tests/unittests/test_activation_op.py | 45 ------------------ 3 files changed, 5 insertions(+), 97 deletions(-) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 71f5702a7cc9a..95126e9f239d1 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -115,7 +115,6 @@ 'log', 'crop_tensor', 'prelu', - 'brelu', 'flatten', 'pad2d', 'unique', @@ -7831,52 +7830,6 @@ def prelu(x, mode, param_attr=None, data_format="NCHW", name=None): return out -@templatedoc() -def brelu(x, t_min=0.0, t_max=24.0, name=None): - """ - ${comment} - Args: - x(${x_type}): ${x_comment} - t_min(${t_min_type}|0.0): ${t_min_comment} - t_max(${t_max_type}|24.0): ${t_max_comment} - name(str|None): The default value is None. Normally there is no need for user to set this property. - For more information, please refer to :ref:`api_guide_Name`. - Returns: - ${out_type}: ${out_comment} - - Examples: - - .. code-block:: python - - import paddle.fluid as fluid - import paddle - import numpy as np - paddle.enable_static() - - input_brelu = np.array([[-1,6],[1,15.6]]) - with fluid.dygraph.guard(): - x = fluid.dygraph.to_variable(input_brelu) - y = fluid.layers.brelu(x, t_min=1.0, t_max=10.0) - print(y.numpy()) - #[[ 1. 6.] - #[ 1. 10.]] - """ - if _non_static_mode(): - return _legacy_C_ops.brelu(x, 't_min', t_min, 't_max', t_max) - - check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'brelu') - - helper = LayerHelper('brelu', **locals()) - out = helper.create_variable_for_type_inference(dtype=x.dtype) - helper.append_op( - type='brelu', - inputs={'X': x}, - outputs={'Out': out}, - attrs={'t_min': t_min, 't_max': t_max}, - ) - return out - - def flatten(x, axis=1, name=None): r""" **Flatten op** diff --git a/python/paddle/fluid/tests/unittests/ipu/test_activation_ops_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_activation_ops_ipu.py index 672195469d2b2..3a16a06ea1b81 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_activation_ops_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_activation_ops_ipu.py @@ -63,7 +63,7 @@ def test(self): self.check() -class TestBReluCase0(TestBase): +class TestHardTanhCase0(TestBase): def set_data_feed(self): data = np.random.uniform(size=[1, 3, 10, 10]) * 30 self.feed_fp32 = {'in_0': data.astype(np.float32)} @@ -71,14 +71,14 @@ def set_data_feed(self): self.feed_list = list(self.feed_fp32.keys()) def set_test_op(self): - self.op = paddle.fluid.layers.brelu + self.op = paddle.nn.functional.hardtanh self.op_attrs = {} -class TestBReluCase1(TestBReluCase0): +class TestHardTanhCase1(TestHardTanhCase0): def set_test_op(self): - self.op = paddle.fluid.layers.brelu - self.op_attrs = {"t_min": 0.1, 't_max': 10.0} + self.op = paddle.nn.functional.hardtanh + self.op_attrs = {"min": 0.1, 'max': 10.0} class TestEluCase1(TestBase): diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py index 39a866434afd6..6cfe72bfdfd39 100755 --- a/python/paddle/fluid/tests/unittests/test_activation_op.py +++ b/python/paddle/fluid/tests/unittests/test_activation_op.py @@ -1891,51 +1891,6 @@ def test_check_grad(self): self.check_grad(['X'], 'Out') -class TestBreluAPI(unittest.TestCase): - # test paddle.fluid.layers.brelu - def setUp(self): - np.random.seed(1024) - self.t_min = 0.0 - self.t_max = 24.0 - self.x_np = np.random.uniform(-1, 30, [10, 12]).astype('float32') - self.out_ref = np.copy(self.x_np) - self.out_ref[self.out_ref < self.t_min] = self.t_min - self.out_ref[self.out_ref > self.t_max] = self.t_max - self.out_ref = self.out_ref.astype('float32') - self.place = ( - paddle.CUDAPlace(0) - if paddle.is_compiled_with_cuda() - else paddle.CPUPlace() - ) - - def test_fluid_api(self): - with paddle.static.program_guard(paddle.static.Program()): - x = paddle.static.data('X', [10, 12]) - out = paddle.fluid.layers.brelu(x) - exe = paddle.static.Executor(self.place) - res = exe.run(feed={'X': self.x_np}, fetch_list=[out]) - np.testing.assert_allclose(self.out_ref, res[0], rtol=1e-05) - - paddle.disable_static(self.place) - x = paddle.to_tensor(self.x_np) - out = paddle.fluid.layers.brelu(x) - np.testing.assert_allclose(self.out_ref, out.numpy(), rtol=1e-05) - paddle.enable_static() - - def test_errors(self): - with program_guard(Program()): - # The input type must be Variable. - self.assertRaises(TypeError, fluid.layers.brelu, 1) - # The input dtype must be float16, float32, float64. - x_int32 = fluid.data(name='x_int32', shape=[12, 10], dtype='int32') - self.assertRaises(TypeError, fluid.layers.brelu, x_int32) - # support the input dtype is float16 - x_fp16 = fluid.layers.data( - name='x_fp16', shape=[12, 10], dtype='float16' - ) - fluid.layers.brelu(x_fp16) - - def ref_relu6(x, threshold=6.0): out = np.copy(x) out[np.abs(x - threshold) < 0.005] = threshold + 0.02 From 297827280125ff3e18ea4e1e12194f96dfd83a6d Mon Sep 17 00:00:00 2001 From: Wangzheee <634486483@qq.com> Date: Thu, 24 Nov 2022 15:06:11 +0800 Subject: [PATCH 195/210] [Paddle Inference]optimize token prune for Paddle-TensorRT (#48241) * optimize token prune --- .../ir/remove_padding_recover_padding_pass.cc | 57 +++ .../ir/remove_padding_recover_padding_pass.h | 12 +- .../tensorrt/convert/fused_token_prune_op.cc | 15 +- .../plugin/fused_token_prune_op_plugin.cu | 371 ++++++++++++------ .../plugin/fused_token_prune_op_plugin.h | 33 +- .../tensorrt/plugin/recover_padding_plugin.cu | 59 +-- .../tensorrt/plugin/remove_padding_plugin.cu | 57 +-- .../plugin/test_fused_token_prune_plugin.cc | 3 - 8 files changed, 422 insertions(+), 185 deletions(-) diff --git a/paddle/fluid/framework/ir/remove_padding_recover_padding_pass.cc b/paddle/fluid/framework/ir/remove_padding_recover_padding_pass.cc index 237cb3bd3d7d1..5127c5934cb48 100644 --- a/paddle/fluid/framework/ir/remove_padding_recover_padding_pass.cc +++ b/paddle/fluid/framework/ir/remove_padding_recover_padding_pass.cc @@ -131,6 +131,21 @@ void Activation::operator()() { // Add links for activation op. activation_op->LinksFrom({activation_input}).LinksTo({activation_out}); } + +void FusedTokenPrune::operator()() { + // Create nodes for fused_token_prune. + auto* fused_token_prune_input = + pattern->NewNode(fused_token_prune_input_repr()) + ->assert_is_op_input("fused_token_prune", "X"); + auto* fused_token_prune_op = pattern->NewNode(fused_token_prune_op_repr()) + ->assert_is_op("fused_token_prune"); + auto* fused_token_prune_output = + pattern->NewNode(fused_token_prune_output_repr()) + ->assert_is_op_output("fused_token_prune", "SlimmedX"); + + fused_token_prune_op->LinksFrom({fused_token_prune_input}) + .LinksTo({fused_token_prune_output}); +} } // namespace patterns void RemovePaddingRecoverPaddingPass::ApplyImpl(ir::Graph* graph) const { @@ -563,6 +578,48 @@ void RemovePaddingRecoverPaddingPass::ApplyImpl(ir::Graph* graph) const { }; gpd6(graph, handler6); + GraphPatternDetector gpd7; + patterns::FusedTokenPrune fused_token_prune( + gpd7.mutable_pattern(), "remove_padding_recover_padding_pass"); + fused_token_prune(); + + auto handler7 = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* graph) { + VLOG(3) << "remove_padding_recover_padding_pass for transformer: " + "fused_token_prune"; + + GET_IR_NODE_FROM_SUBGRAPH( + fused_token_prune_input, fused_token_prune_input, fused_token_prune); + GET_IR_NODE_FROM_SUBGRAPH( + fused_token_prune_op, fused_token_prune_op, fused_token_prune); + GET_IR_NODE_FROM_SUBGRAPH( + fused_token_prune_output, fused_token_prune_output, fused_token_prune); + + std::vector fused_token_prune_input_shape = + fused_token_prune_input->Var()->GetShape(); + check_flag = true; + if (fused_token_prune_input_shape.size() != + multihead_matmul_input_shape.size()) { + check_flag = false; + VLOG(3) << "Transformer model remove_padding shape check failed, return " + "remove_padding pass."; + return; + } + for (size_t i = 0; i < fused_token_prune_input_shape.size(); ++i) { + if (fused_token_prune_input_shape[i] != multihead_matmul_input_shape[i]) { + check_flag = false; + } + } + if (!check_flag) { + VLOG(3) << "Transformer model remove_padding shape check failed, return " + "remove_padding pass."; + return; + } + insert_recover_padding_op(fused_token_prune_op, fused_token_prune_output); + found_subgraph_count++; + }; + gpd7(graph, handler7); + AddStatis(found_subgraph_count); } diff --git a/paddle/fluid/framework/ir/remove_padding_recover_padding_pass.h b/paddle/fluid/framework/ir/remove_padding_recover_padding_pass.h index f93ee4bc7c4ff..ff04dc55323ab 100644 --- a/paddle/fluid/framework/ir/remove_padding_recover_padding_pass.h +++ b/paddle/fluid/framework/ir/remove_padding_recover_padding_pass.h @@ -95,7 +95,6 @@ struct Fc : public PatternBase { PATTERN_DECL_NODE(fc_input); PATTERN_DECL_NODE(fc_op); - PATTERN_DECL_NODE(fc_out); }; struct Activation : public PatternBase { @@ -108,6 +107,17 @@ struct Activation : public PatternBase { PATTERN_DECL_NODE(activation_op); PATTERN_DECL_NODE(activation_out); }; + +struct FusedTokenPrune : public PatternBase { + FusedTokenPrune(PDPattern *pattern, const std::string &name_scope) + : PatternBase(pattern, name_scope, "fused_token_prune") {} + + void operator()(); + + PATTERN_DECL_NODE(fused_token_prune_input); + PATTERN_DECL_NODE(fused_token_prune_op); + PATTERN_DECL_NODE(fused_token_prune_output); +}; } // namespace patterns class RemovePaddingRecoverPaddingPass : public FusePassBase { diff --git a/paddle/fluid/inference/tensorrt/convert/fused_token_prune_op.cc b/paddle/fluid/inference/tensorrt/convert/fused_token_prune_op.cc index dba0d003c0fe5..4832b1fad1826 100644 --- a/paddle/fluid/inference/tensorrt/convert/fused_token_prune_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/fused_token_prune_op.cc @@ -52,8 +52,21 @@ class FusedTokenPruneOpConverter : public OpConverter { auto* word_id = engine_->GetITensor("word_id"); auto* pos_id = engine_->GetITensor("pos_id"); auto* mask_id = engine_->GetITensor("mask_id"); + + // reduce_sum: (-1,headsize,token_length,token_length) -> + // (-1,token_length) + uint32_t reduce_dim = 0; + reduce_dim |= 1 << 1; // 00000000000000000000000000000010 + reduce_dim |= 1 << 2; // 00000000000000000000000000000110 + bool keep_dim = false; + nvinfer1::ReduceOperation reduce_type = nvinfer1::ReduceOperation::kSUM; + auto* reduce_sum_layer = TRT_ENGINE_ADD_LAYER( + engine_, Reduce, *Attn, reduce_type, reduce_dim, keep_dim); + // reduce_sum_layer->getOutput(0)->setType(reduce_sum_layer->getInput(0)->getType()); + + auto* Reduced = reduce_sum_layer->getOutput(0); std::vector itensors = { - Attn, X, Mask, NewMask, word_id, pos_id, mask_id}; + Reduced, X, Mask, NewMask, word_id, pos_id, mask_id}; layer = engine_->AddDynamicPlugin(itensors.data(), 7, plugin); layer->getOutput(0)->setName(output_name.c_str()); diff --git a/paddle/fluid/inference/tensorrt/plugin/fused_token_prune_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/fused_token_prune_op_plugin.cu index fe011422c19e9..b0c800d31bf3a 100644 --- a/paddle/fluid/inference/tensorrt/plugin/fused_token_prune_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/fused_token_prune_op_plugin.cu @@ -31,19 +31,15 @@ namespace inference { namespace tensorrt { namespace plugin { -#if IS_TRT_VERSION_GE(6000) - template __global__ void ElementwiseMask(const T* a, const T* b, T* res, int num_elements) { -#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__) auto tid = threadIdx.x + blockIdx.x * blockDim.x; if (tid >= num_elements) return; const T zero = 0; res[tid] = b[tid] >= zero ? a[tid] : zero; -#endif } template @@ -123,7 +119,6 @@ __global__ void ReduceSum2( template <> __global__ void ReduceSum2( const half* src, half* dst, int bsz, int nb_head, int max_seq_len) { -#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__) int tid = threadIdx.x; int bid = blockIdx.x; int num_blocks_per_head = ((max_seq_len / blockDim.x) * max_seq_len); @@ -155,7 +150,6 @@ __global__ void ReduceSum2( static_cast(bsz * max_seq_len), static_cast(res_half[0])); } -#endif } template @@ -177,14 +171,81 @@ __global__ void TakeAlongAxis(const T* src, } } -__global__ void pos_id_prune_kernel(const int32_t* src, - int32_t* dst, - int pos_nums, - float scale) { - dst[0] = 0; - for (int i = 1; i < pos_nums; i++) { - dst[i] = - dst[i - 1] + max(static_cast((src[i] - src[i - 1]) * scale), 2); +__global__ void compute_token_length(const int32_t* src, + int32_t* dst, + float scale) { + int32_t it = threadIdx.x; + dst[it] = max(static_cast((src[it + 1] - src[it]) * scale), 1); +} + +__global__ void fill_index_padding_score(int32_t* token_index, + const half* scores, + int32_t scores_size, + half* padding_scores) { + int tid = threadIdx.x + blockIdx.x * blockDim.x; + token_index[tid] = threadIdx.x; + if (tid < scores_size) { + padding_scores[tid] = scores[tid]; + } else { + padding_scores[tid] = 0; + } +} + +template +__global__ void general_topk_pair_sort(T* in_keys, int32_t* in_out_values) { + typedef cub::BlockRadixSort + BlockRadixSort; + typedef cub:: + BlockLoad + BlockLoadKey; + typedef cub:: + BlockLoad + BlockLoadValue; + typedef cub:: + BlockStore + BlockStoreKey; + typedef cub::BlockStore + BlockStoreValue; + + __shared__ union { + typename BlockRadixSort::TempStorage sort; + typename BlockLoadKey::TempStorage loadkey; + typename BlockLoadValue::TempStorage loadvalue; + typename BlockStoreKey::TempStorage storekey; + typename BlockStoreValue::TempStorage storevalue; + } temp_storage; + + int block_offset = blockIdx.x * BLOCK_THREADS * ITEMS_PER_THREAD; + + T thread_keys[ITEMS_PER_THREAD]; + int thread_values[ITEMS_PER_THREAD]; + BlockLoadKey(temp_storage.loadkey).Load(in_keys + block_offset, thread_keys); + BlockLoadValue(temp_storage.loadvalue) + .Load(in_out_values + block_offset, thread_values); + __syncthreads(); + + BlockRadixSort(temp_storage.sort).SortDescending(thread_keys, thread_values); + __syncthreads(); + + BlockStoreValue(temp_storage.storevalue) + .Store(in_out_values + block_offset, thread_values); +} + +__global__ void varlen_prune_token(const half* tokens, + const int32_t* token_pos, + const int32_t* token_index, + half* output) { + int batch = blockIdx.x; + int token_it = batch * gridDim.y + blockIdx.y; + int pre_value_it = + token_it * gridDim.z * blockDim.x + blockIdx.z * blockDim.x + threadIdx.x; + + if (token_index[token_it] < token_pos[batch + 1] - token_pos[batch]) { + output[(token_index[token_it] + token_pos[batch]) * gridDim.z * blockDim.x + + blockIdx.z * blockDim.x + threadIdx.x] = tokens[pre_value_it]; } } @@ -195,9 +256,29 @@ nvinfer1::DimsExprs FusedTokenPrunePluginDynamic::getOutputDimensions( nvinfer1::IExprBuilder& expr_builder) TRT_NOEXCEPT { auto x_dims = inputs[1], new_mask_dims = inputs[3]; if (flag_varseqlen_) { + // max sum of seqlen: ceil(sum / scale) + n -1 >= for(i=0;i>>(input, output, pos_nums, scale); -} - int FusedTokenPrunePluginDynamic::enqueue( const nvinfer1::PluginTensorDesc* input_desc, const nvinfer1::PluginTensorDesc* output_desc, @@ -572,73 +621,153 @@ int FusedTokenPrunePluginDynamic::enqueue( void* const* outputs, void* workspace, cudaStream_t stream) TRT_NOEXCEPT { - auto input_type = input_desc[0].type; - auto attn_dims = input_desc[0].dims; - auto bsz = attn_dims.d[0], nb_head = attn_dims.d[1], - max_seq_len = attn_dims.d[2]; - int device_id; - cudaGetDevice(&device_id); - - if (input_type == nvinfer1::DataType::kFLOAT) { - VLOG(1) << "TRT Plugin DataType selected. FusedTokenPrune-->fp32"; - - float max = std::numeric_limits::max(); - - enqueueImpl(input_desc, - output_desc, - inputs, - outputs, - workspace, - stream, - device_id, - max, - keep_first_token_, - keep_order_); - - } else if (input_type == nvinfer1::DataType::kHALF) { -#ifdef TRT_PLUGIN_FP16_AVALIABLE - VLOG(1) << "TRT Plugin DataType selected. FusedTokenPrune-->fp16"; - - half max = 65504.0; - - enqueueImpl(input_desc, - output_desc, - inputs, - outputs, - workspace, - stream, - device_id, - max, - keep_first_token_, - keep_order_); - -#else - PADDLE_THROW(platform::errors::Fatal( - "The Ernie(Bert) TensorRT Plugin should be " - "complied with CUDA version >= 10.0 when running with fp16. " - "Please recomplie it or try to use fp32 by set " - "config.SetTRTDynamicShapeInfo(min_input_shape, " - "max_input_shape, opt_input_shape, true")); -#endif - } else { - PADDLE_THROW( - platform::errors::Fatal("The FusedTokenPrune TRT Plugin's input type " - "should be float or half.")); - } if (flag_varseqlen_) { + if (!(input_desc[0].type == nvinfer1::DataType::kHALF && + input_desc[1].type == nvinfer1::DataType::kHALF)) { + PADDLE_THROW( + platform::errors::InvalidArgument("Token_prune'type must half")); + } float scale = static_cast(input_desc[3].dims.d[2]) / input_desc[6].dims.d[1]; - // outputs[2]=inputs[4]; // word_id - const int32_t* inputs5 = static_cast(inputs[5]); - int32_t* outputs3 = static_cast(outputs[3]); - pos_id_prune( - inputs5, outputs3, input_desc[5].dims.d[0], scale, stream); // pos_id - // outputs[4]=inputs[6]; // new_mask + const int32_t* inputs5 = + static_cast(inputs[5]); // pre pos id + int32_t* outputs3 = static_cast(outputs[3]); // new pos id + half* outputs0 = static_cast(outputs[0]); + + const int32_t B = input_desc[1].dims.d[0]; // batchs + const int32_t max_sequnce_length = + input_desc[1].dims.d[1]; // max sequnce length + const int32_t length = input_desc[1].dims.d[2]; // vector length + const half* scores = static_cast(inputs[0]); // reduce sum + const half* tokens = static_cast(inputs[1]); + const int32_t scores_size = B * max_sequnce_length; + int32_t padding_token_length; + if (max_sequnce_length <= 128) { + padding_token_length = 128; + } else if (max_sequnce_length <= 256) { + padding_token_length = 256; + } else if (max_sequnce_length <= 384) { + padding_token_length = 384; + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "Token_prune'token_length must <= 384")); + } + + // 1. Compute the token length after pruning. + compute_token_length<<<1, B, 0, stream>>>( + inputs5, pruned_token_lengths_, scale); + + fill_index_padding_score<<>>( + token_index_, scores, scores_size, padding_scores_); + + // Determine temporary device storage requirements + void* d_temp_storage = NULL; + size_t temp_storage_bytes = 0; + cub::DeviceScan::ExclusiveSum(d_temp_storage, + temp_storage_bytes, + pruned_token_lengths_, + outputs3, + B + 1); + // Allocate temporary storage + cudaMalloc(&d_temp_storage, temp_storage_bytes); + + // Run exclusive prefix sum + cub::DeviceScan::ExclusiveSum(d_temp_storage, + temp_storage_bytes, + pruned_token_lengths_, + outputs3, + B + 1); + + if (padding_token_length == 128) { + general_topk_pair_sort + <<>>(padding_scores_, token_index_); // 128 + } else if (padding_token_length == 256) { + general_topk_pair_sort + <<>>(padding_scores_, token_index_); // 256 + } else { + general_topk_pair_sort + <<>>(padding_scores_, token_index_); // 384 + } + + int32_t num_threads; + if (length < 1024) { + num_threads = length; + } else { + if (length % 512 == 0) { + num_threads = 512; + } else if (length % 256 == 0) { + num_threads = 256; + } else if (length % 128 == 0) { + num_threads = 128; + } else if (length % 64 == 0) { + num_threads = 64; + } else if (length % 32 == 0) { + num_threads = 32; + } else if (length % 16 == 0) { + num_threads = 16; + } else if (length % 8 == 0) { + num_threads = 8; + } else if (length % 4 == 0) { + num_threads = 4; + } else if (length % 2 == 0) { + num_threads = 2; + } else { + num_threads = 1; + } + } + const dim3 num_blocks( + B, + max_sequnce_length, + length / num_threads); // batchs, max_sequnce_length, vector_ength/*** + varlen_prune_token<<>>( + tokens, outputs3, token_index_, outputs0); + } else { + auto input_type = input_desc[0].type; + auto attn_dims = input_desc[0].dims; + auto bsz = attn_dims.d[0], nb_head = attn_dims.d[1], + max_seq_len = attn_dims.d[2]; + int device_id; + cudaGetDevice(&device_id); + + if (input_type == nvinfer1::DataType::kFLOAT) { + VLOG(1) << "TRT Plugin DataType selected. FusedTokenPrune-->fp32"; + + float max = std::numeric_limits::max(); + + enqueueImpl(input_desc, + output_desc, + inputs, + outputs, + workspace, + stream, + device_id, + max, + keep_first_token_, + keep_order_); + + } else if (input_type == nvinfer1::DataType::kHALF) { + VLOG(1) << "TRT Plugin DataType selected. FusedTokenPrune-->fp16"; + + half max = 65504.0; + enqueueImpl(input_desc, + output_desc, + inputs, + outputs, + workspace, + stream, + device_id, + max, + keep_first_token_, + keep_order_); + } else { + PADDLE_THROW( + platform::errors::Fatal("The FusedTokenPrune TRT Plugin's input type " + "should be float or half.")); + } } return cudaGetLastError() != cudaSuccess; } -#endif } // namespace plugin } // namespace tensorrt } // namespace inference diff --git a/paddle/fluid/inference/tensorrt/plugin/fused_token_prune_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/fused_token_prune_op_plugin.h index 0b32e8a552bb7..4c9c24c59afa2 100644 --- a/paddle/fluid/inference/tensorrt/plugin/fused_token_prune_op_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/fused_token_prune_op_plugin.h @@ -16,6 +16,7 @@ #include "paddle/fluid/inference/tensorrt/engine.h" #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" +#include "paddle/fluid/platform/enforce.h" namespace paddle { namespace inference { @@ -30,11 +31,10 @@ class FusedTokenPrunePluginDynamic : public DynamicPluginTensorRT { bool keep_first_token, bool keep_order, bool flag_varseqlen) - : keep_first_token_(keep_first_token), + : with_fp16_(with_fp16), + keep_first_token_(keep_first_token), keep_order_(keep_order), - flag_varseqlen_(flag_varseqlen) { - with_fp16_ = with_fp16; - } + flag_varseqlen_(flag_varseqlen) {} FusedTokenPrunePluginDynamic(void const* serial_data, size_t serial_length) { DeserializeValue(&serial_data, &serial_length, &with_fp16_); DeserializeValue(&serial_data, &serial_length, &keep_first_token_); @@ -42,8 +42,14 @@ class FusedTokenPrunePluginDynamic : public DynamicPluginTensorRT { DeserializeValue(&serial_data, &serial_length, &flag_varseqlen_); } nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override { - return new FusedTokenPrunePluginDynamic( + FusedTokenPrunePluginDynamic* ptr = new FusedTokenPrunePluginDynamic( with_fp16_, keep_first_token_, keep_order_, flag_varseqlen_); + ptr->max_batchs_ = max_batchs_; + ptr->max_token_length_ = max_token_length_; + ptr->pruned_token_lengths_ = pruned_token_lengths_; + ptr->token_index_ = token_index_; + ptr->padding_scores_ = padding_scores_; + return ptr; } const char* getPluginType() const TRT_NOEXCEPT override { @@ -84,7 +90,16 @@ class FusedTokenPrunePluginDynamic : public DynamicPluginTensorRT { void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in, int nb_inputs, const nvinfer1::DynamicPluginTensorDesc* out, - int nb_outputs) TRT_NOEXCEPT override {} + int nb_outputs) TRT_NOEXCEPT override { + max_batchs_ = in[1].max.d[0]; + max_token_length_ = in[1].max.d[1]; + PADDLE_ENFORCE_GPU_SUCCESS(cudaMalloc(&pruned_token_lengths_, + (max_batchs_ + 1) * sizeof(int32_t))); + PADDLE_ENFORCE_GPU_SUCCESS(cudaMalloc( + &token_index_, max_batchs_ * max_token_length_ * sizeof(int32_t))); + PADDLE_ENFORCE_GPU_SUCCESS(cudaMalloc( + &padding_scores_, max_batchs_ * max_token_length_ * sizeof(half))); + } size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, int nb_inputs, @@ -106,9 +121,15 @@ class FusedTokenPrunePluginDynamic : public DynamicPluginTensorRT { void destroy() TRT_NOEXCEPT override { delete this; } private: + bool with_fp16_; bool keep_first_token_; bool keep_order_; bool flag_varseqlen_; + int32_t* pruned_token_lengths_; + int32_t* token_index_; + int32_t max_batchs_; + int32_t max_token_length_; + half* padding_scores_; }; class FusedTokenPrunePluginDynamicCreator : public nvinfer1::IPluginCreator { diff --git a/paddle/fluid/inference/tensorrt/plugin/recover_padding_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/recover_padding_plugin.cu index c6be871709452..50884b79d835e 100644 --- a/paddle/fluid/inference/tensorrt/plugin/recover_padding_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/recover_padding_plugin.cu @@ -19,9 +19,9 @@ namespace inference { namespace tensorrt { namespace plugin { -__global__ void RecoverPaddingKernel(const float* input0, +__global__ void RecoverPaddingKernel(const half* input0, const int32_t* input1, - float* output) { + half* output) { int word_id = blockIdx.x * gridDim.y + blockIdx.y; int32_t seqence_length = input1[blockIdx.x + 1] - input1[blockIdx.x]; if (blockIdx.y < seqence_length) { @@ -79,7 +79,7 @@ bool RecoverPaddingPlugin::supportsFormatCombination( return inOut[pos].type == nvinfer1::DataType::kFLOAT && inOut[pos].format == nvinfer1::TensorFormat::kLINEAR; } else { - return inOut[pos].type == nvinfer1::DataType::kFLOAT && + return inOut[pos].type == nvinfer1::DataType::kHALF && inOut[pos].format == nvinfer1::TensorFormat::kLINEAR; } // return (inOut[pos].type == nvinfer1::DataType::kFLOAT && inOut[pos].format @@ -114,38 +114,43 @@ int RecoverPaddingPlugin::enqueue(const nvinfer1::PluginTensorDesc* inputDesc, const auto input0_desc = inputDesc[0]; const auto input1_desc = inputDesc[1]; const auto input2_desc = inputDesc[2]; - const float* input0 = static_cast(inputs[0]); + const half* input0 = static_cast(inputs[0]); const int32_t* input1 = static_cast(inputs[1]); // pos_id_tensor - float* output = static_cast(outputs[0]); + half* output = static_cast(outputs[0]); + const int32_t vector_length = input0_desc.dims.d[1]; int32_t num_threads; - if (input0_desc.dims.d[1] % 512 == 0) { - num_threads = 512; - } else if (input0_desc.dims.d[1] % 256 == 0) { - num_threads = 256; - } else if (input0_desc.dims.d[1] % 128 == 0) { - num_threads = 128; - } else if (input0_desc.dims.d[1] % 64 == 0) { - num_threads = 64; - } else if (input0_desc.dims.d[1] % 32 == 0) { - num_threads = 32; - } else if (input0_desc.dims.d[1] % 16 == 0) { - num_threads = 16; - } else if (input0_desc.dims.d[1] % 8 == 0) { - num_threads = 8; - } else if (input0_desc.dims.d[1] % 4 == 0) { - num_threads = 4; - } else if (input0_desc.dims.d[1] % 2 == 0) { - num_threads = 2; + if (vector_length < 1024) { + num_threads = vector_length; } else { - num_threads = 1; + if (vector_length % 512 == 0) { + num_threads = 512; + } else if (vector_length % 256 == 0) { + num_threads = 256; + } else if (vector_length % 128 == 0) { + num_threads = 128; + } else if (vector_length % 64 == 0) { + num_threads = 64; + } else if (vector_length % 32 == 0) { + num_threads = 32; + } else if (vector_length % 16 == 0) { + num_threads = 16; + } else if (vector_length % 8 == 0) { + num_threads = 8; + } else if (vector_length % 4 == 0) { + num_threads = 4; + } else if (vector_length % 2 == 0) { + num_threads = 2; + } else { + num_threads = 1; + } } const dim3 num_blocks( input1_desc.dims.d[0] - 1, input2_desc.dims.d[1], - input0_desc.dims.d[1] / num_threads); // batchs, max sequnce length - // (mask_id.dims.d[1]), - // input.dims.d[1]/256 + vector_length / num_threads); // batchs, max sequnce length + // (mask_id.dims.d[1]), + // input.dims.d[1]/*** RecoverPaddingKernel<<>>( input0, input1, output); return cudaGetLastError() != cudaSuccess; diff --git a/paddle/fluid/inference/tensorrt/plugin/remove_padding_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/remove_padding_plugin.cu index 9f1a1d6d2c109..a18c0d0c7294b 100644 --- a/paddle/fluid/inference/tensorrt/plugin/remove_padding_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/remove_padding_plugin.cu @@ -19,9 +19,9 @@ namespace inference { namespace tensorrt { namespace plugin { -__global__ void RemovePaddingKernel(const float* input0, +__global__ void RemovePaddingKernel(const half* input0, const int32_t* input1, - float* output) { + half* output) { int word_id = blockIdx.x * gridDim.y + blockIdx.y; int32_t seqence_length = input1[blockIdx.x + 1] - input1[blockIdx.x]; if (blockIdx.y < seqence_length) { @@ -73,7 +73,7 @@ bool RemovePaddingPlugin::supportsFormatCombination( return inOut[pos].type == nvinfer1::DataType::kINT32 && inOut[pos].format == nvinfer1::TensorFormat::kLINEAR; } else { - return inOut[pos].type == nvinfer1::DataType::kFLOAT && + return inOut[pos].type == nvinfer1::DataType::kHALF && inOut[pos].format == nvinfer1::TensorFormat::kLINEAR; } // return (inOut[pos].type == nvinfer1::DataType::kFLOAT && inOut[pos].format @@ -106,38 +106,43 @@ int RemovePaddingPlugin::enqueue(const nvinfer1::PluginTensorDesc* inputDesc, void* workspace, cudaStream_t stream) TRT_NOEXCEPT { const auto input_desc = inputDesc[0]; - const float* input0 = static_cast(inputs[0]); + const half* input0 = static_cast(inputs[0]); const int32_t* input1 = static_cast(inputs[1]); // pos_id_tensor - float* output = static_cast(outputs[0]); + half* output = static_cast(outputs[0]); const auto input0_desc = inputDesc[0]; + const int32_t vector_length = input0_desc.dims.d[2]; int32_t num_threads; - if (input0_desc.dims.d[2] % 512 == 0) { - num_threads = 512; - } else if (input0_desc.dims.d[2] % 256 == 0) { - num_threads = 256; - } else if (input0_desc.dims.d[2] % 128 == 0) { - num_threads = 128; - } else if (input0_desc.dims.d[2] % 64 == 0) { - num_threads = 64; - } else if (input0_desc.dims.d[2] % 32 == 0) { - num_threads = 32; - } else if (input0_desc.dims.d[2] % 16 == 0) { - num_threads = 16; - } else if (input0_desc.dims.d[2] % 8 == 0) { - num_threads = 8; - } else if (input0_desc.dims.d[2] % 4 == 0) { - num_threads = 4; - } else if (input0_desc.dims.d[2] % 2 == 0) { - num_threads = 2; + if (vector_length < 1024) { + num_threads = vector_length; } else { - num_threads = 1; + if (vector_length % 512 == 0) { + num_threads = 512; + } else if (vector_length % 256 == 0) { + num_threads = 256; + } else if (vector_length % 128 == 0) { + num_threads = 128; + } else if (vector_length % 64 == 0) { + num_threads = 64; + } else if (vector_length % 32 == 0) { + num_threads = 32; + } else if (vector_length % 16 == 0) { + num_threads = 16; + } else if (vector_length % 8 == 0) { + num_threads = 8; + } else if (vector_length % 4 == 0) { + num_threads = 4; + } else if (vector_length % 2 == 0) { + num_threads = 2; + } else { + num_threads = 1; + } } const dim3 num_blocks( input0_desc.dims.d[0], input0_desc.dims.d[1], - input0_desc.dims.d[2] / - num_threads); // batchs, max sequnce length, input.dims.d[2]/256 + vector_length / + num_threads); // batchs, max sequnce length, input0.dims.d[2]/*** RemovePaddingKernel<<>>( input0, input1, output); diff --git a/paddle/fluid/inference/tensorrt/plugin/test_fused_token_prune_plugin.cc b/paddle/fluid/inference/tensorrt/plugin/test_fused_token_prune_plugin.cc index 4cc20c4365975..543e7dca22c31 100644 --- a/paddle/fluid/inference/tensorrt/plugin/test_fused_token_prune_plugin.cc +++ b/paddle/fluid/inference/tensorrt/plugin/test_fused_token_prune_plugin.cc @@ -26,12 +26,9 @@ TEST(fused_token_prune_op_plugin, test_plugin) { /*keep_first_token*/ false, /*keep_order*/ true, /*flag_varseqlen*/ false); - plugin.configurePlugin(nullptr, 4, nullptr, 2); plugin.initialize(); plugin.getPluginType(); plugin.getNbOutputs(); - auto clone_plugin = plugin.clone(); - clone_plugin->destroy(); size_t buf_size = plugin.getSerializationSize(); std::vector buf(buf_size); plugin.serialize(buf.data()); From e29c50c2d78e026b77512d3a4ecc3224886d7297 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=82=85=E5=89=91=E5=AF=92?= Date: Thu, 24 Nov 2022 15:24:24 +0800 Subject: [PATCH 196/210] remove pad2d in nn.py (#47854) --- python/paddle/fluid/layers/nn.py | 146 ------------------ .../dygraph_to_static/test_cycle_gan.py | 17 +- .../fluid/tests/unittests/test_layers.py | 16 +- .../fluid/tests/unittests/test_pad2d_op.py | 20 +-- 4 files changed, 16 insertions(+), 183 deletions(-) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 95126e9f239d1..49d6906d6a4d3 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -116,7 +116,6 @@ 'crop_tensor', 'prelu', 'flatten', - 'pad2d', 'unique', 'unique_with_counts', 'scale', @@ -7579,151 +7578,6 @@ def _attr_offsets_check(offset_val): return out -def pad2d( - input, - paddings=[0, 0, 0, 0], - mode='constant', - pad_value=0.0, - data_format="NCHW", - name=None, -): - """ - - Pad 2-d images according to 'paddings' and 'mode'. - If mode is 'reflect', paddings[0] and paddings[1] must be no greater - than height-1. And the width dimension has the same condition. - - Parameters: - input (Tensor): The input image with [N, C, H, W] format or [N, H, W, C] format, which is a 4-D Tensor with data type float32. - paddings (Tensor | List[int32]): The padding size. If padding is a List, it must - contain four integers, (padding_top, padding_bottom, padding_left, padding_right). - Otherwise, it is a 1-D Tensor with shape [4]. Data type is int32. - Default is [0, 0, 0, 0]. - mode (str): Three modes: 'constant' (default), 'reflect', 'edge' . - When in 'constant' mode, this op uses a constant value to pad the input tensor. - When in 'reflect' mode, uses reflection of the input boundaries to pad the input tensor. - When in 'edge' mode, uses input boundaries to pad the input tensor. - Default is 'constant' - pad_value (float32): The value to fill the padded areas in 'constant' mode . Default is 0.0 - data_format (str): An string from: "NHWC", "NCHW". Specify the data format of - the input data. - Default is "NCHW" - name (str, optional) : The default value is None. Normally there is no need for - user to set this property. For more information, please refer to :ref:`api_guide_Name` . - - Returns: - Tensor, a 4-D Tensor padded according to paddings and mode and data type is same as input. - - Examples: - .. code-block:: text - - Input = [[[[1., 2., 3.], - [4., 5., 6.]]]] - - Case 0: - paddings = [0, 1, 2, 3], - mode = 'constant' - pad_value = 0 - Out = [[[[0., 0., 1., 2., 3., 0., 0., 0.], - [0., 0., 4., 5., 6., 0., 0., 0.], - [0., 0., 0., 0., 0., 0., 0., 0.]]]] - - Case 1: - paddings = [0, 1, 2, 1], - mode = 'reflect' - Out = [[[[3., 2., 1., 2., 3., 2.], - [6., 5., 4., 5., 6., 5.], - [3., 2., 1., 2., 3., 2.]]]] - - Case 2: - paddings = [0, 1, 2, 1], - mode = 'edge' - Out = [[[[1., 1., 1., 2., 3., 3.], - [4., 4., 4., 5., 6., 6.], - [4., 4., 4., 5., 6., 6.]]]] - - Code Examples: - .. code-block:: python - - import numpy as np - import paddle - import paddle.nn.functional as F - - # example 1 - x_shape = (1, 1, 3, 4) - x = np.arange(np.prod(x_shape), dtype=np.float32).reshape(x_shape) + 1 - tensor_x = paddle.to_tensor(x) - y = paddle.fluid.layers.pad2d(tensor_x, paddings=[1, 2, 2, 1], pad_value=1, mode='constant') - print(y.numpy()) - # [[[[ 1. 1. 1. 1. 1. 1. 1.] - # [ 1. 1. 1. 2. 3. 4. 1.] - # [ 1. 1. 5. 6. 7. 8. 1.] - # [ 1. 1. 9. 10. 11. 12. 1.] - # [ 1. 1. 1. 1. 1. 1. 1.] - # [ 1. 1. 1. 1. 1. 1. 1.]]]] - - # example 2 - x_shape = (1, 1, 2, 3) - x = np.arange(np.prod(x_shape), dtype=np.float32).reshape(x_shape) + 1 - tensor_x = paddle.to_tensor(x) - y = paddle.fluid.layers.pad2d(tensor_x, paddings=[1, 1, 1, 1], mode='reflect') - print(y.numpy()) - # [[[[5. 4. 5. 6. 5.] - # [2. 1. 2. 3. 2.] - # [5. 4. 5. 6. 5.] - # [2. 1. 2. 3. 2.]]]] - """ - if _non_static_mode(): - _paddings = ( - paddings.numpy().tolist() - if isinstance(paddings, Variable) - else paddings - ) - return _legacy_C_ops.pad2d( - input, - 'mode', - mode, - 'pad_value', - pad_value, - 'data_format', - data_format, - 'paddings', - _paddings, - ) - - check_variable_and_dtype( - input, - 'input', - ['float16', 'float32', 'float64', 'int32', 'int64'], - "pad2d", - ) - - attrs = {'mode': mode, 'pad_value': pad_value, 'data_format': data_format} - inputs = {'X': [input]} - if isinstance(paddings, Variable): - inputs['Paddings'] = [paddings] - attrs['paddings'] = [] - else: - attrs['paddings'] = paddings - - helper = LayerHelper('pad2d', **locals()) - - assert mode in [ - 'reflect', - 'edge', - 'constant', - ], "mode should be one of constant, reflect, edge." - - dtype = helper.input_dtype(input_param_name='input') - out = helper.create_variable_for_type_inference(dtype) - - helper.append_op( - type='pad2d', inputs=inputs, outputs={"Out": out}, attrs=attrs - ) - - return out - - @deprecated(since="2.0.0", update_to="paddle.static.nn.prelu") def prelu(x, mode, param_attr=None, data_format="NCHW", name=None): r""" diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_cycle_gan.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_cycle_gan.py index a41791eb04ef4..6f4ea90030d54 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_cycle_gan.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_cycle_gan.py @@ -179,10 +179,12 @@ def __init__(self, dim, use_bias=False): self.dim = dim def forward(self, inputs): - out_res = fluid.layers.pad2d(inputs, [1, 1, 1, 1], mode="reflect") + pad1 = paddle.nn.Pad2D([1, 1, 1, 1], mode="reflect") + out_res = pad1(inputs) out_res = self.conv0(out_res) - out_res = fluid.layers.pad2d(out_res, [1, 1, 1, 1], mode="reflect") + pad2 = paddle.nn.Pad2D([1, 1, 1, 1], mode="reflect") + out_res = pad2(out_res) out_res = self.conv1(out_res) return out_res + inputs @@ -253,7 +255,8 @@ def __init__(self, input_channel): ) def forward(self, inputs): - pad_input = fluid.layers.pad2d(inputs, [3, 3, 3, 3], mode="reflect") + pad1 = paddle.nn.Pad2D([3, 3, 3, 3], mode="reflect") + pad_input = pad1(inputs) y = self.conv0(pad_input) y = self.conv1(y) y = self.conv2(y) @@ -261,7 +264,8 @@ def forward(self, inputs): y = build_resnet_block_i(y) y = self.deconv0(y) y = self.deconv1(y) - y = fluid.layers.pad2d(y, [3, 3, 3, 3], mode="reflect") + pad2 = paddle.nn.Pad2D([3, 3, 3, 3], mode="reflect") + y = pad2(y) y = self.conv3(y) y = paddle.tanh(y) return y @@ -461,9 +465,10 @@ def __init__( def forward(self, inputs): conv = self._deconv(inputs) - conv = fluid.layers.pad2d( - conv, paddings=self.outpadding, mode='constant', pad_value=0.0 + tmp_pad = paddle.nn.Pad2D( + padding=self.outpadding, mode='constant', value=0.0 ) + conv = tmp_pad(conv) if self.norm: conv = self.bn(conv) diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index f191a948c9d2c..ed230ea98b00f 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -3502,23 +3502,15 @@ def make_pad2d(self): input = self._get_data( name="input", shape=[3, 100, 100], dtype="float32" ) - paddings = layers.fill_constant(shape=[4], dtype='int32', value=1) - out = layers.pad2d( - input, - paddings=[1, 2, 3, 4], - mode='reflect', - data_format='NCHW', - name="shape", - ) - out_1 = layers.pad2d( - input, - paddings=paddings, + + tmp_pad = paddle.nn.Pad2D( + padding=[1, 2, 3, 4], mode='reflect', data_format='NCHW', name="shape", ) + out = tmp_pad(input) return out - return out_1 def make_prelu(self): with program_guard( diff --git a/python/paddle/fluid/tests/unittests/test_pad2d_op.py b/python/paddle/fluid/tests/unittests/test_pad2d_op.py index aeca9becf84b1..320d615f63b9f 100644 --- a/python/paddle/fluid/tests/unittests/test_pad2d_op.py +++ b/python/paddle/fluid/tests/unittests/test_pad2d_op.py @@ -12,11 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -import unittest import numpy as np from op_test import OpTest -import paddle.fluid as fluid -from paddle.fluid import Program, program_guard +import unittest class TestPad2dOp(OpTest): @@ -138,21 +136,5 @@ def initTestCase(self): self.variable_paddings = True -class TestPad2dOpError(unittest.TestCase): - def test_errors(self): - with program_guard(Program(), Program()): - input_data = np.random.random((2, 2, 2, 2)).astype("float32") - - def test_Variable(): - fluid.layers.pad2d(input=input_data, paddings=[1, 1, 1, 1]) - - self.assertRaises(TypeError, test_Variable) - - data = fluid.data( - name='data', shape=[None, 3, 20, 20], dtype='float16' - ) - fluid.layers.pad2d(input=data, paddings=[1, 1, 1, 1]) - - if __name__ == '__main__': unittest.main() From df23c7c3d3b716c108f5a2d03fa867d514155711 Mon Sep 17 00:00:00 2001 From: PuQing Date: Thu, 24 Nov 2022 15:32:48 +0800 Subject: [PATCH 197/210] [PHI decoupling] remove "paddle/fluid/platform/enforce.h" in phi (#48049) --- paddle/fluid/inference/check_symbol.sh | 2 +- paddle/fluid/platform/CMakeLists.txt | 3 - paddle/fluid/platform/enforce.h | 530 --------------- paddle/phi/backends/callback_manager.cc | 2 +- paddle/phi/backends/dynload/cudnn.cc | 2 +- paddle/phi/backends/dynload/cufft.cc | 2 +- paddle/phi/backends/dynload/dynamic_loader.cc | 2 +- paddle/phi/backends/dynload/miopen.cc | 2 +- paddle/phi/backends/dynload/tensorrt.h | 2 +- paddle/phi/backends/gpu/cuda/cuda_info.cc | 3 +- paddle/phi/backends/gpu/gpu_context.cc | 4 +- paddle/phi/backends/gpu/gpu_resources.cc | 3 +- paddle/phi/backends/gpu/rocm/rocm_info.cc | 5 +- paddle/phi/backends/xpu/enforce_xpu.h | 2 +- paddle/phi/backends/xpu/xpu_header.h | 2 +- paddle/phi/core/CMakeLists.txt | 4 + paddle/phi/core/cuda_stream.h | 3 +- paddle/phi/core/enforce.h | 601 +++++++++++++++++- .../core}/external_error.proto | 2 +- paddle/phi/kernels/autotune/CMakeLists.txt | 7 +- paddle/phi/kernels/funcs/concat_funcs.h | 2 +- paddle/phi/kernels/funcs/cpu_vec.h | 2 +- paddle/phi/kernels/funcs/cufft_util.h | 2 +- paddle/phi/kernels/funcs/gru_compute.h | 2 +- paddle/phi/kernels/funcs/hipfft_util.h | 2 +- paddle/phi/kernels/funcs/lstm_compute.h | 2 +- paddle/phi/kernels/funcs/math_function.h | 2 +- .../phi/kernels/gpu/batch_norm_grad_kernel.cu | 2 +- paddle/phi/kernels/gpu/batch_norm_kernel.cu | 2 +- .../phi/kernels/gpu/cholesky_solve_kernel.cu | 2 +- .../kernels/gpu/class_center_sample_kernel.cu | 2 +- tools/externalError/start.sh | 2 +- 32 files changed, 633 insertions(+), 574 deletions(-) rename paddle/{fluid/platform => phi/core}/external_error.proto (97%) diff --git a/paddle/fluid/inference/check_symbol.sh b/paddle/fluid/inference/check_symbol.sh index 1d9b566e6c433..0d8892f20514f 100755 --- a/paddle/fluid/inference/check_symbol.sh +++ b/paddle/fluid/inference/check_symbol.sh @@ -18,7 +18,7 @@ lib=$1 if [ $# -ne 1 ]; then echo "No input library"; exit -1 ; fi num_paddle_syms=$(nm -D "${lib}" | grep -c paddle ) -num_google_syms=$(nm -D "${lib}" | grep google | grep -v paddle | grep -v brpc | grep -c "T " ) +num_google_syms=$(nm -D "${lib}" | grep google | grep -v paddle | grep -v phi | grep -v brpc | grep -c "T " ) if [ $num_paddle_syms -le 0 ]; then echo "Have no paddle symbols"; exit -1 ; fi if [ $num_google_syms -ge 1 ]; then echo "Have some google symbols"; exit -1 ; fi diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index 627ea8b9aa7ec..63099e71afff5 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -1,8 +1,5 @@ proto_library(profiler_proto SRCS profiler.proto DEPS framework_proto simple_threadpool) -if(WITH_GPU) - proto_library(external_error_proto SRCS external_error.proto) -endif() if(WITH_PYTHON) py_proto_compile(profiler_py_proto SRCS profiler.proto) add_custom_target(profiler_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h index 5bd9029179ffe..4be05f24bc70a 100644 --- a/paddle/fluid/platform/enforce.h +++ b/paddle/fluid/platform/enforce.h @@ -36,8 +36,6 @@ limitations under the License. */ #include #include #include - -#include "paddle/fluid/platform/external_error.pb.h" #endif // PADDLE_WITH_CUDA #ifdef PADDLE_WITH_HIP @@ -224,533 +222,5 @@ struct EOFException : public std::exception { END_HANDLE_THE_ERROR \ } while (0) -/**************************************************************************/ -/**************************** NVIDIA ERROR ********************************/ -#ifdef PADDLE_WITH_CUDA - -namespace details { - -template -struct ExternalApiType {}; - -#define DEFINE_EXTERNAL_API_TYPE(type, success_value, proto_type) \ - template <> \ - struct ExternalApiType { \ - using Type = type; \ - static constexpr Type kSuccess = success_value; \ - static constexpr const char* kTypeString = #proto_type; \ - static constexpr platform::proto::ApiType kProtoType = \ - platform::proto::ApiType::proto_type; \ - } - -DEFINE_EXTERNAL_API_TYPE(cudaError_t, cudaSuccess, CUDA); -DEFINE_EXTERNAL_API_TYPE(curandStatus_t, CURAND_STATUS_SUCCESS, CURAND); -DEFINE_EXTERNAL_API_TYPE(cudnnStatus_t, CUDNN_STATUS_SUCCESS, CUDNN); -DEFINE_EXTERNAL_API_TYPE(cublasStatus_t, CUBLAS_STATUS_SUCCESS, CUBLAS); -DEFINE_EXTERNAL_API_TYPE(cusparseStatus_t, CUSPARSE_STATUS_SUCCESS, CUSPARSE); -DEFINE_EXTERNAL_API_TYPE(cusolverStatus_t, CUSOLVER_STATUS_SUCCESS, CUSOLVER); -DEFINE_EXTERNAL_API_TYPE(cufftResult_t, CUFFT_SUCCESS, CUFFT); -DEFINE_EXTERNAL_API_TYPE(CUresult, CUDA_SUCCESS, CU); - -#if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL) -DEFINE_EXTERNAL_API_TYPE(ncclResult_t, ncclSuccess, NCCL); -#endif - -} // namespace details - -template -inline const char* GetErrorMsgUrl(T status) { - using __CUDA_STATUS_TYPE__ = decltype(status); - platform::proto::ApiType proto_type = - details::ExternalApiType<__CUDA_STATUS_TYPE__>::kProtoType; - switch (proto_type) { - case platform::proto::ApiType::CUDA: - case platform::proto::ApiType::CU: - return "https://docs.nvidia.com/cuda/cuda-runtime-api/" - "group__CUDART__TYPES.html#group__CUDART__TYPES_" - "1g3f51e3575c2178246db0a94a430e0038"; - break; - case platform::proto::ApiType::CURAND: - return "https://docs.nvidia.com/cuda/curand/" - "group__HOST.html#group__HOST_1gb94a31d5c165858c96b6c18b70644437"; - break; - case platform::proto::ApiType::CUDNN: - return "https://docs.nvidia.com/deeplearning/cudnn/api/" - "index.html#cudnnStatus_t"; - break; - case platform::proto::ApiType::CUBLAS: - return "https://docs.nvidia.com/cuda/cublas/index.html#cublasstatus_t"; - break; - case platform::proto::ApiType::CUSOLVER: - return "https://docs.nvidia.com/cuda/cusolver/" - "index.html#cuSolverSPstatus"; - break; - case platform::proto::ApiType::NCCL: - return "https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/" - "types.html#ncclresult-t"; - break; - case platform::proto::ApiType::CUFFT: - return "https://docs.nvidia.com/cuda/cufft/index.html#cufftresult"; - case platform::proto::ApiType::CUSPARSE: - return "https://docs.nvidia.com/cuda/cusparse/" - "index.html#cusparseStatus_t"; - break; - default: - return "Unknown type of External API, can't get error message URL!"; - break; - } -} - -template -inline std::string GetExternalErrorMsg(T status) { - std::ostringstream sout; - bool _initSucceed = false; - platform::proto::ExternalErrorDesc externalError; - if (externalError.ByteSizeLong() == 0) { - std::string filePath; -#if !defined(_WIN32) - Dl_info info; - if (dladdr(reinterpret_cast(GetCurrentTraceBackString), &info)) { - std::string strModule(info.dli_fname); - const size_t last_slash_idx = strModule.find_last_of("/"); - std::string compare_path = strModule.substr(strModule.length() - 6); - if (std::string::npos != last_slash_idx) { - strModule.erase(last_slash_idx, std::string::npos); - } - if (compare_path.compare("avx.so") == 0) { - filePath = - strModule + - "/../include/third_party/externalError/data/externalErrorMsg.pb"; - } else { - filePath = strModule + - "/../../third_party/externalError/data/externalErrorMsg.pb"; - } - } -#else - char buf[512]; - MEMORY_BASIC_INFORMATION mbi; - HMODULE h_module = - (::VirtualQuery(GetCurrentTraceBackString, &mbi, sizeof(mbi)) != 0) - ? (HMODULE)mbi.AllocationBase - : NULL; - GetModuleFileName(h_module, buf, 512); - std::string strModule(buf); - const size_t last_slash_idx = strModule.find_last_of("\\"); - std::string compare_path = strModule.substr(strModule.length() - 7); - if (std::string::npos != last_slash_idx) { - strModule.erase(last_slash_idx, std::string::npos); - } - if (compare_path.compare("avx.pyd") == 0) { - filePath = strModule + - "\\..\\include\\third_" - "party\\externalerror\\data\\externalErrorMsg.pb"; - } else { - filePath = - strModule + - "\\..\\..\\third_party\\externalerror\\data\\externalErrorMsg.pb"; - } -#endif - std::ifstream fin(filePath, std::ios::in | std::ios::binary); - _initSucceed = externalError.ParseFromIstream(&fin); - } - using __CUDA_STATUS_TYPE__ = decltype(status); - platform::proto::ApiType proto_type = - details::ExternalApiType<__CUDA_STATUS_TYPE__>::kProtoType; - if (_initSucceed) { - for (int i = 0; i < externalError.errors_size(); ++i) { - if (proto_type == externalError.errors(i).type()) { - for (int j = 0; j < externalError.errors(i).messages_size(); ++j) { - if (status == externalError.errors(i).messages(j).code()) { - sout << "\n [Hint: " - << externalError.errors(i).messages(j).message() << "]"; - return sout.str(); - } - } - } - } - } - - sout << "\n [Hint: Please search for the error code(" << status - << ") on website (" << GetErrorMsgUrl(status) - << ") to get Nvidia's official solution and advice about " - << details::ExternalApiType<__CUDA_STATUS_TYPE__>::kTypeString - << " Error.]"; - return sout.str(); -} - -template std::string GetExternalErrorMsg(cudaError_t); -template std::string GetExternalErrorMsg(curandStatus_t); -template std::string GetExternalErrorMsg(cudnnStatus_t); -template std::string GetExternalErrorMsg(cublasStatus_t); -template std::string GetExternalErrorMsg(cusparseStatus_t); -template std::string GetExternalErrorMsg(cusolverStatus_t); -template std::string GetExternalErrorMsg(cufftResult_t); -template std::string GetExternalErrorMsg(CUresult); -#if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL) -template std::string GetExternalErrorMsg(ncclResult_t); -#endif - -/*************** CUDA ERROR ***************/ -inline bool is_error(cudaError_t e) { return e != cudaSuccess; } - -inline std::string build_nvidia_error_msg(cudaError_t e) { - std::ostringstream sout; - sout << "CUDA error(" << e << "), " << cudaGetErrorString(e) << ". " - << GetExternalErrorMsg(e); - return sout.str(); -} - -/*************** CURAND ERROR ***************/ -inline bool is_error(curandStatus_t stat) { - return stat != CURAND_STATUS_SUCCESS; -} - -inline std::string build_nvidia_error_msg(curandStatus_t stat) { - std::ostringstream sout; - sout << "CURAND error(" << stat << "). " << GetExternalErrorMsg(stat); - return sout.str(); -} - -/*************** CUDNN ERROR ***************/ -inline bool is_error(cudnnStatus_t stat) { - return stat != CUDNN_STATUS_SUCCESS; -} - -inline std::string build_nvidia_error_msg(cudnnStatus_t stat) { - std::ostringstream sout; - sout << "CUDNN error(" << stat << "), " - << phi::dynload::cudnnGetErrorString(stat) << ". " - << GetExternalErrorMsg(stat); - return sout.str(); -} - -/*************** CUBLAS ERROR ***************/ -inline bool is_error(cublasStatus_t stat) { - return stat != CUBLAS_STATUS_SUCCESS; -} - -inline std::string build_nvidia_error_msg(cublasStatus_t stat) { - std::ostringstream sout; - sout << "CUBLAS error(" << stat << "). " << GetExternalErrorMsg(stat); - return sout.str(); -} - -/*************** CUSPARSE ERROR ***************/ -inline bool is_error(cusparseStatus_t stat) { - return stat != CUSPARSE_STATUS_SUCCESS; -} - -inline std::string build_nvidia_error_msg(cusparseStatus_t stat) { - std::ostringstream sout; - sout << "CUSparse error(" << stat << "). " << GetExternalErrorMsg(stat); - return sout.str(); -} - -/*************** CUSOLVER ERROR ***************/ -inline bool is_error(cusolverStatus_t stat) { - return stat != CUSOLVER_STATUS_SUCCESS; -} - -inline std::string build_nvidia_error_msg(cusolverStatus_t stat) { - std::ostringstream sout; - sout << "CUSOLVER error(" << stat << "). " << GetExternalErrorMsg(stat); - return sout.str(); -} - -/*************** CUFFT ERROR ***************/ -inline bool is_error(cufftResult_t stat) { return stat != CUFFT_SUCCESS; } - -inline std::string build_nvidia_error_msg(cufftResult_t stat) { - std::ostringstream sout; - sout << "CUFFT error(" << stat << "). " << GetExternalErrorMsg(stat); - return sout.str(); -} - -/*************** CUresult ERROR ***************/ -inline bool is_error(CUresult stat) { return stat != CUDA_SUCCESS; } - -inline std::string build_nvidia_error_msg(CUresult stat) { - std::ostringstream sout; - sout << "CU error(" << stat << "). " << GetExternalErrorMsg(stat); - return sout.str(); -} - -/**************** NCCL ERROR ****************/ -#if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL) -inline bool is_error(ncclResult_t nccl_result) { - return nccl_result != ncclSuccess; -} - -inline std::string build_nvidia_error_msg(ncclResult_t nccl_result) { - std::ostringstream sout; - sout << "NCCL error(" << nccl_result << "), " - << phi::dynload::ncclGetErrorString(nccl_result) << ". "; - if (errno == ENOSPC || errno == EAGAIN) { - std::string detail(strerror(errno)); - detail += "\nPlease try one of the following solutions:"; - detail += "\n1. export NCCL_SHM_DISABLE=1;"; - detail += "\n2. export NCCL_P2P_LEVEL=SYS;"; - detail += - "\n3. Increase shared memory by setting the -shm-size " - "option when starting docker container, e.g., setting " - " -shm-size=2g.\n"; - sout << " Detail: " + detail; - } - sout << GetExternalErrorMsg(nccl_result); - return sout.str(); -} -#endif // not(__APPLE__) and PADDLE_WITH_NCCL - -#define PADDLE_ENFORCE_GPU_SUCCESS(COND) \ - do { \ - auto __cond__ = (COND); \ - using __CUDA_STATUS_TYPE__ = decltype(__cond__); \ - constexpr auto __success_type__ = \ - ::paddle::platform::details::ExternalApiType< \ - __CUDA_STATUS_TYPE__>::kSuccess; \ - if (UNLIKELY(__cond__ != __success_type__)) { \ - auto __summary__ = phi::errors::External( \ - ::paddle::platform::build_nvidia_error_msg(__cond__)); \ - __THROW_ERROR_INTERNAL__(__summary__); \ - } \ - } while (0) - -#define PADDLE_ENFORCE_CUDA_LAUNCH_SUCCESS(OP) \ - do { \ - auto res = cudaGetLastError(); \ - if (UNLIKELY(res != cudaSuccess)) { \ - auto msg = ::paddle::platform::build_nvidia_error_msg(res); \ - PADDLE_THROW(platform::errors::Fatal( \ - "CUDA error after kernel (%s): %s", OP, msg)); \ - } \ - } while (0) - -inline void retry_sleep(unsigned milliseconds) { -#ifdef _WIN32 - Sleep(milliseconds); -#else - if (milliseconds < 1000) { - // usleep argument must be less than 1,000,000. Reference: - // https://pubs.opengroup.org/onlinepubs/7908799/xsh/usleep.html - usleep(milliseconds * 1000); - } else { - // clip to sleep in seconds because we can not and don't have to - // sleep for exact milliseconds - sleep(milliseconds / 1000); - } -#endif -} - -#define PADDLE_RETRY_CUDA_SUCCESS(COND) \ - do { \ - auto __cond__ = (COND); \ - int retry_count = 1; \ - using __CUDA_STATUS_TYPE__ = decltype(__cond__); \ - constexpr auto __success_type__ = \ - ::paddle::platform::details::ExternalApiType< \ - __CUDA_STATUS_TYPE__>::kSuccess; \ - while (UNLIKELY(__cond__ != __success_type__) && retry_count < 5) { \ - paddle::platform::retry_sleep(10000); \ - __cond__ = (COND); \ - ++retry_count; \ - } \ - if (UNLIKELY(__cond__ != __success_type__)) { \ - auto __summary__ = phi::errors::External( \ - ::paddle::platform::build_nvidia_error_msg(__cond__)); \ - __THROW_ERROR_INTERNAL__(__summary__); \ - } \ - } while (0) - -#undef DEFINE_EXTERNAL_API_TYPE -#endif // PADDLE_WITH_CUDA - -/**************************************************************************/ -/***************************** HIP ERROR **********************************/ -#ifdef PADDLE_WITH_HIP - -/***** HIP ERROR *****/ -inline bool is_error(hipError_t e) { return e != hipSuccess; } - -inline std::string build_rocm_error_msg(hipError_t e) { - std::ostringstream sout; - sout << " Hip error(" << e << "), " << hipGetErrorString(e) << "."; - return sout.str(); -} - -/***** HIPRAND ERROR *****/ -inline bool is_error(hiprandStatus_t stat) { - return stat != HIPRAND_STATUS_SUCCESS; -} - -inline const char* hiprandGetErrorString(hiprandStatus_t stat) { - switch (stat) { - case HIPRAND_STATUS_SUCCESS: - return "HIPRAND_STATUS_SUCCESS"; - case HIPRAND_STATUS_VERSION_MISMATCH: - return "HIPRAND_STATUS_VERSION_MISMATCH"; - case HIPRAND_STATUS_NOT_INITIALIZED: - return "HIPRAND_STATUS_NOT_INITIALIZED"; - case HIPRAND_STATUS_ALLOCATION_FAILED: - return "HIPRAND_STATUS_ALLOCATION_FAILED"; - case HIPRAND_STATUS_TYPE_ERROR: - return "HIPRAND_STATUS_TYPE_ERROR"; - case HIPRAND_STATUS_OUT_OF_RANGE: - return "HIPRAND_STATUS_OUT_OF_RANGE"; - case HIPRAND_STATUS_LENGTH_NOT_MULTIPLE: - return "HIPRAND_STATUS_LENGTH_NOT_MULTIPLE"; - case HIPRAND_STATUS_DOUBLE_PRECISION_REQUIRED: - return "HIPRAND_STATUS_DOUBLE_PRECISION_REQUIRED"; - case HIPRAND_STATUS_LAUNCH_FAILURE: - return "HIPRAND_STATUS_LAUNCH_FAILURE"; - case HIPRAND_STATUS_PREEXISTING_FAILURE: - return "HIPRAND_STATUS_PREEXISTING_FAILURE"; - case HIPRAND_STATUS_INITIALIZATION_FAILED: - return "HIPRAND_STATUS_INITIALIZATION_FAILED"; - case HIPRAND_STATUS_ARCH_MISMATCH: - return "HIPRAND_STATUS_ARCH_MISMATCH"; - case HIPRAND_STATUS_INTERNAL_ERROR: - return "HIPRAND_STATUS_INTERNAL_ERROR"; - case HIPRAND_STATUS_NOT_IMPLEMENTED: - return "HIPRAND_STATUS_NOT_IMPLEMENTED"; - default: - return "Unknown hiprand status"; - } -} - -inline std::string build_rocm_error_msg(hiprandStatus_t stat) { - std::string msg(" Hiprand error, "); - return msg + hiprandGetErrorString(stat) + " "; -} - -/***** MIOPEN ERROR *****/ -inline bool is_error(miopenStatus_t stat) { - return stat != miopenStatusSuccess; -} - -inline std::string build_rocm_error_msg(miopenStatus_t stat) { - std::string msg(" Miopen error, "); - return msg + phi::dynload::miopenGetErrorString(stat) + " "; -} - -/***** ROCBLAS ERROR *****/ -inline bool is_error(rocblas_status stat) { - return stat != rocblas_status_success; -} - -inline const char* rocblasGetErrorString(rocblas_status stat) { - switch (stat) { - case rocblas_status_invalid_handle: - return "rocblas_status_invalid_handle"; - case rocblas_status_memory_error: - return "rocblas_status_memory_error"; - case rocblas_status_invalid_value: - return "rocblas_status_invalid_value"; - case rocblas_status_not_implemented: - return "rocblas_status_not_implemented"; - case rocblas_status_invalid_pointer: - return "rocblas_status_invalid_pointer"; - case rocblas_status_invalid_size: - return "rocblas_status_invalid_size"; - case rocblas_status_internal_error: - return "rocblas_status_internal_error"; - default: - return "Unknown cublas status"; - } -} - -inline std::string build_rocm_error_msg(rocblas_status stat) { - std::string msg(" Rocblas error, "); - return msg + rocblasGetErrorString(stat) + " "; -} - -/****** RCCL ERROR ******/ -#if !defined(__APPLE__) && defined(PADDLE_WITH_RCCL) -inline bool is_error(ncclResult_t nccl_result) { - return nccl_result != ncclSuccess; -} - -inline std::string build_rocm_error_msg(ncclResult_t nccl_result) { - std::string msg(" Rccl error, "); - return msg + phi::dynload::ncclGetErrorString(nccl_result) + " "; -} -#endif // not(__APPLE__) and PADDLE_WITH_NCCL - -/***** HIPFFT ERROR *****/ -inline bool is_error(hipfftResult_t stat) { return stat != HIPFFT_SUCCESS; } - -inline std::string build_rocm_error_msg(hipfftResult_t stat) { - std::string msg(" HIPFFT error, "); - return msg + phi::dynload::hipfftGetErrorString(stat) + " "; -} - -namespace details { - -template -struct ExternalApiType {}; - -#define DEFINE_EXTERNAL_API_TYPE(type, success_value) \ - template <> \ - struct ExternalApiType { \ - using Type = type; \ - static constexpr Type kSuccess = success_value; \ - } - -DEFINE_EXTERNAL_API_TYPE(hipError_t, hipSuccess); -DEFINE_EXTERNAL_API_TYPE(hiprandStatus_t, HIPRAND_STATUS_SUCCESS); -DEFINE_EXTERNAL_API_TYPE(miopenStatus_t, miopenStatusSuccess); -DEFINE_EXTERNAL_API_TYPE(rocblas_status, rocblas_status_success); -DEFINE_EXTERNAL_API_TYPE(hipfftResult_t, HIPFFT_SUCCESS); - -#if !defined(__APPLE__) && defined(PADDLE_WITH_RCCL) -DEFINE_EXTERNAL_API_TYPE(ncclResult_t, ncclSuccess); -#endif - -} // namespace details - -#define PADDLE_ENFORCE_GPU_SUCCESS(COND) \ - do { \ - auto __cond__ = (COND); \ - using __CUDA_STATUS_TYPE__ = decltype(__cond__); \ - constexpr auto __success_type__ = \ - ::paddle::platform::details::ExternalApiType< \ - __CUDA_STATUS_TYPE__>::kSuccess; \ - if (UNLIKELY(__cond__ != __success_type__)) { \ - auto __summary__ = phi::errors::External( \ - ::paddle::platform::build_rocm_error_msg(__cond__)); \ - __THROW_ERROR_INTERNAL__(__summary__); \ - } \ - } while (0) - -inline void retry_sleep(unsigned millisecond) { -#ifdef _WIN32 - Sleep(millisecond); -#else - sleep(millisecond); -#endif -} - -#define PADDLE_RETRY_CUDA_SUCCESS(COND) \ - do { \ - auto __cond__ = (COND); \ - int retry_count = 1; \ - using __CUDA_STATUS_TYPE__ = decltype(__cond__); \ - constexpr auto __success_type__ = \ - ::paddle::platform::details::ExternalApiType< \ - __CUDA_STATUS_TYPE__>::kSuccess; \ - while (UNLIKELY(__cond__ != __success_type__) && retry_count < 5) { \ - ::paddle::platform::retry_sleep(10000); \ - __cond__ = (COND); \ - ++retry_count; \ - } \ - if (UNLIKELY(__cond__ != __success_type__)) { \ - auto __summary__ = phi::errors::External( \ - ::paddle::platform::build_rocm_error_msg(__cond__)); \ - __THROW_ERROR_INTERNAL__(__summary__); \ - } \ - } while (0) - -#undef DEFINE_EXTERNAL_API_TYPE -#endif // PADDLE_WITH_HIP - } // namespace platform } // namespace paddle diff --git a/paddle/phi/backends/callback_manager.cc b/paddle/phi/backends/callback_manager.cc index 7ce59880383c7..c996b8d9befa9 100644 --- a/paddle/phi/backends/callback_manager.cc +++ b/paddle/phi/backends/callback_manager.cc @@ -17,8 +17,8 @@ #include #include "paddle/fluid/platform/device/device_wrapper.h" -#include "paddle/fluid/platform/enforce.h" #include "paddle/phi/backends/device_guard.h" +#include "paddle/phi/core/enforce.h" namespace phi { diff --git a/paddle/phi/backends/dynload/cudnn.cc b/paddle/phi/backends/dynload/cudnn.cc index 9bd38a89ab177..924dd60d2c5e1 100644 --- a/paddle/phi/backends/dynload/cudnn.cc +++ b/paddle/phi/backends/dynload/cudnn.cc @@ -14,7 +14,7 @@ limitations under the License. */ #include "paddle/phi/backends/dynload/cudnn.h" -#include "paddle/fluid/platform/enforce.h" +#include "paddle/phi/core/enforce.h" namespace phi { namespace dynload { diff --git a/paddle/phi/backends/dynload/cufft.cc b/paddle/phi/backends/dynload/cufft.cc index 5a7080032d28d..a15969ecc3f87 100644 --- a/paddle/phi/backends/dynload/cufft.cc +++ b/paddle/phi/backends/dynload/cufft.cc @@ -14,7 +14,7 @@ limitations under the License. */ #include "paddle/phi/backends/dynload/cufft.h" -#include "paddle/fluid/platform/enforce.h" +#include "paddle/phi/core/enforce.h" namespace phi { namespace dynload { diff --git a/paddle/phi/backends/dynload/dynamic_loader.cc b/paddle/phi/backends/dynload/dynamic_loader.cc index c9ae1d0cd76fe..3f22e24429332 100644 --- a/paddle/phi/backends/dynload/dynamic_loader.cc +++ b/paddle/phi/backends/dynload/dynamic_loader.cc @@ -17,8 +17,8 @@ limitations under the License. */ #include #include -#include "paddle/fluid/platform/enforce.h" #include "paddle/phi/backends/dynload/cupti_lib_path.h" +#include "paddle/phi/core/enforce.h" #if defined(_WIN32) #include diff --git a/paddle/phi/backends/dynload/miopen.cc b/paddle/phi/backends/dynload/miopen.cc index 9c58da1d6ff1a..b8f328b4aae34 100644 --- a/paddle/phi/backends/dynload/miopen.cc +++ b/paddle/phi/backends/dynload/miopen.cc @@ -14,7 +14,7 @@ limitations under the License. */ #include "paddle/phi/backends/dynload/miopen.h" -#include "paddle/fluid/platform/enforce.h" +#include "paddle/phi/core/enforce.h" namespace phi { namespace dynload { diff --git a/paddle/phi/backends/dynload/tensorrt.h b/paddle/phi/backends/dynload/tensorrt.h index cd8c6457f1b91..c971a66031828 100644 --- a/paddle/phi/backends/dynload/tensorrt.h +++ b/paddle/phi/backends/dynload/tensorrt.h @@ -21,8 +21,8 @@ limitations under the License. */ #include // NOLINT -#include "paddle/fluid/platform/enforce.h" #include "paddle/phi/backends/dynload/dynamic_loader.h" +#include "paddle/phi/core/enforce.h" namespace phi { namespace dynload { diff --git a/paddle/phi/backends/gpu/cuda/cuda_info.cc b/paddle/phi/backends/gpu/cuda/cuda_info.cc index 4b5de3db54d19..f6bff1c7b3c35 100644 --- a/paddle/phi/backends/gpu/cuda/cuda_info.cc +++ b/paddle/phi/backends/gpu/cuda/cuda_info.cc @@ -14,8 +14,7 @@ #include "paddle/phi/backends/gpu/gpu_info.h" -// TODO(phi): remove fluid headers. -#include "paddle/fluid/platform/enforce.h" +#include "paddle/phi/core/enforce.h" static std::once_flag g_device_props_size_init_flag; static std::vector> g_device_props_init_flags; diff --git a/paddle/phi/backends/gpu/gpu_context.cc b/paddle/phi/backends/gpu/gpu_context.cc index 0e102911442f2..c8f5463d82588 100644 --- a/paddle/phi/backends/gpu/gpu_context.cc +++ b/paddle/phi/backends/gpu/gpu_context.cc @@ -21,6 +21,7 @@ limitations under the License. */ #include #include #include +#include #include "glog/logging.h" #include "paddle/phi/api/ext/exception.h" @@ -54,8 +55,7 @@ limitations under the License. */ // without eigen. #include "unsupported/Eigen/CXX11/Tensor" -// TODO(phi): remove fluid header. -#include "paddle/fluid/platform/enforce.h" +#include "paddle/phi/core/enforce.h" namespace phi { diff --git a/paddle/phi/backends/gpu/gpu_resources.cc b/paddle/phi/backends/gpu/gpu_resources.cc index 4a16480101a70..e05e75636c1a0 100644 --- a/paddle/phi/backends/gpu/gpu_resources.cc +++ b/paddle/phi/backends/gpu/gpu_resources.cc @@ -32,8 +32,7 @@ #include "unsupported/Eigen/CXX11/Tensor" -// TODO(phi): remove fluid header. -#include "paddle/fluid/platform/enforce.h" +#include "paddle/phi/core/enforce.h" namespace phi { diff --git a/paddle/phi/backends/gpu/rocm/rocm_info.cc b/paddle/phi/backends/gpu/rocm/rocm_info.cc index 1646d9666ff42..8e4e06af801ae 100644 --- a/paddle/phi/backends/gpu/rocm/rocm_info.cc +++ b/paddle/phi/backends/gpu/rocm/rocm_info.cc @@ -16,12 +16,11 @@ #include "paddle/phi/backends/gpu/gpu_info.h" -// TODO(phi): remove fluid headers. -#include "paddle/fluid/platform/enforce.h" +#include "paddle/phi/core/enforce.h" static std::once_flag g_device_props_size_init_flag; static std::vector> g_device_props_init_flags; -static std::vector g_device_props; +static std::vector g_device_props; namespace phi { namespace backends { diff --git a/paddle/phi/backends/xpu/enforce_xpu.h b/paddle/phi/backends/xpu/enforce_xpu.h index 44763d408f7d7..71bea73634bf6 100644 --- a/paddle/phi/backends/xpu/enforce_xpu.h +++ b/paddle/phi/backends/xpu/enforce_xpu.h @@ -14,8 +14,8 @@ limitations under the License. */ #pragma once -#include "paddle/fluid/platform/enforce.h" #include "paddle/phi/backends/xpu/xpu_header.h" +#include "paddle/phi/core/enforce.h" #ifdef PADDLE_WITH_XPU_BKCL #include "xpu/bkcl.h" #endif diff --git a/paddle/phi/backends/xpu/xpu_header.h b/paddle/phi/backends/xpu/xpu_header.h index 1fe6f6d07796f..ca4ea6145caf0 100644 --- a/paddle/phi/backends/xpu/xpu_header.h +++ b/paddle/phi/backends/xpu/xpu_header.h @@ -19,9 +19,9 @@ limitations under the License. */ #include #include -#include "paddle/fluid/platform/enforce.h" #include "paddle/phi/common/bfloat16.h" #include "paddle/phi/common/float16.h" +#include "paddle/phi/core/enforce.h" #include "xpu/runtime.h" #include "xpu/runtime_ex.h" #include "xpu/xdnn.h" diff --git a/paddle/phi/core/CMakeLists.txt b/paddle/phi/core/CMakeLists.txt index 90f5d38bfc93b..3ecd022ff5d23 100644 --- a/paddle/phi/core/CMakeLists.txt +++ b/paddle/phi/core/CMakeLists.txt @@ -1,6 +1,10 @@ # compatible utils used for fluid op system add_subdirectory(compat) +if(WITH_GPU) + proto_library(external_error_proto SRCS external_error.proto) +endif() + cc_library(errors SRCS errors.cc) set(phi_enforce_deps errors flags) if(WITH_GPU) diff --git a/paddle/phi/core/cuda_stream.h b/paddle/phi/core/cuda_stream.h index 61aa9648dbabf..160a31262b3a1 100644 --- a/paddle/phi/core/cuda_stream.h +++ b/paddle/phi/core/cuda_stream.h @@ -28,8 +28,7 @@ using gpuStream_t = cudaStream_t; using gpuStream_t = hipStream_t; #endif -// TODO(phi): remove fluid headers. -#include "paddle/fluid/platform/enforce.h" +#include "paddle/phi/core/enforce.h" namespace phi { diff --git a/paddle/phi/core/enforce.h b/paddle/phi/core/enforce.h index cfe8b47ef9afa..d8449d5ca45d2 100644 --- a/paddle/phi/core/enforce.h +++ b/paddle/phi/core/enforce.h @@ -1,11 +1,8 @@ /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -28,21 +25,81 @@ limitations under the License. */ #include // GetModuleFileName, Sleep #endif +#ifdef PADDLE_WITH_CUDA +#include +#include +#include +#include +#include +#include +#include + +#include "paddle/phi/core/external_error.pb.h" +#endif // PADDLE_WITH_CUDA + +#ifdef PADDLE_WITH_HIP +#include +#include +#include +#include +#include // NOLINT +#endif + +#include +#include +#include #include #include #include #include +#include #if !defined(_WIN32) && !defined(PADDLE_WITH_MUSL) #include #endif -#include "gflags/gflags.h" - #define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h +#include "gflags/gflags.h" +#include "glog/logging.h" #include "paddle/phi/core/errors.h" + +#include "paddle/phi/backends/dynload/port.h" #include "paddle/utils/string/printf.h" #include "paddle/utils/string/to_string.h" + +#ifdef PADDLE_WITH_CUDA +#include "paddle/phi/backends/dynload/cublas.h" +#include "paddle/phi/backends/dynload/cudnn.h" +#include "paddle/phi/backends/dynload/curand.h" +#include "paddle/phi/backends/dynload/cusolver.h" +#if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL) +#include + +#include "paddle/phi/backends/dynload/nccl.h" +#endif // __APPLE__ +#endif // PADDLE_WITH_CUDA + +#ifdef PADDLE_WITH_HIP +#include "paddle/phi/backends/dynload/hipfft.h" +#include "paddle/phi/backends/dynload/hiprand.h" +#include "paddle/phi/backends/dynload/miopen.h" +#include "paddle/phi/backends/dynload/rocblas.h" +#if !defined(__APPLE__) && defined(PADDLE_WITH_RCCL) +#include // NOLINT + +#include "paddle/phi/backends/dynload/rccl.h" +#endif // __APPLE__ +#endif // PADDLE_WITH_HIP + +// Note: these headers for simplify demangle type string +#include "paddle/phi/core/enforce.h" +#include "paddle/phi/core/type_defs.h" +// Note: this header for simplify HIP and CUDA type string +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#include "paddle/phi/backends/gpu/gpu_types.h" +#endif +#include "paddle/fluid/platform/flags.h" + #include "paddle/utils/variant.h" DECLARE_int32(call_stack_level); @@ -51,6 +108,10 @@ namespace phi { class ErrorSummary; } // namespace phi +namespace phi { +namespace proto {} // namespace proto +} // namespace phi + namespace phi { namespace enforce { @@ -420,7 +481,7 @@ struct EnforceNotMet : public std::exception { * the direct use of paddle::get by CI rule. * * Parameters: - *     __TYPE: the target variable type + * __TYPE: the target variable type * __VALUE: the target variable to get * * Examples: @@ -480,6 +541,534 @@ DEFINE_SAFE_PADDLE_GET(InputType&&, phi::enforce::details::SafeBoostGetMutable<__TYPE>( \ __VALUE, #__VALUE, __FILE__, __LINE__) +/**************************************************************************/ +/**************************** NVIDIA ERROR ********************************/ +#ifdef PADDLE_WITH_CUDA + +namespace details { + +template +struct ExternalApiType {}; + +#define DEFINE_EXTERNAL_API_TYPE(type, success_value, proto_type) \ + template <> \ + struct ExternalApiType { \ + using Type = type; \ + static constexpr Type kSuccess = success_value; \ + static constexpr const char* kTypeString = #proto_type; \ + static constexpr phi::proto::ApiType kProtoType = \ + phi::proto::ApiType::proto_type; \ + } + +DEFINE_EXTERNAL_API_TYPE(cudaError_t, cudaSuccess, CUDA); +DEFINE_EXTERNAL_API_TYPE(curandStatus_t, CURAND_STATUS_SUCCESS, CURAND); +DEFINE_EXTERNAL_API_TYPE(cudnnStatus_t, CUDNN_STATUS_SUCCESS, CUDNN); +DEFINE_EXTERNAL_API_TYPE(cublasStatus_t, CUBLAS_STATUS_SUCCESS, CUBLAS); +DEFINE_EXTERNAL_API_TYPE(cusparseStatus_t, CUSPARSE_STATUS_SUCCESS, CUSPARSE); +DEFINE_EXTERNAL_API_TYPE(cusolverStatus_t, CUSOLVER_STATUS_SUCCESS, CUSOLVER); +DEFINE_EXTERNAL_API_TYPE(cufftResult_t, CUFFT_SUCCESS, CUFFT); +DEFINE_EXTERNAL_API_TYPE(CUresult, CUDA_SUCCESS, CU); + +#if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL) +DEFINE_EXTERNAL_API_TYPE(ncclResult_t, ncclSuccess, NCCL); +#endif + +} // namespace details + +template +inline const char* GetErrorMsgUrl(T status) { + using __CUDA_STATUS_TYPE__ = decltype(status); + phi::proto::ApiType proto_type = + details::ExternalApiType<__CUDA_STATUS_TYPE__>::kProtoType; + switch (proto_type) { + case phi::proto::ApiType::CUDA: + case phi::proto::ApiType::CU: + return "https://docs.nvidia.com/cuda/cuda-runtime-api/" + "group__CUDART__TYPES.html#group__CUDART__TYPES_" + "1g3f51e3575c2178246db0a94a430e0038"; + break; + case phi::proto::ApiType::CURAND: + return "https://docs.nvidia.com/cuda/curand/" + "group__HOST.html#group__HOST_1gb94a31d5c165858c96b6c18b70644437"; + break; + case phi::proto::ApiType::CUDNN: + return "https://docs.nvidia.com/deeplearning/cudnn/api/" + "index.html#cudnnStatus_t"; + break; + case phi::proto::ApiType::CUBLAS: + return "https://docs.nvidia.com/cuda/cublas/index.html#cublasstatus_t"; + break; + case phi::proto::ApiType::CUSOLVER: + return "https://docs.nvidia.com/cuda/cusolver/" + "index.html#cuSolverSPstatus"; + break; + case phi::proto::ApiType::NCCL: + return "https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/" + "types.html#ncclresult-t"; + break; + case phi::proto::ApiType::CUFFT: + return "https://docs.nvidia.com/cuda/cufft/index.html#cufftresult"; + case phi::proto::ApiType::CUSPARSE: + return "https://docs.nvidia.com/cuda/cusparse/" + "index.html#cusparseStatus_t"; + break; + default: + return "Unknown type of External API, can't get error message URL!"; + break; + } +} + +template +inline std::string GetExternalErrorMsg(T status) { + std::ostringstream sout; + bool _initSucceed = false; + phi::proto::ExternalErrorDesc externalError; + if (externalError.ByteSizeLong() == 0) { + std::string filePath; +#if !defined(_WIN32) + Dl_info info; + if (dladdr(reinterpret_cast(GetCurrentTraceBackString), &info)) { + std::string strModule(info.dli_fname); + const size_t last_slash_idx = strModule.find_last_of("/"); + std::string compare_path = strModule.substr(strModule.length() - 6); + if (std::string::npos != last_slash_idx) { + strModule.erase(last_slash_idx, std::string::npos); + } + if (compare_path.compare("avx.so") == 0) { + filePath = + strModule + + "/../include/third_party/externalError/data/externalErrorMsg.pb"; + } else { + filePath = strModule + + "/../../third_party/externalError/data/externalErrorMsg.pb"; + } + } +#else + char buf[512]; + MEMORY_BASIC_INFORMATION mbi; + HMODULE h_module = + (::VirtualQuery(GetCurrentTraceBackString, &mbi, sizeof(mbi)) != 0) + ? (HMODULE)mbi.AllocationBase + : NULL; + GetModuleFileName(h_module, buf, 512); + std::string strModule(buf); + const size_t last_slash_idx = strModule.find_last_of("\\"); + std::string compare_path = strModule.substr(strModule.length() - 7); + if (std::string::npos != last_slash_idx) { + strModule.erase(last_slash_idx, std::string::npos); + } + if (compare_path.compare("avx.pyd") == 0) { + filePath = strModule + + "\\..\\include\\third_" + "party\\externalerror\\data\\externalErrorMsg.pb"; + } else { + filePath = + strModule + + "\\..\\..\\third_party\\externalerror\\data\\externalErrorMsg.pb"; + } +#endif + std::ifstream fin(filePath, std::ios::in | std::ios::binary); + _initSucceed = externalError.ParseFromIstream(&fin); + } + using __CUDA_STATUS_TYPE__ = decltype(status); + phi::proto::ApiType proto_type = + details::ExternalApiType<__CUDA_STATUS_TYPE__>::kProtoType; + if (_initSucceed) { + for (int i = 0; i < externalError.errors_size(); ++i) { + if (proto_type == externalError.errors(i).type()) { + for (int j = 0; j < externalError.errors(i).messages_size(); ++j) { + if (status == externalError.errors(i).messages(j).code()) { + sout << "\n [Hint: " + << externalError.errors(i).messages(j).message() << "]"; + return sout.str(); + } + } + } + } + } + + sout << "\n [Hint: Please search for the error code(" << status + << ") on website (" << GetErrorMsgUrl(status) + << ") to get Nvidia's official solution and advice about " + << details::ExternalApiType<__CUDA_STATUS_TYPE__>::kTypeString + << " Error.]"; + return sout.str(); +} + +template std::string GetExternalErrorMsg(cudaError_t); +template std::string GetExternalErrorMsg(curandStatus_t); +template std::string GetExternalErrorMsg(cudnnStatus_t); +template std::string GetExternalErrorMsg(cublasStatus_t); +template std::string GetExternalErrorMsg(cusparseStatus_t); +template std::string GetExternalErrorMsg(cusolverStatus_t); +template std::string GetExternalErrorMsg(cufftResult_t); +template std::string GetExternalErrorMsg(CUresult); +#if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL) +template std::string GetExternalErrorMsg(ncclResult_t); +#endif + +/*************** CUDA ERROR ***************/ +inline bool is_error(cudaError_t e) { return e != cudaSuccess; } + +inline std::string build_nvidia_error_msg(cudaError_t e) { + std::ostringstream sout; + sout << "CUDA error(" << e << "), " << cudaGetErrorString(e) << ". " + << GetExternalErrorMsg(e); + return sout.str(); +} + +/*************** CURAND ERROR ***************/ +inline bool is_error(curandStatus_t stat) { + return stat != CURAND_STATUS_SUCCESS; +} + +inline std::string build_nvidia_error_msg(curandStatus_t stat) { + std::ostringstream sout; + sout << "CURAND error(" << stat << "). " << GetExternalErrorMsg(stat); + return sout.str(); +} + +/*************** CUDNN ERROR ***************/ +inline bool is_error(cudnnStatus_t stat) { + return stat != CUDNN_STATUS_SUCCESS; +} + +inline std::string build_nvidia_error_msg(cudnnStatus_t stat) { + std::ostringstream sout; + sout << "CUDNN error(" << stat << "), " + << phi::dynload::cudnnGetErrorString(stat) << ". " + << GetExternalErrorMsg(stat); + return sout.str(); +} + +/*************** CUBLAS ERROR ***************/ +inline bool is_error(cublasStatus_t stat) { + return stat != CUBLAS_STATUS_SUCCESS; +} + +inline std::string build_nvidia_error_msg(cublasStatus_t stat) { + std::ostringstream sout; + sout << "CUBLAS error(" << stat << "). " << GetExternalErrorMsg(stat); + return sout.str(); +} + +/*************** CUSPARSE ERROR ***************/ +inline bool is_error(cusparseStatus_t stat) { + return stat != CUSPARSE_STATUS_SUCCESS; +} + +inline std::string build_nvidia_error_msg(cusparseStatus_t stat) { + std::ostringstream sout; + sout << "CUSparse error(" << stat << "). " << GetExternalErrorMsg(stat); + return sout.str(); +} + +/*************** CUSOLVER ERROR ***************/ +inline bool is_error(cusolverStatus_t stat) { + return stat != CUSOLVER_STATUS_SUCCESS; +} + +inline std::string build_nvidia_error_msg(cusolverStatus_t stat) { + std::ostringstream sout; + sout << "CUSOLVER error(" << stat << "). " << GetExternalErrorMsg(stat); + return sout.str(); +} + +/*************** CUFFT ERROR ***************/ +inline bool is_error(cufftResult_t stat) { return stat != CUFFT_SUCCESS; } + +inline std::string build_nvidia_error_msg(cufftResult_t stat) { + std::ostringstream sout; + sout << "CUFFT error(" << stat << "). " << GetExternalErrorMsg(stat); + return sout.str(); +} + +/*************** CUresult ERROR ***************/ +inline bool is_error(CUresult stat) { return stat != CUDA_SUCCESS; } + +inline std::string build_nvidia_error_msg(CUresult stat) { + std::ostringstream sout; + sout << "CU error(" << stat << "). " << GetExternalErrorMsg(stat); + return sout.str(); +} + +/**************** NCCL ERROR ****************/ +#if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL) +inline bool is_error(ncclResult_t nccl_result) { + return nccl_result != ncclSuccess; +} + +inline std::string build_nvidia_error_msg(ncclResult_t nccl_result) { + std::ostringstream sout; + sout << "NCCL error(" << nccl_result << "), " + << phi::dynload::ncclGetErrorString(nccl_result) << ". "; + if (errno == ENOSPC || errno == EAGAIN) { + std::string detail(strerror(errno)); + detail += "\nPlease try one of the following solutions:"; + detail += "\n1. export NCCL_SHM_DISABLE=1;"; + detail += "\n2. export NCCL_P2P_LEVEL=SYS;"; + detail += + "\n3. Increase shared memory by setting the -shm-size " + "option when starting docker container, e.g., setting " + " -shm-size=2g.\n"; + sout << " Detail: " + detail; + } + sout << GetExternalErrorMsg(nccl_result); + return sout.str(); +} +#endif // not(__APPLE__) and PADDLE_WITH_NCCL + +#define PADDLE_ENFORCE_GPU_SUCCESS(COND) \ + do { \ + auto __cond__ = (COND); \ + using __CUDA_STATUS_TYPE__ = decltype(__cond__); \ + constexpr auto __success_type__ = \ + ::phi::enforce::details::ExternalApiType< \ + __CUDA_STATUS_TYPE__>::kSuccess; \ + if (UNLIKELY(__cond__ != __success_type__)) { \ + auto __summary__ = phi::errors::External( \ + ::phi::enforce::build_nvidia_error_msg(__cond__)); \ + __THROW_ERROR_INTERNAL__(__summary__); \ + } \ + } while (0) + +#define PADDLE_ENFORCE_CUDA_LAUNCH_SUCCESS(OP) \ + do { \ + auto res = cudaGetLastError(); \ + if (UNLIKELY(res != cudaSuccess)) { \ + auto msg = ::phi::enforce::build_nvidia_error_msg(res); \ + PADDLE_THROW( \ + phi::errors::Fatal("CUDA error after kernel (%s): %s", OP, msg)); \ + } \ + } while (0) + +inline void retry_sleep(unsigned milliseconds) { +#ifdef _WIN32 + Sleep(milliseconds); +#else + if (milliseconds < 1000) { + // usleep argument must be less than 1,000,000. Reference: + // https://pubs.opengroup.org/onlinepubs/7908799/xsh/usleep.html + usleep(milliseconds * 1000); + } else { + // clip to sleep in seconds because we can not and don't have to + // sleep for exact milliseconds + sleep(milliseconds / 1000); + } +#endif +} + +#define PADDLE_RETRY_CUDA_SUCCESS(COND) \ + do { \ + auto __cond__ = (COND); \ + int retry_count = 1; \ + using __CUDA_STATUS_TYPE__ = decltype(__cond__); \ + constexpr auto __success_type__ = \ + ::phi::enforce::details::ExternalApiType< \ + __CUDA_STATUS_TYPE__>::kSuccess; \ + while (UNLIKELY(__cond__ != __success_type__) && retry_count < 5) { \ + phi::enforce::retry_sleep(10000); \ + __cond__ = (COND); \ + ++retry_count; \ + } \ + if (UNLIKELY(__cond__ != __success_type__)) { \ + auto __summary__ = phi::errors::External( \ + ::phi::enforce::build_nvidia_error_msg(__cond__)); \ + __THROW_ERROR_INTERNAL__(__summary__); \ + } \ + } while (0) + +#undef DEFINE_EXTERNAL_API_TYPE +#endif // PADDLE_WITH_CUDA + +/**************************************************************************/ +/***************************** HIP ERROR **********************************/ +#ifdef PADDLE_WITH_HIP + +/***** HIP ERROR *****/ +inline bool is_error(hipError_t e) { return e != hipSuccess; } + +inline std::string build_rocm_error_msg(hipError_t e) { + std::ostringstream sout; + sout << " Hip error(" << e << "), " << hipGetErrorString(e) << "."; + return sout.str(); +} + +/***** HIPRAND ERROR *****/ +inline bool is_error(hiprandStatus_t stat) { + return stat != HIPRAND_STATUS_SUCCESS; +} + +inline const char* hiprandGetErrorString(hiprandStatus_t stat) { + switch (stat) { + case HIPRAND_STATUS_SUCCESS: + return "HIPRAND_STATUS_SUCCESS"; + case HIPRAND_STATUS_VERSION_MISMATCH: + return "HIPRAND_STATUS_VERSION_MISMATCH"; + case HIPRAND_STATUS_NOT_INITIALIZED: + return "HIPRAND_STATUS_NOT_INITIALIZED"; + case HIPRAND_STATUS_ALLOCATION_FAILED: + return "HIPRAND_STATUS_ALLOCATION_FAILED"; + case HIPRAND_STATUS_TYPE_ERROR: + return "HIPRAND_STATUS_TYPE_ERROR"; + case HIPRAND_STATUS_OUT_OF_RANGE: + return "HIPRAND_STATUS_OUT_OF_RANGE"; + case HIPRAND_STATUS_LENGTH_NOT_MULTIPLE: + return "HIPRAND_STATUS_LENGTH_NOT_MULTIPLE"; + case HIPRAND_STATUS_DOUBLE_PRECISION_REQUIRED: + return "HIPRAND_STATUS_DOUBLE_PRECISION_REQUIRED"; + case HIPRAND_STATUS_LAUNCH_FAILURE: + return "HIPRAND_STATUS_LAUNCH_FAILURE"; + case HIPRAND_STATUS_PREEXISTING_FAILURE: + return "HIPRAND_STATUS_PREEXISTING_FAILURE"; + case HIPRAND_STATUS_INITIALIZATION_FAILED: + return "HIPRAND_STATUS_INITIALIZATION_FAILED"; + case HIPRAND_STATUS_ARCH_MISMATCH: + return "HIPRAND_STATUS_ARCH_MISMATCH"; + case HIPRAND_STATUS_INTERNAL_ERROR: + return "HIPRAND_STATUS_INTERNAL_ERROR"; + case HIPRAND_STATUS_NOT_IMPLEMENTED: + return "HIPRAND_STATUS_NOT_IMPLEMENTED"; + default: + return "Unknown hiprand status"; + } +} + +inline std::string build_rocm_error_msg(hiprandStatus_t stat) { + std::string msg(" Hiprand error, "); + return msg + hiprandGetErrorString(stat) + " "; +} + +/***** MIOPEN ERROR *****/ +inline bool is_error(miopenStatus_t stat) { + return stat != miopenStatusSuccess; +} + +inline std::string build_rocm_error_msg(miopenStatus_t stat) { + std::string msg(" Miopen error, "); + return msg + phi::dynload::miopenGetErrorString(stat) + " "; +} + +/***** ROCBLAS ERROR *****/ +inline bool is_error(rocblas_status stat) { + return stat != rocblas_status_success; +} + +inline const char* rocblasGetErrorString(rocblas_status stat) { + switch (stat) { + case rocblas_status_invalid_handle: + return "rocblas_status_invalid_handle"; + case rocblas_status_memory_error: + return "rocblas_status_memory_error"; + case rocblas_status_invalid_value: + return "rocblas_status_invalid_value"; + case rocblas_status_not_implemented: + return "rocblas_status_not_implemented"; + case rocblas_status_invalid_pointer: + return "rocblas_status_invalid_pointer"; + case rocblas_status_invalid_size: + return "rocblas_status_invalid_size"; + case rocblas_status_internal_error: + return "rocblas_status_internal_error"; + default: + return "Unknown cublas status"; + } +} + +inline std::string build_rocm_error_msg(rocblas_status stat) { + std::string msg(" Rocblas error, "); + return msg + rocblasGetErrorString(stat) + " "; +} + +/****** RCCL ERROR ******/ +#if !defined(__APPLE__) && defined(PADDLE_WITH_RCCL) +inline bool is_error(ncclResult_t nccl_result) { + return nccl_result != ncclSuccess; +} + +inline std::string build_rocm_error_msg(ncclResult_t nccl_result) { + std::string msg(" Rccl error, "); + return msg + phi::dynload::ncclGetErrorString(nccl_result) + " "; +} +#endif // not(__APPLE__) and PADDLE_WITH_NCCL + +/***** HIPFFT ERROR *****/ +inline bool is_error(hipfftResult_t stat) { return stat != HIPFFT_SUCCESS; } + +inline std::string build_rocm_error_msg(hipfftResult_t stat) { + std::string msg(" HIPFFT error, "); + return msg + phi::dynload::hipfftGetErrorString(stat) + " "; +} + +namespace details { + +template +struct ExternalApiType {}; + +#define DEFINE_EXTERNAL_API_TYPE(type, success_value) \ + template <> \ + struct ExternalApiType { \ + using Type = type; \ + static constexpr Type kSuccess = success_value; \ + } + +DEFINE_EXTERNAL_API_TYPE(hipError_t, hipSuccess); +DEFINE_EXTERNAL_API_TYPE(hiprandStatus_t, HIPRAND_STATUS_SUCCESS); +DEFINE_EXTERNAL_API_TYPE(miopenStatus_t, miopenStatusSuccess); +DEFINE_EXTERNAL_API_TYPE(rocblas_status, rocblas_status_success); +DEFINE_EXTERNAL_API_TYPE(hipfftResult_t, HIPFFT_SUCCESS); + +#if !defined(__APPLE__) && defined(PADDLE_WITH_RCCL) +DEFINE_EXTERNAL_API_TYPE(ncclResult_t, ncclSuccess); +#endif + +} // namespace details + +#define PADDLE_ENFORCE_GPU_SUCCESS(COND) \ + do { \ + auto __cond__ = (COND); \ + using __CUDA_STATUS_TYPE__ = decltype(__cond__); \ + constexpr auto __success_type__ = \ + ::phi::enforce::details::ExternalApiType< \ + __CUDA_STATUS_TYPE__>::kSuccess; \ + if (UNLIKELY(__cond__ != __success_type__)) { \ + auto __summary__ = phi::errors::External( \ + ::phi::enforce::build_rocm_error_msg(__cond__)); \ + __THROW_ERROR_INTERNAL__(__summary__); \ + } \ + } while (0) + +inline void retry_sleep(unsigned millisecond) { +#ifdef _WIN32 + Sleep(millisecond); +#else + sleep(millisecond); +#endif +} + +#define PADDLE_RETRY_CUDA_SUCCESS(COND) \ + do { \ + auto __cond__ = (COND); \ + int retry_count = 1; \ + using __CUDA_STATUS_TYPE__ = decltype(__cond__); \ + constexpr auto __success_type__ = \ + ::phi::enforce::details::ExternalApiType< \ + __CUDA_STATUS_TYPE__>::kSuccess; \ + while (UNLIKELY(__cond__ != __success_type__) && retry_count < 5) { \ + ::phi::enforce::retry_sleep(10000); \ + __cond__ = (COND); \ + ++retry_count; \ + } \ + if (UNLIKELY(__cond__ != __success_type__)) { \ + auto __summary__ = phi::errors::External( \ + ::phi::enforce::build_rocm_error_msg(__cond__)); \ + __THROW_ERROR_INTERNAL__(__summary__); \ + } \ + } while (0) + +#undef DEFINE_EXTERNAL_API_TYPE +#endif // PADDLE_WITH_HIP + } // namespace enforce using namespace enforce; // NOLINT } // namespace phi diff --git a/paddle/fluid/platform/external_error.proto b/paddle/phi/core/external_error.proto similarity index 97% rename from paddle/fluid/platform/external_error.proto rename to paddle/phi/core/external_error.proto index 8861c2c2ff4fb..992207242963f 100644 --- a/paddle/fluid/platform/external_error.proto +++ b/paddle/phi/core/external_error.proto @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ syntax = "proto2"; -package paddle.platform.proto; +package phi.proto; // (NOTE:zhouwei): ApiType describes which kind of external third party API // More external third party API can be added. diff --git a/paddle/phi/kernels/autotune/CMakeLists.txt b/paddle/phi/kernels/autotune/CMakeLists.txt index 5b195ef3ecf1e..aa05fcd74cc73 100644 --- a/paddle/phi/kernels/autotune/CMakeLists.txt +++ b/paddle/phi/kernels/autotune/CMakeLists.txt @@ -2,9 +2,12 @@ if(WITH_CUDNN_FRONTEND) cc_library( cache SRCS cache.cc - DEPS cudnn-frontend) + DEPS cudnn-frontend phi_enforce) else() - cc_library(cache SRCS cache.cc) + cc_library( + cache + SRCS cache.cc + DEPS phi_enforce) endif() cc_library( switch_autotune diff --git a/paddle/phi/kernels/funcs/concat_funcs.h b/paddle/phi/kernels/funcs/concat_funcs.h index 1c1e0eb1d0a23..db965c2ef9b65 100644 --- a/paddle/phi/kernels/funcs/concat_funcs.h +++ b/paddle/phi/kernels/funcs/concat_funcs.h @@ -14,7 +14,7 @@ #pragma once -#include "paddle/fluid/platform/enforce.h" +#include "paddle/phi/core/enforce.h" #include "paddle/phi/core/errors.h" namespace phi { namespace funcs { diff --git a/paddle/phi/kernels/funcs/cpu_vec.h b/paddle/phi/kernels/funcs/cpu_vec.h index 21a0b429c992e..2719f86f522e5 100644 --- a/paddle/phi/kernels/funcs/cpu_vec.h +++ b/paddle/phi/kernels/funcs/cpu_vec.h @@ -18,7 +18,7 @@ limitations under the License. */ #include #include "paddle/fluid/platform/cpu_info.h" -#include "paddle/fluid/platform/enforce.h" +#include "paddle/phi/core/enforce.h" #ifdef PADDLE_WITH_MKLML #include "paddle/phi/backends/dynload/mklml.h" diff --git a/paddle/phi/kernels/funcs/cufft_util.h b/paddle/phi/kernels/funcs/cufft_util.h index 584425c6112a5..3a4a3ef5e5914 100644 --- a/paddle/phi/kernels/funcs/cufft_util.h +++ b/paddle/phi/kernels/funcs/cufft_util.h @@ -15,9 +15,9 @@ #pragma once #include -#include "paddle/fluid/platform/enforce.h" #include "paddle/phi/backends/dynload/cufft.h" #include "paddle/phi/core/ddim.h" +#include "paddle/phi/core/enforce.h" #include "paddle/phi/kernels/funcs/fft.h" #include "paddle/phi/kernels/funcs/fft_key.h" diff --git a/paddle/phi/kernels/funcs/gru_compute.h b/paddle/phi/kernels/funcs/gru_compute.h index 02b2b91423cfc..eb2ea85dce05f 100644 --- a/paddle/phi/kernels/funcs/gru_compute.h +++ b/paddle/phi/kernels/funcs/gru_compute.h @@ -12,7 +12,7 @@ limitations under the License. */ #pragma once #include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/platform/enforce.h" +#include "paddle/phi/core/enforce.h" #include "paddle/phi/kernels/funcs/detail/activation_functions.h" namespace phi { diff --git a/paddle/phi/kernels/funcs/hipfft_util.h b/paddle/phi/kernels/funcs/hipfft_util.h index 6583a97f17a1d..74ca06fcf17f0 100644 --- a/paddle/phi/kernels/funcs/hipfft_util.h +++ b/paddle/phi/kernels/funcs/hipfft_util.h @@ -15,8 +15,8 @@ #pragma once #include -#include "paddle/fluid/platform/enforce.h" #include "paddle/phi/backends/dynload/hipfft.h" +#include "paddle/phi/core/enforce.h" #include "paddle/phi/kernels/funcs/fft.h" #include "paddle/phi/kernels/funcs/fft_key.h" diff --git a/paddle/phi/kernels/funcs/lstm_compute.h b/paddle/phi/kernels/funcs/lstm_compute.h index d51b92fc4fd69..0352ce2710d8c 100644 --- a/paddle/phi/kernels/funcs/lstm_compute.h +++ b/paddle/phi/kernels/funcs/lstm_compute.h @@ -15,7 +15,7 @@ limitations under the License. */ #pragma once #include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/platform/enforce.h" +#include "paddle/phi/core/enforce.h" #include "paddle/phi/kernels/funcs/detail/activation_functions.h" namespace phi { diff --git a/paddle/phi/kernels/funcs/math_function.h b/paddle/phi/kernels/funcs/math_function.h index 86d2f5c8efb7e..48649a454ae91 100644 --- a/paddle/phi/kernels/funcs/math_function.h +++ b/paddle/phi/kernels/funcs/math_function.h @@ -21,8 +21,8 @@ limitations under the License. */ #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/platform/enforce.h" #include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/enforce.h" #include "paddle/phi/core/utils/data_type.h" #include "paddle/phi/kernels/funcs/eigen/common.h" diff --git a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu index e6c681588e4ed..afa73f0a5719b 100644 --- a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu @@ -15,10 +15,10 @@ #include "paddle/fluid/operators/layout_utils.h" #include "paddle/fluid/operators/norm_utils.cu.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" -#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/flags.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/common/layout.h" +#include "paddle/phi/core/enforce.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/batch_norm_kernel.h" #include "paddle/phi/kernels/empty_kernel.h" diff --git a/paddle/phi/kernels/gpu/batch_norm_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_kernel.cu index 44fe99046e158..126b5c81ab3e2 100644 --- a/paddle/phi/kernels/gpu/batch_norm_kernel.cu +++ b/paddle/phi/kernels/gpu/batch_norm_kernel.cu @@ -23,10 +23,10 @@ namespace cub = hipcub; #include "paddle/fluid/operators/layout_utils.h" #include "paddle/fluid/operators/norm_utils.cu.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" -#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/flags.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/common/layout.h" +#include "paddle/phi/core/enforce.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/batch_norm_kernel.h" #include "paddle/phi/kernels/funcs/batch_norm_utils.h" diff --git a/paddle/phi/kernels/gpu/cholesky_solve_kernel.cu b/paddle/phi/kernels/gpu/cholesky_solve_kernel.cu index f74f4bf3814f3..f350106f67cf8 100644 --- a/paddle/phi/kernels/gpu/cholesky_solve_kernel.cu +++ b/paddle/phi/kernels/gpu/cholesky_solve_kernel.cu @@ -15,10 +15,10 @@ #ifndef PADDLE_WITH_HIP // HIP not support cusolver -#include "paddle/fluid/platform/enforce.h" #include "paddle/phi/backends/dynload/cusolver.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/common/complex.h" +#include "paddle/phi/core/enforce.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/lapack/lapack_function.h" #include "paddle/phi/kernels/impl/cholesky_solve_kernel_impl.h" diff --git a/paddle/phi/kernels/gpu/class_center_sample_kernel.cu b/paddle/phi/kernels/gpu/class_center_sample_kernel.cu index eb92a4488e502..65dace12b01ab 100644 --- a/paddle/phi/kernels/gpu/class_center_sample_kernel.cu +++ b/paddle/phi/kernels/gpu/class_center_sample_kernel.cu @@ -30,7 +30,7 @@ namespace cub = hipcub; #include #include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/platform/enforce.h" +#include "paddle/phi/core/enforce.h" #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/distributed/collective/ProcessGroup.h" diff --git a/tools/externalError/start.sh b/tools/externalError/start.sh index 82715dd47326c..ea9da231ecbf7 100644 --- a/tools/externalError/start.sh +++ b/tools/externalError/start.sh @@ -29,7 +29,7 @@ else echo "please run on Mac/Linux" exit 1 fi -protobuf/bin/protoc -I../../paddle/fluid/platform/ --python_out . ../../paddle/fluid/platform/external_error.proto +protobuf/bin/protoc -I../../paddle/phi/core/ --python_out . ../../paddle/phi/core/external_error.proto python3.7 spider.py tar czvf externalErrorMsg_$(date +'%Y%m%d').tar.gz externalErrorMsg.pb From de4310e67dc8acc30ef179c57087919de6069af4 Mon Sep 17 00:00:00 2001 From: huangjiyi <43315610+huangjiyi@users.noreply.github.com> Date: Thu, 24 Nov 2022 15:33:30 +0800 Subject: [PATCH 198/210] [PHI decoupling] simplify "convert_utils.h" in fluid (#48168) * rm dependence to "convert_utils.h" in some files * fix bugs * replace DataType2String with DataTypeToString * replace framework::DataTypeSize with phi::SizeOf * mv convert_function from fluid to phi and rm old map * recommit with pre-commit * repalce ProtoVarType with ProtoDataType and update comment. * fix error about include "dnnl.hpp" * revert add dep mkldnn to convert_utils in phi * add mkldnn deps in convert_utils.h in phi * move deps to convert_utils.h in phi --- .../distributed/ps/service/brpc_utils.cc | 32 ++-- .../distributed/ps/service/heter_client.cc | 4 +- paddle/fluid/eager/amp_auto_cast.h | 4 +- paddle/fluid/eager/eager_amp_auto_cast.h | 4 +- paddle/fluid/eager/grad_node_info.cc | 8 +- paddle/fluid/framework/CMakeLists.txt | 18 +- paddle/fluid/framework/convert_utils.cc | 166 ------------------ paddle/fluid/framework/convert_utils.h | 23 +-- .../fluid/framework/heter_section_worker.cc | 2 +- paddle/fluid/framework/tensor_util.cc | 8 +- paddle/fluid/inference/lite/tensor_utils.cc | 4 +- .../amp/update_loss_scaling_op_npu.cc | 2 +- .../operators/collective/c_embedding_op.h | 2 +- .../collective/c_embedding_op_npu.cc | 10 +- paddle/fluid/operators/detection/bbox_util.h | 2 +- paddle/fluid/operators/ipu/ipu_runtime_op.cc | 2 +- .../pscore/send_and_recv_op_gpu_test.cc | 2 +- .../fluid/operators/reader/buffered_reader.cc | 18 +- .../platform/device/gpu/cuda/cudnn_desc.h | 33 ++-- .../platform/device/gpu/rocm/miopen_desc.h | 16 +- paddle/fluid/pybind/eager_functions.cc | 2 +- paddle/fluid/pybind/eager_method.cc | 16 +- paddle/fluid/pybind/imperative.cc | 4 +- paddle/phi/common/data_type.h | 2 + paddle/phi/core/compat/CMakeLists.txt | 4 + paddle/phi/core/utils/data_type.h | 134 ++++++++------ paddle/phi/infermeta/unary.cc | 8 +- paddle/phi/kernels/cpu/arg_min_max_kernel.cc | 2 +- .../kernels/cpu/index_sample_grad_kernel.cc | 6 +- paddle/phi/kernels/cpu/index_sample_kernel.cc | 6 +- .../kernels/cpu/unique_consecutive_kernel.cc | 2 +- paddle/phi/kernels/funcs/unique_functor.h | 6 +- paddle/phi/kernels/gpu/arg_min_max_kernel.cu | 2 +- .../kernels/gpu/index_sample_grad_kernel.cu | 6 +- paddle/phi/kernels/gpu/index_sample_kernel.cu | 6 +- .../kernels/gpu/unique_consecutive_kernel.cu | 2 +- .../phi/kernels/gpudnn/conv_cudnn_frontend.h | 5 +- 37 files changed, 202 insertions(+), 371 deletions(-) delete mode 100644 paddle/fluid/framework/convert_utils.cc diff --git a/paddle/fluid/distributed/ps/service/brpc_utils.cc b/paddle/fluid/distributed/ps/service/brpc_utils.cc index 53edb62ff83fc..f07b66e1e90b9 100644 --- a/paddle/fluid/distributed/ps/service/brpc_utils.cc +++ b/paddle/fluid/distributed/ps/service/brpc_utils.cc @@ -111,14 +111,13 @@ void SerializeLodTensor(framework::Variable* var, } // IO Buffer if (platform::is_cpu_place(tensor->place())) { - auto data_len = tensor->numel() * framework::DataTypeSize(tensor->dtype()); + auto data_len = tensor->numel() * phi::SizeOf(tensor->dtype()); iobuf->append(reinterpret_cast(&data_len), 8); iobuf->append(reinterpret_cast(tensor->data()), data_len); } else { #ifdef PADDLE_WITH_CUDA char* temp_ptr = - new char[tensor->numel() * - framework::DataTypeSize(tensor->dtype())]; // NOLINT + new char[tensor->numel() * phi::SizeOf(tensor->dtype())]; // NOLINT auto stream = reinterpret_cast(ctx).stream(); memory::Copy( platform::CPUPlace(), @@ -128,7 +127,7 @@ void SerializeLodTensor(framework::Variable* var, tensor->numel() * framework::SizeOfType( framework::TransToProtoVarType(tensor->dtype())), stream); - auto data_len = tensor->numel() * framework::DataTypeSize(tensor->dtype()); + auto data_len = tensor->numel() * phi::SizeOf(tensor->dtype()); iobuf->append(reinterpret_cast(&data_len), 8); iobuf->append(reinterpret_cast(temp_ptr), data_len); delete[] temp_ptr; @@ -159,14 +158,13 @@ void SerializeSelectedRows(framework::Variable* var, } // IO Buffer if (platform::is_cpu_place(tensor->place())) { - auto data_len = tensor->numel() * framework::DataTypeSize(tensor->dtype()); + auto data_len = tensor->numel() * phi::SizeOf(tensor->dtype()); iobuf->append(reinterpret_cast(&data_len), 8); iobuf->append(reinterpret_cast(tensor->data()), data_len); } else { #ifdef PADDLE_WITH_CUDA char* temp_ptr = - new char[tensor->numel() * - framework::DataTypeSize(tensor->dtype())]; // NOLINT + new char[tensor->numel() * phi::SizeOf(tensor->dtype())]; // NOLINT auto stream = reinterpret_cast(ctx).stream(); memory::Copy( platform::CPUPlace(), @@ -176,7 +174,7 @@ void SerializeSelectedRows(framework::Variable* var, tensor->numel() * framework::SizeOfType( framework::TransToProtoVarType(tensor->dtype())), stream); - auto data_len = tensor->numel() * framework::DataTypeSize(tensor->dtype()); + auto data_len = tensor->numel() * phi::SizeOf(tensor->dtype()); iobuf->append(reinterpret_cast(&data_len), 8); iobuf->append(reinterpret_cast(temp_ptr), data_len); delete[] temp_ptr; @@ -259,16 +257,15 @@ void DeserializeLodTensor(framework::Variable* var, #ifdef PADDLE_WITH_CUDA unsigned long data_len; // NOLINT char* temp_ptr = - new char[tensor->numel() * - framework::DataTypeSize(tensor->dtype())]; // NOLINT - io_buffer_itr.copy_and_forward((void*)(&data_len), 8); // NOLINT - io_buffer_itr.copy_and_forward((void*)temp_ptr, data_len); // NOLINT + new char[tensor->numel() * phi::SizeOf(tensor->dtype())]; // NOLINT + io_buffer_itr.copy_and_forward((void*)(&data_len), 8); // NOLINT + io_buffer_itr.copy_and_forward((void*)temp_ptr, data_len); // NOLINT auto stream = reinterpret_cast(ctx).stream(); memory::Copy(place, tensor_data, platform::CPUPlace(), (void*)temp_ptr, // NOLINT - tensor->numel() * framework::DataTypeSize(tensor->dtype()), + tensor->numel() * phi::SizeOf(tensor->dtype()), stream); delete[] temp_ptr; #endif @@ -303,17 +300,16 @@ void DeserializeSelectedRows( } else if (platform::is_gpu_place(place)) { #ifdef PADDLE_WITH_CUDA char* temp_ptr = - new char[tensor->numel() * - framework::DataTypeSize(tensor->dtype())]; // NOLINT - unsigned long data_len; // NOLINT - io_buffer_itr.copy_and_forward((void*)(&data_len), 8); // NOLINT + new char[tensor->numel() * phi::SizeOf(tensor->dtype())]; // NOLINT + unsigned long data_len; // NOLINT + io_buffer_itr.copy_and_forward((void*)(&data_len), 8); // NOLINT io_buffer_itr.copy_and_forward(temp_ptr, data_len); auto stream = reinterpret_cast(ctx).stream(); memory::Copy(place, tensor_data, platform::CPUPlace(), temp_ptr, - tensor->numel() * framework::DataTypeSize(tensor->dtype()), + tensor->numel() * phi::SizeOf(tensor->dtype()), stream); delete[] temp_ptr; #endif diff --git a/paddle/fluid/distributed/ps/service/heter_client.cc b/paddle/fluid/distributed/ps/service/heter_client.cc index f90893bc8dc2d..2ca9fef5c0876 100644 --- a/paddle/fluid/distributed/ps/service/heter_client.cc +++ b/paddle/fluid/distributed/ps/service/heter_client.cc @@ -41,14 +41,14 @@ int GetMicroId(const platform::DeviceContext& ctx, } else { #ifdef PADDLE_WITH_CUDA std::vector temp; - temp.resize(tensor->numel() * framework::DataTypeSize(tensor->dtype())); + temp.resize(tensor->numel() * phi::SizeOf(tensor->dtype())); char* temp_ptr = temp.data(); auto stream = reinterpret_cast(ctx).stream(); memory::Copy(platform::CPUPlace(), temp_ptr, tensor->place(), tensor->data(), - tensor->numel() * framework::DataTypeSize(tensor->dtype()), + tensor->numel() * phi::SizeOf(tensor->dtype()), stream); float* temp_ptr_float = reinterpret_cast(temp_ptr); micro_id = static_cast(temp_ptr_float[0]); diff --git a/paddle/fluid/eager/amp_auto_cast.h b/paddle/fluid/eager/amp_auto_cast.h index a16dd95396427..0bf5875823bd9 100644 --- a/paddle/fluid/eager/amp_auto_cast.h +++ b/paddle/fluid/eager/amp_auto_cast.h @@ -49,7 +49,7 @@ inline std::vector AmpAutoCasts( std::string op_name) { VLOG(6) << "AMP AmpAutoCasts:" << " inputs(" << inputs_name << ") dst_dtype(" - << paddle::framework::DataType2String(dst_dtype) << ")."; + << phi::DataTypeToString(dst_dtype) << ")."; std::vector inputs_casted; for (auto& input : inputs) { if (NeedCast(input, dst_dtype)) { @@ -72,7 +72,7 @@ inline paddle::experimental::Tensor AmpAutoCast( std::string op_name) { VLOG(6) << "AMP AmpAutoCasts:" << " input(" << input_name << ") dst_dtype(" - << paddle::framework::DataType2String(dst_dtype) << ")."; + << phi::DataTypeToString(dst_dtype) << ")."; if (dst_dtype == paddle::experimental::DataType::FLOAT16) { if (op_name == "run_program") { return input; diff --git a/paddle/fluid/eager/eager_amp_auto_cast.h b/paddle/fluid/eager/eager_amp_auto_cast.h index 22748e31cfd7a..ea3e53b972d99 100644 --- a/paddle/fluid/eager/eager_amp_auto_cast.h +++ b/paddle/fluid/eager/eager_amp_auto_cast.h @@ -69,7 +69,7 @@ inline std::vector EagerAmpAutoCasts( bool trace_backward = true) { VLOG(6) << "AMP AmpAutoCasts:" << " inputs(" << inputs_name << ") dst_dtype(" - << paddle::framework::DataType2String(dst_dtype) << ")."; + << phi::DataTypeToString(dst_dtype) << ")."; std::vector inputs_casted; for (auto& input : inputs) { if (NeedCast(input, dst_dtype)) { @@ -89,7 +89,7 @@ inline paddle::experimental::Tensor EagerAmpAutoCast( bool trace_backward = true) { VLOG(6) << "AMP AmpAutoCasts:" << " input(" << egr::EagerUtils::TensorStr(input) << " to dst_dtype(" - << paddle::framework::DataType2String(dst_dtype) << ")."; + << phi::DataTypeToString(dst_dtype) << ")."; if (dst_dtype == paddle::experimental::DataType::FLOAT16) { if (op_name == "run_program") { return input; diff --git a/paddle/fluid/eager/grad_node_info.cc b/paddle/fluid/eager/grad_node_info.cc index afa8a6f205259..43da47436e2ac 100644 --- a/paddle/fluid/eager/grad_node_info.cc +++ b/paddle/fluid/eager/grad_node_info.cc @@ -41,16 +41,16 @@ static void CheckTensor(const paddle::experimental::Tensor& pre, "The tensor in before and after hook are not consistent")); } if (pre.initialized() && post.initialized()) { - VLOG(7) << paddle::framework::DataType2String(pre.dtype()) << " " - << paddle::framework::DataType2String(post.dtype()); + VLOG(7) << phi::DataTypeToString(pre.dtype()) << " " + << phi::DataTypeToString(post.dtype()); PADDLE_ENFORCE_EQ( pre.dtype(), post.dtype(), paddle::platform::errors::PermissionDenied( "The dtype of tensor before(%s) and after(%s) hook are not " "consistent", - paddle::framework::DataType2String(pre.dtype()), - paddle::framework::DataType2String(post.dtype()))); + phi::DataTypeToString(pre.dtype()), + phi::DataTypeToString(post.dtype()))); PADDLE_ENFORCE_EQ(pre.place(), post.place(), paddle::platform::errors::PermissionDenied( diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 6fd4095d0d28f..05c2a4ccfb2c4 100755 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -1242,23 +1242,7 @@ cc_test( SRCS phi_utils_test.cc DEPS phi_utils) -if(WITH_GPU OR WITH_ROCM) - cc_library( - fluid_convert_utils - SRCS convert_utils.cc - DEPS data_type place gpu_info) -else() - cc_library( - fluid_convert_utils - SRCS convert_utils.cc - DEPS data_type place) -endif() - -# every source file that includes "dnnl.h" must depends on mkldnn -# or, the first one should depends on mkldnn -if(WITH_MKLDNN) - add_dependencies(fluid_convert_utils mkldnn) -endif() +cc_library(fluid_convert_utils DEPS data_type) cc_test( convert_utils_test diff --git a/paddle/fluid/framework/convert_utils.cc b/paddle/fluid/framework/convert_utils.cc deleted file mode 100644 index 49efde53f4a8e..0000000000000 --- a/paddle/fluid/framework/convert_utils.cc +++ /dev/null @@ -1,166 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/convert_utils.h" -// See Note [ Why still include the fluid headers? ] -#include "paddle/fluid/platform/device/gpu/gpu_info.h" - -#include "paddle/phi/common/pstring.h" -namespace paddle { -namespace framework { - -paddle::experimental::DataType TransToPhiDataType( - const paddle::framework::proto::VarType::Type& dtype) { - // Set the order of case branches according to the frequency with - // the data type is used - switch (dtype) { - case paddle::framework::proto::VarType::FP32: - return DataType::FLOAT32; - case paddle::framework::proto::VarType::FP64: - return DataType::FLOAT64; - case paddle::framework::proto::VarType::INT64: - return DataType::INT64; - case paddle::framework::proto::VarType::INT32: - return DataType::INT32; - case paddle::framework::proto::VarType::INT8: - return DataType::INT8; - case paddle::framework::proto::VarType::UINT8: - return DataType::UINT8; - case paddle::framework::proto::VarType::INT16: - return DataType::INT16; - case paddle::framework::proto::VarType::COMPLEX64: - return DataType::COMPLEX64; - case paddle::framework::proto::VarType::COMPLEX128: - return DataType::COMPLEX128; - case paddle::framework::proto::VarType::FP16: - return DataType::FLOAT16; - case paddle::framework::proto::VarType::BF16: - return DataType::BFLOAT16; - case paddle::framework::proto::VarType::BOOL: - return DataType::BOOL; - case paddle::framework::proto::VarType::PSTRING: - return DataType::PSTRING; - default: - return DataType::UNDEFINED; - } -} - -paddle::framework::proto::VarType::Type TransToProtoVarType( - const paddle::experimental::DataType& dtype) { - // Set the order of case branches according to the frequency with - // the data type is used - switch (dtype) { - case DataType::FLOAT32: - return paddle::framework::proto::VarType::FP32; - case DataType::FLOAT64: - return paddle::framework::proto::VarType::FP64; - case DataType::INT64: - return paddle::framework::proto::VarType::INT64; - case DataType::INT32: - return paddle::framework::proto::VarType::INT32; - case DataType::INT8: - return paddle::framework::proto::VarType::INT8; - case DataType::UINT8: - return paddle::framework::proto::VarType::UINT8; - case DataType::INT16: - return paddle::framework::proto::VarType::INT16; - case DataType::COMPLEX64: - return paddle::framework::proto::VarType::COMPLEX64; - case DataType::COMPLEX128: - return paddle::framework::proto::VarType::COMPLEX128; - case DataType::FLOAT16: - return paddle::framework::proto::VarType::FP16; - case DataType::BFLOAT16: - return paddle::framework::proto::VarType::BF16; - case DataType::BOOL: - return paddle::framework::proto::VarType::BOOL; - case DataType::PSTRING: - return paddle::framework::proto::VarType::PSTRING; - default: - PADDLE_THROW(paddle::platform::errors::Unimplemented( - "Unsupported data type `%s` when casting it into " - "paddle data type.", - dtype)); - } -} - -size_t DataTypeSize(DataType dtype) { - switch (dtype) { - case DataType::UNDEFINED: - return 0; - case DataType::BOOL: - return sizeof(bool); - case DataType::INT8: - return sizeof(int8_t); - case DataType::UINT8: - return sizeof(uint8_t); - case DataType::INT16: - return sizeof(int16_t); - case DataType::INT32: - return sizeof(int); - case DataType::INT64: - return sizeof(int64_t); - case DataType::BFLOAT16: - return sizeof(paddle::platform::bfloat16); - case DataType::FLOAT16: - return sizeof(paddle::platform::float16); - case DataType::FLOAT32: - return sizeof(float); - case DataType::FLOAT64: - return sizeof(double); - case DataType::COMPLEX64: - return sizeof(paddle::platform::complex); - case DataType::COMPLEX128: - return sizeof(paddle::platform::complex); - case DataType::PSTRING: - return sizeof(paddle::platform::pstring); - default: - return 0; - } -} - -DataType String2DataType(const std::string& str) { - if (str == "bool") { - return DataType::BOOL; - } else if (str == "float16") { - return DataType::FLOAT16; - } else if (str == "float32") { - return DataType::FLOAT32; - } else if (str == "float64") { - return DataType::FLOAT64; - } else if (str == "int8") { - return DataType::INT8; - } else if (str == "int16") { - return DataType::INT16; - } else if (str == "int32") { - return DataType::INT32; - } else if (str == "int64") { - return DataType::INT64; - } else if (str == "uint8") { - return DataType::UINT8; - } else if (str == "complex64") { - return DataType::COMPLEX64; - } else if (str == "complex128") { - return DataType::COMPLEX128; - } else if (str == "pstring") { - return DataType::PSTRING; - } else if (str == "bfloat16") { - return DataType::BFLOAT16; - } else { - return DataType::UNDEFINED; - } -} - -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/framework/convert_utils.h b/paddle/fluid/framework/convert_utils.h index d3aca94003045..6995e21da8910 100644 --- a/paddle/fluid/framework/convert_utils.h +++ b/paddle/fluid/framework/convert_utils.h @@ -14,12 +14,8 @@ limitations under the License. */ #pragma once -#include "paddle/phi/common/backend.h" -#include "paddle/phi/common/data_type.h" -#include "paddle/phi/common/layout.h" -#include "paddle/phi/core/tensor_meta.h" - #include "paddle/fluid/framework/data_type.h" +#include "paddle/phi/common/layout.h" #include "paddle/phi/core/utils/data_type.h" // TODO(chenweihang): this file may need to be removed @@ -27,19 +23,16 @@ limitations under the License. */ namespace paddle { namespace framework { -using DataType = paddle::experimental::DataType; +using DataType = phi::DataType; using DataLayout = phi::DataLayout; -DataType TransToPhiDataType( - const paddle::framework::proto::VarType::Type& dtype); - -paddle::framework::proto::VarType::Type TransToProtoVarType( - const DataType& dtype); - -size_t DataTypeSize(DataType dtype); -DataType String2DataType(const std::string& str); +using phi::DataTypeToString; +using phi::SizeOf; +using phi::TransToPhiDataType; -using phi::DataType2String; +inline proto::VarType::Type TransToProtoVarType(const DataType& dtype) { + return static_cast(phi::TransToProtoVarType(dtype)); +} } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/heter_section_worker.cc b/paddle/fluid/framework/heter_section_worker.cc index 914a172039ec2..6321fe9c5388f 100644 --- a/paddle/fluid/framework/heter_section_worker.cc +++ b/paddle/fluid/framework/heter_section_worker.cc @@ -45,7 +45,7 @@ void SetMicroId(paddle::framework::Scope* scope, if (platform::is_gpu_place(place)) { #ifdef PADDLE_WITH_CUDA std::vector temp; - temp.resize(tensor->numel() * framework::DataTypeSize(tensor->dtype())); + temp.resize(tensor->numel() * phi::SizeOf(tensor->dtype())); char* temp_ptr = temp.data(); float* temp_ptr_float = reinterpret_cast(temp_ptr); temp_ptr_float[0] = micro_id; diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc index 867f15a3e09bd..91b87a98447ce 100644 --- a/paddle/fluid/framework/tensor_util.cc +++ b/paddle/fluid/framework/tensor_util.cc @@ -74,9 +74,9 @@ void TensorCopyImpl(const TENSOR& src, #ifdef PADDLE_WITH_MKLDNN auto size = src.layout() == DataLayout::ONEDNN ? src.memory_size() - : src.numel() * framework::DataTypeSize(src.dtype()); + : src.numel() * phi::SizeOf(src.dtype()); #else - auto size = src.numel() * framework::DataTypeSize(src.dtype()); + auto size = src.numel() * phi::SizeOf(src.dtype()); #endif if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) { @@ -486,7 +486,7 @@ void TensorCopySync(const phi::DenseTensor& src, return; } - auto size = src.numel() * framework::DataTypeSize(src.dtype()); + auto size = src.numel() * phi::SizeOf(src.dtype()); if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) { memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); } @@ -679,7 +679,7 @@ void TensorToStream(std::ostream& os, os.write(out.data(), size); } { // the 3rd field, tensor data - uint64_t size = tensor.numel() * framework::DataTypeSize(tensor.dtype()); + uint64_t size = tensor.numel() * phi::SizeOf(tensor.dtype()); auto* data_ptr = tensor.data(); PADDLE_ENFORCE_LT(size, diff --git a/paddle/fluid/inference/lite/tensor_utils.cc b/paddle/fluid/inference/lite/tensor_utils.cc index 3208ef961dcf0..67e81d676a179 100644 --- a/paddle/fluid/inference/lite/tensor_utils.cc +++ b/paddle/fluid/inference/lite/tensor_utils.cc @@ -217,7 +217,7 @@ void TensorCopyAsync(paddle::lite_api::Tensor* dst, const platform::Place& src_place = src.place(); const platform::Place& dst_place = GetNativePlace(dst->target()); const size_t bytes = - static_cast(src.numel()) * framework::DataTypeSize(src.dtype()); + static_cast(src.numel()) * phi::SizeOf(src.dtype()); dst->Resize(phi::vectorize(src.dims())); const void* src_data = src.data(); void* dst_data{nullptr}; @@ -241,7 +241,7 @@ void TensorCopyAsync(phi::DenseTensor* dst, const platform::Place& src_place = GetNativePlace(src.target()); const platform::Place& dst_place = dst->place(); int64_t src_numel = GetLiteTensorNumel(src); - const size_t bytes = src_numel * framework::DataTypeSize(dst->dtype()); + const size_t bytes = src_numel * phi::SizeOf(dst->dtype()); const void* src_data = src.data(); // When Lite is ready, the source type needs to be modified here. void* dst_data = dst->mutable_data(dst_place, dst->dtype()); diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc b/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc index dc1cd958f458c..b1bfcf8edd672 100644 --- a/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc +++ b/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc @@ -219,7 +219,7 @@ class LazyZerosNPU { if (!found_inf_vec[0]) { framework::TensorCopy(*x, place, dev_ctx, out); } else if (zero_ptr != dst_ptr) { - auto size = out->numel() * framework::DataTypeSize(out->dtype()); + auto size = out->numel() * phi::SizeOf(out->dtype()); memory::Copy(place, dst_ptr, place, zero_ptr, size, stream); } } diff --git a/paddle/fluid/operators/collective/c_embedding_op.h b/paddle/fluid/operators/collective/c_embedding_op.h index 4dbc6f963194f..55fd021a7cde3 100644 --- a/paddle/fluid/operators/collective/c_embedding_op.h +++ b/paddle/fluid/operators/collective/c_embedding_op.h @@ -128,7 +128,7 @@ class CEmbeddingGradOpCPUKernel : public framework::OpKernel { table_grad_t->mutable_data(table_t->dims(), context.GetPlace()); size_t table_t_mem_size = - table_t->numel() * framework::DataTypeSize(table_grad_t->dtype()); + table_t->numel() * phi::SizeOf(table_grad_t->dtype()); size_t table_grad_t_mem_size = table_grad_t->numel() * framework::SizeOfType( diff --git a/paddle/fluid/operators/collective/c_embedding_op_npu.cc b/paddle/fluid/operators/collective/c_embedding_op_npu.cc index 263fb264dc10d..f7fd7fbd83eab 100644 --- a/paddle/fluid/operators/collective/c_embedding_op_npu.cc +++ b/paddle/fluid/operators/collective/c_embedding_op_npu.cc @@ -127,10 +127,8 @@ void NPUGetIdsEmbedding(const framework::ExecutionContext &context) { auto pad_shape = phi::make_ddim({table_t->dims()[0] + 1, table_t->dims()[1]}); phi::DenseTensor table_t_pad; - size_t mem_size = - table_t->numel() * framework::DataTypeSize(table_t->dtype()); - size_t line_mem_size = - table_t->dims()[1] * framework::DataTypeSize(table_t->dtype()); + size_t mem_size = table_t->numel() * phi::SizeOf(table_t->dtype()); + size_t line_mem_size = table_t->dims()[1] * phi::SizeOf(table_t->dtype()); PADDLE_ENFORCE_EQ(line_mem_size % 64, 0, platform::errors::InvalidArgument( @@ -227,11 +225,11 @@ void NPUUpdateEmbedding(const framework::ExecutionContext &context) { // copy table_t_pad to table_t T *dst = table_grad_t->mutable_data(table_t->dims(), context.GetPlace()); const size_t mem_size = - table_grad_t->numel() * framework::DataTypeSize(table_grad_t->dtype()); + table_grad_t->numel() * phi::SizeOf(table_grad_t->dtype()); // check align size_t line_mem_size = - table_grad_t->dims()[1] * framework::DataTypeSize(table_grad_t->dtype()); + table_grad_t->dims()[1] * phi::SizeOf(table_grad_t->dtype()); PADDLE_ENFORCE_EQ(line_mem_size % 64, 0, platform::errors::InvalidArgument( diff --git a/paddle/fluid/operators/detection/bbox_util.h b/paddle/fluid/operators/detection/bbox_util.h index 4046f6b2830d8..aba166355ed8e 100644 --- a/paddle/fluid/operators/detection/bbox_util.h +++ b/paddle/fluid/operators/detection/bbox_util.h @@ -153,7 +153,7 @@ static void AppendProposals(phi::DenseTensor* dst, const phi::DenseTensor& src) { auto* out_data = dst->data(); auto* to_add_data = src.data(); - size_t size_of_t = framework::DataTypeSize(src.dtype()); + size_t size_of_t = phi::SizeOf(src.dtype()); offset *= size_of_t; std::memcpy( reinterpret_cast(reinterpret_cast(out_data) + offset), diff --git a/paddle/fluid/operators/ipu/ipu_runtime_op.cc b/paddle/fluid/operators/ipu/ipu_runtime_op.cc index e243c8f7d9e36..2c7bde2871a25 100644 --- a/paddle/fluid/operators/ipu/ipu_runtime_op.cc +++ b/paddle/fluid/operators/ipu/ipu_runtime_op.cc @@ -46,7 +46,7 @@ class IpuRuntimeOp : public framework::OperatorBase { for (size_t i = 0; i < outputs.size(); ++i) { auto* out = outputs[i]; if (out->dims().size() == 0) { - auto sizeof_dtype = framework::DataTypeSize(out->dtype()); + auto sizeof_dtype = phi::SizeOf(out->dtype()); int64_t dim = out->memory_size() / sizeof_dtype; out->Resize({dim}); VLOG(10) << "set ipu_runtime_op output: " << output_names[i] diff --git a/paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc b/paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc index 0491acc059919..faac4865975e5 100644 --- a/paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc +++ b/paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc @@ -128,7 +128,7 @@ void InitTensorsOnClient(framework::Scope* scope, reinterpret_cast(x_ptr), platform::CPUPlace(), reinterpret_cast(x_vec_ptr), - x_var->numel() * framework::DataTypeSize(x_var->dtype()), + x_var->numel() * phi::SizeOf(x_var->dtype()), stream); // auto res_var = scope->Var("res")->GetMutable(); diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc index 8ba7b84fe4bf9..5bb8a29ce356e 100644 --- a/paddle/fluid/operators/reader/buffered_reader.cc +++ b/paddle/fluid/operators/reader/buffered_reader.cc @@ -191,8 +191,7 @@ void BufferedReader::ReadAsync(size_t i) { cuda[i].set_layout(cpu[i].layout()); cuda_pinned_ptrs[i] = cuda[i].mutable_data(cuda_pinned_place, cpu[i].type()); - auto size = cpu[i].numel() * - paddle::framework::DataTypeSize(cpu[i].dtype()); + auto size = cpu[i].numel() * phi::SizeOf(cpu[i].dtype()); memory::Copy(cuda_pinned_place, cuda_pinned_ptrs[i], @@ -245,8 +244,7 @@ void BufferedReader::ReadAsync(size_t i) { auto cpu_place = cpu[i].place(); auto cpu_ptr = cpu[i].data(); auto gpu_ptr = gpu_ptrs[i]; - auto size = - cpu[i].numel() * paddle::framework::DataTypeSize(cpu[i].dtype()); + auto size = cpu[i].numel() * phi::SizeOf(cpu[i].dtype()); if (platform::is_cuda_pinned_place(cpu_place)) { memory::Copy( place_, gpu_ptr, cpu_place, cpu_ptr, size, stream_.get()); @@ -312,8 +310,7 @@ void BufferedReader::ReadAsync(size_t i) { auto cpu_place = cpu[i].place(); auto cpu_ptr = cpu[i].data(); auto npu_ptr = npu_ptrs[i]; - auto size = - cpu[i].numel() * paddle::framework::DataTypeSize(cpu[i].dtype()); + auto size = cpu[i].numel() * phi::SizeOf(cpu[i].dtype()); if ((platform::is_npu_place(cpu_place))) { memory::Copy( place_, npu_ptr, cpu_place, cpu_ptr, size, stream_.get()); @@ -364,8 +361,7 @@ void BufferedReader::ReadAsync(size_t i) { auto cpu_place = cpu[i].place(); auto cpu_ptr = cpu[i].data(); auto mlu_ptr = mlu_ptrs[i]; - auto size = - cpu[i].numel() * paddle::framework::DataTypeSize(cpu[i].dtype()); + auto size = cpu[i].numel() * phi::SizeOf(cpu[i].dtype()); if ((platform::is_mlu_place(cpu_place))) { memory::Copy( place_, mlu_ptr, cpu_place, cpu_ptr, size, stream_.get()); @@ -417,8 +413,7 @@ void BufferedReader::ReadAsync(size_t i) { auto cpu_place = cpu[i].place(); auto cpu_ptr = cpu[i].data(); auto xpu_ptr = xpu_ptrs[i]; - auto size = - cpu[i].numel() * paddle::framework::DataTypeSize(cpu[i].dtype()); + auto size = cpu[i].numel() * phi::SizeOf(cpu[i].dtype()); // TODO(zhanghuan) for now hardware not support xpu_memcpy_async, maybe // KL3 if ((platform::is_xpu_place(cpu_place))) { @@ -471,8 +466,7 @@ void BufferedReader::ReadAsync(size_t i) { auto cpu_place = cpu[i].place(); auto cpu_ptr = cpu[i].data(); auto custom_device_ptr = custom_device_ptrs[i]; - auto size = - cpu[i].numel() * paddle::framework::DataTypeSize(cpu[i].dtype()); + auto size = cpu[i].numel() * phi::SizeOf(cpu[i].dtype()); if ((platform::is_custom_place(cpu_place))) { memory::Copy(place_, custom_device_ptr, cpu_place, cpu_ptr, size); custom_device_stream_->Synchronize(); diff --git a/paddle/fluid/platform/device/gpu/cuda/cudnn_desc.h b/paddle/fluid/platform/device/gpu/cuda/cudnn_desc.h index de8c30efe5a4e..677dc49cce4b6 100644 --- a/paddle/fluid/platform/device/gpu/cuda/cudnn_desc.h +++ b/paddle/fluid/platform/device/gpu/cuda/cudnn_desc.h @@ -23,9 +23,9 @@ #include #include -#include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/platform/device/gpu/cuda/cudnn_helper.h" #include "paddle/fluid/platform/device_context.h" +#include "paddle/phi/core/utils/data_type.h" namespace phi { class DenseTensor; @@ -37,7 +37,7 @@ namespace platform { template inline cudnnDataType_t ToCudnnDataType(const T& t) { auto type = framework::ToDataType(t); - return ToCudnnDataType(type); + return ToCudnnDataType(phi::TransToPhiDataType(type)); } template @@ -68,21 +68,20 @@ inline std::vector TransformDimOrder(const std::vector& dims) { } template <> -inline cudnnDataType_t ToCudnnDataType( - const framework::proto::VarType::Type& t) { +inline cudnnDataType_t ToCudnnDataType(const phi::DataType& t) { cudnnDataType_t type = CUDNN_DATA_FLOAT; switch (t) { - case framework::proto::VarType::FP16: + case phi::DataType::FLOAT16: type = CUDNN_DATA_HALF; break; - case framework::proto::VarType::FP32: + case phi::DataType::FLOAT32: type = CUDNN_DATA_FLOAT; break; - case framework::proto::VarType::FP64: + case phi::DataType::FLOAT64: type = CUDNN_DATA_DOUBLE; break; #if CUDNN_VERSION_MIN(8, 1, 0) - case framework::proto::VarType::BF16: + case phi::DataType::BFLOAT16: type = CUDNN_DATA_BFLOAT16; break; #endif @@ -152,12 +151,12 @@ class TensorDescriptor { if (groups > 1) { dims_with_group[1] = dims_with_group[1] / groups; } - PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnSetTensorNdDescriptor( - desc_.get(), - ToCudnnDataType(framework::TransToProtoVarType(tensor.dtype())), - dims_with_group.size(), - dims_with_group.data(), - strides.data())); + PADDLE_ENFORCE_GPU_SUCCESS( + dynload::cudnnSetTensorNdDescriptor(desc_.get(), + ToCudnnDataType(tensor.dtype()), + dims_with_group.size(), + dims_with_group.data(), + strides.data())); } void set(const std::vector& dims, @@ -179,8 +178,7 @@ class TensorDescriptor { void set(const phi::DenseTensor& tensor, const cudnnTensorFormat_t format) { auto dims = phi::vectorize(tensor.dims()); - auto dtype = - ToCudnnDataType(framework::TransToProtoVarType(tensor.dtype())); + auto dtype = ToCudnnDataType(tensor.dtype()); set(dims, format, dtype); } @@ -232,8 +230,7 @@ class FilterDescriptor { const cudnnTensorFormat_t format, const int groups = 1) { auto dims = phi::vectorize(tensor.dims()); - auto dtype = - ToCudnnDataType(framework::TransToProtoVarType(tensor.dtype())); + auto dtype = ToCudnnDataType(tensor.dtype()); set(dims, format, dtype, groups); } diff --git a/paddle/fluid/platform/device/gpu/rocm/miopen_desc.h b/paddle/fluid/platform/device/gpu/rocm/miopen_desc.h index 8faae285e49e3..1ce4df05be64e 100644 --- a/paddle/fluid/platform/device/gpu/rocm/miopen_desc.h +++ b/paddle/fluid/platform/device/gpu/rocm/miopen_desc.h @@ -25,6 +25,7 @@ #include "paddle/fluid/platform/device/gpu/rocm/miopen_helper.h" #include "paddle/fluid/platform/device_context.h" +#include "paddle/phi/core/utils/data_type.h" namespace phi { class DenseTensor; @@ -36,7 +37,7 @@ namespace platform { template inline miopenDataType_t ToCudnnDataType(const T& t) { auto type = framework::ToDataType(t); - return ToCudnnDataType(type); + return ToCudnnDataType(phi::TransToPhiDataType(type)); } inline std::vector TransformDimOrder(const std::vector& dims) { @@ -63,14 +64,13 @@ inline std::vector TransformDimOrder(const std::vector& dims) { } template <> -inline miopenDataType_t ToCudnnDataType( - const framework::proto::VarType::Type& t) { +inline miopenDataType_t ToCudnnDataType(const phi::DataType& t) { miopenDataType_t type = miopenFloat; switch (t) { - case framework::proto::VarType::FP16: + case phi::DataType::FLOAT16: type = miopenHalf; break; - case framework::proto::VarType::FP32: + case phi::DataType::FLOAT32: type = miopenFloat; break; default: @@ -142,7 +142,7 @@ class TensorDescriptor { } PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenSetTensorDescriptor( (miopenTensorDescriptor_t)(desc_.get()), - ToCudnnDataType(framework::TransToProtoVarType(tensor.dtype())), + ToCudnnDataType(tensor.dtype()), static_cast(dims_with_group.size()), const_cast(dims_with_group.data()), const_cast(strides.data()))); @@ -166,7 +166,7 @@ class TensorDescriptor { } PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenSetTensorDescriptor( (miopenTensorDescriptor_t)(desc_.get()), - ToCudnnDataType(framework::TransToProtoVarType(tensor.dtype())), + ToCudnnDataType(tensor.dtype()), static_cast(dims_with_group.size()), const_cast(dims_with_group.data()), const_cast(strides.data()))); @@ -214,7 +214,7 @@ class FilterDescriptor { } PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenSetTensorDescriptor( (miopenTensorDescriptor_t)(desc_.get()), - ToCudnnDataType(framework::TransToProtoVarType(tensor.dtype())), + ToCudnnDataType(tensor.dtype()), static_cast(dims_with_group.size()), const_cast(dims_with_group.data()), const_cast(strides.data()))); diff --git a/paddle/fluid/pybind/eager_functions.cc b/paddle/fluid/pybind/eager_functions.cc index 3389daf330c7c..14368a9d99cad 100644 --- a/paddle/fluid/pybind/eager_functions.cc +++ b/paddle/fluid/pybind/eager_functions.cc @@ -79,7 +79,7 @@ class EagerNumpyAllocation : public phi::Allocation { explicit EagerNumpyAllocation(PyObject* numpy_data, phi::DataType dtype) : Allocation( static_cast(pybind11::detail::array_proxy(numpy_data)->data), - framework::DataTypeSize(dtype) * PyArray_Size_(numpy_data), + phi::SizeOf(dtype) * PyArray_Size_(numpy_data), paddle::platform::CPUPlace()), arr_(numpy_data) { PADDLE_ENFORCE_NOT_NULL( diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc index 0610a51d4cc22..6c91b32786648 100644 --- a/paddle/fluid/pybind/eager_method.cc +++ b/paddle/fluid/pybind/eager_method.cc @@ -116,7 +116,7 @@ static PyObject* tensor_method_numpy(TensorObject* self, } auto tensor_dims = self->tensor.shape(); auto numpy_dtype = TensorDtype2NumpyDtype(self->tensor.type()); - auto sizeof_dtype = paddle::framework::DataTypeSize(self->tensor.type()); + auto sizeof_dtype = phi::SizeOf(self->tensor.type()); Py_intptr_t py_dims[paddle::framework::DDim::kMaxRank]; Py_intptr_t py_strides[paddle::framework::DDim::kMaxRank]; size_t numel = 1; @@ -203,8 +203,7 @@ static PyObject* tensor_method_numpy(TensorObject* self, paddle::platform::GpuMemcpySync( pybind11::detail::array_proxy(array)->data, dense_tensor->data(), - paddle::framework::DataTypeSize(dense_tensor->dtype()) * - dense_tensor->numel(), + phi::SizeOf(dense_tensor->dtype()) * dense_tensor->numel(), kind); } else { VLOG(6) << "Getting DenseTensor's numpy value"; @@ -213,8 +212,7 @@ static PyObject* tensor_method_numpy(TensorObject* self, paddle::platform::GpuMemcpySync( pybind11::detail::array_proxy(array)->data, dense_tensor->data(), - paddle::framework::DataTypeSize(dense_tensor->dtype()) * - dense_tensor->numel(), + phi::SizeOf(dense_tensor->dtype()) * dense_tensor->numel(), kind); } #endif @@ -258,8 +256,7 @@ static PyObject* tensor_method_numpy(TensorObject* self, ->MemoryCopyD2H( pybind11::detail::array_proxy(array)->data, dense_tensor->data(), - paddle::framework::DataTypeSize(dense_tensor->dtype()) * - dense_tensor->numel()); + phi::SizeOf(dense_tensor->dtype()) * dense_tensor->numel()); } else { VLOG(6) << "Getting DenseTensor's numpy value"; auto dense_tensor = @@ -268,8 +265,7 @@ static PyObject* tensor_method_numpy(TensorObject* self, ->MemoryCopyD2H( pybind11::detail::array_proxy(array)->data, dense_tensor->data(), - paddle::framework::DataTypeSize(dense_tensor->dtype()) * - dense_tensor->numel()); + phi::SizeOf(dense_tensor->dtype()) * dense_tensor->numel()); } #endif } else { @@ -1698,7 +1694,7 @@ static PyObject* tensor_method_element_size(TensorObject* self, PyObject* args, PyObject* kwargs) { EAGER_TRY - uint32_t element_size = framework::DataTypeSize(self->tensor.dtype()); + uint32_t element_size = phi::SizeOf(self->tensor.dtype()); return ToPyObject(element_size); EAGER_CATCH_AND_THROW_RETURN_NULL diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc index 1eb5f8bd4764c..7adec4dca2bb7 100644 --- a/paddle/fluid/pybind/imperative.cc +++ b/paddle/fluid/pybind/imperative.cc @@ -562,7 +562,7 @@ void BindImperative(py::module *m_ptr) { &t, array, platform::CPUPlace(), true); // 3. allocate shared memory void *data_ptr = t.data(); - size_t data_size = t.numel() * framework::DataTypeSize(t.dtype()); + size_t data_size = t.numel() * phi::SizeOf(t.dtype()); auto shared_writer_holder = memory::allocation::AllocateMemoryMapWriterAllocation(data_size); // 4. maintain mmap fd set & backup ipc_name @@ -602,7 +602,7 @@ void BindImperative(py::module *m_ptr) { &t, array, platform::CPUPlace(), true); // 3. allocate shared memory void *data_ptr = t.data(); - size_t data_size = t.numel() * framework::DataTypeSize(t.dtype()); + size_t data_size = t.numel() * phi::SizeOf(t.dtype()); auto shared_writer_holder = memory::allocation::AllocateMemoryMapWriterAllocation(data_size); // 4. maintain mmap fd set & backup ipc_name diff --git a/paddle/phi/common/data_type.h b/paddle/phi/common/data_type.h index 339f240dae268..d66c6d89fe66e 100644 --- a/paddle/phi/common/data_type.h +++ b/paddle/phi/common/data_type.h @@ -258,6 +258,8 @@ inline std::string DataTypeToString(const DataType& dtype) { namespace phi { using DataType = paddle::experimental::DataType; +using paddle::experimental::DataTypeToString; +using paddle::experimental::SizeOf; } // namespace phi namespace paddle { diff --git a/paddle/phi/core/compat/CMakeLists.txt b/paddle/phi/core/compat/CMakeLists.txt index 31b1636d5b2fd..3d76cb1112c7c 100644 --- a/paddle/phi/core/compat/CMakeLists.txt +++ b/paddle/phi/core/compat/CMakeLists.txt @@ -9,6 +9,10 @@ cc_library( set(convert_utils_deps data_type place op_utils phi_backends) +if(WITH_MKLDNN) + set(convert_utils_deps ${convert_utils_deps} mkldnn) +endif() + cc_library( convert_utils SRCS convert_utils.cc diff --git a/paddle/phi/core/utils/data_type.h b/paddle/phi/core/utils/data_type.h index 7852d87c9a293..6879c6206564c 100644 --- a/paddle/phi/core/utils/data_type.h +++ b/paddle/phi/core/utils/data_type.h @@ -22,31 +22,6 @@ limitations under the License. */ #include "paddle/phi/core/enforce.h" namespace phi { -// Here we can't depend on the fluid proto::VarType, so we use the dtype enum -// value directly. See also `assign_value_sig.cc`. -// proto::VarType::INT16 -> 1 -> phi::DataType::INT16 -// proto::VarType::INT32 -> 2 -> phi::DataType::INT32 -// proto::VarType::INT64 -> 3 -> phi::DataType::INT64 -// proto::VarType::FP16 -> 4 -> phi::DataType::FLOAT16 -// proto::VarType::FP32 -> 5 -> phi::DataType::FLOAT32 -// proto::VarType::FP64 -> 6 -> phi::DataType::FLOAT64 -// proto::VarType::UINT8 -> 20 -> phi::DataType::UINT8 -static std::map var_type_map{{1, phi::DataType::INT16}, - {2, phi::DataType::INT32}, - {3, phi::DataType::INT64}, - {4, phi::DataType::FLOAT16}, - {5, phi::DataType::FLOAT32}, - {6, phi::DataType::FLOAT64}, - {20, phi::DataType::UINT8}}; - -static std::map map_to_var_type{{phi::DataType::INT16, 1}, - {phi::DataType::INT32, 2}, - {phi::DataType::INT64, 3}, - {phi::DataType::FLOAT16, 4}, - {phi::DataType::FLOAT32, 5}, - {phi::DataType::FLOAT64, 6}, - {phi::DataType::UINT8, 20}}; - #define _PhiForEachDataTypeHelper_(callback, cpp_type, data_type) \ callback(cpp_type, data_type); @@ -136,39 +111,98 @@ inline DataType ToRealType(const DataType& type) { } } -inline std::string DataType2String(DataType dtype) { +// In some cases we need to use the conversion between phi::DataType and +// fluid proto::VarType::Type, but can't depend on the proto::VarType::Type. +// So here we defined an enum type ProtoDataType which corresponds to +// proto::VarType::Type in fluid, but keeps only the data types we need. +// Note: The ProtoDataType (defined here) and proto::VarType::Type (defined +// in framework.pb.h) need to be modified simultaneously. +enum ProtoDataType { + BOOL = 0, + INT16 = 1, + INT32 = 2, + INT64 = 3, + FP16 = 4, + FP32 = 5, + FP64 = 6, + UINT8 = 20, + INT8 = 21, + BF16 = 22, + COMPLEX64 = 23, + COMPLEX128 = 24, + PSTRING = 29 +}; + +inline DataType TransToPhiDataType(const int& dtype) { + // Set the order of case branches according to the frequency with + // the data type is used switch (dtype) { - case DataType::BOOL: - return "bool"; + case ProtoDataType::FP32: + return DataType::FLOAT32; + case ProtoDataType::FP64: + return DataType::FLOAT64; + case ProtoDataType::INT64: + return DataType::INT64; + case ProtoDataType::INT32: + return DataType::INT32; + case ProtoDataType::INT8: + return DataType::INT8; + case ProtoDataType::UINT8: + return DataType::UINT8; + case ProtoDataType::INT16: + return DataType::INT16; + case ProtoDataType::COMPLEX64: + return DataType::COMPLEX64; + case ProtoDataType::COMPLEX128: + return DataType::COMPLEX128; + case ProtoDataType::FP16: + return DataType::FLOAT16; + case ProtoDataType::BF16: + return DataType::BFLOAT16; + case ProtoDataType::BOOL: + return DataType::BOOL; + case ProtoDataType::PSTRING: + return DataType::PSTRING; + default: + return DataType::UNDEFINED; + } +} + +inline int TransToProtoVarType(const DataType& dtype) { + // Set the order of case branches according to the frequency with + // the data type is used + switch (dtype) { + case DataType::FLOAT32: + return ProtoDataType::FP32; + case DataType::FLOAT64: + return ProtoDataType::FP64; + case DataType::INT64: + return ProtoDataType::INT64; + case DataType::INT32: + return ProtoDataType::INT32; case DataType::INT8: - return "int8"; + return ProtoDataType::INT8; case DataType::UINT8: - return "uint8"; + return ProtoDataType::UINT8; case DataType::INT16: - return "int16"; - case DataType::INT32: - return "int32"; - case DataType::INT64: - return "int64"; - case DataType::FLOAT16: - return "float16"; - case DataType::FLOAT32: - return "float32"; - case DataType::FLOAT64: - return "float64"; + return ProtoDataType::INT16; case DataType::COMPLEX64: - return "complex64"; + return ProtoDataType::COMPLEX64; case DataType::COMPLEX128: - return "complex128"; - case DataType::PSTRING: - return "pstring"; + return ProtoDataType::COMPLEX128; + case DataType::FLOAT16: + return ProtoDataType::FP16; case DataType::BFLOAT16: - return "bfloat16"; + return ProtoDataType::BF16; + case DataType::BOOL: + return ProtoDataType::BOOL; + case DataType::PSTRING: + return ProtoDataType::PSTRING; default: - PADDLE_THROW( - errors::InvalidArgument("Unknow phi::DataType, the int value = %d.", - static_cast(dtype))); - return ""; + PADDLE_THROW(phi::errors::Unimplemented( + "Unsupported data type `%s` when casting it into " + "paddle data type.", + dtype)); } } diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc index 5f602a134ecd9..f2c3873d81e5c 100644 --- a/paddle/phi/infermeta/unary.cc +++ b/paddle/phi/infermeta/unary.cc @@ -133,9 +133,9 @@ void ArgMinMaxInferMeta(const MetaTensor& x, phi::errors::InvalidArgument( "The attribute of dtype in argmin/argmax must be [%s] or [%s], but " "received [%s]", - phi::DataType2String(DataType::INT32), - phi::DataType2String(DataType::INT64), - phi::DataType2String(var_type_map[dtype]))); + phi::DataTypeToString(DataType::INT32), + phi::DataTypeToString(DataType::INT64), + phi::DataTypeToString(phi::TransToPhiDataType(dtype)))); if (!config.is_runtime && axis.FromTensor()) { std::vector vec; @@ -177,7 +177,7 @@ void ArgMinMaxInferMeta(const MetaTensor& x, auto x_rank = x_dims.size(); if (int_axis < 0) int_axis += x_rank; if (config.is_runtime) { - if (dtype == map_to_var_type[DataType::INT32]) { + if (dtype == phi::TransToProtoVarType(DataType::INT32)) { int64_t all_element_num = 0; if (flatten) { all_element_num = phi::product(x_dims); diff --git a/paddle/phi/kernels/cpu/arg_min_max_kernel.cc b/paddle/phi/kernels/cpu/arg_min_max_kernel.cc index 999cb16620d26..5c7e79c9e8ade 100644 --- a/paddle/phi/kernels/cpu/arg_min_max_kernel.cc +++ b/paddle/phi/kernels/cpu/arg_min_max_kernel.cc @@ -149,7 +149,7 @@ void ArgMinMaxKernel(const Context& dev_ctx, return; } phi::VisitDataTypeTiny( - var_type_map[dtype], + phi::TransToPhiDataType(dtype), VisitDataArgMinMaxFunctor( dev_ctx, x, axis.to(), keepdims, flatten, out)); } diff --git a/paddle/phi/kernels/cpu/index_sample_grad_kernel.cc b/paddle/phi/kernels/cpu/index_sample_grad_kernel.cc index 50f2c3267fbc5..42aef3cc2482b 100644 --- a/paddle/phi/kernels/cpu/index_sample_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/index_sample_grad_kernel.cc @@ -81,9 +81,9 @@ void IndexSampleGradKernel(const Context& ctx, errors::InvalidArgument( "Input(Index) holds the wrong type, it holds %s, but " "desires to be %s or %s", - phi::DataType2String(index_type), - phi::DataType2String(DataType::INT32), - phi::DataType2String(DataType::INT64))); + phi::DataTypeToString(index_type), + phi::DataTypeToString(DataType::INT32), + phi::DataTypeToString(DataType::INT64))); if (index_type == DataType::INT32) { IndexSampleGradInner(ctx, out_grad, index, x_grad); } else if (index_type == DataType::INT64) { diff --git a/paddle/phi/kernels/cpu/index_sample_kernel.cc b/paddle/phi/kernels/cpu/index_sample_kernel.cc index 11e24b10b153b..e51d06c442408 100644 --- a/paddle/phi/kernels/cpu/index_sample_kernel.cc +++ b/paddle/phi/kernels/cpu/index_sample_kernel.cc @@ -94,9 +94,9 @@ void IndexSampleKernel(const Context &ctx, errors::InvalidArgument( "Input(Index) holds the wrong type, it holds %s, but " "desires to be %s or %s", - phi::DataType2String(index_type), - phi::DataType2String(DataType::INT32), - phi::DataType2String(DataType::INT64))); + phi::DataTypeToString(index_type), + phi::DataTypeToString(DataType::INT32), + phi::DataTypeToString(DataType::INT64))); if (index_type == DataType::INT32) { IndexSampleInner(ctx, x, index, out); } else if (index_type == DataType::INT64) { diff --git a/paddle/phi/kernels/cpu/unique_consecutive_kernel.cc b/paddle/phi/kernels/cpu/unique_consecutive_kernel.cc index 07df5f1f566a4..2e1dd3e4ecbf1 100644 --- a/paddle/phi/kernels/cpu/unique_consecutive_kernel.cc +++ b/paddle/phi/kernels/cpu/unique_consecutive_kernel.cc @@ -32,7 +32,7 @@ void UniqueConsecutiveKernel(const Context& dev_ctx, DenseTensor* out, DenseTensor* index, DenseTensor* counts) { - auto data_type = var_type_map[dtype]; + auto data_type = phi::TransToPhiDataType(dtype); if (data_type == phi::DataType::INT32) { PADDLE_ENFORCE_LE( x.numel(), diff --git a/paddle/phi/kernels/funcs/unique_functor.h b/paddle/phi/kernels/funcs/unique_functor.h index 2c713243904eb..edd3935ef7603 100644 --- a/paddle/phi/kernels/funcs/unique_functor.h +++ b/paddle/phi/kernels/funcs/unique_functor.h @@ -82,9 +82,9 @@ struct UniqueOpFunctor { phi::errors::InvalidArgument( "Index holds the wrong type, it holds %s, " "but desires to be %s or %s", - phi::DataType2String(index_type), - phi::DataType2String(DataType::INT32), - phi::DataType2String(DataType::INT64))); + phi::DataTypeToString(index_type), + phi::DataTypeToString(DataType::INT32), + phi::DataTypeToString(DataType::INT64))); if (index_type == DataType::INT32) { for (auto i = 0; i < in_->numel(); ++i) { diff --git a/paddle/phi/kernels/gpu/arg_min_max_kernel.cu b/paddle/phi/kernels/gpu/arg_min_max_kernel.cu index 0cbf206bb3689..5764ba54b397a 100644 --- a/paddle/phi/kernels/gpu/arg_min_max_kernel.cu +++ b/paddle/phi/kernels/gpu/arg_min_max_kernel.cu @@ -215,7 +215,7 @@ void ArgMinMaxOpCUDAKernel(const Context& dev_ctx, return; } phi::VisitDataTypeTiny( - var_type_map[dtype], + phi::TransToPhiDataType(dtype), VisitDataCudaArgMinMaxFunctor( dev_ctx, x, axis.to(), keepdims, flatten, out)); } diff --git a/paddle/phi/kernels/gpu/index_sample_grad_kernel.cu b/paddle/phi/kernels/gpu/index_sample_grad_kernel.cu index 5193d0a7ab05c..b016cf20b1332 100644 --- a/paddle/phi/kernels/gpu/index_sample_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/index_sample_grad_kernel.cu @@ -75,9 +75,9 @@ void IndexSampleGradKernel(const Context& ctx, errors::InvalidArgument( "Input(Index) holds the wrong type, it holds %s, but " "desires to be %s or %s", - phi::DataType2String(index_type), - phi::DataType2String(DataType::INT32), - phi::DataType2String(DataType::INT64))); + phi::DataTypeToString(index_type), + phi::DataTypeToString(DataType::INT32), + phi::DataTypeToString(DataType::INT64))); auto stream = reinterpret_cast(ctx).stream(); auto input_num = x.numel(); diff --git a/paddle/phi/kernels/gpu/index_sample_kernel.cu b/paddle/phi/kernels/gpu/index_sample_kernel.cu index dd1b4aa97d2b6..31fe1ff2a02f2 100644 --- a/paddle/phi/kernels/gpu/index_sample_kernel.cu +++ b/paddle/phi/kernels/gpu/index_sample_kernel.cu @@ -64,9 +64,9 @@ void IndexSampleKernel(const Context& ctx, errors::InvalidArgument( "Input(Index) holds the wrong type, it holds %s, but " "desires to be %s or %s", - phi::DataType2String(index_type), - phi::DataType2String(DataType::INT32), - phi::DataType2String(DataType::INT64))); + phi::DataTypeToString(index_type), + phi::DataTypeToString(DataType::INT32), + phi::DataTypeToString(DataType::INT64))); const T* in_data = x.data(); T* out_data = ctx.template Alloc(out); auto stream = reinterpret_cast(ctx).stream(); diff --git a/paddle/phi/kernels/gpu/unique_consecutive_kernel.cu b/paddle/phi/kernels/gpu/unique_consecutive_kernel.cu index 9eb9309bb496f..9c04ab511d04f 100644 --- a/paddle/phi/kernels/gpu/unique_consecutive_kernel.cu +++ b/paddle/phi/kernels/gpu/unique_consecutive_kernel.cu @@ -33,7 +33,7 @@ void UniqueConsecutiveKernel(const Context& dev_ctx, DenseTensor* out, DenseTensor* index, DenseTensor* counts) { - auto data_type = var_type_map[dtype]; + auto data_type = phi::TransToPhiDataType(dtype); if (data_type == phi::DataType::INT32) { PADDLE_ENFORCE_LE( x.numel() + 1, diff --git a/paddle/phi/kernels/gpudnn/conv_cudnn_frontend.h b/paddle/phi/kernels/gpudnn/conv_cudnn_frontend.h index e45ff63d41b3a..368b5585b291e 100644 --- a/paddle/phi/kernels/gpudnn/conv_cudnn_frontend.h +++ b/paddle/phi/kernels/gpudnn/conv_cudnn_frontend.h @@ -17,11 +17,11 @@ limitations under the License. */ #include -#include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/platform/device/gpu/cuda/cudnn_desc.h" #include "paddle/phi/backends/dynload/cudnn_frontend.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/utils/data_type.h" #include "paddle/phi/kernels/autotune/cache.h" #include "paddle/phi/kernels/autotune/switch_autotune.h" @@ -95,8 +95,7 @@ class CudnnFrontendConvHelper { .setStrides(strides.size(), strides.data()) .setId(id) .setAlignment(GetAlignment(tensor)) - .setDataType(paddle::platform::ToCudnnDataType( - paddle::framework::TransToProtoVarType(tensor->dtype()))) + .setDataType(paddle::platform::ToCudnnDataType(tensor->dtype())) .build(); } From 6138331db6ba898bc0f28af427f6beb421c2d21a Mon Sep 17 00:00:00 2001 From: ccrrong <101700995+ccrrong@users.noreply.github.com> Date: Thu, 24 Nov 2022 17:15:52 +0800 Subject: [PATCH 199/210] remove inplace_abn (#48275) --- python/paddle/fluid/layers/nn.py | 288 ------------------ .../tests/unittests/test_inplace_abn_op.py | 144 +-------- .../fluid/tests/unittests/test_layers.py | 28 -- 3 files changed, 16 insertions(+), 444 deletions(-) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 49d6906d6a4d3..0f542bfab1f5a 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -74,7 +74,6 @@ 'pool2d', 'pool3d', 'batch_norm', - 'inplace_abn', 'instance_norm', 'data_norm', 'reduce_sum', @@ -2797,293 +2796,6 @@ def batch_norm( return helper.append_activation(batch_norm_out) -def inplace_abn( - input, - act=None, - is_test=False, - momentum=0.9, - epsilon=1e-05, - param_attr=None, - bias_attr=None, - data_layout='NCHW', - name=None, - moving_mean_name=None, - moving_variance_name=None, - do_model_average_for_mean_and_var=True, - use_global_stats=False, - act_alpha=1.0, -): - r""" - **In-place Activation Batch Normalization Layer** - - This layer calculates batch normalization and activation with in-place memory. - For batch normalization calculations, see `fluid.layers.batch_norm`. - For in-place activation batch normalization, see `In-Place Activated BatchNorm for - Memory-Optimized Training of DNNs `_ - - `inplace_abn` only support activation type as `None`, `identity`, `leaky_relu`, - `elu` currently. - `inplace_abn` only support data type as `float32`, `float64` currently. - - Note: - if build_strategy.sync_batch_norm=True, the batch_norm in network will use - sync_batch_norm automatically. - `is_test = True` can only be used in test program and inference program, `is_test` CANNOT be set to True in train program, if you want to use global status from pre_train model in train program, please set `use_global_stats = True`. - - Args: - input(Variable): The rank of input variable can be 2, 3, 4, 5. The data type - is float16 or float32 or float64. - act(string, Default None): Activation type, linear|relu|prelu|... - is_test (bool, Default False): A flag indicating whether it is in - test phrase or not. - momentum(float|Variable, Default 0.9): The value used for the moving_mean and - moving_var computation. This should be a float number or a Variable with - shape [1] and data type as float32. The updated formula is: - :math:`moving\_mean = moving\_mean * momentum + new\_mean * (1. - momentum)` - :math:`moving\_var = moving\_var * momentum + new\_var * (1. - momentum)` - Default is 0.9. - epsilon(float, Default 1e-05): A value added to the denominator for - numerical stability. Default is 1e-5. - param_attr(ParamAttr|None): The parameter attribute for Parameter `scale` - of inplace_abn. If it is set to None or one attribute of ParamAttr, inplace_abn - will create ParamAttr as param_attr, the name of scale can be set in ParamAttr. - If the Initializer of the param_attr is not set, the parameter is initialized - with Xavier. Default: None. - bias_attr(ParamAttr|None): The parameter attribute for the bias of inplace_abn. - If it is set to None or one attribute of ParamAttr, inplace_abn - will create ParamAttr as bias_attr, the name of bias can be set in ParamAttr. - If the Initializer of the bias_attr is not set, the bias is initialized zero. - Default: None. - data_layout (str, optional): Specify the data format of the input, and the data format of the output - will be consistent with that of the input. An optional string from: `"NCHW"`, `"NHWC"`. - The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of: - `[batch_size, input_channels, input_height, input_width]`. - name(str|None): For detailed information, please refer to :ref:`api_guide_Name`. - Usually name is no need to set and None by default. - moving_mean_name(str, Default None): The name of moving_mean which store the global Mean. If it - is set to None, inplace_abn will save global mean with a random name, otherwise, inplace_abn - will save global mean with the string. - moving_variance_name(str, Default None): The name of the moving_variance which store the global Variance. - If it is set to None, inplace_abn, will save global variance with a random name, otherwise, inplace_abn - will save global variance with the string. - do_model_average_for_mean_and_var(bool, Default True): Whether parameter mean and variance should do model - average when model average is enabled. - use_global_stats(bool, Default False): Whether to use global mean and - variance. In inference or test mode, set use_global_stats to true - or is_test to true, and the behavior is equivalent. - In train mode, when setting use_global_stats True, the global mean - and variance are also used during train period. - act_alpha(float, Default 1.0): when activation is in ['elu', 'identity', 'leaky_relu'], - inplace activative batch normalization will be used, and alpha parameter for activation - can be given by this parameter. - Returns: - A Variable holding Tensor which is the result after applying batch normalization and activation on the input, - has same shape and data type with input. - - Examples: - - .. code-block:: python - - import paddle.fluid as fluid - x = fluid.data(name='x', shape=[3, 7, 3, 7], dtype='float32') - hidden1 = fluid.layers.fc(input=x, size=200, param_attr='fc1.w') - hidden2 = fluid.layers.inplace_abn(input=hidden1) - hidden3 = fluid.layers.inplace_abn(input=hidden2, act='leaky_relu', act_alpha=0.2) - - """ - assert act in [None, 'identity', 'leaky_relu', 'elu'], ( - "inplace_abn only support act as None, 'identity', " - "'leaky_relu', 'elu' currently" - ) - assert ( - bias_attr is not False - ), "bias_attr should not be False in inplace_abn." - helper = LayerHelper('inplace_abn', **locals()) - - check_variable_and_dtype( - input, 'input', ['float32', 'float64'], 'inplace_abn' - ) - dtype = helper.input_dtype() - - input_shape = input.shape - if data_layout == 'NCHW': - channel_num = input_shape[1] - else: - if data_layout == 'NHWC': - channel_num = input_shape[-1] - else: - raise ValueError("unsupported data layout:" + data_layout) - - param_shape = [channel_num] - - # create parameter - scale = helper.create_parameter( - attr=helper.param_attr, - shape=param_shape, - dtype=dtype, - default_initializer=Constant(1.0), - ) - bias = helper.create_parameter( - attr=helper.bias_attr, shape=param_shape, dtype=dtype, is_bias=True - ) - - mean = helper.create_parameter( - attr=ParamAttr( - name=moving_mean_name, - initializer=Constant(0.0), - trainable=False, - do_model_average=do_model_average_for_mean_and_var, - ), - shape=param_shape, - dtype=dtype, - ) - mean.stop_gradient = True - - variance = helper.create_parameter( - attr=ParamAttr( - name=moving_variance_name, - initializer=Constant(1.0), - trainable=False, - do_model_average=do_model_average_for_mean_and_var, - ), - shape=param_shape, - dtype=dtype, - ) - variance.stop_gradient = True - - # create output - # mean and mean_out share the same memory - mean_out = mean - # variance and variance out share the same memory - variance_out = variance - # batch_norm_out and input share the same memory - batch_norm_out = input - - if in_dygraph_mode(): - inputs_has_MomemtumTensor = False - attrs_has_momentum = False - tmp_tensor_type = core.eager.Tensor - if isinstance(momentum, tmp_tensor_type): - inputs_has_MomemtumTensor = True - else: - attrs_has_momentum = True - - attrs__ = () - if attrs_has_momentum: - attrs__ = ( - 'momentum', - momentum, - 'epsilon', - epsilon, - 'is_test', - is_test, - 'data_layout', - data_layout, - 'use_mkldnn', - False, - 'fuse_with_relu', - False, - 'use_global_stats', - use_global_stats, - 'activation', - act, - 'alpha', - act_alpha, - ) - else: - attrs__ = ( - 'epsilon', - epsilon, - 'is_test', - is_test, - 'data_layout', - data_layout, - 'use_mkldnn', - False, - 'fuse_with_relu', - False, - 'use_global_stats', - use_global_stats, - 'activation', - act, - 'alpha', - act_alpha, - ) - if inputs_has_MomemtumTensor: - batch_norm_out, _, _, _, _, _ = _legacy_C_ops.inplace_abn_( - input, - scale, - bias, - mean, - variance, - momentum, - mean_out, - variance_out, - *attrs__, - ) - return batch_norm_out - else: - batch_norm_out, _, _, _, _, _ = _legacy_C_ops.inplace_abn_( - input, - scale, - bias, - mean, - variance, - None, - mean_out, - variance_out, - *attrs__, - ) - return batch_norm_out - - saved_mean = helper.create_variable_for_type_inference( - dtype=dtype, stop_gradient=True - ) - saved_variance = helper.create_variable_for_type_inference( - dtype=dtype, stop_gradient=True - ) - reserve_space = helper.create_variable_for_type_inference( - dtype=dtype, stop_gradient=True - ) - - inputs = { - "X": input, - "Scale": scale, - "Bias": bias, - "Mean": mean, - "Variance": variance, - } - attrs = { - "epsilon": epsilon, - "is_test": is_test, - "data_layout": data_layout, - "use_mkldnn": False, - "fuse_with_relu": False, - "use_global_stats": use_global_stats, - "activation": act, - "alpha": act_alpha, - } - if isinstance(momentum, Variable): - inputs['MomemtumTensor'] = momentum - else: - attrs['momentum'] = momentum - outputs = { - "Y": batch_norm_out, - "MeanOut": mean_out, - "VarianceOut": variance_out, - "SavedMean": saved_mean, - "SavedVariance": saved_variance, - } - if reserve_space is not None: - outputs["ReserveSpace"] = reserve_space - - helper.append_op( - type="inplace_abn", inputs=inputs, outputs=outputs, attrs=attrs - ) - - return batch_norm_out - - def instance_norm( input, epsilon=1e-05, param_attr=None, bias_attr=None, name=None ): diff --git a/python/paddle/fluid/tests/unittests/test_inplace_abn_op.py b/python/paddle/fluid/tests/unittests/test_inplace_abn_op.py index 56f3c13f4f33d..f29dbc7086736 100644 --- a/python/paddle/fluid/tests/unittests/test_inplace_abn_op.py +++ b/python/paddle/fluid/tests/unittests/test_inplace_abn_op.py @@ -17,7 +17,6 @@ import os import paddle.fluid.core as core import paddle.fluid as fluid -from paddle.fluid import compiler import paddle @@ -54,33 +53,21 @@ def build_program( append_batch_size=False, stop_gradient=False, ) - if inplace: - bn = fluid.layers.inplace_abn( - data, - act=activation, - param_attr=fluid.ParamAttr(name='bn_scale'), - bias_attr=fluid.ParamAttr(name='bn_bias'), - moving_mean_name='bn_moving_mean', - moving_variance_name='bn_moving_variance', - data_layout=layout, - is_test=only_forward, - act_alpha=alpha, - ) - else: - bn = fluid.layers.batch_norm( - data, - param_attr=fluid.ParamAttr(name='bn_scale'), - bias_attr=fluid.ParamAttr(name='bn_bias'), - moving_mean_name='bn_moving_mean', - moving_variance_name='bn_moving_variance', - data_layout=layout, - is_test=only_forward, - in_place=inplace, - ) - if activation == 'leaky_relu': - bn = paddle.nn.functional.leaky_relu(bn, alpha) - if activation == 'elu': - bn = paddle.nn.functional.elu(bn, alpha) + + bn = fluid.layers.batch_norm( + data, + param_attr=fluid.ParamAttr(name='bn_scale'), + bias_attr=fluid.ParamAttr(name='bn_bias'), + moving_mean_name='bn_moving_mean', + moving_variance_name='bn_moving_variance', + data_layout=layout, + is_test=only_forward, + in_place=inplace, + ) + if activation == 'leaky_relu': + bn = paddle.nn.functional.leaky_relu(bn, alpha) + if activation == 'elu': + bn = paddle.nn.functional.elu(bn, alpha) # NOTE: in inplace mode input and output of bn # may have same name, multiply 1. to generate @@ -94,105 +81,6 @@ def build_program( sgd_opt.backward(out) return main, startup, [out, bn] - def compare(self, place, layout, only_forward, activation, alpha, use_cuda): - seed = 10 - os.environ['FLAGS_cudnn_deterministic'] = "1" - data = np.random.random(size=self.dshape).astype(self.dtype) * 4.0 - 2 - - fetch_outs = [] - fetch_names = [] - for inplace in [False, True]: - main, startup, outs = self.build_program( - place, - layout, - seed, - only_forward, - activation, - alpha, - inplace=inplace, - ) - exe = fluid.Executor(place) - exe.run(startup) - - fetch_name = [v.name for v in outs] + [ - 'bn_moving_mean', - 'bn_moving_variance', - 'bn_scale', - 'bn_bias', - ] - if not only_forward: - others = [ - 'inplace_abn_0.tmp_0' if inplace else 'batch_norm_0.tmp_0', - 'inplace_abn_0.tmp_1' if inplace else 'batch_norm_0.tmp_1', - 'bn_scale@GRAD', - 'bn_bias@GRAD', - 'input@GRAD', - ] - fetch_name += others - for nm in fetch_name: - fv = fluid.framework._get_var(str(nm), program=main) - fv.persistable = True - - build_strategy = fluid.BuildStrategy() - build_strategy.sync_batch_norm = ( - use_cuda and fluid.core.get_cuda_device_count() > 1 - ) - build_strategy.enable_inplace = inplace - exec_strategy = fluid.ExecutionStrategy() - exec_strategy.num_threads = 1 if os.name == 'nt' else 0 - comp_prog1 = compiler.CompiledProgram(main).with_data_parallel( - outs[0].name if not only_forward else None, - build_strategy=build_strategy, - exec_strategy=exec_strategy, - ) - bn_fetches = exe.run( - program=main, feed={'input': data}, fetch_list=fetch_name - ) - fetch_outs.append(bn_fetches) - fetch_names.append(fetch_name) - - for bn_val, inplace_abn_val, name1, name2 in zip( - *(fetch_outs + fetch_names) - ): - np.testing.assert_allclose( - bn_val, - inplace_abn_val, - rtol=1e-05, - atol=0.01, - err_msg='Output (' - + name1 - + ':' - + name2 - + ') has diff on {} with {} layout and {} activation. \n'.format( - place, layout, activation - ) - + '\nBN ' - + str(bn_val) - + '\n' - + 'Inplace ABN ' - + str(inplace_abn_val), - ) - - def test_op(self): - use_cudas = [False, True] if core.is_compiled_with_cuda() else [False] - # use_cudas = [False] - for use_cuda in use_cudas: - place = core.CUDAPlace(0) if use_cuda else core.CPUPlace() - layouts = ["NCHW", "NHWC"] - for layout in layouts: - for activation, alpha in zip( - [None, 'elu', 'leaky_relu'], [0.0, 1.0, 0.02] - ): - for infer_only in [True, False]: - self.compare( - place, - layout, - infer_only, - activation, - alpha, - use_cuda, - ) - def test_all_branches(self): seed = 10 os.environ['FLAGS_cudnn_deterministic'] = "1" @@ -212,7 +100,7 @@ def test_all_branches(self): activation, alpha, use_cuda, - True, + False, ) exe = fluid.Executor(place) exe.run(startup) diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index ed230ea98b00f..937c027222001 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -3686,34 +3686,6 @@ def make_batch_norm_momentum_variable(self): out = layers.batch_norm(data, momentum=momentum) return out - def make_inplace_abn(self): - with program_guard( - fluid.default_main_program(), fluid.default_startup_program() - ): - data = self._get_data( - name='data', shape=[32, 128, 128], dtype="float32" - ) - out = layers.inplace_abn(data, act='leaky_relu', act_alpha=0.2) - return out - - def make_inplace_abn_momentum_variable(self): - with program_guard( - fluid.default_main_program(), fluid.default_startup_program() - ): - data = self._get_data( - name='data', shape=[32, 128, 128], dtype="float32" - ) - momentum = self._get_data( - name='momentum', - shape=[1], - dtype='float32', - append_batch_size=False, - ) - out = layers.inplace_abn( - data, momentum=momentum, act='elu', act_alpha=2.0 - ) - return out - def make_range(self): with program_guard( fluid.default_main_program(), fluid.default_startup_program() From f3a93e51850a80bd3fe73bf0c1142777e0180b2d Mon Sep 17 00:00:00 2001 From: ccrrong <101700995+ccrrong@users.noreply.github.com> Date: Thu, 24 Nov 2022 17:53:10 +0800 Subject: [PATCH 200/210] Move conv3d from fluid to static.nn.common (#48266) * move conv3d * remove unused import --- python/paddle/fluid/layers/nn.py | 309 ------------------ .../ir/inference/test_trt_conv3d_op.py | 5 +- .../tests/unittests/npu/test_conv3d_op_npu.py | 28 +- .../tests/unittests/test_conv3d_layer.py | 2 +- .../fluid/tests/unittests/test_conv3d_op.py | 28 +- .../tests/unittests/test_conv_nn_grad.py | 14 +- .../tests/unittests/test_functional_conv3d.py | 4 +- .../test_imperative_load_static_param.py | 4 +- .../fluid/tests/unittests/test_layers.py | 4 +- .../tests/unittests/xpu/test_conv3d_op_xpu.py | 28 +- python/paddle/static/nn/__init__.py | 2 +- python/paddle/static/nn/common.py | 309 ++++++++++++++++++ 12 files changed, 370 insertions(+), 367 deletions(-) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 0f542bfab1f5a..d2ac562dfd0bb 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -69,7 +69,6 @@ 'crf_decoding', 'cos_sim', 'conv2d', - 'conv3d', 'softmax', 'pool2d', 'pool3d', @@ -1683,314 +1682,6 @@ def _get_default_param_initializer(): return helper.append_activation(pre_act) -def conv3d( - input, - num_filters, - filter_size, - stride=1, - padding=0, - dilation=1, - groups=None, - param_attr=None, - bias_attr=None, - use_cudnn=True, - act=None, - name=None, - data_format="NCDHW", -): - r""" - :api_attr: Static Graph - - The convolution3D layer calculates the output based on the input, filter - and strides, paddings, dilations, groups parameters. Input(Input) and - Output(Output) are in NCDHW or NDHWC format. Where N is batch size C is the number of - channels, D is the depth of the feature, H is the height of the feature, - and W is the width of the feature. Convlution3D is similar with Convlution2D - but adds one dimension(depth). If bias attribution and activation type are - provided, bias is added to the output of the convolution, and the - corresponding activation function is applied to the final result. - - For each input :math:`X`, the equation is: - - .. math:: - - Out = \sigma (W \\ast X + b) - - In the above equation: - - * :math:`X`: Input value, a tensor with NCDHW or NDHWC format. - * :math:`W`: Filter value, a tensor with MCDHW format. - * :math:`\\ast`: Convolution operation. - * :math:`b`: Bias value, a 2-D tensor with shape [M, 1]. - * :math:`\\sigma`: Activation function. - * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different. - - Example: - - - Input: - - Input shape: :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})` - - Filter shape: :math:`(C_{out}, C_{in}, D_f, H_f, W_f)` - - - Output: - Output shape: :math:`(N, C_{out}, D_{out}, H_{out}, W_{out})` - - Where - - .. math:: - - D_{out}&= \\frac{(D_{in} + 2 * paddings[0] - (dilations[0] * (D_f - 1) + 1))}{strides[0]} + 1 \\\\ - H_{out}&= \\frac{(H_{in} + 2 * paddings[1] - (dilations[1] * (H_f - 1) + 1))}{strides[1]} + 1 \\\\ - W_{out}&= \\frac{(W_{in} + 2 * paddings[2] - (dilations[2] * (W_f - 1) + 1))}{strides[2]} + 1 - - Args: - input (Tensor): The input is 5-D Tensor with shape [N, C, D, H, W], the data - type of input is float16 or float32 or float64. - num_filters(int): The number of filter. It is as same as the output - image channel. - filter_size (int|tuple): The filter size. If filter_size is a tuple, - it must contain three integers, (filter_size_depth, filter_size_height, - filter_size_width). Otherwise, filter_size_depth = filter_size_height = \ - filter_size_width = filter_size. - stride (int|tuple): The stride size. It means the stride in convolution. If stride is a - tuple, it must contain three integers, (stride_depth, stride_height, stride_width). - Otherwise, stride_depth = stride_height = stride_width = stride. Default: stride = 1. - padding (string|int|list|tuple): The padding size. It means the number of zero-paddings - on both sides for each dimension. If `padding` is a string, either 'VALID' or - 'SAME' which is the padding algorithm. If padding size is a tuple or list, - it could be in three forms: `[pad_depth, pad_height, pad_width]` or - `[pad_depth_front, pad_depth_back, pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`, - and when `data_format` is `"NCDHW"`, `pool_padding` can be in the form - `[[0,0], [0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`. - when `data_format` is `"NDHWC"`, `pool_padding` can be in the form - `[[0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`. - Default: padding = 0. - dilation (int|tuple): The dilation size. It means the spacing between the kernel points. - If dilation is a tuple, it must contain three integers, (dilation_depth, dilation_height, - dilation_width). Otherwise, dilation_depth = dilation_height = dilation_width = dilation. - Default: dilation = 1. - groups (int): The groups number of the Conv3d Layer. According to grouped - convolution in Alex Krizhevsky's Deep CNN paper: when group=2, - the first half of the filters is only connected to the first half - of the input channels, while the second half of the filters is only - connected to the second half of the input channels. Default: groups=1 - param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights - of conv3d. If it is set to None or one attribute of ParamAttr, conv3d - will create ParamAttr as param_attr. If it is set to None, the parameter - is initialized with :math:`Normal(0.0, std)`, and the :math:`std` is - :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. Default: None. - bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of conv3d. - If it is set to False, no bias will be added to the output units. - If it is set to None or one attribute of ParamAttr, conv3d - will create ParamAttr as bias_attr. If the Initializer of the bias_attr - is not set, the bias is initialized zero. Default: None. - use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn - library is installed. Default: True - act (str): Activation type, if it is set to None, activation is not appended. - Default: None. - name(str|None): For detailed information, please refer - to :ref:`api_guide_Name`. Usually name is no need to set and - None by default. - data_format (str, optional): Specify the data format of the input, and the data format of the output - will be consistent with that of the input. An optional string from: `"NCHW"`, `"NHWC"`. - The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of: - `[batch_size, input_channels, input_height, input_width]`. - - Returns: - A Variable holding Tensor representing the conv3d, whose data type is - the same with input. If act is None, the tensor variable storing the - convolution result, and if act is not None, the tensor variable storing - convolution and non-linearity activation result. - - Raises: - ValueError: If the type of `use_cudnn` is not bool. - ValueError: If `data_format` is not "NCDHW" or "NDHWC". - ValueError: If the channel dimmention of the input is less than or equal to zero. - ValueError: If `padding` is a string, but not "SAME" or "VALID". - ValueError: If `padding` is a tuple, but the element corresponding to the input's batch size is not 0 - or the element corresponding to the input's channel is not 0. - ShapeError: If the input is not 5-D Tensor. - ShapeError: If the input's dimension size and filter's dimension size not equal. - ShapeError: If the dimension size of input minus the size of `stride` is not 2. - ShapeError: If the number of input channels is not equal to filter's channels * groups. - ShapeError: If the number of output channels is not be divided by groups. - - Examples: - .. code-block:: python - - import paddle - import numpy as np - - paddle.enable_static() - data = paddle.static.data(name='data', shape=[None, 3, 12, 32, 32], dtype='float32') - param_attr = paddle.framework.ParamAttr(name='conv3d.weight', initializer=paddle.nn.initializer.XavierNormal(), learning_rate=0.001) - res = paddle.static.nn.conv3d(input=data, num_filters=2, filter_size=3, act="relu", param_attr=param_attr) - place = paddle.CPUPlace() - exe = paddle.static.Executor(place) - exe.run(paddle.static.default_startup_program()) - x = np.random.rand(1, 3, 12, 32, 32).astype("float32") - output = exe.run(feed={"data": x}, fetch_list=[res]) - print(output) - """ - - l_type = 'conv3d' - assert param_attr is not False, "param_attr should not be False here." - helper = LayerHelper(l_type, **locals()) - dtype = helper.input_dtype() - - if not isinstance(use_cudnn, bool): - raise ValueError( - "Attr(use_cudnn) should be True or False. Received " - "Attr(use_cudnn): %s. " % str(use_cudnn) - ) - - if data_format not in ["NCDHW", "NDHWC"]: - raise ValueError( - "Attr(data_format) should be 'NCDHW' or 'NDHWC'. Received " - "Attr(data_format): %s." % str(data_format) - ) - - channel_last = data_format == "NDHWC" - if len(input.shape) != 5: - raise ValueError( - "Input should be 5D tensor, but received input with the shape of {}".format( - input.shape - ) - ) - num_channels = input.shape[4] if channel_last else input.shape[1] - if num_channels < 0: - raise ValueError( - "The channel dimmention of the input(%s) should be defined. " - "Received: %s." % (str(input.shape), str(num_channels)) - ) - - if groups is None: - num_filter_channels = num_channels - elif groups <= 0: - raise ValueError( - "the groups of conv3d should be greater than 0. Received groups: {}".format( - groups - ) - ) - else: - if num_channels % groups != 0: - raise ValueError( - "The number of input channels must be divisible by Attr(groups). " - "Received: number of channels(%s), groups(%s)." - % (str(num_channels), str(groups)) - ) - num_filter_channels = num_channels // groups - - filter_size = utils.convert_to_list(filter_size, 3, 'filter_size') - stride = utils.convert_to_list(stride, 3, 'stride') - dilation = utils.convert_to_list(dilation, 3, 'dilation') - - def _update_padding(padding, data_format): - def is_list_or_tuple(ele): - if isinstance(ele, list) or isinstance(ele, tuple): - return True - return False - - if is_list_or_tuple(padding) and len(padding) == 5: - if is_list_or_tuple(padding[0]) and (data_format == "NCDHW"): - if not (padding[0] == [0, 0] and padding[1] == [0, 0]): - raise ValueError( - "Non-zero padding(%s) in the batch or channel dimensions " - "is not supported." % str(padding) - ) - padding = padding[2:5] - padding = [ele for a_list in padding for ele in a_list] - elif is_list_or_tuple(padding[0]) and (data_format == "NDHWC"): - if not (padding[0] == [0, 0] and padding[4] == [0, 0]): - raise ValueError( - "Non-zero padding(%s) in the batch or channel dimensions " - "is not supported." % str(padding) - ) - padding = padding[1:4] - padding = [ele for a_list in padding for ele in a_list] - padding = utils.convert_to_list(padding, 6, 'padding') - if utils._is_symmetric_padding(padding, 3): - padding = [padding[0], padding[2], padding[4]] - elif is_list_or_tuple(padding) and len(padding) == 6: - padding = utils.convert_to_list(padding, 6, 'padding') - if utils._is_symmetric_padding(padding, 3): - padding = [padding[0], padding[2], padding[4]] - else: - padding = utils.convert_to_list(padding, 3, 'padding') - - return padding - - padding_algorithm = "EXPLICIT" - if isinstance(padding, str): - padding = padding.upper() - if padding not in ["SAME", "VALID"]: - raise ValueError( - "Unknown padding: '%s'. It can only be 'SAME' or 'VALID'." - % str(padding) - ) - if padding == "VALID": - padding_algorithm = "VALID" - padding = [0, 0, 0] - elif padding == "SAME": - padding_algorithm = "SAME" - padding = [0, 0, 0] - - padding = _update_padding(padding, data_format) - - input_shape = input.shape - filter_shape = [num_filters, num_filter_channels] + filter_size - - def _get_default_param_initializer(): - filter_elem_num = ( - filter_size[0] * filter_size[1] * filter_size[2] * num_channels - ) - if filter_elem_num <= 0: - raise ValueError( - "Invalid filter number, excepted number is larger than 0, but" - " received {}, please check the input shape and " - "filter size.".format(filter_elem_num) - ) - - std = (2.0 / filter_elem_num) ** 0.5 - return Normal(0.0, std, 0) - - filter_param = helper.create_parameter( - attr=helper.param_attr, - shape=filter_shape, - dtype=dtype, - default_initializer=_get_default_param_initializer(), - ) - - pre_bias = helper.create_variable_for_type_inference(dtype) - - helper.append_op( - type=l_type, - inputs={ - 'Input': input, - 'Filter': filter_param, - }, - outputs={"Output": pre_bias}, - attrs={ - 'strides': stride, - 'paddings': padding, - 'dilations': dilation, - 'groups': groups, - 'use_cudnn': use_cudnn, - 'use_mkldnn': False, - "padding_algorithm": padding_algorithm, - "data_format": data_format, - }, - ) - - if data_format == 'NCDHW': - pre_act = helper.append_bias_op(pre_bias, dim_start=1, dim_end=2) - else: - pre_act = helper.append_bias_op(pre_bias, dim_start=4, dim_end=5) - - return helper.append_activation(pre_act) - - @templatedoc() def pool2d( input, diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_conv3d_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_conv3d_op.py index 70e62f9eaf6c7..1142e6370345a 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_conv3d_op.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_conv3d_op.py @@ -15,6 +15,7 @@ import unittest import numpy as np from inference_pass_test import InferencePassTest +import paddle import paddle.fluid as fluid import paddle.fluid.core as core from paddle.fluid.core import PassVersionChecker @@ -29,7 +30,7 @@ def setUp(self): data = fluid.data( name="data", shape=[-1, 3, 6, 32, 32], dtype="float32" ) - conv_out = fluid.layers.conv3d( + conv_out = paddle.static.nn.conv3d( input=data, num_filters=self.conv_num_filters, filter_size=self.conv_filter_size, @@ -113,7 +114,7 @@ def setUp(self): data = fluid.data( name="data", shape=[-1, 6, -1, -1, -1], dtype="float32" ) - conv_out = fluid.layers.conv3d( + conv_out = paddle.static.nn.conv3d( input=data, num_filters=self.conv_num_filters, filter_size=self.conv_filter_size, diff --git a/python/paddle/fluid/tests/unittests/npu/test_conv3d_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_conv3d_op_npu.py index 81af05b156c86..65ec9a489f4a5 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_conv3d_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_conv3d_op_npu.py @@ -364,7 +364,7 @@ def test_api(self): dtype="float32", ) - fluid.layers.conv3d( + paddle.static.nn.conv3d( input=input_NDHWC, num_filters=3, filter_size=[3, 3, 3], @@ -375,7 +375,7 @@ def test_api(self): data_format="NCDHW", ) - fluid.layers.conv3d( + paddle.static.nn.conv3d( input=input_NCDHW, num_filters=3, filter_size=[3, 3, 3], @@ -386,7 +386,7 @@ def test_api(self): data_format="NCDHW", ) - fluid.layers.conv3d( + paddle.static.nn.conv3d( input=input_NCDHW, num_filters=3, filter_size=[3, 3, 3], @@ -397,7 +397,7 @@ def test_api(self): data_format="NCDHW", ) - fluid.layers.conv3d( + paddle.static.nn.conv3d( input=input_NDHWC, num_filters=3, filter_size=[3, 3, 3], @@ -408,7 +408,7 @@ def test_api(self): data_format="NDHWC", ) - fluid.layers.conv3d( + paddle.static.nn.conv3d( input=input_NCDHW, num_filters=3, filter_size=[3, 3, 3], @@ -419,7 +419,7 @@ def test_api(self): data_format="NCDHW", ) - fluid.layers.conv3d( + paddle.static.nn.conv3d( input=input_NCDHW, num_filters=3, filter_size=[3, 3, 3], @@ -442,7 +442,7 @@ def test_api(self): # ValueError: cudnn def run_1(): - fluid.layers.conv3d( + paddle.static.nn.conv3d( input=input, num_filters=3, filter_size=3, @@ -458,7 +458,7 @@ def run_1(): # ValueError: data_format def run_2(): - fluid.layers.conv3d( + paddle.static.nn.conv3d( input=input, num_filters=3, filter_size=[3, 3, 3], @@ -474,7 +474,7 @@ def run_2(): # ValueError: padding def run_3(): - fluid.layers.conv3d( + paddle.static.nn.conv3d( input=input, num_filters=3, filter_size=3, @@ -489,7 +489,7 @@ def run_3(): self.assertRaises(ValueError, run_3) def run_4(): - fluid.layers.conv3d( + paddle.static.nn.conv3d( input=input, num_filters=3, filter_size=3, @@ -504,7 +504,7 @@ def run_4(): self.assertRaises(ValueError, run_4) def run_5(): - fluid.layers.conv3d( + paddle.static.nn.conv3d( input=input, num_filters=3, filter_size=0, @@ -527,7 +527,7 @@ def run_5(): ) def run_6(): - fluid.layers.conv3d( + paddle.static.nn.conv3d( input=x, num_filters=3, filter_size=3, @@ -543,7 +543,7 @@ def run_6(): # ValueError: groups def run_7(): - fluid.layers.conv3d( + paddle.static.nn.conv3d( input=input, num_filters=3, filter_size=3, @@ -559,7 +559,7 @@ def run_7(): # ValueError: filter num def run_8(): - fluid.layers.conv3d( + paddle.static.nn.conv3d( input=input, num_filters=0, filter_size=0, diff --git a/python/paddle/fluid/tests/unittests/test_conv3d_layer.py b/python/paddle/fluid/tests/unittests/test_conv3d_layer.py index a8ab4d65961f1..fd7dc6bb63029 100644 --- a/python/paddle/fluid/tests/unittests/test_conv3d_layer.py +++ b/python/paddle/fluid/tests/unittests/test_conv3d_layer.py @@ -101,7 +101,7 @@ def fluid_layer(self, place): bias_attr = False else: bias_attr = I.NumpyArrayInitializer(self.bias) - y_var = fluid.layers.conv3d( + y_var = paddle.static.nn.conv3d( x_var, self.num_filters, self.filter_size, diff --git a/python/paddle/fluid/tests/unittests/test_conv3d_op.py b/python/paddle/fluid/tests/unittests/test_conv3d_op.py index 54a3621e0ba72..d9cd69e4d550c 100644 --- a/python/paddle/fluid/tests/unittests/test_conv3d_op.py +++ b/python/paddle/fluid/tests/unittests/test_conv3d_op.py @@ -878,7 +878,7 @@ def test_api(self): dtype="float32", ) - fluid.layers.conv3d( + paddle.static.nn.conv3d( input=input_NDHWC, num_filters=3, filter_size=[3, 3, 3], @@ -889,7 +889,7 @@ def test_api(self): data_format="NCDHW", ) - fluid.layers.conv3d( + paddle.static.nn.conv3d( input=input_NCDHW, num_filters=3, filter_size=[3, 3, 3], @@ -900,7 +900,7 @@ def test_api(self): data_format="NCDHW", ) - fluid.layers.conv3d( + paddle.static.nn.conv3d( input=input_NCDHW, num_filters=3, filter_size=[3, 3, 3], @@ -911,7 +911,7 @@ def test_api(self): data_format="NCDHW", ) - fluid.layers.conv3d( + paddle.static.nn.conv3d( input=input_NDHWC, num_filters=3, filter_size=[3, 3, 3], @@ -922,7 +922,7 @@ def test_api(self): data_format="NDHWC", ) - fluid.layers.conv3d( + paddle.static.nn.conv3d( input=input_NCDHW, num_filters=3, filter_size=[3, 3, 3], @@ -933,7 +933,7 @@ def test_api(self): data_format="NCDHW", ) - fluid.layers.conv3d( + paddle.static.nn.conv3d( input=input_NCDHW, num_filters=3, filter_size=[3, 3, 3], @@ -956,7 +956,7 @@ def test_api(self): # ValueError: cudnn def run_1(): - fluid.layers.conv3d( + paddle.static.nn.conv3d( input=input, num_filters=3, filter_size=3, @@ -972,7 +972,7 @@ def run_1(): # ValueError: data_format def run_2(): - fluid.layers.conv3d( + paddle.static.nn.conv3d( input=input, num_filters=3, filter_size=[3, 3, 3], @@ -988,7 +988,7 @@ def run_2(): # ValueError: padding def run_3(): - fluid.layers.conv3d( + paddle.static.nn.conv3d( input=input, num_filters=3, filter_size=3, @@ -1003,7 +1003,7 @@ def run_3(): self.assertRaises(ValueError, run_3) def run_4(): - fluid.layers.conv3d( + paddle.static.nn.conv3d( input=input, num_filters=3, filter_size=3, @@ -1018,7 +1018,7 @@ def run_4(): self.assertRaises(ValueError, run_4) def run_5(): - fluid.layers.conv3d( + paddle.static.nn.conv3d( input=input, num_filters=3, filter_size=0, @@ -1041,7 +1041,7 @@ def run_5(): ) def run_6(): - fluid.layers.conv3d( + paddle.static.nn.conv3d( input=x, num_filters=3, filter_size=3, @@ -1057,7 +1057,7 @@ def run_6(): # ValueError: groups def run_7(): - fluid.layers.conv3d( + paddle.static.nn.conv3d( input=input, num_filters=3, filter_size=3, @@ -1073,7 +1073,7 @@ def run_7(): # ValueError: filter num def run_8(): - fluid.layers.conv3d( + paddle.static.nn.conv3d( input=input, num_filters=0, filter_size=0, diff --git a/python/paddle/fluid/tests/unittests/test_conv_nn_grad.py b/python/paddle/fluid/tests/unittests/test_conv_nn_grad.py index b09a86f5bfa0b..19b9d6fdee97c 100644 --- a/python/paddle/fluid/tests/unittests/test_conv_nn_grad.py +++ b/python/paddle/fluid/tests/unittests/test_conv_nn_grad.py @@ -110,7 +110,7 @@ def func(self, place): eps = 0.005 dtype = np.float32 if fluid.core.is_compiled_with_rocm() else np.float64 x = layers.data('x', shape, False, dtype) - y = layers.conv3d(x, 2, 1, bias_attr=False) + y = paddle.static.nn.conv3d(x, 2, 1, bias_attr=False) x_arr = np.random.uniform(-1, 1, shape).astype(dtype) w = fluid.default_main_program().global_block().all_parameters() @@ -137,7 +137,7 @@ def func(self, place): eps = 0.005 dtype = np.float32 if fluid.core.is_compiled_with_rocm() else np.float64 x = layers.data('x', shape, False, dtype) - y = layers.conv3d(x, 2, 1, padding=1, bias_attr=False) + y = paddle.static.nn.conv3d(x, 2, 1, padding=1, bias_attr=False) x_arr = np.random.uniform(-1, 1, shape).astype(dtype) w = fluid.default_main_program().global_block().all_parameters() @@ -332,7 +332,7 @@ def func(self, place): eps = 0.005 dtype = np.float32 if fluid.core.is_compiled_with_rocm() else np.float64 x = layers.data('x', shape, False, dtype) - y = layers.conv3d( + y = paddle.static.nn.conv3d( input=x, num_filters=2, filter_size=1, @@ -365,7 +365,7 @@ def func(self, place): eps = 0.005 dtype = np.float32 if fluid.core.is_compiled_with_rocm() else np.float64 x = layers.data('x', shape, False, dtype) - y = layers.conv3d( + y = paddle.static.nn.conv3d( input=x, num_filters=2, filter_size=1, @@ -399,7 +399,7 @@ def func(self, place): eps = 0.005 dtype = np.float32 if fluid.core.is_compiled_with_rocm() else np.float64 x = layers.data('x', shape, False, dtype) - y = layers.conv3d( + y = paddle.static.nn.conv3d( input=x, num_filters=2, filter_size=1, @@ -432,7 +432,7 @@ def func(self, place): eps = 0.005 dtype = np.float32 if fluid.core.is_compiled_with_rocm() else np.float64 x = layers.data('x', shape, False, dtype) - y = layers.conv3d( + y = paddle.static.nn.conv3d( input=x, num_filters=2, filter_size=1, @@ -467,7 +467,7 @@ def func(self, place): eps = 0.005 dtype = np.float32 if fluid.core.is_compiled_with_rocm() else np.float64 x = layers.data('x', shape, False, dtype) - y = layers.conv3d( + y = paddle.static.nn.conv3d( input=x, num_filters=2, filter_size=1, diff --git a/python/paddle/fluid/tests/unittests/test_functional_conv3d.py b/python/paddle/fluid/tests/unittests/test_functional_conv3d.py index d2d7074a4b1a3..71123cb51e74b 100644 --- a/python/paddle/fluid/tests/unittests/test_functional_conv3d.py +++ b/python/paddle/fluid/tests/unittests/test_functional_conv3d.py @@ -87,7 +87,7 @@ def static_graph_case_1(self): (-1, self.in_channels, -1, -1, -1), dtype=self.dtype, ) - y = fluid.layers.conv3d( + y = paddle.static.nn.conv3d( x, self.out_channels, self.filter_shape, @@ -480,7 +480,7 @@ def static_graph_case(self): with fluid.unique_name.guard(): with fluid.program_guard(main, start): x = fluid.data("input", self.input.shape, dtype=paddle.float32) - y = fluid.layers.conv3d( + y = paddle.static.nn.conv3d( x, self.num_filters, self.filter_size, diff --git a/python/paddle/fluid/tests/unittests/test_imperative_load_static_param.py b/python/paddle/fluid/tests/unittests/test_imperative_load_static_param.py index c96fe97fac87f..7147e924a191a 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_load_static_param.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_load_static_param.py @@ -51,10 +51,10 @@ def testLoadStaticModel(self): conv3d_in = fluid.data( name='conv3d_in', shape=[None, 3, 12, 32, 32], dtype='float32' ) - conv3d_out_1 = fluid.layers.conv3d( + conv3d_out_1 = paddle.static.nn.conv3d( input=conv3d_in, num_filters=2, filter_size=3, act="relu" ) - conv3d_out_2 = fluid.layers.conv3d( + conv3d_out_2 = paddle.static.nn.conv3d( input=conv3d_in, num_filters=2, filter_size=3, act="relu" ) diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index 937c027222001..8301a02a2e21d 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -1688,7 +1688,9 @@ def test_conv3d(self): images = layers.data( name='pixel', shape=[3, 6, 6, 6], dtype='float32' ) - ret = layers.conv3d(input=images, num_filters=3, filter_size=2) + ret = paddle.static.nn.conv3d( + input=images, num_filters=3, filter_size=2 + ) static_ret = self.get_static_graph_result( feed={'pixel': np.ones([2, 3, 6, 6, 6], dtype='float32')}, fetch_list=[ret], diff --git a/python/paddle/fluid/tests/unittests/xpu/test_conv3d_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_conv3d_op_xpu.py index f949d7eeef87b..3b6b1a4363f16 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_conv3d_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_conv3d_op_xpu.py @@ -525,7 +525,7 @@ def test_api(self): dtype="float32", ) - fluid.layers.conv3d( + paddle.static.nn.conv3d( input=input_NDHWC, num_filters=3, filter_size=[3, 3, 3], @@ -536,7 +536,7 @@ def test_api(self): data_format="NCDHW", ) - fluid.layers.conv3d( + paddle.static.nn.conv3d( input=input_NCDHW, num_filters=3, filter_size=[3, 3, 3], @@ -547,7 +547,7 @@ def test_api(self): data_format="NCDHW", ) - fluid.layers.conv3d( + paddle.static.nn.conv3d( input=input_NCDHW, num_filters=3, filter_size=[3, 3, 3], @@ -558,7 +558,7 @@ def test_api(self): data_format="NCDHW", ) - fluid.layers.conv3d( + paddle.static.nn.conv3d( input=input_NDHWC, num_filters=3, filter_size=[3, 3, 3], @@ -569,7 +569,7 @@ def test_api(self): data_format="NDHWC", ) - fluid.layers.conv3d( + paddle.static.nn.conv3d( input=input_NCDHW, num_filters=3, filter_size=[3, 3, 3], @@ -580,7 +580,7 @@ def test_api(self): data_format="NCDHW", ) - fluid.layers.conv3d( + paddle.static.nn.conv3d( input=input_NCDHW, num_filters=3, filter_size=[3, 3, 3], @@ -603,7 +603,7 @@ def test_api(self): # ValueError: cudnn def run_1(): - fluid.layers.conv3d( + paddle.static.nn.conv3d( input=input, num_filters=3, filter_size=3, @@ -619,7 +619,7 @@ def run_1(): # ValueError: data_format def run_2(): - fluid.layers.conv3d( + paddle.static.nn.conv3d( input=input, num_filters=3, filter_size=[3, 3, 3], @@ -635,7 +635,7 @@ def run_2(): # ValueError: padding def run_3(): - fluid.layers.conv3d( + paddle.static.nn.conv3d( input=input, num_filters=3, filter_size=3, @@ -650,7 +650,7 @@ def run_3(): self.assertRaises(ValueError, run_3) def run_4(): - fluid.layers.conv3d( + paddle.static.nn.conv3d( input=input, num_filters=3, filter_size=3, @@ -665,7 +665,7 @@ def run_4(): self.assertRaises(ValueError, run_4) def run_5(): - fluid.layers.conv3d( + paddle.static.nn.conv3d( input=input, num_filters=3, filter_size=0, @@ -688,7 +688,7 @@ def run_5(): ) def run_6(): - fluid.layers.conv3d( + paddle.static.nn.conv3d( input=x, num_filters=3, filter_size=3, @@ -704,7 +704,7 @@ def run_6(): # ValueError: groups def run_7(): - fluid.layers.conv3d( + paddle.static.nn.conv3d( input=input, num_filters=3, filter_size=3, @@ -720,7 +720,7 @@ def run_7(): # ValueError: filter num def run_8(): - fluid.layers.conv3d( + paddle.static.nn.conv3d( input=input, num_filters=0, filter_size=0, diff --git a/python/paddle/static/nn/__init__.py b/python/paddle/static/nn/__init__.py index 7f695a48219d4..ef966ecd98ce4 100755 --- a/python/paddle/static/nn/__init__.py +++ b/python/paddle/static/nn/__init__.py @@ -14,6 +14,7 @@ from .common import fc # noqa: F401 from .common import deform_conv2d # noqa: F401 +from .common import conv3d # noqa: F401 from .common import conv2d_transpose # noqa: F401 from .common import conv3d_transpose # noqa: F401 @@ -22,7 +23,6 @@ from ...fluid.layers import case # noqa: F401 from ...fluid.layers import cond # noqa: F401 from ...fluid.layers import conv2d # noqa: F401 -from ...fluid.layers import conv3d # noqa: F401 from ...fluid.layers import create_parameter # noqa: F401 from ...fluid.layers import crf_decoding # noqa: F401 from ...fluid.layers import data_norm # noqa: F401 diff --git a/python/paddle/static/nn/common.py b/python/paddle/static/nn/common.py index aee2009edd28f..a7470f2fb2e03 100755 --- a/python/paddle/static/nn/common.py +++ b/python/paddle/static/nn/common.py @@ -13,6 +13,7 @@ # limitations under the License. import paddle +from paddle.fluid.initializer import Normal from paddle.fluid.framework import static_only, Variable, _non_static_mode from paddle.fluid.data_feeder import check_dtype @@ -176,6 +177,314 @@ def fc( ) +def conv3d( + input, + num_filters, + filter_size, + stride=1, + padding=0, + dilation=1, + groups=None, + param_attr=None, + bias_attr=None, + use_cudnn=True, + act=None, + name=None, + data_format="NCDHW", +): + r""" + :api_attr: Static Graph + + The convolution3D layer calculates the output based on the input, filter + and strides, paddings, dilations, groups parameters. Input(Input) and + Output(Output) are in NCDHW or NDHWC format. Where N is batch size C is the number of + channels, D is the depth of the feature, H is the height of the feature, + and W is the width of the feature. Convlution3D is similar with Convlution2D + but adds one dimension(depth). If bias attribution and activation type are + provided, bias is added to the output of the convolution, and the + corresponding activation function is applied to the final result. + + For each input :math:`X`, the equation is: + + .. math:: + + Out = \sigma (W \\ast X + b) + + In the above equation: + + * :math:`X`: Input value, a tensor with NCDHW or NDHWC format. + * :math:`W`: Filter value, a tensor with MCDHW format. + * :math:`\\ast`: Convolution operation. + * :math:`b`: Bias value, a 2-D tensor with shape [M, 1]. + * :math:`\\sigma`: Activation function. + * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different. + + Example: + + - Input: + + Input shape: :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})` + + Filter shape: :math:`(C_{out}, C_{in}, D_f, H_f, W_f)` + + - Output: + Output shape: :math:`(N, C_{out}, D_{out}, H_{out}, W_{out})` + + Where + + .. math:: + + D_{out}&= \\frac{(D_{in} + 2 * paddings[0] - (dilations[0] * (D_f - 1) + 1))}{strides[0]} + 1 \\\\ + H_{out}&= \\frac{(H_{in} + 2 * paddings[1] - (dilations[1] * (H_f - 1) + 1))}{strides[1]} + 1 \\\\ + W_{out}&= \\frac{(W_{in} + 2 * paddings[2] - (dilations[2] * (W_f - 1) + 1))}{strides[2]} + 1 + + Args: + input (Tensor): The input is 5-D Tensor with shape [N, C, D, H, W], the data + type of input is float16 or float32 or float64. + num_filters(int): The number of filter. It is as same as the output + image channel. + filter_size (int|tuple): The filter size. If filter_size is a tuple, + it must contain three integers, (filter_size_depth, filter_size_height, + filter_size_width). Otherwise, filter_size_depth = filter_size_height = \ + filter_size_width = filter_size. + stride (int|tuple): The stride size. It means the stride in convolution. If stride is a + tuple, it must contain three integers, (stride_depth, stride_height, stride_width). + Otherwise, stride_depth = stride_height = stride_width = stride. Default: stride = 1. + padding (string|int|list|tuple): The padding size. It means the number of zero-paddings + on both sides for each dimension. If `padding` is a string, either 'VALID' or + 'SAME' which is the padding algorithm. If padding size is a tuple or list, + it could be in three forms: `[pad_depth, pad_height, pad_width]` or + `[pad_depth_front, pad_depth_back, pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`, + and when `data_format` is `"NCDHW"`, `pool_padding` can be in the form + `[[0,0], [0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`. + when `data_format` is `"NDHWC"`, `pool_padding` can be in the form + `[[0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`. + Default: padding = 0. + dilation (int|tuple): The dilation size. It means the spacing between the kernel points. + If dilation is a tuple, it must contain three integers, (dilation_depth, dilation_height, + dilation_width). Otherwise, dilation_depth = dilation_height = dilation_width = dilation. + Default: dilation = 1. + groups (int): The groups number of the Conv3d Layer. According to grouped + convolution in Alex Krizhevsky's Deep CNN paper: when group=2, + the first half of the filters is only connected to the first half + of the input channels, while the second half of the filters is only + connected to the second half of the input channels. Default: groups=1 + param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights + of conv3d. If it is set to None or one attribute of ParamAttr, conv3d + will create ParamAttr as param_attr. If it is set to None, the parameter + is initialized with :math:`Normal(0.0, std)`, and the :math:`std` is + :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. Default: None. + bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of conv3d. + If it is set to False, no bias will be added to the output units. + If it is set to None or one attribute of ParamAttr, conv3d + will create ParamAttr as bias_attr. If the Initializer of the bias_attr + is not set, the bias is initialized zero. Default: None. + use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn + library is installed. Default: True + act (str): Activation type, if it is set to None, activation is not appended. + Default: None. + name(str|None): For detailed information, please refer + to :ref:`api_guide_Name`. Usually name is no need to set and + None by default. + data_format (str, optional): Specify the data format of the input, and the data format of the output + will be consistent with that of the input. An optional string from: `"NCHW"`, `"NHWC"`. + The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of: + `[batch_size, input_channels, input_height, input_width]`. + + Returns: + A Variable holding Tensor representing the conv3d, whose data type is + the same with input. If act is None, the tensor variable storing the + convolution result, and if act is not None, the tensor variable storing + convolution and non-linearity activation result. + + Raises: + ValueError: If the type of `use_cudnn` is not bool. + ValueError: If `data_format` is not "NCDHW" or "NDHWC". + ValueError: If the channel dimmention of the input is less than or equal to zero. + ValueError: If `padding` is a string, but not "SAME" or "VALID". + ValueError: If `padding` is a tuple, but the element corresponding to the input's batch size is not 0 + or the element corresponding to the input's channel is not 0. + ShapeError: If the input is not 5-D Tensor. + ShapeError: If the input's dimension size and filter's dimension size not equal. + ShapeError: If the dimension size of input minus the size of `stride` is not 2. + ShapeError: If the number of input channels is not equal to filter's channels * groups. + ShapeError: If the number of output channels is not be divided by groups. + + Examples: + .. code-block:: python + + import paddle + import numpy as np + + paddle.enable_static() + data = paddle.static.data(name='data', shape=[None, 3, 12, 32, 32], dtype='float32') + param_attr = paddle.framework.ParamAttr(name='conv3d.weight', initializer=paddle.nn.initializer.XavierNormal(), learning_rate=0.001) + res = paddle.static.nn.conv3d(input=data, num_filters=2, filter_size=3, act="relu", param_attr=param_attr) + place = paddle.CPUPlace() + exe = paddle.static.Executor(place) + exe.run(paddle.static.default_startup_program()) + x = np.random.rand(1, 3, 12, 32, 32).astype("float32") + output = exe.run(feed={"data": x}, fetch_list=[res]) + print(output) + """ + + l_type = 'conv3d' + assert param_attr is not False, "param_attr should not be False here." + helper = LayerHelper(l_type, **locals()) + dtype = helper.input_dtype() + + if not isinstance(use_cudnn, bool): + raise ValueError( + "Attr(use_cudnn) should be True or False. Received " + "Attr(use_cudnn): %s. " % str(use_cudnn) + ) + + if data_format not in ["NCDHW", "NDHWC"]: + raise ValueError( + "Attr(data_format) should be 'NCDHW' or 'NDHWC'. Received " + "Attr(data_format): %s." % str(data_format) + ) + + channel_last = data_format == "NDHWC" + if len(input.shape) != 5: + raise ValueError( + "Input should be 5D tensor, but received input with the shape of {}".format( + input.shape + ) + ) + num_channels = input.shape[4] if channel_last else input.shape[1] + if num_channels < 0: + raise ValueError( + "The channel dimmention of the input(%s) should be defined. " + "Received: %s." % (str(input.shape), str(num_channels)) + ) + + if groups is None: + num_filter_channels = num_channels + elif groups <= 0: + raise ValueError( + "the groups of conv3d should be greater than 0. Received groups: {}".format( + groups + ) + ) + else: + if num_channels % groups != 0: + raise ValueError( + "The number of input channels must be divisible by Attr(groups). " + "Received: number of channels(%s), groups(%s)." + % (str(num_channels), str(groups)) + ) + num_filter_channels = num_channels // groups + + filter_size = utils.convert_to_list(filter_size, 3, 'filter_size') + stride = utils.convert_to_list(stride, 3, 'stride') + dilation = utils.convert_to_list(dilation, 3, 'dilation') + + def _update_padding(padding, data_format): + def is_list_or_tuple(ele): + if isinstance(ele, list) or isinstance(ele, tuple): + return True + return False + + if is_list_or_tuple(padding) and len(padding) == 5: + if is_list_or_tuple(padding[0]) and (data_format == "NCDHW"): + if not (padding[0] == [0, 0] and padding[1] == [0, 0]): + raise ValueError( + "Non-zero padding(%s) in the batch or channel dimensions " + "is not supported." % str(padding) + ) + padding = padding[2:5] + padding = [ele for a_list in padding for ele in a_list] + elif is_list_or_tuple(padding[0]) and (data_format == "NDHWC"): + if not (padding[0] == [0, 0] and padding[4] == [0, 0]): + raise ValueError( + "Non-zero padding(%s) in the batch or channel dimensions " + "is not supported." % str(padding) + ) + padding = padding[1:4] + padding = [ele for a_list in padding for ele in a_list] + padding = utils.convert_to_list(padding, 6, 'padding') + if utils._is_symmetric_padding(padding, 3): + padding = [padding[0], padding[2], padding[4]] + elif is_list_or_tuple(padding) and len(padding) == 6: + padding = utils.convert_to_list(padding, 6, 'padding') + if utils._is_symmetric_padding(padding, 3): + padding = [padding[0], padding[2], padding[4]] + else: + padding = utils.convert_to_list(padding, 3, 'padding') + + return padding + + padding_algorithm = "EXPLICIT" + if isinstance(padding, str): + padding = padding.upper() + if padding not in ["SAME", "VALID"]: + raise ValueError( + "Unknown padding: '%s'. It can only be 'SAME' or 'VALID'." + % str(padding) + ) + if padding == "VALID": + padding_algorithm = "VALID" + padding = [0, 0, 0] + elif padding == "SAME": + padding_algorithm = "SAME" + padding = [0, 0, 0] + + padding = _update_padding(padding, data_format) + + input_shape = input.shape + filter_shape = [num_filters, num_filter_channels] + filter_size + + def _get_default_param_initializer(): + filter_elem_num = ( + filter_size[0] * filter_size[1] * filter_size[2] * num_channels + ) + if filter_elem_num <= 0: + raise ValueError( + "Invalid filter number, excepted number is larger than 0, but" + " received {}, please check the input shape and " + "filter size.".format(filter_elem_num) + ) + + std = (2.0 / filter_elem_num) ** 0.5 + return Normal(0.0, std, 0) + + filter_param = helper.create_parameter( + attr=helper.param_attr, + shape=filter_shape, + dtype=dtype, + default_initializer=_get_default_param_initializer(), + ) + + pre_bias = helper.create_variable_for_type_inference(dtype) + + helper.append_op( + type=l_type, + inputs={ + 'Input': input, + 'Filter': filter_param, + }, + outputs={"Output": pre_bias}, + attrs={ + 'strides': stride, + 'paddings': padding, + 'dilations': dilation, + 'groups': groups, + 'use_cudnn': use_cudnn, + 'use_mkldnn': False, + "padding_algorithm": padding_algorithm, + "data_format": data_format, + }, + ) + + if data_format == 'NCDHW': + pre_act = helper.append_bias_op(pre_bias, dim_start=1, dim_end=2) + else: + pre_act = helper.append_bias_op(pre_bias, dim_start=4, dim_end=5) + + return helper.append_activation(pre_act) + + def conv2d_transpose( input, num_filters, From 43b92b633f5d2db98f45d4b9597e5389f6f9712f Mon Sep 17 00:00:00 2001 From: wangxiaoning <71813629+wangxn12138@users.noreply.github.com> Date: Thu, 24 Nov 2022 18:00:17 +0800 Subject: [PATCH 201/210] [Fluid clean] (#48105) * add index sample fp16 support * remove fluid APIs in distributed_strategy.py and role_maker.py * Revert "remove fluid APIs in distributed_strategy.py and role_maker.py" This reverts commit 223bbee990d3bf69e252fc3c0f19e3873550a264. * remove fluid APIs in distributed_strategy.py and role_maker.py * remove index sample op changes * remove fluid APIs under fleet.base * remove fluid APIs under fleet.layers.mpu * remove fluid APIs under fleet.meta_optimizers * fix fluid error * fix util_factory.py * reset fluid.io.load_inference_model API --- .../kernels/gpu/index_sample_grad_kernel.cu | 0 paddle/phi/kernels/gpu/index_sample_kernel.cu | 0 .../fleet/base/distributed_strategy.py | 6 ++-- .../distributed/fleet/base/role_maker.py | 10 +++---- .../distributed/fleet/base/util_factory.py | 27 ++++++++++-------- .../distributed/fleet/layers/mpu/mp_layers.py | 2 +- .../distributed/fleet/layers/mpu/mp_ops.py | 28 +++++++++---------- .../distributed/fleet/layers/mpu/random.py | 7 +++-- .../meta_optimizers/raw_program_optimizer.py | 7 +++-- .../meta_optimizers/sharding_optimizer.py | 15 +++++----- .../tensor_parallel_optimizer.py | 4 +-- .../tests/unittests/test_index_sample_op.py | 0 python/paddle/tensor/search.py | 0 13 files changed, 56 insertions(+), 50 deletions(-) mode change 100644 => 100755 paddle/phi/kernels/gpu/index_sample_grad_kernel.cu mode change 100644 => 100755 paddle/phi/kernels/gpu/index_sample_kernel.cu mode change 100644 => 100755 python/paddle/fluid/tests/unittests/test_index_sample_op.py mode change 100644 => 100755 python/paddle/tensor/search.py diff --git a/paddle/phi/kernels/gpu/index_sample_grad_kernel.cu b/paddle/phi/kernels/gpu/index_sample_grad_kernel.cu old mode 100644 new mode 100755 diff --git a/paddle/phi/kernels/gpu/index_sample_kernel.cu b/paddle/phi/kernels/gpu/index_sample_kernel.cu old mode 100644 new mode 100755 diff --git a/python/paddle/distributed/fleet/base/distributed_strategy.py b/python/paddle/distributed/fleet/base/distributed_strategy.py index 34207f6ce6f31..32656c19a38dc 100755 --- a/python/paddle/distributed/fleet/base/distributed_strategy.py +++ b/python/paddle/distributed/fleet/base/distributed_strategy.py @@ -103,7 +103,7 @@ def _set_distributed_strategy(self, dist_strategy): self.job_info.strategy = dist_strategy -ReduceStrategyFluid = paddle.fluid.BuildStrategy.ReduceStrategy +ReduceStrategyFluid = paddle.static.BuildStrategy.ReduceStrategy ReduceStrategyFleet = int @@ -207,7 +207,7 @@ def execution_strategy(self): strategy.execution_strategy = exe_strategy """ - execution_strategy = paddle.fluid.ExecutionStrategy() + execution_strategy = paddle.static.ExecutionStrategy() fields = self.strategy.execution_strategy.DESCRIPTOR.fields for f in fields: setattr( @@ -255,7 +255,7 @@ def build_strategy(self): """ - build_strategy = paddle.fluid.BuildStrategy() + build_strategy = paddle.static.BuildStrategy() fields = self.strategy.build_strategy.DESCRIPTOR.fields for f in fields: value = getattr(self.strategy.build_strategy, f.name) diff --git a/python/paddle/distributed/fleet/base/role_maker.py b/python/paddle/distributed/fleet/base/role_maker.py index b001c5482fdfc..e29cee04fca03 100755 --- a/python/paddle/distributed/fleet/base/role_maker.py +++ b/python/paddle/distributed/fleet/base/role_maker.py @@ -19,7 +19,7 @@ from multiprocessing import Process, Manager import paddle -import paddle.fluid as fluid +import paddle.fluid.core as core from paddle.distributed.fleet.base.private_helper_function import ( wait_server_ready, ) @@ -128,7 +128,7 @@ def init( def _init_fs(self, fs_path, prefix): def init(rank, nodes, role): - gloo = fluid.core.Gloo() + gloo = core.Gloo() gloo.set_rank(rank) gloo.set_size(nodes) gloo.set_prefix(prefix) @@ -156,7 +156,7 @@ def init(rank, nodes, role): def _init_dfs(self, dfs_name, dfs_ugi, dfs_path, prefix): def init(rank, nodes, role): - gloo = fluid.core.Gloo() + gloo = core.Gloo() gloo.set_rank(rank) gloo.set_size(nodes) gloo.set_prefix(prefix) @@ -216,7 +216,7 @@ def init_kv_server(http_server_d): return _http_server def init(rank, nodes, role): - gloo = fluid.core.Gloo() + gloo = core.Gloo() gloo.set_rank(rank) gloo.set_size(nodes) gloo.set_prefix(prefix) @@ -1175,7 +1175,7 @@ def _generate_role(self): else: self._collective_env() self._role_is_generated = True - if not paddle.fluid.framework._non_static_mode(): + if not paddle.framework.in_dynamic_mode(): self._gloo_init() diff --git a/python/paddle/distributed/fleet/base/util_factory.py b/python/paddle/distributed/fleet/base/util_factory.py index 8717619eafe35..1f9a0c8d5f30c 100755 --- a/python/paddle/distributed/fleet/base/util_factory.py +++ b/python/paddle/distributed/fleet/base/util_factory.py @@ -16,12 +16,13 @@ """basic collective operations in python""" """remote file system""" +import paddle from ..utils.fs import FS from paddle.fluid.proto import framework_pb2 -from paddle.fluid.framework import Program +from paddle.static import Program from paddle.fluid import debugger from google.protobuf import text_format -import paddle.fluid as fluid +import paddle.framework as framework from collections import OrderedDict from paddle.fluid import core import subprocess @@ -376,7 +377,7 @@ def _proto_check(self, config): pruned_vars = [ (v.name, v) for v in pruned_prog.list_vars() - if fluid.io.is_persistable(v) + if paddle.static.io.is_persistable(v) ] pruned_vars = OrderedDict(pruned_vars) pruned_vars_name = [name for name in pruned_vars] @@ -460,7 +461,7 @@ def reader(batch_size, fn, dim): ) saved_params = [ - v for v in prog.list_vars() if fluid.io.is_persistable(v) + v for v in prog.list_vars() if paddle.static.io.is_persistable(v) ] print( "persistable vars in dump program: {}".format( @@ -487,15 +488,15 @@ def check_not_expected_ops(prog, not_expected_op_types): ) return False - place = fluid.CPUPlace() - exe = fluid.Executor(place) - scope = fluid.core.Scope() - with fluid.scope_guard(scope): + place = framework.CPUPlace() + exe = paddle.static.Executor(place) + scope = paddle.static.Scope() + with paddle.static.scope_guard(scope): ( inference_program, feed_target_names, fetch_targets, - ) = fluid.io.load_inference_model( + ) = paddle.fluid.io.load_inference_model( config.dump_model_dir, exe, model_filename=model_filename, @@ -508,7 +509,7 @@ def check_not_expected_ops(prog, not_expected_op_types): for each_var in saved_params } for each_var in saved_params: - var_temp = fluid.global_scope().find_var(each_var.name) + var_temp = paddle.static.global_scope().find_var(each_var.name) assert var_temp is not None, ( "can't not find var: " + each_var.name ) @@ -639,7 +640,7 @@ def check_not_expected_ops(prog, not_expected_op_types): dtype=feed_config.feeded_vars_types[i], ) feed_tensors.append( - fluid.create_lod_tensor( + paddle.fluid.create_lod_tensor( t, [[1] * config.batch_size], place ) ) @@ -668,7 +669,9 @@ def check_not_expected_ops(prog, not_expected_op_types): ) for i in range(len(feed_config.feeded_vars_names)) ] - feeder = fluid.DataFeeder(feed_list=feed_vars, place=place) + feeder = paddle.fluid.DataFeeder( + feed_list=feed_vars, place=place + ) batch_feed = feed_gen( config.batch_size, feed_config.feeded_vars_dims, diff --git a/python/paddle/distributed/fleet/layers/mpu/mp_layers.py b/python/paddle/distributed/fleet/layers/mpu/mp_layers.py index 8224d2a7b98a0..acbd95f8ff50a 100644 --- a/python/paddle/distributed/fleet/layers/mpu/mp_layers.py +++ b/python/paddle/distributed/fleet/layers/mpu/mp_layers.py @@ -15,7 +15,7 @@ import paddle from . import mp_ops from paddle.fluid import core -from paddle.fluid.dygraph.layers import Layer +from paddle.nn import Layer from .random import get_rng_state_tracker from paddle.nn import functional as F from ...base import topology as tp diff --git a/python/paddle/distributed/fleet/layers/mpu/mp_ops.py b/python/paddle/distributed/fleet/layers/mpu/mp_ops.py index 83ba760c9e0a7..8a463e996604e 100644 --- a/python/paddle/distributed/fleet/layers/mpu/mp_ops.py +++ b/python/paddle/distributed/fleet/layers/mpu/mp_ops.py @@ -15,17 +15,17 @@ import paddle from paddle import _legacy_C_ops from paddle.fluid import core -from paddle.fluid.framework import _non_static_mode -from paddle.fluid.framework import _in_legacy_dygraph -from paddle.fluid.framework import in_dygraph_mode -from paddle.fluid.framework import _varbase_creator -from paddle.fluid.layer_helper import LayerHelper +from paddle.framework import in_dynamic_mode +from paddle.framework import _in_legacy_dygraph +from paddle.framework import in_dygraph_mode +from paddle.framework import _varbase_creator +from paddle.framework import LayerHelper from paddle.fluid.data_feeder import check_variable_and_dtype -from paddle.fluid.dygraph import layers +from paddle.nn import Layer from paddle.distributed import collective from ....communication.reduce import ReduceOp, _get_reduce_op from paddle.fluid.data_feeder import check_dtype -import paddle.fluid.dygraph_utils as dygraph_utils +from paddle.common_ops_import import dygraph_utils def _c_identity(tensor, group=None): @@ -123,7 +123,7 @@ def _c_concat(tensor, group=None): rank = group.rank nranks = group.nranks - if _non_static_mode(): + if in_dynamic_mode(): return _legacy_C_ops.c_concat( tensor, 'ring_id', @@ -189,7 +189,7 @@ def _c_split(tensor, group=None): else group.nranks ) - if _non_static_mode(): + if in_dynamic_mode(): return _legacy_C_ops.c_split( tensor, 'use_calc_stream', @@ -335,7 +335,7 @@ def _c_lookup_table(table, index, start_index=0, name=None): Returns: Tensor. """ - if _non_static_mode(): + if in_dynamic_mode(): return _legacy_C_ops.c_embedding( table, index, "start_index", start_index ) @@ -354,7 +354,7 @@ def _c_lookup_table(table, index, start_index=0, name=None): return tmp -class _Linear(layers.Layer): +class _Linear(Layer): """ Linear """ @@ -424,7 +424,7 @@ def _c_softmax_with_cross_entropy( if input_dims - 1 == label_dims: label = paddle.unsqueeze(label, axis=-1) - if _non_static_mode(): + if in_dynamic_mode(): softmax, loss = _legacy_C_ops.c_softmax_with_cross_entropy( logits, label, 'ring_id', ring_id, 'rank', rank, 'nranks', nranks ) @@ -458,7 +458,7 @@ def _linear(x, weight, bias=None, name=None): """ Fuction Linear """ - if _non_static_mode(): + if in_dynamic_mode(): pre_bias = _varbase_creator(dtype=x.dtype) _legacy_C_ops.matmul( x, @@ -825,7 +825,7 @@ def split( supported_operations ) ) - if _non_static_mode(): + if in_dynamic_mode(): raise ValueError( "paddle.distributed.split cannot be used in dynamic " "graph mode, plese use ParallelEmbedding, ParallelRowLinear, " diff --git a/python/paddle/distributed/fleet/layers/mpu/random.py b/python/paddle/distributed/fleet/layers/mpu/random.py index 17442c1938a1d..5661804a27966 100644 --- a/python/paddle/distributed/fleet/layers/mpu/random.py +++ b/python/paddle/distributed/fleet/layers/mpu/random.py @@ -18,8 +18,9 @@ from paddle import _legacy_C_ops from paddle.fluid import core from paddle.fluid.data_feeder import check_variable_and_dtype -from paddle.fluid.framework import Variable, _non_static_mode -from paddle.fluid.layer_helper import LayerHelper +from paddle.static import Variable +from paddle.framework import in_dynamic_mode +from paddle.framework import LayerHelper __all__ = [] @@ -209,7 +210,7 @@ def dropout( ) # semantic transfer # dygraph using tracker, doesn't need determinate seed - if _non_static_mode(): + if in_dynamic_mode(): out, mask = _legacy_C_ops.dropout( x, 'dropout_prob', diff --git a/python/paddle/distributed/fleet/meta_optimizers/raw_program_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/raw_program_optimizer.py index c16f60139dbe4..6b1425c703f97 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/raw_program_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/raw_program_optimizer.py @@ -11,8 +11,9 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and -import paddle.fluid as fluid -from paddle.fluid import core, unique_name +import paddle.static as static +from paddle.fluid import core +from paddle.utils import unique_name from .meta_optimizer_base import MetaOptimizerBase from .common import ( OpRole, @@ -132,7 +133,7 @@ def minimize_impl( self.rank = self.role_maker._worker_index() self.nranks = self.role_maker._worker_num() if startup_program is None: - startup_program = fluid.default_startup_program() + startup_program = static.default_startup_program() self.startup_program = startup_program block = loss.block diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py index fe11a788c51e1..05fa6e16ca51a 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py @@ -13,10 +13,11 @@ # limitations under the License. import os -from paddle.fluid import unique_name, core -import paddle.fluid as fluid +from paddle.fluid import core +from paddle.utils import unique_name +from paddle.fluid.optimizer import PipelineOptimizer from paddle.static import default_startup_program, device_guard -from paddle.fluid import layers +from paddle.static import create_global_var from .common import OpRole, OP_ROLE_VAR_KEY, CollectiveHelper, OP_ROLE_KEY from .common import is_backward_op, is_optimizer_op, is_update_op @@ -275,7 +276,7 @@ def _inner_opt_minimize( ) if self.pp_degree > 1: - pp_optimizer = fluid.optimizer.PipelineOptimizer( + pp_optimizer = PipelineOptimizer( self.inner_opt, self._gradient_merge_acc_step ) self._pp_optimizer = pp_optimizer @@ -1916,7 +1917,7 @@ def create_persistable_gradients_and_insert_merge_ops( def _create_gm_cond(self, main_block): # Add const var - acc_step_var = layers.create_global_var( + acc_step_var = create_global_var( name="gradient_merge_acc_step", shape=[1], value=int(self._gradient_merge_acc_step), @@ -1925,7 +1926,7 @@ def _create_gm_cond(self, main_block): force_cpu=True, ) - zero_var = layers.create_global_var( + zero_var = create_global_var( name="gradient_merge_zero", shape=[1], value=int(0), @@ -1935,7 +1936,7 @@ def _create_gm_cond(self, main_block): ) # Add step var & cond var - current_step_var = layers.create_global_var( + current_step_var = create_global_var( name="gradient_merge_current_step", shape=[1], value=int(0), diff --git a/python/paddle/distributed/fleet/meta_optimizers/tensor_parallel_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/tensor_parallel_optimizer.py index f798a6d3f430e..41ef5f6190ebf 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/tensor_parallel_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/tensor_parallel_optimizer.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and -import paddle.fluid as fluid +import paddle.static as static from .meta_optimizer_base import MetaOptimizerBase from .common import ( CollectiveHelper, @@ -174,7 +174,7 @@ def minimize_impl( self.current_endpoint = self.endpoints[self.role_maker._worker_index()] self.startup_program = startup_program if startup_program is None: - self.startup_program = fluid.default_startup_program() + self.startup_program = static.default_startup_program() optimize_ops, params_grads = self.inner_opt.minimize( loss, self.startup_program, parameter_list, no_grad_set diff --git a/python/paddle/fluid/tests/unittests/test_index_sample_op.py b/python/paddle/fluid/tests/unittests/test_index_sample_op.py old mode 100644 new mode 100755 diff --git a/python/paddle/tensor/search.py b/python/paddle/tensor/search.py old mode 100644 new mode 100755 From 4952f344dd573f854e33ef2f067eb437a3565972 Mon Sep 17 00:00:00 2001 From: 201716010711 <87008376+201716010711@users.noreply.github.com> Date: Thu, 24 Nov 2022 18:34:00 +0800 Subject: [PATCH 202/210] clean fluid task: transfer uniform_random_batch_size_like api (#48270) --- python/paddle/distribution/uniform.py | 3 +- python/paddle/fluid/layers/distributions.py | 6 +- python/paddle/fluid/layers/nn.py | 110 ------------------ ...perative_star_gan_with_gradient_penalty.py | 3 +- .../fluid/tests/unittests/test_layers.py | 3 +- .../unittests/test_uniform_random_bf16_op.py | 3 +- .../tests/unittests/test_uniform_random_op.py | 7 +- python/paddle/tensor/random.py | 95 +++++++++++++++ 8 files changed, 111 insertions(+), 119 deletions(-) diff --git a/python/paddle/distribution/uniform.py b/python/paddle/distribution/uniform.py index c8f8c40a758a4..a5013ab9880dd 100644 --- a/python/paddle/distribution/uniform.py +++ b/python/paddle/distribution/uniform.py @@ -15,6 +15,7 @@ import numpy as np from paddle import _C_ops, _legacy_C_ops from paddle.distribution import distribution +from paddle.tensor import random from paddle.fluid.data_feeder import check_type, convert_dtype from paddle.fluid.framework import ( _non_static_mode, @@ -167,7 +168,7 @@ def sample(self, shape, seed=0): zero_tmp = tensor.fill_constant_batch_size_like( self.low + self.high, batch_shape + shape, self.dtype, 0.0 ) - uniform_random_tmp = nn.uniform_random_batch_size_like( + uniform_random_tmp = random.uniform_random_batch_size_like( zero_tmp, zero_tmp.shape, dtype=self.dtype, diff --git a/python/paddle/fluid/layers/distributions.py b/python/paddle/fluid/layers/distributions.py index 18b7f26713ab1..4d13260c614e1 100644 --- a/python/paddle/fluid/layers/distributions.py +++ b/python/paddle/fluid/layers/distributions.py @@ -221,8 +221,10 @@ def sample(self, shape, seed=0): zero_tmp = tensor.fill_constant_batch_size_like( self.low + self.high, batch_shape + shape, self.low.dtype, 0.0 ) - uniform_random_tmp = nn.uniform_random_batch_size_like( - zero_tmp, zero_tmp.shape, min=0.0, max=1.0, seed=seed + uniform_random_tmp = ( + paddle.tensor.random.uniform_random_batch_size_like( + zero_tmp, zero_tmp.shape, min=0.0, max=1.0, seed=seed + ) ) output = ( uniform_random_tmp * (zero_tmp + self.high - self.low) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index d2ac562dfd0bb..215dd0845e725 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -121,7 +121,6 @@ 'elementwise_div', 'elementwise_sub', 'elementwise_mul', - 'uniform_random_batch_size_like', 'gaussian_random', 'sampling_id', 'gaussian_random_batch_size_like', @@ -7182,115 +7181,6 @@ def flatten(x, axis=1, name=None): from paddle.fluid.framework import convert_np_dtype_to_dtype_ -@deprecated(since='1.8.0', update_to="paddle.uniform") -@templatedoc() -def uniform_random_batch_size_like( - input, - shape, - dtype='float32', - input_dim_idx=0, - output_dim_idx=0, - min=-1.0, - max=1.0, - seed=0, -): - """ - This OP initializes a variable with random values sampled from a - uniform distribution in the range [min, max). The input_dim_idx used to get the input dimension value which will be used to resize the output dimension. - - .. code-block:: text - - *Case 1: - - Given: - input =[[0.946741 , 0.1357001 , 0.38086128]] # input.shape=[1,3] - shape=[2,4] - - result.shape[output_dim_idx] = input.shape[input_dim_idx], - output_dim_idx = 0, - input_dim_idx = 0, - result.shape[0] = input.shape[0], - then: - result=[[ 0.3443427 , -0.23056602, 0.3477049 , 0.06139076]] # result.shape=[1,4] - - *Case 2: - - Given: - input =[[0.946741 , 0.1357001 , 0.38086128]] # input.shape=[1,3] - shape=[2,4] - input_dim_idx=1 - output_dim_idx=1 - - result.shape[output_dim_idx] = input.shape[input_dim_idx], - output_dim_idx = 1, - input_dim_idx = 1, - result.shape[1] = input.shape[1], - then: - result=[[-0.23133647, -0.84195036, 0.21441269], - [-0.08774924, 0.25605237, -0.09403259]] # result.shape=[2,3] - Args: - input (Variable): A Tensor. Supported data types: float32, float64. - shape (tuple|list): A python list or python tuple. The shape of the output Tensor, the data type is int. - input_dim_idx (int, optional): An index used to get the input dimension value which will be used to resize the output dimension. Default 0. - output_dim_idx (int, optional): An index used to indicate the specific dimension that will be replaced by corresponding input dimension value. Default 0. - min (float, optional): The lower bound on the range of random values to generate, the min is included in the range. Default -1.0. - max (float, optional): The upper bound on the range of random values to generate, the max is excluded in the range. Default 1.0. - seed (int, optional): Random seed used for generating samples. 0 means use a seed generated by the system.Note that if seed is not 0, this operator will always generate the same random numbers every time. - dtype(np.dtype|core.VarDesc.VarType|str, optional): The data type of output Tensor. Supported data types: float32, float64. Default float32. - Returns: - Variable: A Tensor of the specified shape filled with uniform_random values. The shape of the Tensor is determined by the shape parameter and the specified dimension of the input Tensor. - - Examples: - .. code-block:: python - - import paddle - import paddle.fluid as fluid - paddle.enable_static() - - # example 1: - input = fluid.data(name="input", shape=[1, 3], dtype='float32') - out_1 = fluid.layers.uniform_random_batch_size_like(input, [2, 4]) # out_1.shape=[1, 4] - - # example 2: - out_2 = fluid.layers.uniform_random_batch_size_like(input, [2, 4], input_dim_idx=1, output_dim_idx=1) # out_2.shape=[2, 3] - - - """ - check_variable_and_dtype( - input, - 'Input', - ("float32", 'float64', "uint16"), - 'uniform_random_batch_size_like', - ) - check_type(shape, 'shape', (list, tuple), 'uniform_random_batch_size_like') - check_dtype( - dtype, - 'dtype', - ('float32', 'float64', "uint16"), - 'uniform_random_batch_size_like', - ) - - helper = LayerHelper('uniform_random_batch_size_like', **locals()) - out = helper.create_variable_for_type_inference(dtype) - c_dtype = convert_np_dtype_to_dtype_(dtype) - helper.append_op( - type='uniform_random_batch_size_like', - inputs={'Input': input}, - outputs={'Out': out}, - attrs={ - 'shape': shape, - 'input_dim_idx': input_dim_idx, - 'output_dim_idx': output_dim_idx, - 'min': min, - 'max': max, - 'seed': seed, - 'dtype': c_dtype, - }, - ) - - return out - - @deprecated(since="2.0.0", update_to="paddle.normal") @templatedoc() def gaussian_random( diff --git a/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py b/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py index d623a277cf006..5d969804a8e1d 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py @@ -14,6 +14,7 @@ import paddle import paddle.fluid as fluid +from paddle.tensor import random import numpy as np import unittest from paddle import _legacy_C_ops @@ -402,7 +403,7 @@ def calc_gradients(outputs, inputs, no_grad_set): def gradient_penalty(f, real, fake, no_grad_set, cfg): def _interpolate(a, b): shape = [a.shape[0]] - alpha = fluid.layers.uniform_random_batch_size_like( + alpha = random.uniform_random_batch_size_like( input=a, shape=shape, min=0.1, max=1.0, seed=cfg.seed ) diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index 8301a02a2e21d..00733426b6a55 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -33,6 +33,7 @@ from paddle.fluid.dygraph import base from paddle.fluid.dygraph import to_variable from paddle.fluid.framework import _test_eager_guard +from paddle.tensor import random import paddle.nn.functional as F @@ -3555,7 +3556,7 @@ def make_uniform_random_batch_size_like(self): input = self._get_data( name="input", shape=[13, 11], dtype='float32' ) - out = layers.uniform_random_batch_size_like(input, [-1, 11]) + out = random.uniform_random_batch_size_like(input, [-1, 11]) return out def make_gaussian_random(self): diff --git a/python/paddle/fluid/tests/unittests/test_uniform_random_bf16_op.py b/python/paddle/fluid/tests/unittests/test_uniform_random_bf16_op.py index 0977ec69ac3ae..b417789ec01c4 100644 --- a/python/paddle/fluid/tests/unittests/test_uniform_random_bf16_op.py +++ b/python/paddle/fluid/tests/unittests/test_uniform_random_bf16_op.py @@ -23,6 +23,7 @@ output_hist, output_hist_diag, ) +from paddle.tensor import random class TestUniformRandomOpBF16(OpTest): @@ -262,7 +263,7 @@ def test_attr_tensorlist_int32_API(self): train_program = fluid.Program() with fluid.program_guard(train_program, startup_program): input = fluid.data(name="input", shape=[1, 3], dtype='uint16') - out_1 = fluid.layers.uniform_random_batch_size_like( + out_1 = random.uniform_random_batch_size_like( input, [2, 4], dtype=np.uint16 ) # out_1.shape=[1, 4] diff --git a/python/paddle/fluid/tests/unittests/test_uniform_random_op.py b/python/paddle/fluid/tests/unittests/test_uniform_random_op.py index efcbf075bf3fc..dbc036cb7e47f 100644 --- a/python/paddle/fluid/tests/unittests/test_uniform_random_op.py +++ b/python/paddle/fluid/tests/unittests/test_uniform_random_op.py @@ -25,6 +25,7 @@ from paddle.fluid.framework import _test_eager_guard from test_attribute_var import UnittestBase +from paddle.tensor import random def output_hist(out): @@ -481,7 +482,7 @@ def test_Variable(): x1 = fluid.create_lod_tensor( np.zeros((100, 784)), [[10, 10, 10, 70]], fluid.CPUPlace() ) - fluid.layers.uniform_random_batch_size_like(x1) + random.uniform_random_batch_size_like(x1) self.assertRaises(TypeError, test_Variable) @@ -489,7 +490,7 @@ def test_shape(): x1 = fluid.layers.data( name='x2', shape=[100, 784], dtype='float32' ) - fluid.layers.uniform_random_batch_size_like(x1, shape="shape") + random.uniform_random_batch_size_like(x1, shape="shape") self.assertRaises(TypeError, test_shape) @@ -497,7 +498,7 @@ def test_dtype(): x2 = fluid.layers.data( name='x2', shape=[100, 784], dtype='float32' ) - fluid.layers.uniform_random_batch_size_like(x2, 'int32') + random.uniform_random_batch_size_like(x2, 'int32') self.assertRaises(TypeError, test_dtype) diff --git a/python/paddle/tensor/random.py b/python/paddle/tensor/random.py index 54e8459661c45..da7f8b6a28513 100644 --- a/python/paddle/tensor/random.py +++ b/python/paddle/tensor/random.py @@ -215,6 +215,101 @@ def multinomial(x, num_samples=1, replacement=False, name=None): return out +def uniform_random_batch_size_like( + input, + shape, + dtype='float32', + input_dim_idx=0, + output_dim_idx=0, + min=-1.0, + max=1.0, + seed=0, +): + """ + This OP initializes a variable with random values sampled from a + uniform distribution in the range [min, max). The input_dim_idx used to get the input dimension value which will be used to resize the output dimension. + .. code-block:: text + *Case 1: + Given: + input =[[0.946741 , 0.1357001 , 0.38086128]] # input.shape=[1,3] + shape=[2,4] + result.shape[output_dim_idx] = input.shape[input_dim_idx], + output_dim_idx = 0, + input_dim_idx = 0, + result.shape[0] = input.shape[0], + then: + result=[[ 0.3443427 , -0.23056602, 0.3477049 , 0.06139076]] # result.shape=[1,4] + *Case 2: + Given: + input =[[0.946741 , 0.1357001 , 0.38086128]] # input.shape=[1,3] + shape=[2,4] + input_dim_idx=1 + output_dim_idx=1 + result.shape[output_dim_idx] = input.shape[input_dim_idx], + output_dim_idx = 1, + input_dim_idx = 1, + result.shape[1] = input.shape[1], + then: + result=[[-0.23133647, -0.84195036, 0.21441269], + [-0.08774924, 0.25605237, -0.09403259]] # result.shape=[2,3] + Args: + input (Variable): A Tensor. Supported data types: float32, float64. + shape (tuple|list): A python list or python tuple. The shape of the output Tensor, the data type is int. + input_dim_idx (int, optional): An index used to get the input dimension value which will be used to resize the output dimension. Default 0. + output_dim_idx (int, optional): An index used to indicate the specific dimension that will be replaced by corresponding input dimension value. Default 0. + min (float, optional): The lower bound on the range of random values to generate, the min is included in the range. Default -1.0. + max (float, optional): The upper bound on the range of random values to generate, the max is excluded in the range. Default 1.0. + seed (int, optional): Random seed used for generating samples. 0 means use a seed generated by the system.Note that if seed is not 0, this operator will always generate the same random numbers every time. + dtype(np.dtype|core.VarDesc.VarType|str, optional): The data type of output Tensor. Supported data types: float32, float64. Default float32. + Returns: + Variable: A Tensor of the specified shape filled with uniform_random values. The shape of the Tensor is determined by the shape parameter and the specified dimension of the input Tensor. + Examples: + .. code-block:: python + import paddle + import paddle.fluid as fluid + from paddle.tensor import random + paddle.enable_static() + # example 1: + input = fluid.data(name="input", shape=[1, 3], dtype='float32') + out_1 = random.uniform_random_batch_size_like(input, [2, 4]) # out_1.shape=[1, 4] + # example 2: + out_2 = random.uniform_random_batch_size_like(input, [2, 4], input_dim_idx=1, output_dim_idx=1) # out_2.shape=[2, 3] + """ + check_variable_and_dtype( + input, + 'Input', + ("float32", 'float64', "uint16"), + 'uniform_random_batch_size_like', + ) + check_type(shape, 'shape', (list, tuple), 'uniform_random_batch_size_like') + check_dtype( + dtype, + 'dtype', + ('float32', 'float64', "uint16"), + 'uniform_random_batch_size_like', + ) + + helper = LayerHelper('uniform_random_batch_size_like', **locals()) + out = helper.create_variable_for_type_inference(dtype) + c_dtype = convert_np_dtype_to_dtype_(dtype) + helper.append_op( + type='uniform_random_batch_size_like', + inputs={'Input': input}, + outputs={'Out': out}, + attrs={ + 'shape': shape, + 'input_dim_idx': input_dim_idx, + 'output_dim_idx': output_dim_idx, + 'min': min, + 'max': max, + 'seed': seed, + 'dtype': c_dtype, + }, + ) + + return out + + def gaussian(shape, mean=0.0, std=1.0, dtype=None, name=None): """ Returns a Tensor filled with random values sampled from a Gaussian From ac8a4b1619b4d007085467f50e8f0d10d2994d7c Mon Sep 17 00:00:00 2001 From: yuehuayingxueluo <867460659@qq.com> Date: Thu, 24 Nov 2022 18:57:09 +0800 Subject: [PATCH 203/210] clear fluid api: kldiv_loss, kldiv_loss, mse_loss (#48147) --- python/paddle/fluid/layers/loss.py | 142 ------------------ .../tests/unittests/ipu/test_dy2static_ipu.py | 2 +- .../unittests/ipu/test_kldiv_loss_op_ipu.py | 2 +- .../fluid/tests/unittests/test_layers.py | 6 +- .../fluid/tests/unittests/test_mse_loss.py | 7 +- .../tests/unittests/test_npair_loss_op.py | 15 +- 6 files changed, 17 insertions(+), 157 deletions(-) diff --git a/python/paddle/fluid/layers/loss.py b/python/paddle/fluid/layers/loss.py index 710382bdd26fa..5af7111e58243 100644 --- a/python/paddle/fluid/layers/loss.py +++ b/python/paddle/fluid/layers/loss.py @@ -41,9 +41,6 @@ 'nce', 'softmax_with_cross_entropy', 'sigmoid_cross_entropy_with_logits', - 'kldiv_loss', - 'npair_loss', - 'mse_loss', ] kIgnoreIndex = -100 @@ -826,142 +823,3 @@ def sigmoid_cross_entropy_with_logits( outputs={"Out": out}, ) return out - - -@deprecated(since="2.0.0", update_to="paddle.nn.functional.kl_div") -@templatedoc() -def kldiv_loss(x, target, reduction='mean', name=None): - """ - - ${comment} - - Args: - x (Tensor): ${x_comment} - target (Tensor): ${target_comment} - reduction (Tensor): ${reduction_comment} - name(str, optional): For detailed information, please refer - to :ref:`api_guide_Name`. Usually name is no need to set and - None by default. - - Returns: - Tensor: The KL divergence loss. The data type is same as input tensor - - Examples: - .. code-block:: python - - import paddle - import paddle.fluid as fluid - - x = paddle.rand(shape=[3,4,2,2], dtype='float32') - target = paddle.rand(shape=[3,4,2,2], dtype='float32') - - # 'batchmean' reduction, loss shape will be [1] - loss = fluid.layers.kldiv_loss(x=x, target=target, reduction='batchmean') - print(loss.shape) # shape=[1] - - # 'mean' reduction, loss shape will be [1] - loss = fluid.layers.kldiv_loss(x=x, target=target, reduction='mean') - print(loss.shape) # shape=[1] - - # 'sum' reduction, loss shape will be [1] - loss = fluid.layers.kldiv_loss(x=x, target=target, reduction='sum') - print(loss.shape) # shape=[1] - - # 'none' reduction, loss shape is same with X shape - loss = fluid.layers.kldiv_loss(x=x, target=target, reduction='none') - print(loss.shape) # shape=[3, 4, 2, 2] - - """ - helper = LayerHelper('kldiv_loss', **locals()) - check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'kldiv_loss') - check_variable_and_dtype( - target, 'target', ['float32', 'float64'], 'kldiv_loss' - ) - check_type(reduction, 'reduction', str, 'kldiv_loss') - loss = helper.create_variable_for_type_inference(dtype=x.dtype) - helper.append_op( - type='kldiv_loss', - inputs={'X': x, 'Target': target}, - outputs={'Loss': loss}, - attrs={'reduction': reduction}, - ) - return loss - - -from .control_flow import equal - - -def npair_loss(anchor, positive, labels, l2_reg=0.002): - """ - - Npair loss requires paired data. Npair loss has two parts: the first part is L2 - regularizer on the embedding vector; the second part is cross entropy loss which - takes the similarity matrix of anchor and positive as logits. - - For more information, please refer to: - `Improved Deep Metric Learning with Multi class N pair Loss Objective `_ - - Args: - anchor(Tensor): embedding vector for the anchor image. shape=[batch_size, embedding_dims], - the data type is float32 or float64. - positive(Tensor): embedding vector for the positive image. shape=[batch_size, embedding_dims], - the data type is float32 or float64. - labels(Tensor): 1-D tensor. shape=[batch_size], the data type is float32 or float64 or int64. - l2_reg(float32): L2 regularization term on embedding vector, default: 0.002. - - - Returns: - A Tensor representing the npair loss, the data type is the same as anchor, the shape is [1]. - - Examples: - - .. code-block:: python - - import paddle - - DATATYPE = "float32" - - anchor = paddle.rand(shape=(18, 6), dtype=DATATYPE) - positive = paddle.rand(shape=(18, 6), dtype=DATATYPE) - labels = paddle.rand(shape=(18,), dtype=DATATYPE) - - npair_loss = paddle.nn.functional.npair_loss(anchor, positive, labels, l2_reg = 0.002) - print(npair_loss) - - """ - return paddle.nn.functional.npair_loss(anchor, positive, labels, l2_reg) - - -def mse_loss(input, label): - """ - - This op accepts input predications and target label and returns the mean square error. - - The loss can be described as: - - .. math:: - - Out = MEAN((input - label)^2) - - Parameters: - input (Tensor): Input tensor, the data type should be float32. - label (Tensor): Label tensor, the data type should be float32. - - Returns: - Tensor: The tensor storing the mean square error difference of input and label. - - Return type: Tensor. - - Examples: - .. code-block:: python - - import paddle - input = paddle.to_tensor([1.1, 1.9]) - label = paddle.to_tensor([1.0, 2.0]) - output = paddle.fluid.layers.mse_loss(input, label) - print(output.numpy()) - # [0.01] - """ - check_variable_and_dtype(input, "input", ['float32', 'float64'], 'mse_loss') - check_variable_and_dtype(label, "label", ['float32', 'float64'], 'mse_loss') - return nn.reduce_mean(square_error_cost(input, label)) diff --git a/python/paddle/fluid/tests/unittests/ipu/test_dy2static_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_dy2static_ipu.py index 701ae1531fae0..4e16b0efdf822 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_dy2static_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_dy2static_ipu.py @@ -238,7 +238,7 @@ def create_model(self, use_ipu=False): class TestWithoutIdentityLoss3(TestBase): def set_op_attrs(self): - self.loss_op = partial(paddle.fluid.layers.kldiv_loss, reduction="none") + self.loss_op = partial(paddle.nn.functional.kl_div, reduction="none") def set_data_feed(self): self.data = paddle.uniform((8, 3, 10, 10), dtype='float32') diff --git a/python/paddle/fluid/tests/unittests/ipu/test_kldiv_loss_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_kldiv_loss_op_ipu.py index d08df5399d939..62577b61b0e46 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_kldiv_loss_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_kldiv_loss_op_ipu.py @@ -57,7 +57,7 @@ def build_model(self, on_ipu): target = paddle.static.data( name=self.feed_list[1], shape=self.feed_shape[1], dtype='float32' ) - out = paddle.fluid.layers.kldiv_loss(x, target, **self.attrs) + out = paddle.nn.functional.kl_div(x, target, **self.attrs) self.fetch_list = [out.name] def run_model(self, exec_mode): diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index 00733426b6a55..f78bb59a06926 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -3731,7 +3731,9 @@ def make_kldiv_loss(self): dtype="float32", append_batch_size=False, ) - loss = layers.kldiv_loss(x=x, target=target, reduction='batchmean') + loss = paddle.nn.functional.kl_div( + input=x, label=target, reduction='batchmean' + ) return loss def make_temporal_shift(self): @@ -3773,7 +3775,7 @@ def make_mse_loss(self): ): x = self._get_data(name="X", shape=[1], dtype="float32") y = self._get_data(name="Y", shape=[1], dtype="float32") - out = layers.mse_loss(input=x, label=y) + out = paddle.nn.functional.mse_loss(input=x, label=y) return out def make_square_error_cost(self): diff --git a/python/paddle/fluid/tests/unittests/test_mse_loss.py b/python/paddle/fluid/tests/unittests/test_mse_loss.py index b0b59694406a7..1cf52d4d6742b 100644 --- a/python/paddle/fluid/tests/unittests/test_mse_loss.py +++ b/python/paddle/fluid/tests/unittests/test_mse_loss.py @@ -17,7 +17,6 @@ import paddle import paddle.fluid.core as core import paddle.fluid as fluid -import paddle.fluid.layers as layers from paddle.fluid.executor import Executor @@ -32,7 +31,7 @@ def test_mse_loss(self): input_var = fluid.data(name="input", shape=[-1, 3], dtype="float32") label_var = fluid.data(name="label", shape=[-1, 3], dtype="float32") - output = layers.mse_loss(input=input_var, label=label_var) + output = paddle.nn.functional.mse_loss(input=input_var, label=label_var) for use_cuda in ( [False, True] if core.is_compiled_with_cuda() else [False] ): @@ -52,14 +51,14 @@ def test_error(self): def test_invalid_input(): input = [256, 3] label = fluid.data(name='label1', shape=[None, 3], dtype='float32') - loss = fluid.layers.mse_loss(input, label) + loss = paddle.nn.functional.mse_loss(input, label) self.assertRaises(TypeError, test_invalid_input) def test_invalid_label(): input = fluid.data(name='input1', shape=[None, 3], dtype='float32') label = [256, 3] - loss = fluid.layers.mse_loss(input, label) + loss = paddle.nn.functional.mse_loss(input, label) self.assertRaises(TypeError, test_invalid_label) diff --git a/python/paddle/fluid/tests/unittests/test_npair_loss_op.py b/python/paddle/fluid/tests/unittests/test_npair_loss_op.py index f90b3f4cdff60..97a732325aab9 100644 --- a/python/paddle/fluid/tests/unittests/test_npair_loss_op.py +++ b/python/paddle/fluid/tests/unittests/test_npair_loss_op.py @@ -14,6 +14,7 @@ import unittest import paddle.fluid as fluid +import paddle import paddle.fluid.core as core import numpy as np from paddle.fluid import Program, program_guard @@ -99,7 +100,7 @@ def test_npair_loss(self): append_batch_size=False, ) - npair_loss_op = fluid.layers.npair_loss( + npair_loss_op = paddle.nn.functional.npair_loss( anchor=anc, positive=pos, labels=lab, l2_reg=reg_lambda ) out_tensor = exe.run( @@ -140,19 +141,19 @@ def test_errors(self): def test_anchor_Variable(): # the anchor type must be Variable - fluid.layers.npair_loss( + paddle.nn.functional.npair_loss( anchor=anchor_np, positive=positive_data, labels=labels_data ) def test_positive_Variable(): # the positive type must be Variable - fluid.layers.npair_loss( + paddle.nn.functional.npair_loss( anchor=anchor_data, positive=positive_np, labels=labels_data ) def test_labels_Variable(): # the labels type must be Variable - fluid.layers.npair_loss( + paddle.nn.functional.npair_loss( anchor=anchor_data, positive=positive_data, labels=labels_np ) @@ -165,7 +166,7 @@ def test_anchor_type(): anchor_data1 = fluid.data( name='anchor1', shape=[2, 4], dtype='int32' ) - fluid.layers.npair_loss( + paddle.nn.functional.npair_loss( anchor=anchor_data, positive=positive_data, labels=labels_np ) @@ -174,7 +175,7 @@ def test_positive_type(): positive_data1 = fluid.data( name='positive1', shape=[2, 4], dtype='int32' ) - fluid.layers.npair_loss( + paddle.nn.functional.npair_loss( anchor=anchor_data, positive=positive_data1, labels=labels_np, @@ -185,7 +186,7 @@ def test_labels_type(): labels_data1 = fluid.data( name='labels1', shape=[2], dtype='int32' ) - fluid.layers.npair_loss( + paddle.nn.functional.npair_loss( anchor=anchor_data, positive=positive_data, labels=labels_data1, From 22555e96e40101a8bbb02a8f977c41657b2b48d9 Mon Sep 17 00:00:00 2001 From: zhangyikun02 <48021248+zhangyk0314@users.noreply.github.com> Date: Thu, 24 Nov 2022 19:41:56 +0800 Subject: [PATCH 204/210] add pad3d and pad3d_grad op for xpu, test=kunlun (#48306) --- cmake/external/xpu.cmake | 2 +- .../fluid/platform/device/xpu/xpu2_op_list.h | 2 + paddle/phi/kernels/xpu/pad3d_grad_kernel.cc | 108 +++ paddle/phi/kernels/xpu/pad3d_kernel.cc | 187 +++++ .../tests/unittests/xpu/test_pad3d_op_xpu.py | 701 ++++++++++++++++++ 5 files changed, 999 insertions(+), 1 deletion(-) create mode 100644 paddle/phi/kernels/xpu/pad3d_grad_kernel.cc create mode 100644 paddle/phi/kernels/xpu/pad3d_kernel.cc create mode 100644 python/paddle/fluid/tests/unittests/xpu/test_pad3d_op_xpu.py diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake index 8d485fba6a3bd..f4f1ff479c72f 100644 --- a/cmake/external/xpu.cmake +++ b/cmake/external/xpu.cmake @@ -10,7 +10,7 @@ set(XPU_RT_LIB_NAME "libxpurt.so") if(NOT DEFINED XPU_BASE_URL) set(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.su.bcebos.com/KL-SDK/klsdk-dev") - set(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20221120") + set(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20221124") else() set(XPU_BASE_URL "${XPU_BASE_URL}") endif() diff --git a/paddle/fluid/platform/device/xpu/xpu2_op_list.h b/paddle/fluid/platform/device/xpu/xpu2_op_list.h index 62a4daf727503..7474ac88ac8db 100644 --- a/paddle/fluid/platform/device/xpu/xpu2_op_list.h +++ b/paddle/fluid/platform/device/xpu/xpu2_op_list.h @@ -433,6 +433,8 @@ XPUOpMap& get_kl2_ops() { pOpKernelType(vartype::INT64, XPUPlace())})}, {"p_norm", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"p_norm_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"pad3d_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"pad3d", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"pool2d_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), pOpKernelType(vartype::FP16, XPUPlace())})}, diff --git a/paddle/phi/kernels/xpu/pad3d_grad_kernel.cc b/paddle/phi/kernels/xpu/pad3d_grad_kernel.cc new file mode 100644 index 0000000000000..035ab6c4b1adc --- /dev/null +++ b/paddle/phi/kernels/xpu/pad3d_grad_kernel.cc @@ -0,0 +1,108 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/pad3d_grad_kernel.h" + +#include "paddle/phi/backends/xpu/enforce_xpu.h" +#include "paddle/phi/backends/xpu/xpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void Pad3dGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out_grad, + const IntArray& paddings, + const std::string& mode, + float pad_value, + const std::string& data_format, + DenseTensor* x_grad) { + T value = static_cast(pad_value); + std::vector pads = paddings.GetData(); + + auto* d_out = &out_grad; + auto* d_in = x_grad; + auto d_in_dims = d_in->dims(); + const T* d_out_data = d_out->data(); + T* d_in_data = dev_ctx.template Alloc(d_in); + + bool is_ncdhw = true; + if (data_format == "NDHWC") { + is_ncdhw = false; + } + + const int num = d_in_dims[0]; // n + int channels = d_in_dims[1]; // c + int in_depth = d_in_dims[2]; // xd + int in_height = d_in_dims[3]; // xh + int in_width = d_in_dims[4]; // xw + if (data_format == "NDHWC") { + channels = d_in_dims[4]; + in_depth = d_in_dims[1]; + in_height = d_in_dims[2]; + in_width = d_in_dims[3]; + } + + std::vector pads_xpu(6); + pads_xpu[0] = pads[4]; // pf + pads_xpu[1] = pads[5]; // pb + pads_xpu[2] = pads[2]; // pt + pads_xpu[3] = pads[3]; // pd + pads_xpu[4] = pads[0]; // pl + pads_xpu[5] = pads[1]; // pr + + if (mode == "reflect") { + int r = xpu::reflection_pad3d_grad(dev_ctx.x_context(), + d_out_data, + d_in_data, + num, + channels, + in_depth, + in_height, + in_width, + pads_xpu, + is_ncdhw); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "reflection_pad3d_grad"); + } else if (mode == "replicate") { + int r = xpu::replication_pad3d_grad(dev_ctx.x_context(), + d_out_data, + d_in_data, + num, + channels, + in_depth, + in_height, + in_width, + pads_xpu, + is_ncdhw); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "replication_pad3d_grad"); + } else if (mode == "constant") { + int r = xpu::constant_pad3d_grad(dev_ctx.x_context(), + d_out_data, + d_in_data, + num, + channels, + in_depth, + in_height, + in_width, + pads_xpu, + value, + is_ncdhw); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "constant_pad3d_grad"); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(pad3d_grad, XPU, ALL_LAYOUT, phi::Pad3dGradKernel, float) {} diff --git a/paddle/phi/kernels/xpu/pad3d_kernel.cc b/paddle/phi/kernels/xpu/pad3d_kernel.cc new file mode 100644 index 0000000000000..7cf730205ae83 --- /dev/null +++ b/paddle/phi/kernels/xpu/pad3d_kernel.cc @@ -0,0 +1,187 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/pad3d_kernel.h" + +#include "paddle/phi/backends/xpu/enforce_xpu.h" +#include "paddle/phi/backends/xpu/xpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void Pad3dKernel(const Context& dev_ctx, + const DenseTensor& x, + const IntArray& paddings, + const std::string& mode, + float pad_value, + const std::string& data_format, + DenseTensor* out) { + T value = static_cast(pad_value); + std::vector pads = paddings.GetData(); + + auto in_dims = x.dims(); + const T* in_data = x.data(); + + bool is_ncdhw = true; + if (data_format == "NCDHW") { + out->Resize({in_dims[0], + in_dims[1], + in_dims[2] + pads[4] + pads[5], + in_dims[3] + pads[2] + pads[3], + in_dims[4] + pads[0] + pads[1]}); + } else { + is_ncdhw = false; + out->Resize({in_dims[0], + in_dims[1] + pads[4] + pads[5], + in_dims[2] + pads[2] + pads[3], + in_dims[3] + pads[0] + pads[1], + in_dims[4]}); + } + + T* out_data = dev_ctx.template Alloc(out); + + const int num = in_dims[0]; // n + int channels = in_dims[1]; // c + int in_depth = in_dims[2]; // xd + int in_height = in_dims[3]; // xh + int in_width = in_dims[4]; // xw + if (data_format == "NDHWC") { + channels = in_dims[4]; + in_depth = in_dims[1]; + in_height = in_dims[2]; + in_width = in_dims[3]; + } + + if (mode == "circular") { + PADDLE_THROW(phi::errors::External( + "XPU is not support circular padding mode in pad3d")); + } + + if (mode == "reflect") { + PADDLE_ENFORCE_GT( + in_depth, + pads[4], + errors::InvalidArgument("The depth of Input(X)'s dimension should be " + "greater than pad_front" + " in reflect mode" + ", but received depth(%d) and pad_front(%d).", + in_depth, + pads[4])); + PADDLE_ENFORCE_GT( + in_depth, + pads[5], + errors::InvalidArgument("The depth of Input(X)'s dimension should be " + "greater than pad_back" + " in reflect mode" + ", but received depth(%d) and pad_back(%d).", + in_depth, + pads[5])); + + PADDLE_ENFORCE_GT( + in_height, + pads[2], + errors::InvalidArgument("The height of Input(X)'s dimension should be " + "greater than pad_top" + " in reflect mode" + ", but received depth(%d) and pad_top(%d).", + in_height, + pads[2])); + PADDLE_ENFORCE_GT( + in_height, + pads[3], + errors::InvalidArgument("The height of Input(X)'s dimension should be " + "greater than pad_bottom" + " in reflect mode" + ", but received depth(%d) and pad_bottom(%d).", + in_height, + pads[3])); + + PADDLE_ENFORCE_GT( + in_width, + pads[0], + errors::InvalidArgument("The width of Input(X)'s dimension should be " + "greater than pad_left" + " in reflect mode" + ", but received depth(%d) and pad_left(%d).", + in_width, + pads[0])); + PADDLE_ENFORCE_GT( + in_width, + pads[1], + errors::InvalidArgument("The width of Input(X)'s dimension should be " + "greater than pad_right" + " in reflect mode" + ", but received depth(%d) and pad_right(%d).", + in_width, + pads[1])); + } else if (mode == "replicate") { + PADDLE_ENFORCE_NE(in_depth * in_height * in_width, + 0, + errors::InvalidArgument( + "The input tensor size can not be 0 for circular " + "or replicate padding mode.")); + } + + std::vector pads_xpu(6); + pads_xpu[0] = pads[4]; // pf + pads_xpu[1] = pads[5]; // pb + pads_xpu[2] = pads[2]; // pt + pads_xpu[3] = pads[3]; // pd + pads_xpu[4] = pads[0]; // pl + pads_xpu[5] = pads[1]; // pr + + if (mode == "reflect") { + int r = xpu::reflection_pad3d(dev_ctx.x_context(), + in_data, + out_data, + num, + channels, + in_depth, + in_height, + in_width, + pads_xpu, + is_ncdhw); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "reflection_pad3d"); + } else if (mode == "replicate") { + int r = xpu::replication_pad3d(dev_ctx.x_context(), + in_data, + out_data, + num, + channels, + in_depth, + in_height, + in_width, + pads_xpu, + is_ncdhw); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "replication_pad3d"); + } else if (mode == "constant") { + int r = xpu::constant_pad3d(dev_ctx.x_context(), + in_data, + out_data, + num, + channels, + in_depth, + in_height, + in_width, + pads_xpu, + value, + is_ncdhw); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "constant_pad3d"); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(pad3d, XPU, ALL_LAYOUT, phi::Pad3dKernel, float) {} diff --git a/python/paddle/fluid/tests/unittests/xpu/test_pad3d_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_pad3d_op_xpu.py new file mode 100644 index 0000000000000..df4fd640571f3 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/xpu/test_pad3d_op_xpu.py @@ -0,0 +1,701 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np +import sys + +sys.path.append("..") + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from op_test_xpu import XPUOpTest +from xpu.get_test_cover_info import ( + create_test_class, + get_xpu_op_support_types, + XPUOpTestWrapper, +) + +from paddle.fluid import Program, program_guard, Executor, default_main_program + +paddle.enable_static() + + +class XPUTestPad3dOp(XPUOpTestWrapper): + def __init__(self): + self.op_name = 'pad3d' + + class TestPad3dOp(XPUOpTest): + def setUp(self): + paddle.enable_static() + self.op_type = "pad3d" + self.dtype = self.in_type + self.place = paddle.XPUPlace(0) + + self.value = 0.0 + self.initTestCase() + self.python_api = paddle.nn.functional.pad + self.inputs = {'X': np.random.random(self.shape).astype(self.dtype)} + self.attrs = {} + if self.variable_paddings: + self.attrs['paddings'] = [] + self.inputs['Paddings'] = ( + np.array(self.paddings).flatten().astype("int32") + ) + else: + self.attrs['paddings'] = ( + np.array(self.paddings).flatten().astype("int32") + ) + self.attrs['value'] = self.value + self.attrs['mode'] = self.mode + self.attrs['data_format'] = self.data_format + if self.data_format == "NCDHW": + paddings = [ + (0, 0), + (0, 0), + (self.paddings[4], self.paddings[5]), + (self.paddings[2], self.paddings[3]), + (self.paddings[0], self.paddings[1]), + ] + else: + paddings = [ + (0, 0), + (self.paddings[4], self.paddings[5]), + (self.paddings[2], self.paddings[3]), + (self.paddings[0], self.paddings[1]), + (0, 0), + ] + if self.mode == "constant": + out = np.pad( + self.inputs['X'], + paddings, + mode=self.mode, + constant_values=self.value, + ) + elif self.mode == "reflect": + out = np.pad(self.inputs['X'], paddings, mode=self.mode) + elif self.mode == "replicate": + out = np.pad(self.inputs['X'], paddings, mode="edge") + elif self.mode == "circular": + out = np.pad(self.inputs['X'], paddings, mode="wrap") + self.outputs = {'Out': out} + + def test_check_output(self): + self.check_output(check_eager=True) + + def test_check_grad_normal(self): + self.check_grad(['X'], 'Out', check_eager=True) + + def initTestCase(self): + self.shape = (2, 3, 4, 5, 6) + self.paddings = [0, 0, 0, 0, 0, 0] + self.mode = "constant" + self.data_format = "NCDHW" + self.pad_value = 0.0 + self.variable_paddings = False + + class TestCase1(TestPad3dOp): + def initTestCase(self): + self.shape = (2, 3, 4, 5, 6) + self.paddings = [0, 1, 2, 3, 4, 5] + self.mode = "constant" + self.data_format = "NCDHW" + self.value = 1.0 + self.variable_paddings = False + + class TestCase2(TestPad3dOp): + def initTestCase(self): + self.shape = (2, 3, 4, 5, 6) + self.paddings = [1, 1, 1, 1, 1, 1] + self.mode = "constant" + self.data_format = "NDHWC" + self.value = 1.0 + self.variable_paddings = False + + class TestCase3(TestPad3dOp): + def initTestCase(self): + self.shape = (2, 3, 4, 5, 6) + self.paddings = [0, 1, 1, 0, 2, 3] + self.mode = "reflect" + self.data_format = "NCDHW" + self.variable_paddings = False + + class TestCase4(TestPad3dOp): + def initTestCase(self): + self.shape = (4, 4, 4, 4, 4) + self.paddings = [0, 1, 2, 1, 2, 3] + self.mode = "reflect" + self.data_format = "NDHWC" + self.variable_paddings = False + + class TestCase5(TestPad3dOp): + def initTestCase(self): + self.shape = (2, 3, 4, 5, 6) + self.paddings = [0, 1, 2, 3, 2, 1] + self.mode = "replicate" + self.data_format = "NCDHW" + self.variable_paddings = False + + class TestCase6(TestPad3dOp): + def initTestCase(self): + self.shape = (4, 4, 4, 4, 4) + self.paddings = [5, 4, 2, 1, 2, 3] + self.mode = "replicate" + self.data_format = "NDHWC" + self.variable_paddings = False + + class TestCase7(TestPad3dOp): + def initTestCase(self): + self.shape = (2, 3, 4, 5, 6) + self.paddings = [0, 1, 2, 3, 4, 5] + self.mode = "constant" + self.data_format = "NCDHW" + self.value = 1.0 + self.variable_paddings = True + + class TestCase8(TestPad3dOp): + def initTestCase(self): + self.shape = (2, 3, 4, 5, 6) + self.paddings = [0, 1, 2, 3, 4, 5] + self.mode = "constant" + self.data_format = "NDHWC" + self.value = 1.0 + self.variable_paddings = True + + class TestPadAPI(unittest.TestCase): + def setUp(self): + self.places = [paddle.XPUPlace(0)] + self.dtype = self.in_type + + def check_static_result_1(self, place): + paddle.enable_static() + with program_guard(Program(), Program()): + input_shape = (1, 2, 3, 4, 5) + pad = [1, 2, 1, 1, 3, 4] + mode = "constant" + value = 100 + input_data = np.random.rand(*input_shape).astype(self.dtype) + x = paddle.fluid.data(name="x", shape=input_shape) + result = F.pad( + x=x, pad=pad, value=value, mode=mode, data_format="NCDHW" + ) + exe = Executor(place) + fetches = exe.run( + default_main_program(), + feed={"x": input_data}, + fetch_list=[result], + ) + + np_out = self._get_numpy_out(input_data, pad, mode, value) + np.testing.assert_allclose(fetches[0], np_out, rtol=1e-05) + + def check_static_result_2(self, place): + paddle.enable_static() + with program_guard(Program(), Program()): + input_shape = (2, 3, 4, 5, 6) + pad = [1, 2, 1, 1, 1, 2] + mode = "reflect" + input_data = np.random.rand(*input_shape).astype(self.dtype) + x = paddle.fluid.data(name="x", shape=input_shape) + result1 = F.pad(x=x, pad=pad, mode=mode, data_format="NCDHW") + result2 = F.pad(x=x, pad=pad, mode=mode, data_format="NDHWC") + exe = Executor(place) + fetches = exe.run( + default_main_program(), + feed={"x": input_data}, + fetch_list=[result1, result2], + ) + + np_out1 = self._get_numpy_out( + input_data, pad, mode, data_format="NCDHW" + ) + np_out2 = self._get_numpy_out( + input_data, pad, mode, data_format="NDHWC" + ) + np.testing.assert_allclose(fetches[0], np_out1, rtol=1e-05) + np.testing.assert_allclose(fetches[1], np_out2, rtol=1e-05) + + def check_static_result_3(self, place): + paddle.enable_static() + with program_guard(Program(), Program()): + input_shape = (2, 3, 4, 5, 6) + pad = [1, 2, 1, 1, 3, 4] + mode = "replicate" + input_data = np.random.rand(*input_shape).astype(self.dtype) + x = paddle.fluid.data(name="x", shape=input_shape) + result1 = F.pad(x=x, pad=pad, mode=mode, data_format="NCDHW") + result2 = F.pad(x=x, pad=pad, mode=mode, data_format="NDHWC") + exe = Executor(place) + fetches = exe.run( + default_main_program(), + feed={"x": input_data}, + fetch_list=[result1, result2], + ) + + np_out1 = self._get_numpy_out( + input_data, pad, mode, data_format="NCDHW" + ) + np_out2 = self._get_numpy_out( + input_data, pad, mode, data_format="NDHWC" + ) + np.testing.assert_allclose(fetches[0], np_out1, rtol=1e-05) + np.testing.assert_allclose(fetches[1], np_out2, rtol=1e-05) + + def _get_numpy_out( + self, input_data, pad, mode, value=0, data_format="NCDHW" + ): + if mode == "constant" and len(pad) == len(input_data.shape) * 2: + pad = np.reshape(pad, (-1, 2)).tolist() + elif data_format == "NCDHW": + pad = [ + (0, 0), + (0, 0), + (pad[4], pad[5]), + (pad[2], pad[3]), + (pad[0], pad[1]), + ] + elif data_format == "NDHWC": + pad = [ + (0, 0), + (pad[4], pad[5]), + (pad[2], pad[3]), + (pad[0], pad[1]), + (0, 0), + ] + elif data_format == "NCHW": + pad = [ + (0, 0), + (0, 0), + (pad[2], pad[3]), + (pad[0], pad[1]), + ] + elif data_format == "NHWC": + pad = [ + (0, 0), + (pad[2], pad[3]), + (pad[0], pad[1]), + (0, 0), + ] + elif data_format == "NCL": + pad = [ + (0, 0), + (0, 0), + (pad[0], pad[1]), + ] + elif data_format == "NLC": + pad = [ + (0, 0), + (pad[0], pad[1]), + (0, 0), + ] + + if mode == "constant": + out = np.pad(input_data, pad, mode=mode, constant_values=value) + elif mode == "reflect": + out = np.pad(input_data, pad, mode=mode) + elif mode == "replicate": + out = np.pad(input_data, pad, mode="edge") + elif mode == "circular": + out = np.pad(input_data, pad, mode="wrap") + + return out + + def test_static(self): + for place in self.places: + self.check_static_result_1(place=place) + self.check_static_result_2(place=place) + self.check_static_result_3(place=place) + + def test_dygraph_1(self): + paddle.disable_static() + input_shape = (1, 2, 3, 4, 5) + pad = [1, 2, 1, 1, 3, 4] + pad_3 = [1, 2, 1, 1, 3, 4, 5, 6, 7, 8] + mode = "constant" + value = 100 + input_data = np.random.rand(*input_shape).astype(self.dtype) + np_out1 = self._get_numpy_out( + input_data, pad, mode, value, data_format="NCDHW" + ) + np_out2 = self._get_numpy_out( + input_data, pad, mode, value, data_format="NDHWC" + ) + np_out3 = self._get_numpy_out( + input_data, pad_3, mode, value, data_format="NCDHW" + ) + tensor_data = paddle.to_tensor(input_data) + + y1 = F.pad( + tensor_data, + pad=pad, + mode=mode, + value=value, + data_format="NCDHW", + ) + y2 = F.pad( + tensor_data, + pad=pad, + mode=mode, + value=value, + data_format="NDHWC", + ) + y3 = F.pad( + tensor_data, + pad=pad_3, + mode=mode, + value=value, + data_format="NCDHW", + ) + + np.testing.assert_allclose(y1.numpy(), np_out1, rtol=1e-05) + np.testing.assert_allclose(y2.numpy(), np_out2, rtol=1e-05) + np.testing.assert_allclose(y3.numpy(), np_out3, rtol=1e-05) + + def test_dygraph_2(self): + paddle.disable_static() + input_shape = (2, 3, 4, 5) + pad = [1, 1, 3, 4] + pad_3 = [1, 2, 1, 1, 3, 4, 5, 6] + mode = "constant" + value = 100 + input_data = np.random.rand(*input_shape).astype(self.dtype) + np_out1 = self._get_numpy_out( + input_data, pad, mode, value, data_format="NCHW" + ) + np_out2 = self._get_numpy_out( + input_data, pad, mode, value, data_format="NHWC" + ) + np_out3 = self._get_numpy_out( + input_data, pad_3, mode, value, data_format="NCHW" + ) + + tensor_data = paddle.to_tensor(input_data) + tensor_pad = paddle.to_tensor(pad, dtype="int32") + + y1 = F.pad( + tensor_data, + pad=tensor_pad, + mode=mode, + value=value, + data_format="NCHW", + ) + y2 = F.pad( + tensor_data, + pad=tensor_pad, + mode=mode, + value=value, + data_format="NHWC", + ) + y3 = F.pad( + tensor_data, + pad=pad_3, + mode=mode, + value=value, + data_format="NCHW", + ) + + np.testing.assert_allclose(y1.numpy(), np_out1, rtol=1e-05) + np.testing.assert_allclose(y2.numpy(), np_out2, rtol=1e-05) + np.testing.assert_allclose(y3.numpy(), np_out3, rtol=1e-05) + + def test_dygraph_3(self): + paddle.disable_static() + input_shape = (3, 4, 5) + pad = [3, 4] + pad_3 = [3, 4, 5, 6, 7, 8] + mode = "constant" + value = 100 + input_data = np.random.rand(*input_shape).astype(self.dtype) + np_out1 = self._get_numpy_out( + input_data, pad, mode, value, data_format="NCL" + ) + np_out2 = self._get_numpy_out( + input_data, pad, mode, value, data_format="NLC" + ) + np_out3 = self._get_numpy_out( + input_data, pad_3, mode, value, data_format="NCL" + ) + tensor_data = paddle.to_tensor(input_data) + tensor_pad = paddle.to_tensor(pad, dtype="int32") + + y1 = F.pad( + tensor_data, + pad=tensor_pad, + mode=mode, + value=value, + data_format="NCL", + ) + y2 = F.pad( + tensor_data, + pad=tensor_pad, + mode=mode, + value=value, + data_format="NLC", + ) + y3 = F.pad( + tensor_data, + pad=pad_3, + mode=mode, + value=value, + data_format="NCL", + ) + + np.testing.assert_allclose(y1.numpy(), np_out1, rtol=1e-05) + np.testing.assert_allclose(y2.numpy(), np_out2, rtol=1e-05) + np.testing.assert_allclose(y3.numpy(), np_out3, rtol=1e-05) + + class TestPad3dAPI(unittest.TestCase): + def _get_numpy_out( + self, input_data, pad, mode, value=0.0, data_format="NCDHW" + ): + if data_format == "NCDHW": + pad = [ + (0, 0), + (0, 0), + (pad[4], pad[5]), + (pad[2], pad[3]), + (pad[0], pad[1]), + ] + else: + pad = [ + (0, 0), + (pad[4], pad[5]), + (pad[2], pad[3]), + (pad[0], pad[1]), + (0, 0), + ] + + if mode == "constant": + out = np.pad(input_data, pad, mode=mode, constant_values=value) + elif mode == "reflect": + out = np.pad(input_data, pad, mode=mode) + elif mode == "replicate": + out = np.pad(input_data, pad, mode="edge") + elif mode == "circular": + out = np.pad(input_data, pad, mode="wrap") + + return out + + def setUp(self): + self.places = [paddle.XPUPlace(0)] + self.dtype = self.in_type + + def test_class(self): + paddle.disable_static() + for place in self.places: + input_shape = (3, 4, 5, 6, 7) + pad = [1, 2, 2, 1, 1, 0] + pad_int = 1 + value = 100 + input_data = np.random.rand(*input_shape).astype(self.dtype) + + pad_reflection = nn.Pad3D(padding=pad, mode="reflect") + pad_replication = nn.Pad3D(padding=pad, mode="replicate") + pad_constant = nn.Pad3D( + padding=pad, mode="constant", value=value + ) + pad_constant_int = nn.Pad3D( + padding=pad_int, mode="constant", value=value + ) + pad_circular = nn.Pad3D(padding=pad, mode="circular") + + data = paddle.to_tensor(input_data) + + output = pad_reflection(data) + np_out = self._get_numpy_out( + input_data, pad, "reflect", data_format="NCDHW" + ) + np.testing.assert_allclose(output.numpy(), np_out, rtol=1e-05) + + output = pad_replication(data) + np_out = self._get_numpy_out( + input_data, pad, "replicate", data_format="NCDHW" + ) + np.testing.assert_allclose(output.numpy(), np_out, rtol=1e-05) + + output = pad_constant(data) + np_out = self._get_numpy_out( + input_data, + pad, + "constant", + value=value, + data_format="NCDHW", + ) + np.testing.assert_allclose(output.numpy(), np_out, rtol=1e-05) + + output = pad_constant_int(data) + np_out = self._get_numpy_out( + input_data, + [pad_int] * 6, + "constant", + value=value, + data_format="NCDHW", + ) + np.testing.assert_allclose(output.numpy(), np_out, rtol=1e-05) + + def test_pad_tensor(self): + paddle.disable_static() + for place in self.places: + input_shape = (3, 4, 5, 6, 7) + pad = [1, 2, 2, 1, 1, 0] + pad_tensor = paddle.to_tensor(pad) + input_data = np.random.rand(*input_shape).astype(np.float32) + + pad_reflection_ncdhw = nn.Pad3D( + padding=pad_tensor, mode="reflect", data_format="NCDHW" + ) + pad_reflection_ndhwc = nn.Pad3D( + padding=pad_tensor, mode="reflect", data_format="NDHWC" + ) + data = paddle.to_tensor(input_data) + + output = pad_reflection_ncdhw(data) + np_out = self._get_numpy_out( + input_data, pad, "reflect", data_format="NCDHW" + ) + np.testing.assert_allclose(output.numpy(), np_out, rtol=1e-05) + + output = pad_reflection_ndhwc(data) + np_out = self._get_numpy_out( + input_data, pad, "reflect", data_format="NDHWC" + ) + np.testing.assert_allclose(output.numpy(), np_out, rtol=1e-05) + + class TestPad3dOpError(unittest.TestCase): + def setUp(self): + self.places = [paddle.XPUPlace(0)] + self.dtype = self.in_type + + def test_errors(self): + def test_variable(): + input_shape = (1, 2, 3, 4, 5) + data = np.random.rand(*input_shape).astype(self.dtype) + y = F.pad(x=data, pad=[1, 1, 1, 1, 1, 1], data_format="NCDHW") + + def test_reflect_1(): + input_shape = (1, 2, 3, 4, 5) + data = np.random.rand(*input_shape).astype(self.dtype) + x = paddle.to_tensor(data) + y = F.pad( + x, + pad=[5, 6, 1, 1, 1, 1], + value=1, + mode='reflect', + data_format="NCDHW", + ) + + def test_reflect_2(): + input_shape = (1, 2, 3, 4, 5) + data = np.random.rand(*input_shape).astype(self.dtype) + x = paddle.to_tensor(data) + y = F.pad( + x, + pad=[1, 1, 4, 3, 1, 1], + value=1, + mode='reflect', + data_format="NCDHW", + ) + + def test_reflect_3(): + input_shape = (1, 2, 3, 4, 5) + data = np.random.rand(*input_shape).astype(self.dtype) + x = paddle.to_tensor(data) + y = F.pad( + x, + pad=[1, 1, 1, 1, 2, 3], + value=1, + mode='reflect', + data_format="NCDHW", + ) + + def test_replicate_1(): + input_shape = (1, 2, 0, 4, 5) + data = np.random.rand(*input_shape).astype(self.dtype) + x = paddle.to_tensor(data) + y = F.pad( + x, + pad=[1, 1, 1, 1, 2, 3], + mode='replicate', + data_format="NCDHW", + ) + + paddle.disable_static() + for place in self.places: + self.assertRaises(ValueError, test_variable) + self.assertRaises(Exception, test_reflect_1) + self.assertRaises(Exception, test_reflect_2) + self.assertRaises(Exception, test_reflect_3) + self.assertRaises(Exception, test_replicate_1) + paddle.enable_static() + + class TestPadDataformatError(unittest.TestCase): + def test_errors(self): + def test_ncl(): + input_shape = (1, 2, 3, 4) + pad = paddle.to_tensor(np.array([2, 1, 2, 1]).astype('int32')) + data = ( + np.arange(np.prod(input_shape), dtype=np.float64).reshape( + input_shape + ) + + 1 + ) + my_pad = nn.Pad1D( + padding=pad, mode="replicate", data_format="NCL" + ) + data = paddle.to_tensor(data) + result = my_pad(data) + + def test_nchw(): + input_shape = (1, 2, 4) + pad = paddle.to_tensor(np.array([2, 1, 2, 1]).astype('int32')) + data = ( + np.arange(np.prod(input_shape), dtype=np.float64).reshape( + input_shape + ) + + 1 + ) + my_pad = nn.Pad1D( + padding=pad, mode="replicate", data_format="NCHW" + ) + data = paddle.to_tensor(data) + result = my_pad(data) + + def test_ncdhw(): + input_shape = (1, 2, 3, 4) + pad = paddle.to_tensor(np.array([2, 1, 2, 1]).astype('int32')) + data = ( + np.arange(np.prod(input_shape), dtype=np.float64).reshape( + input_shape + ) + + 1 + ) + my_pad = nn.Pad1D( + padding=pad, mode="replicate", data_format="NCDHW" + ) + data = paddle.to_tensor(data) + result = my_pad(data) + + self.assertRaises(AssertionError, test_ncl) + self.assertRaises(AssertionError, test_nchw) + self.assertRaises(AssertionError, test_ncdhw) + + +support_types = get_xpu_op_support_types('pad3d') +for stype in support_types: + create_test_class(globals(), XPUTestPad3dOp, stype) + + +if __name__ == '__main__': + unittest.main() From d2f87d96ec46a3681fd989262a6379b1bd5310eb Mon Sep 17 00:00:00 2001 From: zhangyikun02 <48021248+zhangyk0314@users.noreply.github.com> Date: Thu, 24 Nov 2022 19:43:01 +0800 Subject: [PATCH 205/210] add exp_grad, hard_sigmoid and hard_sigmoid_grad for xpu, test=kunlun (#48307) --- .../fluid/platform/device/xpu/xpu2_op_list.h | 5 ++ .../phi/kernels/xpu/activation_grad_kernel.cc | 55 +++++++++++++++++++ paddle/phi/kernels/xpu/activation_kernel.cc | 24 ++++++++ 3 files changed, 84 insertions(+) diff --git a/paddle/fluid/platform/device/xpu/xpu2_op_list.h b/paddle/fluid/platform/device/xpu/xpu2_op_list.h index 7474ac88ac8db..f8b15d4d4ee28 100644 --- a/paddle/fluid/platform/device/xpu/xpu2_op_list.h +++ b/paddle/fluid/platform/device/xpu/xpu2_op_list.h @@ -220,6 +220,7 @@ XPUOpMap& get_kl2_ops() { XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()), pOpKernelType(vartype::INT32, XPUPlace()), pOpKernelType(vartype::FP32, XPUPlace())})}, + {"exp_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"exp", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"expand_as_v2", XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()), @@ -314,6 +315,10 @@ XPUOpMap& get_kl2_ops() { pOpKernelType(vartype::FP32, XPUPlace())})}, {"grid_sampler", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"hard_sigmoid_grad", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"hard_sigmoid", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"hard_swish_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), pOpKernelType(vartype::FP16, XPUPlace())})}, diff --git a/paddle/phi/kernels/xpu/activation_grad_kernel.cc b/paddle/phi/kernels/xpu/activation_grad_kernel.cc index a30f63d176e50..e3b5e1bfcd3fd 100644 --- a/paddle/phi/kernels/xpu/activation_grad_kernel.cc +++ b/paddle/phi/kernels/xpu/activation_grad_kernel.cc @@ -160,6 +160,21 @@ int xpu_activation_backward(const Context& dev_ctx, return r; } +template +struct XPUExpGradFunctor : public funcs::BaseActivationFunctor { + using XPUType = typename XPUTypeTrait::Type; + template + void operator()(const Context& dev_ctx, + const DenseTensor* x, + const DenseTensor* out, + const DenseTensor* dout, + DenseTensor* dx) const { + int r = xpu_activation_backward( + dev_ctx, x, out, dout, dx, xpu::exp_grad); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "exp_grad"); + } +}; + template struct XPULogGradFunctor : public funcs::BaseActivationFunctor { template @@ -238,6 +253,39 @@ struct XPULeakyReluGradFunctor : public funcs::BaseActivationFunctor { } }; +template +struct XPUHardSigmoidGradFunctor : public funcs::BaseActivationFunctor { + using XPUType = typename XPUTypeTrait::Type; + float slope; + float offset; + typename funcs::BaseActivationFunctor::AttrPair GetAttrs() { + return {{"slope", &slope}, {"offset", &offset}}; + } + + template + void operator()(const Context& dev_ctx, + const DenseTensor* x, + const DenseTensor* out, + const DenseTensor* dout, + DenseTensor* dx) const { + const T* y_data = out->data(); + const T* y_grad = dout->data(); + T* x_grad = dx->data(); + + auto xpu_context = dev_ctx.x_context(); + int r = xpu::hard_sigmoid_grad( + xpu_context, + reinterpret_cast( + y_data), // hard_sigmoid_grad do not need x_data + reinterpret_cast(y_data), + reinterpret_cast(y_grad), + reinterpret_cast(x_grad), + dx->numel(), + slope); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "hard_sigmoid_grad"); + } +}; + template struct XPUHardSwishGradFunctor : public funcs::BaseActivationFunctor { float threshold; @@ -497,6 +545,7 @@ struct XPUSoftPlusGradFunctor : public funcs::BaseActivationFunctor { } }; +DEFINE_XPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Exp, XPUExpGradFunctor); DEFINE_XPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Reciprocal, XPUReciprocalGradFunctor); DEFINE_XPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Sigmoid, XPUSigmoidGradFunctor); DEFINE_XPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Sqrt, XPUSqrtGradFunctor); @@ -524,6 +573,10 @@ DEFINE_XPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(Softplus, XPUSoftPlusGradFunctor, beta, threshold) +DEFINE_XPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPOUT(HardSigmoid, + XPUHardSigmoidGradFunctor, + slope, + offset) template void HardSwishGradKernel(const Context& dev_ctx, @@ -560,8 +613,10 @@ PD_REGISTER_KERNEL(tanh_grad, phi::TanhGradKernel, float, phi::dtype::float16) {} +PD_REGISTER_ACTIVATION_GRAD_KERNEL(exp_grad, ExpGradKernel) PD_REGISTER_ACTIVATION_GRAD_KERNEL(log_grad, LogGradKernel) PD_REGISTER_ACTIVATION_GRAD_KERNEL(leaky_relu_grad, LeakyReluGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(hard_sigmoid_grad, HardSigmoidGradKernel) PD_REGISTER_ACTIVATION_GRAD_KERNEL(hard_swish_grad, HardSwishGradKernel) PD_REGISTER_ACTIVATION_GRAD_KERNEL(reciprocal_grad, ReciprocalGradKernel) PD_REGISTER_ACTIVATION_GRAD_KERNEL(relu6_grad, Relu6GradKernel) diff --git a/paddle/phi/kernels/xpu/activation_kernel.cc b/paddle/phi/kernels/xpu/activation_kernel.cc index f730c38e8f0f2..51f74bd34750a 100644 --- a/paddle/phi/kernels/xpu/activation_kernel.cc +++ b/paddle/phi/kernels/xpu/activation_kernel.cc @@ -226,6 +226,25 @@ void PowKernel(const Context& dev_ctx, PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast_pow"); } +template +struct XPUHardSigmoidFunctor : public funcs::BaseActivationFunctor { + float slope; + float offset; + typename funcs::BaseActivationFunctor::AttrPair GetAttrs() { + return {{"slope", &slope}, {"offset", &offset}}; + } + + template + void operator()(const Context& dev_ctx, + const DenseTensor& x, + DenseTensor* out) const { + using XPUType = typename XPUTypeTrait::Type; + int r = xpu_activation_1attr_func( + dev_ctx, x, out, slope, xpu::hard_sigmoid); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "hard_sigmoid"); + } +}; + template struct XPUHardSwishFunctor : public funcs::BaseActivationFunctor { float threshold; @@ -428,6 +447,10 @@ DEFINE_XPU_ACTIVATION_KERNEL_WITH_TWO_ATTRS(Softplus, XPUSoftplusFunctor, beta, threshold) +DEFINE_XPU_ACTIVATION_KERNEL_WITH_TWO_ATTRS(HardSigmoid, + XPUHardSigmoidFunctor, + slope, + offset) template void HardSwishRawKernel(const Context& dev_ctx, @@ -459,6 +482,7 @@ PD_REGISTER_KERNEL( PD_REGISTER_ACTIVATION_KERNEL(exp, ExpKernel) // no grad PD_REGISTER_ACTIVATION_KERNEL(log, LogKernel) PD_REGISTER_ACTIVATION_KERNEL(leaky_relu, LeakyReluKernel) +PD_REGISTER_ACTIVATION_KERNEL(hard_sigmoid, HardSigmoidKernel) PD_REGISTER_ACTIVATION_KERNEL(hard_swish_raw, HardSwishRawKernel) PD_REGISTER_ACTIVATION_KERNEL(mish, MishKernel) PD_REGISTER_ACTIVATION_KERNEL(pow, PowKernel) From c3fdb8177a827a89c6feea3b69d00222e8fdf0b9 Mon Sep 17 00:00:00 2001 From: zqw_1997 <118182234+zhengqiwen1997@users.noreply.github.com> Date: Thu, 24 Nov 2022 21:25:34 +0800 Subject: [PATCH 206/210] remove paddle.fluid.layers.nn.shuffle_channel (#48226) --- python/paddle/fluid/layers/nn.py | 76 ------------------- .../ir/inference/test_trt_subgraph_pass.py | 28 ------- .../fluid/tests/unittests/test_layers.py | 8 -- 3 files changed, 112 deletions(-) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 215dd0845e725..d792e78629252 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -146,7 +146,6 @@ 'bilinear_tensor_product', 'merge_selected_rows', 'get_tensor_from_selected_rows', - 'shuffle_channel', 'temporal_shift', 'py_func', 'psroi_pool', @@ -9849,81 +9848,6 @@ def get_tensor_from_selected_rows(x, name=None): return out -def shuffle_channel(x, group, name=None): - """ - This operator shuffles the channels of input x. - It divide the input channels in each group into :attr:`group` subgroups, - and obtain a new order by selecting element from every subgroup one by one. - - Please refer to the paper - https://arxiv.org/pdf/1707.01083.pdf - - .. code-block:: text - - Given a 4-D tensor input with the shape (N, C, H, W): - input.shape = (1, 4, 2, 2) - input.data =[[[[0.1, 0.2], - [0.2, 0.3]], - - [[0.3, 0.4], - [0.4, 0.5]], - - [[0.5, 0.6], - [0.6, 0.7]], - - [[0.7, 0.8], - [0.8, 0.9]]]] - Given group: 2 - then we get a 4-D tensor out with the same shape of input: - out.shape = (1, 4, 2, 2) - out.data = [[[[0.1, 0.2], - [0.2, 0.3]], - - [[0.5, 0.6], - [0.6, 0.7]], - - [[0.3, 0.4], - [0.4, 0.5]], - - [[0.7, 0.8], - [0.8, 0.9]]]] - - Args: - x(Variable): The input tensor variable. It should be a 4-D tensor with shape [N, C, H, W] - group(int): Indicating the counts of subgroups, It should divide the number of channels. - - Returns: - out(Variable): the channels shuffling result is a tensor variable with the - same shape and same type as the input. - - Raises: - ValueError: If group is not an int type variable. - - Examples: - .. code-block:: python - - import paddle - import paddle.fluid as fluid - paddle.enable_static() - input = fluid.data(name='input', shape=[None,4,2,2], dtype='float32') - out = fluid.layers.shuffle_channel(x=input, group=2) - """ - helper = LayerHelper("shuffle_channel", **locals()) - - out = helper.create_variable_for_type_inference(dtype=x.dtype) - - if not isinstance(group, int): - raise TypeError("group must be int type") - - helper.append_op( - type="shuffle_channel", - inputs={"X": x}, - outputs={"Out": out}, - attrs={"group": group}, - ) - return out - - @templatedoc() def temporal_shift(x, seg_num, shift_ratio=0.25, name=None, data_format="NCHW"): """ diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_subgraph_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_subgraph_pass.py index 86a995c45c01c..e260f46a5de7b 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_subgraph_pass.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_subgraph_pass.py @@ -453,33 +453,5 @@ def test_check_output(self): ) -class TensorRTSubgraphPassShuffleChannelTest(InferencePassTest): - def setUp(self): - with fluid.program_guard(self.main_program, self.startup_program): - data = fluid.data( - name="data", shape=[-1, 6, 64, 64], dtype="float32" - ) - sc_out = fluid.layers.shuffle_channel(data, group=3) - out = fluid.layers.batch_norm(sc_out, is_test=True) - self.feeds = { - "data": np.random.random([1, 6, 64, 64]).astype("float32"), - } - self.enable_trt = True - self.trt_parameters = ( - TensorRTSubgraphPassShuffleChannelTest.TensorRTParam( - 1 << 30, 32, 0, AnalysisConfig.Precision.Float32, False, False - ) - ) - self.fetch_list = [out] - - def test_check_output(self): - if core.is_compiled_with_cuda(): - use_gpu = True - self.check_output_with_option(use_gpu) - self.assertTrue( - PassVersionChecker.IsCompatible('tensorrt_subgraph_pass') - ) - - if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index f78bb59a06926..f62adf08ecf95 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -3744,14 +3744,6 @@ def make_temporal_shift(self): out = layers.temporal_shift(x, seg_num=2, shift_ratio=0.2) return out - def make_shuffle_channel(self): - with program_guard( - fluid.default_main_program(), fluid.default_startup_program() - ): - x = self._get_data(name="X", shape=[16, 4, 4], dtype="float32") - out = layers.shuffle_channel(x, group=4) - return out - def make_fsp_matrix(self): with program_guard( fluid.default_main_program(), fluid.default_startup_program() From 1b59830b767a208206b663359725010ce4236ba7 Mon Sep 17 00:00:00 2001 From: tianshuo78520a <707759223@qq.com> Date: Thu, 24 Nov 2022 21:57:58 +0800 Subject: [PATCH 207/210] Delete fluid_convert_utils fix PR-CI-Build (#48347) --- paddle/fluid/framework/CMakeLists.txt | 7 +------ paddle/phi/core/CMakeLists.txt | 6 +++--- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 05c2a4ccfb2c4..e48ec3ad0385b 100755 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -1242,9 +1242,4 @@ cc_test( SRCS phi_utils_test.cc DEPS phi_utils) -cc_library(fluid_convert_utils DEPS data_type) - -cc_test( - convert_utils_test - SRCS convert_utils_test.cc - DEPS fluid_convert_utils) +cc_test(convert_utils_test SRCS convert_utils_test.cc) diff --git a/paddle/phi/core/CMakeLists.txt b/paddle/phi/core/CMakeLists.txt index 3ecd022ff5d23..38fd021928104 100644 --- a/paddle/phi/core/CMakeLists.txt +++ b/paddle/phi/core/CMakeLists.txt @@ -19,12 +19,12 @@ if(WITH_XPU) cc_library( kernel_factory SRCS kernel_factory.cc - DEPS phi_enforce fluid_convert_utils convert_utils xpu_op_list) + DEPS phi_enforce convert_utils xpu_op_list) else() cc_library( kernel_factory SRCS kernel_factory.cc - DEPS phi_enforce fluid_convert_utils) + DEPS phi_enforce convert_utils) endif() cc_library( kernel_context @@ -55,7 +55,7 @@ cc_library( cc_library( dense_tensor SRCS dense_tensor.cc dense_tensor_impl.cc - DEPS convert_utils fluid_convert_utils tensor_meta tensor_base ddim) + DEPS convert_utils tensor_meta tensor_base ddim) target_link_libraries(dense_tensor malloc) From aaf3a13efdea848d35a3cc533fb8cfb48d997b75 Mon Sep 17 00:00:00 2001 From: sneaxiy <32832641+sneaxiy@users.noreply.github.com> Date: Fri, 25 Nov 2022 08:31:36 +0800 Subject: [PATCH 208/210] add bfloat16 support for more ops (#48272) * add bfloat16 support for more ops * fix ci compile * fix windows compile error * fix windows compile error * fix rocm compile error * fix ROCM compile error --- paddle/phi/kernels/funcs/cross_entropy.cu | 4 ++++ paddle/phi/kernels/funcs/cross_entropy.h | 22 ++++++++++++++++--- paddle/phi/kernels/funcs/math.h | 4 ++++ paddle/phi/kernels/gpu/arg_min_max_kernel.cu | 2 ++ .../kernels/gpu/cross_entropy_grad_kernel.cu | 21 ++++++++++++++++++ .../phi/kernels/gpu/cross_entropy_kernel.cu | 11 ++++++++++ .../kernels/gpu/index_sample_grad_kernel.cu | 2 ++ paddle/phi/kernels/gpu/index_sample_kernel.cu | 2 ++ 8 files changed, 65 insertions(+), 3 deletions(-) diff --git a/paddle/phi/kernels/funcs/cross_entropy.cu b/paddle/phi/kernels/funcs/cross_entropy.cu index 174c1c1bd934e..9f08214ef5a07 100644 --- a/paddle/phi/kernels/funcs/cross_entropy.cu +++ b/paddle/phi/kernels/funcs/cross_entropy.cu @@ -14,6 +14,7 @@ limitations under the License. */ #include "paddle/phi/kernels/funcs/cross_entropy.h" +#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_device_function.h" #include "paddle/phi/backends/gpu/gpu_primitives.h" @@ -153,6 +154,9 @@ void CrossEntropyFunctor::operator()( template class CrossEntropyFunctor; template class CrossEntropyFunctor; template class CrossEntropyFunctor; +#if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION_MIN(8, 1, 0) +template class CrossEntropyFunctor; +#endif } // namespace funcs } // namespace phi diff --git a/paddle/phi/kernels/funcs/cross_entropy.h b/paddle/phi/kernels/funcs/cross_entropy.h index 692ba5efef5b7..3c4057420c3d4 100644 --- a/paddle/phi/kernels/funcs/cross_entropy.h +++ b/paddle/phi/kernels/funcs/cross_entropy.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include +#include "paddle/phi/common/bfloat16.h" #include "paddle/phi/common/float16.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/hostdevice.h" @@ -48,12 +49,27 @@ template <> struct TolerableValue { HOSTDEVICE phi::dtype::float16 operator()( const phi::dtype::float16& x) const { - if (phi::dtype::isfinite(x)) + if (phi::dtype::isfinite(x)) { return x; - else if (x > static_cast(0)) + } else if (x > static_cast(0)) { return std::numeric_limits::max(); - else + } else { return std::numeric_limits::min(); + } + } +}; + +template <> +struct TolerableValue { + HOSTDEVICE phi::dtype::bfloat16 operator()( + const phi::dtype::bfloat16& x) const { + if (phi::dtype::isfinite(x)) { + return x; + } else if (x > static_cast(0)) { + return std::numeric_limits::max(); + } else { + return std::numeric_limits::min(); + } } }; diff --git a/paddle/phi/kernels/funcs/math.h b/paddle/phi/kernels/funcs/math.h index f8c373badf187..004279c25d4af 100644 --- a/paddle/phi/kernels/funcs/math.h +++ b/paddle/phi/kernels/funcs/math.h @@ -33,6 +33,10 @@ inline HOSTDEVICE phi::dtype::float16 real_log(phi::dtype::float16 x) { return static_cast(::logf(static_cast(x))); } +inline HOSTDEVICE phi::dtype::bfloat16 real_log(phi::dtype::bfloat16 x) { + return static_cast(::logf(static_cast(x))); +} + inline HOSTDEVICE float real_log(float x) { return ::logf(x); } inline HOSTDEVICE double real_log(double x) { return ::log(x); } diff --git a/paddle/phi/kernels/gpu/arg_min_max_kernel.cu b/paddle/phi/kernels/gpu/arg_min_max_kernel.cu index 5764ba54b397a..74be557c7d667 100644 --- a/paddle/phi/kernels/gpu/arg_min_max_kernel.cu +++ b/paddle/phi/kernels/gpu/arg_min_max_kernel.cu @@ -253,6 +253,7 @@ PD_REGISTER_KERNEL(arg_min, ALL_LAYOUT, phi::ArgMinKernel, phi::dtype::float16, + phi::dtype::bfloat16, float, double, int32_t, @@ -265,6 +266,7 @@ PD_REGISTER_KERNEL(arg_max, ALL_LAYOUT, phi::ArgMaxKernel, phi::dtype::float16, + phi::dtype::bfloat16, float, double, int32_t, diff --git a/paddle/phi/kernels/gpu/cross_entropy_grad_kernel.cu b/paddle/phi/kernels/gpu/cross_entropy_grad_kernel.cu index 8618f947be457..7bafa03aba5ff 100644 --- a/paddle/phi/kernels/gpu/cross_entropy_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/cross_entropy_grad_kernel.cu @@ -281,6 +281,7 @@ void CrossEntropyWithSoftmaxGradKernel(const Context& dev_ctx, } // namespace phi +#ifdef PADDLE_WITH_HIP PD_REGISTER_KERNEL(cross_entropy_with_softmax_grad, GPU, ALL_LAYOUT, @@ -288,3 +289,23 @@ PD_REGISTER_KERNEL(cross_entropy_with_softmax_grad, float, double, phi::dtype::float16) {} +#else +#if CUDNN_VERSION_MIN(8, 1, 0) +PD_REGISTER_KERNEL(cross_entropy_with_softmax_grad, + GPU, + ALL_LAYOUT, + phi::CrossEntropyWithSoftmaxGradKernel, + float, + double, + phi::dtype::float16, + phi::dtype::bfloat16) {} +#else +PD_REGISTER_KERNEL(cross_entropy_with_softmax_grad, + GPU, + ALL_LAYOUT, + phi::CrossEntropyWithSoftmaxGradKernel, + float, + double, + phi::dtype::float16) {} +#endif +#endif diff --git a/paddle/phi/kernels/gpu/cross_entropy_kernel.cu b/paddle/phi/kernels/gpu/cross_entropy_kernel.cu index 93d5f06b66564..b9a2b07e6961f 100644 --- a/paddle/phi/kernels/gpu/cross_entropy_kernel.cu +++ b/paddle/phi/kernels/gpu/cross_entropy_kernel.cu @@ -1468,6 +1468,16 @@ PD_REGISTER_KERNEL(cross_entropy_with_softmax, float, phi::dtype::float16) {} #else +#if CUDNN_VERSION_MIN(8, 1, 0) +PD_REGISTER_KERNEL(cross_entropy_with_softmax, + GPU, + ALL_LAYOUT, + phi::CrossEntropyWithSoftmaxKernel, + float, + double, + phi::dtype::float16, + phi::dtype::bfloat16) {} +#else PD_REGISTER_KERNEL(cross_entropy_with_softmax, GPU, ALL_LAYOUT, @@ -1476,3 +1486,4 @@ PD_REGISTER_KERNEL(cross_entropy_with_softmax, double, phi::dtype::float16) {} #endif +#endif diff --git a/paddle/phi/kernels/gpu/index_sample_grad_kernel.cu b/paddle/phi/kernels/gpu/index_sample_grad_kernel.cu index b016cf20b1332..9bf5181c1746c 100755 --- a/paddle/phi/kernels/gpu/index_sample_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/index_sample_grad_kernel.cu @@ -130,6 +130,8 @@ PD_REGISTER_KERNEL(index_sample_grad, GPU, ALL_LAYOUT, phi::IndexSampleGradKernel, + phi::dtype::float16, + phi::dtype::bfloat16, float, double, int, diff --git a/paddle/phi/kernels/gpu/index_sample_kernel.cu b/paddle/phi/kernels/gpu/index_sample_kernel.cu index 31fe1ff2a02f2..2ea5eaa9eaad1 100755 --- a/paddle/phi/kernels/gpu/index_sample_kernel.cu +++ b/paddle/phi/kernels/gpu/index_sample_kernel.cu @@ -103,6 +103,8 @@ PD_REGISTER_KERNEL(index_sample, GPU, ALL_LAYOUT, phi::IndexSampleKernel, + phi::dtype::float16, + phi::dtype::bfloat16, float, double, int, From 080349cd61cb1e5a22fe68f90cce606c5a63dc08 Mon Sep 17 00:00:00 2001 From: sneaxiy <32832641+sneaxiy@users.noreply.github.com> Date: Fri, 25 Nov 2022 08:34:40 +0800 Subject: [PATCH 209/210] fix cuda 116 compile error (#48342) --- paddle/phi/backends/gpu/gpu_resources.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/paddle/phi/backends/gpu/gpu_resources.cc b/paddle/phi/backends/gpu/gpu_resources.cc index e05e75636c1a0..3aeb73e1b63e4 100644 --- a/paddle/phi/backends/gpu/gpu_resources.cc +++ b/paddle/phi/backends/gpu/gpu_resources.cc @@ -22,6 +22,7 @@ #ifdef PADDLE_WITH_CUDA #include "paddle/phi/backends/dynload/cublas.h" +#include "paddle/phi/backends/dynload/cublasLt.h" #include "paddle/phi/backends/dynload/cudnn.h" #include "paddle/phi/backends/dynload/cusolver.h" #include "paddle/phi/backends/dynload/cusparse.h" From d90469a4a43baa54e53dc90238140919e032ba83 Mon Sep 17 00:00:00 2001 From: houj04 <35131887+houj04@users.noreply.github.com> Date: Fri, 25 Nov 2022 09:21:24 +0800 Subject: [PATCH 210/210] fix xpu compile on phi::enforce. (#48345) --- paddle/fluid/platform/device/xpu/tests/enforce_xpu_test.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/platform/device/xpu/tests/enforce_xpu_test.cc b/paddle/fluid/platform/device/xpu/tests/enforce_xpu_test.cc index b2fd80530feb7..afa02f62c324b 100644 --- a/paddle/fluid/platform/device/xpu/tests/enforce_xpu_test.cc +++ b/paddle/fluid/platform/device/xpu/tests/enforce_xpu_test.cc @@ -27,7 +27,7 @@ bool CheckXPUStatusFailure(T value, const std::string& msg) { try { PADDLE_ENFORCE_XPU_SUCCESS(value); return false; - } catch (paddle::platform::EnforceNotMet& error) { + } catch (phi::enforce::EnforceNotMet& error) { std::string ex_msg = error.what(); std::cout << ex_msg << std::endl; return ex_msg.find(msg) != std::string::npos; @@ -45,7 +45,7 @@ bool CheckXDNNStatusFailure(T value, const std::string& msg) { try { PADDLE_ENFORCE_XDNN_SUCCESS(value, "XDNN Error "); return false; - } catch (paddle::platform::EnforceNotMet& error) { + } catch (phi::enforce::EnforceNotMet& error) { std::string ex_msg = error.what(); std::cout << ex_msg << std::endl; return ex_msg.find(msg) != std::string::npos;