From f6d53f42813f9779c04df4ae8736836c1a625ef9 Mon Sep 17 00:00:00 2001 From: zhouwei25 Date: Mon, 24 Apr 2023 14:06:52 +0000 Subject: [PATCH] [Zero-Dim] Support paddle.sum/mean/loss api output 0D,test=allcase --- .../composite_backward_api.h | 2 +- paddle/phi/infermeta/unary.cc | 7 +- .../distributed/auto_parallel/completion.py | 20 ++- .../distributed/auto_parallel/engine.py | 2 +- .../auto_parallel/operators/common.py | 2 +- .../auto_parallel/operators/dist_default.py | 4 +- .../auto_parallel/operators/dist_eltwise.py | 4 +- .../auto_parallel/operators/dist_embedding.py | 2 +- .../auto_parallel/operators/dist_matmul.py | 18 +-- .../auto_parallel/operators/dist_reshape.py | 12 +- .../auto_parallel/operators/dist_scale.py | 4 +- .../auto_parallel/operators/dist_softmax.py | 4 +- .../auto_parallel/operators/dist_transpose.py | 4 +- .../auto_parallel/tuner/rule_based_tuner.py | 12 +- .../paddle/distributed/auto_parallel/utils.py | 13 +- .../hybrid_parallel_optimizer.py | 49 ++++--- .../sharding/group_sharded_utils.py | 55 ++++---- .../distributed/fleet/metrics/metric.py | 6 +- .../distributed/passes/auto_parallel_amp.py | 46 +++++-- .../distributed/passes/auto_parallel_fp16.py | 9 +- .../passes/auto_parallel_grad_clip.py | 4 +- .../passes/auto_parallel_quantization.py | 4 +- .../tests/unittests/check_nan_inf_base.py | 6 +- .../tests/unittests/seresnext_test_base.py | 14 +- .../fluid/tests/unittests/test_argsort_op.py | 5 +- .../paddle/fluid/tests/unittests/test_cond.py | 2 +- .../unittests/test_cosine_embedding_loss.py | 8 +- .../test_eager_deletion_recurrent_op.py | 4 +- .../unittests/test_fetch_lod_tensor_array.py | 8 +- .../unittests/test_fuse_all_reduce_pass.py | 10 +- .../tests/unittests/test_fuse_bn_act_pass.py | 4 +- .../unittests/test_fuse_bn_add_act_pass.py | 4 +- .../test_fuse_elewise_add_act_pass.py | 10 +- .../unittests/test_fuse_optimizer_pass.py | 10 +- .../test_fuse_relu_depthwise_conv_pass.py | 10 +- .../tests/unittests/test_gradient_clip.py | 2 +- .../unittests/test_hinge_embedding_loss.py | 12 +- .../unittests/test_ir_memory_optimize_pass.py | 7 +- .../fluid/tests/unittests/test_l1_loss.py | 4 +- .../fluid/tests/unittests/test_mse_loss.py | 12 +- .../fluid/tests/unittests/test_nan_inf.py | 4 - .../test_parallel_executor_run_cinn.py | 2 +- ...llel_executor_seresnext_with_reduce_cpu.py | 48 ++++--- .../tests/unittests/test_recurrent_op.py | 4 +- .../unittests/test_resnet50_with_cinn.py | 2 +- .../tests/unittests/test_run_program_op.py | 2 +- .../tests/unittests/test_zero_dim_tensor.py | 128 ++++++++++++------ .../incubate/autograd/composite_rules.py | 2 +- .../distributed/models/moe/grad_clip.py | 12 +- test/auto_parallel/test_while_op_partition.py | 2 +- .../test_autograd_functional_static.py | 4 +- .../test_multi_precision_fp16_train.py | 2 +- .../test_distribution_transform.py | 2 +- test/legacy_test/test_async_read_write.py | 2 +- .../vjp/eager/test_comp_eager_sum_grad.py | 2 +- ...test_standalone_cuda_graph_multi_stream.py | 2 +- test/xpu/test_zero_dim_tensor_xpu.py | 3 +- 57 files changed, 380 insertions(+), 258 deletions(-) diff --git a/paddle/fluid/prim/api/composite_backward/composite_backward_api.h b/paddle/fluid/prim/api/composite_backward/composite_backward_api.h index a42c41c1ba229..014edef530286 100644 --- a/paddle/fluid/prim/api/composite_backward/composite_backward_api.h +++ b/paddle/fluid/prim/api/composite_backward/composite_backward_api.h @@ -317,7 +317,7 @@ void sum_grad(const Tensor& x, if (!keepdim) { auto axis_ = std::vector(); if (reduce_all) { - for (int64_t i = 1; i < x_dim_size; i++) { + for (int64_t i = 0; i < x_dim_size; i++) { axis_.push_back(i); } } else { diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc index ea27eba513051..42f231d85f84c 100644 --- a/paddle/phi/infermeta/unary.cc +++ b/paddle/phi/infermeta/unary.cc @@ -4002,9 +4002,6 @@ DDim OriginReduceInferDim(const MetaTensor& x, out_dim_vector.push_back(x.dims().at(i)); } } - if (x_rank > 0 && out_dim_vector.size() == 0) { - out_dim_vector.push_back(1); - } DDim out_dim = phi::make_ddim(out_dim_vector); return out_dim; @@ -4021,14 +4018,14 @@ DDim OriginReduceInferDimForIntArrayAxis(const MetaTensor& x, if (keep_dim) { vec_dim = std::vector(x.dims().size(), 1); } else { - vec_dim = {1}; + vec_dim = {}; } } else { if (keep_dim) { vec_dim = std::vector(x.dims().size(), -1); } else { auto x_rank = static_cast(x.dims().size()); - if (vec_axis.size() >= x_rank) { + if (vec_axis.size() > x_rank) { vec_dim = {-1}; } else { vec_dim = std::vector(x.dims().size() - vec_axis.size(), -1); diff --git a/python/paddle/distributed/auto_parallel/completion.py b/python/paddle/distributed/auto_parallel/completion.py index 91ffc556c78bd..5f2ab7e102b0b 100644 --- a/python/paddle/distributed/auto_parallel/completion.py +++ b/python/paddle/distributed/auto_parallel/completion.py @@ -1688,7 +1688,7 @@ def complete_update_annotation(self, serial_main_program): world_ranks ) out_dist_attr.dims_mapping = [ - -1 for _ in range(len(out_var.shape)) + -1 for _ in out_var.shape ] self._dist_context.set_tensor_dist_attr_for_program( out_var, out_dist_attr @@ -1732,7 +1732,9 @@ def complete_update_annotation(self, serial_main_program): len(out_var.shape) == 1 and out_var.shape[0] == 1 ) - out_dist_attr.dims_mapping = [-1] + out_dist_attr.dims_mapping = [ + -1 for _ in out_var.shape + ] self._dist_context.set_tensor_dist_attr_for_program( out_var, out_dist_attr ) @@ -1802,16 +1804,20 @@ def complete_update_annotation(self, serial_main_program): param.name, ref_dims_mapping ) learning_var = vars[op.input("LearningRate")[0]] - op_dist_attr.set_input_dims_mapping(learning_var.name, [-1]) + op_dist_attr.set_input_dims_mapping( + learning_var.name, [-1 for _ in learning_var.shape] + ) op_dist_attr.set_output_dims_mapping( - learning_var.name, [-1] + learning_var.name, [-1 for _ in learning_var.shape] ) if not learning_rate_completed: learning_rate_completed = True var_dist_attr = TensorDistAttr() var_dist_attr.process_mesh = ProcessMesh(world_ranks) - var_dist_attr.dims_mapping = [-1] + var_dist_attr.dims_mapping = [ + -1 for _ in learning_var.shape + ] self._dist_context.set_tensor_dist_attr_for_program( learning_var, var_dist_attr ) @@ -1841,10 +1847,10 @@ def complete_update_annotation(self, serial_main_program): ): input_var_attr.dims_mapping = [-1] op_dist_attr.set_input_dims_mapping( - input_var.name, [-1] + input_var.name, [-1 for _ in input_var.shape] ) op_dist_attr.set_output_dims_mapping( - input_var.name, [-1] + input_var.name, [-1 for _ in input_var.shape] ) else: input_var_attr.dims_mapping = ref_dims_mapping diff --git a/python/paddle/distributed/auto_parallel/engine.py b/python/paddle/distributed/auto_parallel/engine.py index 567723aa29321..cb735fe11b86e 100644 --- a/python/paddle/distributed/auto_parallel/engine.py +++ b/python/paddle/distributed/auto_parallel/engine.py @@ -511,7 +511,7 @@ def _prepare_logger( loss_indices = fetch_indices[group_idx] assert len(loss_indices) <= 1 for idx in loss_indices: - logs["loss"] = outs[idx][0] + logs["loss"] = outs[idx] group_idx += 1 # logging metrics dist_context = self._dist_contexts[mode] diff --git a/python/paddle/distributed/auto_parallel/operators/common.py b/python/paddle/distributed/auto_parallel/operators/common.py index 209319f861a77..987c533a012eb 100644 --- a/python/paddle/distributed/auto_parallel/operators/common.py +++ b/python/paddle/distributed/auto_parallel/operators/common.py @@ -393,7 +393,7 @@ def get_data_parallel_group(dist_ctx, op, act_grad_names, rank): for var_name in act_grad_names: var_dim_mapping = op_dist_attr.get_input_dims_mapping(var_name) - # consider that the variable's shape is None + # consider that the variable's shape is [], which is 0D # TODO utilize the batch_dim attr instead of "0" in future batch_size_axis = var_dim_mapping[0] if len(var_dim_mapping) > 0 else -1 diff --git a/python/paddle/distributed/auto_parallel/operators/dist_default.py b/python/paddle/distributed/auto_parallel/operators/dist_default.py index 2e7ad3d12e0b6..bad92a02c63f0 100644 --- a/python/paddle/distributed/auto_parallel/operators/dist_default.py +++ b/python/paddle/distributed/auto_parallel/operators/dist_default.py @@ -159,7 +159,9 @@ def calc_bwd_cost(self, dist_op, ctx, cluster): ): var_dim_mapping = dist_attr.get_input_dims_mapping(varname) mesh_shape = process_mesh.shape - batch_size_axis = var_dim_mapping[0] + batch_size_axis = ( + var_dim_mapping[0] if len(var_dim_mapping) > 0 else -1 + ) if batch_size_axis > -1 and mesh_shape[batch_size_axis] > 1: need_gradient_allreduce = True break diff --git a/python/paddle/distributed/auto_parallel/operators/dist_eltwise.py b/python/paddle/distributed/auto_parallel/operators/dist_eltwise.py index 3e0924d143ff6..5c11dfba08fe1 100644 --- a/python/paddle/distributed/auto_parallel/operators/dist_eltwise.py +++ b/python/paddle/distributed/auto_parallel/operators/dist_eltwise.py @@ -101,7 +101,9 @@ def calc_bwd_cost(self, dist_op, ctx, cluster): ): var_dim_mapping = dist_attr.get_input_dims_mapping(varname) mesh_shape = process_mesh.shape - batch_size_axis = var_dim_mapping[0] + batch_size_axis = ( + var_dim_mapping[0] if len(var_dim_mapping) > 0 else -1 + ) if batch_size_axis > -1 and mesh_shape[batch_size_axis] > 1: need_gradient_allreduce = True break diff --git a/python/paddle/distributed/auto_parallel/operators/dist_embedding.py b/python/paddle/distributed/auto_parallel/operators/dist_embedding.py index 7176341feedfb..4f13c89bb14fd 100644 --- a/python/paddle/distributed/auto_parallel/operators/dist_embedding.py +++ b/python/paddle/distributed/auto_parallel/operators/dist_embedding.py @@ -252,7 +252,7 @@ def calc_bwd_cost(self, dist_op, ctx, cluster): backward_op.input("Ids")[0] ) mesh_shape = process_mesh.shape - batch_size_axis = var_dim_mapping[0] + batch_size_axis = var_dim_mapping[0] if len(var_dim_mapping) > 0 else -1 if batch_size_axis > -1 and mesh_shape[batch_size_axis] > 1: parallel_axis = batch_size_axis attrs = {"use_calc_stream": True} diff --git a/python/paddle/distributed/auto_parallel/operators/dist_matmul.py b/python/paddle/distributed/auto_parallel/operators/dist_matmul.py index f3e1c74771b73..8825e14d9aba7 100644 --- a/python/paddle/distributed/auto_parallel/operators/dist_matmul.py +++ b/python/paddle/distributed/auto_parallel/operators/dist_matmul.py @@ -651,7 +651,7 @@ def calc_bwd_cost(self, dist_op, ctx, cluster): backward_op.input("X")[0] ) mesh_shape = process_mesh.shape - batch_size_axis = var_dim_mapping[0] + batch_size_axis = var_dim_mapping[0] if len(var_dim_mapping) > 0 else -1 if ( batch_size_axis > -1 and mesh_shape[batch_size_axis] > 1 @@ -1028,7 +1028,7 @@ def calc_bwd_cost(self, dist_op, ctx, cluster): backward_op.input("X")[0] ) mesh_shape = process_mesh.shape - batch_size_axis = var_dim_mapping[0] + batch_size_axis = var_dim_mapping[0] if len(var_dim_mapping) > 0 else -1 if ( batch_size_axis > -1 and mesh_shape[batch_size_axis] > 1 @@ -1365,7 +1365,7 @@ def calc_bwd_cost(self, dist_op, ctx, cluster): backward_op.input("X")[0] ) mesh_shape = process_mesh.shape - batch_size_axis = var_dim_mapping[0] + batch_size_axis = var_dim_mapping[0] if len(var_dim_mapping) > 0 else -1 if ( batch_size_axis > -1 and mesh_shape[batch_size_axis] > 1 @@ -1552,7 +1552,7 @@ def calc_bwd_cost(self, dist_op, ctx, cluster): backward_op.input("X")[0] ) mesh_shape = process_mesh.shape - batch_size_axis = var_dim_mapping[0] + batch_size_axis = var_dim_mapping[0] if len(var_dim_mapping) > 0 else -1 if ( batch_size_axis > -1 and mesh_shape[batch_size_axis] > 1 @@ -1929,7 +1929,7 @@ def calc_bwd_cost(self, dist_op, ctx, cluster): backward_op.input("X")[0] ) mesh_shape = process_mesh.shape - batch_size_axis = var_dim_mapping[0] + batch_size_axis = var_dim_mapping[0] if len(var_dim_mapping) > 0 else -1 if ( batch_size_axis > -1 and mesh_shape[batch_size_axis] > 1 @@ -2264,7 +2264,7 @@ def calc_bwd_cost(self, dist_op, ctx, cluster): backward_op.input("X")[0] ) mesh_shape = process_mesh.shape - batch_size_axis = var_dim_mapping[0] + batch_size_axis = var_dim_mapping[0] if len(var_dim_mapping) > 0 else -1 if ( batch_size_axis > -1 and mesh_shape[batch_size_axis] > 1 @@ -2449,7 +2449,7 @@ def calc_bwd_cost(self, dist_op, ctx, cluster): backward_op.input("X")[0] ) mesh_shape = process_mesh.shape - batch_size_axis = var_dim_mapping[0] + batch_size_axis = var_dim_mapping[0] if len(var_dim_mapping) > 0 else -1 if ( batch_size_axis > -1 and mesh_shape[batch_size_axis] > 1 @@ -2832,7 +2832,7 @@ def calc_bwd_cost(self, dist_op, ctx, cluster): backward_op.input("X")[0] ) mesh_shape = process_mesh.shape - batch_size_axis = var_dim_mapping[0] + batch_size_axis = var_dim_mapping[0] if len(var_dim_mapping) > 0 else -1 if ( batch_size_axis > -1 and mesh_shape[batch_size_axis] > 1 @@ -3178,7 +3178,7 @@ def calc_bwd_cost(self, dist_op, ctx, cluster): backward_op.input("X")[0] ) mesh_shape = process_mesh.shape - batch_size_axis = var_dim_mapping[0] + batch_size_axis = var_dim_mapping[0] if len(var_dim_mapping) > 0 else -1 if ( batch_size_axis > -1 and mesh_shape[batch_size_axis] > 1 diff --git a/python/paddle/distributed/auto_parallel/operators/dist_reshape.py b/python/paddle/distributed/auto_parallel/operators/dist_reshape.py index d1e590c379ec3..267e8437abacc 100644 --- a/python/paddle/distributed/auto_parallel/operators/dist_reshape.py +++ b/python/paddle/distributed/auto_parallel/operators/dist_reshape.py @@ -120,7 +120,9 @@ def calc_bwd_cost(self, dist_op, ctx, cluster): var_dim_mapping = dist_attr.get_input_dims_mapping(varname) mesh_shape = process_mesh.shape - batch_size_axis = var_dim_mapping[0] + batch_size_axis = ( + var_dim_mapping[0] if len(var_dim_mapping) > 0 else -1 + ) if batch_size_axis > -1 and mesh_shape[batch_size_axis] > 1: parallel_axis = batch_size_axis attrs = {"use_calc_stream": True} @@ -377,7 +379,9 @@ def calc_bwd_cost(self, dist_op, ctx, cluster): var_dim_mapping = dist_attr.get_input_dims_mapping(varname) mesh_shape = process_mesh.shape - batch_size_axis = var_dim_mapping[0] + batch_size_axis = ( + var_dim_mapping[0] if len(var_dim_mapping) > 0 else -1 + ) if batch_size_axis > -1 and mesh_shape[batch_size_axis] > 1: parallel_axis = batch_size_axis attrs = {"use_calc_stream": True} @@ -637,7 +641,9 @@ def calc_bwd_cost(self, dist_op, ctx, cluster): var_dim_mapping = dist_attr.get_input_dims_mapping(varname) mesh_shape = process_mesh.shape - batch_size_axis = var_dim_mapping[0] + batch_size_axis = ( + var_dim_mapping[0] if len(var_dim_mapping) > 0 else -1 + ) if batch_size_axis > -1 and mesh_shape[batch_size_axis] > 1: parallel_axis = batch_size_axis attrs = {"use_calc_stream": True} diff --git a/python/paddle/distributed/auto_parallel/operators/dist_scale.py b/python/paddle/distributed/auto_parallel/operators/dist_scale.py index e95e001b89000..66a35b1eadb68 100644 --- a/python/paddle/distributed/auto_parallel/operators/dist_scale.py +++ b/python/paddle/distributed/auto_parallel/operators/dist_scale.py @@ -100,7 +100,9 @@ def calc_bwd_cost(self, dist_op, ctx, cluster): ): var_dim_mapping = dist_attr.get_input_dims_mapping(varname) mesh_shape = process_mesh.shape - batch_size_axis = var_dim_mapping[0] + batch_size_axis = ( + var_dim_mapping[0] if len(var_dim_mapping) > 0 else -1 + ) if batch_size_axis > -1 and mesh_shape[batch_size_axis] > 1: need_gradient_allreduce = True break diff --git a/python/paddle/distributed/auto_parallel/operators/dist_softmax.py b/python/paddle/distributed/auto_parallel/operators/dist_softmax.py index d5c3802e50689..814464d659865 100644 --- a/python/paddle/distributed/auto_parallel/operators/dist_softmax.py +++ b/python/paddle/distributed/auto_parallel/operators/dist_softmax.py @@ -94,7 +94,9 @@ def calc_bwd_cost(self, dist_op, ctx, cluster): var_dim_mapping = dist_attr.get_input_dims_mapping(varname) mesh_shape = process_mesh.shape - batch_size_axis = var_dim_mapping[0] + batch_size_axis = ( + var_dim_mapping[0] if len(var_dim_mapping) > 0 else -1 + ) if batch_size_axis > -1 and mesh_shape[batch_size_axis] > 1: parallel_axis = batch_size_axis attrs = {"use_calc_stream": True} diff --git a/python/paddle/distributed/auto_parallel/operators/dist_transpose.py b/python/paddle/distributed/auto_parallel/operators/dist_transpose.py index 5fa1f3a7bac39..762c5b9209cd5 100644 --- a/python/paddle/distributed/auto_parallel/operators/dist_transpose.py +++ b/python/paddle/distributed/auto_parallel/operators/dist_transpose.py @@ -183,7 +183,9 @@ def calc_bwd_cost(self, dist_op, ctx, cluster): var_dim_mapping = dist_attr.get_input_dims_mapping(varname) mesh_shape = process_mesh.shape - batch_size_axis = var_dim_mapping[0] + batch_size_axis = ( + var_dim_mapping[0] if len(var_dim_mapping) > 0 else -1 + ) if batch_size_axis > -1 and mesh_shape[batch_size_axis] > 1: parallel_axis = batch_size_axis attrs = {"use_calc_stream": True} diff --git a/python/paddle/distributed/auto_parallel/tuner/rule_based_tuner.py b/python/paddle/distributed/auto_parallel/tuner/rule_based_tuner.py index 9f71163f6d78e..5ef0e87293345 100644 --- a/python/paddle/distributed/auto_parallel/tuner/rule_based_tuner.py +++ b/python/paddle/distributed/auto_parallel/tuner/rule_based_tuner.py @@ -1727,7 +1727,9 @@ def _complete_sub_update_program(self, sub_program_dist_context): len(out_var.shape) == 1 and out_var.shape[0] == 1 ) - out_dist_attr.dims_mapping = [-1] + out_dist_attr.dims_mapping = [ + -1 for _ in out_var.shape + ] sub_program_dist_context.set_tensor_dist_attr_for_program( out_var, out_dist_attr ) @@ -1798,17 +1800,19 @@ def _complete_sub_update_program(self, sub_program_dist_context): ) learning_var = vars[op.input("LearningRate")[0]] op_dist_attr.set_input_dims_mapping( - learning_var.name, [-1] + learning_var.name, [-1 for i in learning_var.shape] ) op_dist_attr.set_output_dims_mapping( - learning_var.name, [-1] + learning_var.name, [-1 for i in learning_var.shape] ) if not learning_rate_completed: learning_rate_completed = True var_dist_attr = TensorDistAttr() var_dist_attr.process_mesh = world_ranks - var_dist_attr.dims_mapping = [-1] + var_dist_attr.dims_mapping = [ + -1 for i in learning_var.shape + ] sub_program_dist_context.set_tensor_dist_attr_for_program( learning_var, var_dist_attr ) diff --git a/python/paddle/distributed/auto_parallel/utils.py b/python/paddle/distributed/auto_parallel/utils.py index 6b6afca969b7e..f4dfb8d9c20d5 100644 --- a/python/paddle/distributed/auto_parallel/utils.py +++ b/python/paddle/distributed/auto_parallel/utils.py @@ -1466,7 +1466,8 @@ def update_op_dims_mapping_by_default_dist_impl(dist_op): ), "{} only the batch dimension (0-dim) can be sharded, but the dimension {} is sharded by {} part.".format( op_desc.type(), idx, mapping ) - batch_dim_mappings.append(dims_mapping[0]) + if len(dims_mapping) >= 1: + batch_dim_mappings.append(dims_mapping[0]) for arg_name in op_desc.output_arg_names(): serial_tensor = dist_op.get_serial_output(arg_name) if serial_tensor.is_parameter: @@ -1480,7 +1481,8 @@ def update_op_dims_mapping_by_default_dist_impl(dist_op): ), "{} only the batch dimension (0-dim) can be sharded, but the dimension {} is sharded by {} part.".format( op_desc.type(), idx, mapping ) - batch_dim_mappings.append(dims_mapping[0]) + if len(dims_mapping) >= 1: + batch_dim_mappings.append(dims_mapping[0]) else: assert ( dims_mapping[0] == -1 @@ -1505,7 +1507,7 @@ def update_op_dims_mapping_by_default_dist_impl(dist_op): if serial_tensor.is_parameter: continue dims_mapping = op_dist_attr.get_input_dims_mapping(arg_name) - if compatible_dim_mapping != dims_mapping[0]: + if len(dims_mapping) >= 1 and compatible_dim_mapping != dims_mapping[0]: dims_mapping[0] = compatible_dim_mapping changed = True for arg_name in op_desc.output_arg_names(): @@ -1514,7 +1516,10 @@ def update_op_dims_mapping_by_default_dist_impl(dist_op): continue dims_mapping = op_dist_attr.get_output_dims_mapping(arg_name) if arg_name not in xshape_arg_names: - if compatible_dim_mapping != dims_mapping[0]: + if ( + len(dims_mapping) >= 1 + and compatible_dim_mapping != dims_mapping[0] + ): dims_mapping[0] = compatible_dim_mapping changed = True else: diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py index ab1b270e2fd88..73eb4bf88a75d 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import numpy as np import paddle from paddle import framework @@ -95,11 +96,10 @@ def _dygraph_clip(self, params_grads): # global norm of distributed FP16 params_and_grads if len(sum_square_dist_fp16) == 0: global_norm_dist_fp16 = paddle.to_tensor( - [0.0], dtype=paddle.float32 + np.array(0.0), dtype=paddle.float32 ) else: - global_norm_dist_fp16 = paddle.concat(sum_square_dist_fp16) - global_norm_dist_fp16 = paddle.sum(global_norm_dist_fp16) + global_norm_dist_fp16 = paddle.add_n(sum_square_dist_fp16) global_norm_dist_fp16 = paddle.cast( global_norm_dist_fp16, dtype=paddle.float32 ) @@ -107,11 +107,10 @@ def _dygraph_clip(self, params_grads): # global norm of non-distributed FP16 params_and_grads if len(sum_square_not_dist_fp16) == 0: global_norm_not_dist_fp16 = paddle.to_tensor( - [0.0], dtype=paddle.float32 + np.array(0.0), dtype=paddle.float32 ) else: - global_norm_not_dist_fp16 = paddle.concat(sum_square_not_dist_fp16) - global_norm_not_dist_fp16 = paddle.sum(global_norm_not_dist_fp16) + global_norm_not_dist_fp16 = paddle.add_n(sum_square_not_dist_fp16) global_norm_not_dist_fp16 = paddle.cast( global_norm_not_dist_fp16, dtype=paddle.float32 ) @@ -119,11 +118,10 @@ def _dygraph_clip(self, params_grads): # global norm of distributed BF16 params_and_grads if len(sum_square_dist_bf16) == 0: global_norm_dist_bf16 = paddle.to_tensor( - [0.0], dtype=paddle.float32 + np.array(0.0), dtype=paddle.float32 ) else: - global_norm_dist_bf16 = paddle.concat(sum_square_dist_bf16) - global_norm_dist_bf16 = paddle.sum(global_norm_dist_bf16) + global_norm_dist_bf16 = paddle.add_n(sum_square_dist_bf16) global_norm_dist_bf16 = paddle.cast( global_norm_dist_bf16, dtype=paddle.float32 ) @@ -131,30 +129,29 @@ def _dygraph_clip(self, params_grads): # global norm of non-distributed FP16 params_and_grads if len(sum_square_not_dist_bf16) == 0: global_norm_not_dist_bf16 = paddle.to_tensor( - [0.0], dtype=paddle.float32 + np.array(0.0), dtype=paddle.float32 ) else: - global_norm_not_dist_bf16 = paddle.concat(sum_square_not_dist_bf16) - global_norm_not_dist_bf16 = paddle.sum(global_norm_not_dist_bf16) + global_norm_not_dist_bf16 = paddle.add_n(sum_square_not_dist_bf16) global_norm_not_dist_bf16 = paddle.cast( global_norm_not_dist_bf16, dtype=paddle.float32 ) # global norm of distributed FP32 params_and_grads - global_norm_dist_fp32 = ( - paddle.concat(sum_square_dist_fp32) - if len(sum_square_dist_fp32) != 0 - else paddle.to_tensor([0.0], dtype=paddle.float32) - ) - global_norm_dist_fp32 = paddle.sum(global_norm_dist_fp32) + if len(sum_square_dist_fp32) == 0: + global_norm_dist_fp32 = paddle.to_tensor( + np.array(0.0), dtype=paddle.float32 + ) + else: + global_norm_dist_fp32 = paddle.add_n(sum_square_dist_fp32) # global norm of non-distributed FP32 params_and_grads - global_norm_not_dist_fp32 = ( - paddle.concat(sum_square_not_dist_fp32) - if len(sum_square_not_dist_fp32) != 0 - else paddle.to_tensor([0.0], dtype=paddle.float32) - ) - global_norm_not_dist_fp32 = paddle.sum(global_norm_not_dist_fp32) + if len(sum_square_not_dist_fp32) == 0: + global_norm_not_dist_fp32 = paddle.to_tensor( + np.array(0.0), dtype=paddle.float32 + ) + else: + global_norm_not_dist_fp32 = paddle.add_n(sum_square_not_dist_fp32) global_norm_var_dist = ( global_norm_dist_fp16 @@ -193,14 +190,14 @@ def _dygraph_clip(self, params_grads): ) max_global_norm = paddle.full( - shape=[1], + shape=[], dtype=global_norm_var_fp32.dtype, fill_value=self.clip_norm, ) clip_var = paddle.divide( x=max_global_norm, y=paddle.maximum(x=global_norm_var_fp32, y=max_global_norm) - + paddle.to_tensor([1.0e-6], dtype=paddle.float32), + + paddle.to_tensor(np.array(1.0e-6), dtype=paddle.float32), ) clip_var_fp16 = paddle.cast(clip_var, paddle.float16) diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py index f64edb8b63c6a..928c93df0b40a 100644 --- a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py +++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py @@ -94,59 +94,64 @@ def _dygraph_clip(self, params_grads): # global norm of non-distributed FP16 params_and_grads if len(sum_square_fp16) == 0: - global_norm_fp16 = paddle.to_tensor([0.0], dtype=paddle.float32) + global_norm_fp16 = paddle.to_tensor( + np.array(0.0), dtype=paddle.float32 + ) else: - global_norm_fp16 = paddle.concat(sum_square_fp16) - global_norm_fp16 = paddle.sum(global_norm_fp16) + global_norm_fp16 = paddle.add_n(sum_square_fp16) global_norm_fp16 = paddle.cast( global_norm_fp16, dtype=paddle.float32 ) # global norm of non-distributed BFP16 params_and_grads if len(sum_square_bfp16) == 0: - global_norm_bfp16 = paddle.to_tensor([0.0], dtype=paddle.float32) + global_norm_bfp16 = paddle.to_tensor( + np.array(0.0), dtype=paddle.float32 + ) else: - global_norm_bfp16 = paddle.concat(sum_square_bfp16) - global_norm_bfp16 = paddle.sum(global_norm_bfp16) + global_norm_bfp16 = paddle.add_n(sum_square_bfp16) global_norm_bfp16 = paddle.cast( global_norm_bfp16, dtype=paddle.float32 ) # global norm of non-distributed FP16 params_and_grads for unslice parameters if len(unslice_params_fp16) == 0: - global_unslice_fp16 = paddle.to_tensor([0.0], dtype=paddle.float32) + global_unslice_fp16 = paddle.to_tensor( + np.array(0.0), dtype=paddle.float32 + ) else: - global_unslice_fp16 = paddle.concat(unslice_params_fp16) - global_unslice_fp16 = paddle.sum(global_unslice_fp16) + global_unslice_fp16 = paddle.add_n(unslice_params_fp16) global_unslice_fp16 = paddle.cast( global_unslice_fp16, dtype=paddle.float32 ) # global norm of non-distributed BFP16 params_and_grads for unslice parameters if len(unslice_params_bfp16) == 0: - global_unslice_bfp16 = paddle.to_tensor([0.0], dtype=paddle.float32) + global_unslice_bfp16 = paddle.to_tensor( + np.array(0.0), dtype=paddle.float32 + ) else: - global_unslice_bfp16 = paddle.concat(unslice_params_bfp16) - global_unslice_bfp16 = paddle.sum(global_unslice_bfp16) + global_unslice_bfp16 = paddle.add_n(unslice_params_bfp16) global_unslice_bfp16 = paddle.cast( global_unslice_bfp16, dtype=paddle.float32 ) # global norm of non-distributed FP32 params_and_grads - global_norm_fp32 = ( - paddle.concat(sum_square_fp32) - if len(sum_square_fp32) != 0 - else paddle.to_tensor([0.0], dtype=paddle.float32) - ) - global_norm_fp32 = paddle.sum(global_norm_fp32) + if len(sum_square_fp32) == 0: + global_norm_fp32 = paddle.to_tensor( + np.array(0.0), dtype=paddle.float32 + ) + else: + global_norm_fp32 = paddle.add_n(sum_square_fp32) # global norm of non-distributed FP32 params_and_grads for unslice parameters - global_unslice_fp32 = ( - paddle.concat(unslice_params_fp32) - if len(unslice_params_fp32) != 0 - else paddle.to_tensor([0.0], dtype=paddle.float32) - ) - global_unslice_fp32 = paddle.sum(global_unslice_fp32) + if len(unslice_params_fp32) == 0: + global_unslice_fp32 = paddle.to_tensor( + np.array(0.0), dtype=paddle.float32 + ) + else: + global_unslice_fp32 = paddle.add_n(unslice_params_fp32) + global_unslice_var = ( global_unslice_fp16 + global_unslice_fp32 + global_unslice_bfp16 ) @@ -165,7 +170,7 @@ def _dygraph_clip(self, params_grads): global_norm_var = paddle.sqrt(global_norm_var + global_unslice_var) max_global_norm = paddle.full( - shape=[1], dtype=global_norm_var.dtype, fill_value=self.clip_norm + shape=[], dtype=global_norm_var.dtype, fill_value=self.clip_norm ) clip_var = paddle.divide( diff --git a/python/paddle/distributed/fleet/metrics/metric.py b/python/paddle/distributed/fleet/metrics/metric.py index d2f72b0c7d047..0d744d17cdd4a 100644 --- a/python/paddle/distributed/fleet/metrics/metric.py +++ b/python/paddle/distributed/fleet/metrics/metric.py @@ -40,7 +40,7 @@ def sum(input, scope=None, util=None): # in model.py input = paddle.cast(some_input, dtype='float32') cnt = paddle.sum(input) - global_cnt = paddle.static.create_global_var(persistable=True, dtype='float32', shape=[1], value=0) + global_cnt = paddle.static.create_global_var(persistable=True, dtype='float32', shape=[], value=0) tmp = paddle.add(cnt, global_cnt) paddle.assign(tmp, global_cnt) @@ -80,7 +80,7 @@ def max(input, scope=None, util=None): # in model.py input = paddle.cast(some_input, dtype='float32') cnt = paddle.sum(input) - global_cnt = paddle.static.create_global_var(persistable=True, dtype='float32', shape=[1], value=0) + global_cnt = paddle.static.create_global_var(persistable=True, dtype='float32', shape=[], value=0) tmp = paddle.maximum(cnt, global_cnt) paddle.assign(tmp, global_cnt) @@ -120,7 +120,7 @@ def min(input, scope=None, util=None): # in model.py input = paddle.cast(some_input, dtype='float32') cnt = paddle.sum(input) - global_cnt = paddle.static.create_global_var(persistable=True, dtype='float32', shape=[1], value=0) + global_cnt = paddle.static.create_global_var(persistable=True, dtype='float32', shape=[], value=0) tmp = paddle.minimum(cnt, global_cnt) paddle.assign(tmp, global_cnt) diff --git a/python/paddle/distributed/passes/auto_parallel_amp.py b/python/paddle/distributed/passes/auto_parallel_amp.py index 6f44c44bf3f96..def5156f811aa 100644 --- a/python/paddle/distributed/passes/auto_parallel_amp.py +++ b/python/paddle/distributed/passes/auto_parallel_amp.py @@ -955,7 +955,7 @@ def _cast_loss(self): loss_op._set_attr(OP_ROLE_KEY, OpRole.Forward) naive_set_dist_op_attr_for_program_by_mesh_and_mapping( - cast_op, ref_mesh, [-1], self.dist_context + cast_op, ref_mesh, [-1 for i in loss.shape], self.dist_context ) # backward @@ -970,12 +970,20 @@ def _cast_loss(self): dtype=core.VarDesc.VarType.FP32, persistable=loss.persistable, ) - set_var_dist_attr(self.dist_context, cast_loss_grad, [-1], ref_mesh) + set_var_dist_attr( + self.dist_context, + cast_loss_grad, + [-1 for i in loss.shape], + ref_mesh, + ) pre_grad_name = first_backward_op.output_arg_names[0] first_backward_op._rename_output(pre_grad_name, cast_loss_grad.name) naive_set_dist_op_attr_for_program_by_mesh_and_mapping( - first_backward_op, ref_mesh, [-1], self.dist_context + first_backward_op, + ref_mesh, + [-1 for i in loss.shape], + self.dist_context, ) cast_grad_op = main_block._insert_op( loss_op_idx + 3, @@ -989,7 +997,10 @@ def _cast_loss(self): }, ) naive_set_dist_op_attr_for_program_by_mesh_and_mapping( - cast_grad_op, ref_mesh, [-1], self.dist_context + cast_grad_op, + ref_mesh, + [-1 for i in loss.shape], + self.dist_context, ) loss_op = cast_op loss = cast_loss @@ -1021,7 +1032,12 @@ def _scale_loss(self): dtype=loss.dtype, persistable=loss.persistable, ) - set_var_dist_attr(self.dist_context, scaled_loss, [-1], ref_mesh) + set_var_dist_attr( + self.dist_context, + scaled_loss, + [-1 for i in loss.shape], + ref_mesh, + ) elementwise_mul_op = main_block._insert_op( loss_op_idx + 1, @@ -1034,7 +1050,10 @@ def _scale_loss(self): ) loss_op._set_attr(OP_ROLE_KEY, OpRole.Forward) naive_set_dist_op_attr_for_program_by_mesh_and_mapping( - elementwise_mul_op, ref_mesh, [-1], self.dist_context + elementwise_mul_op, + ref_mesh, + [-1 for i in loss.shape], + self.dist_context, ) # backward @@ -1050,14 +1069,20 @@ def _scale_loss(self): persistable=loss.persistable, ) set_var_dist_attr( - self.dist_context, scaled_loss_grad, [-1], ref_mesh + self.dist_context, + scaled_loss_grad, + [-1 for i in loss.shape], + ref_mesh, ) pre_grad_name = first_backward_op.output_arg_names[0] first_backward_op._rename_output( pre_grad_name, scaled_loss_grad.name ) naive_set_dist_op_attr_for_program_by_mesh_and_mapping( - first_backward_op, ref_mesh, [-1], self.dist_context + first_backward_op, + ref_mesh, + [-1 for i in loss.shape], + self.dist_context, ) scaled_loss_grad.op = first_backward_op # FIXME(JZ-LIANG) a trick to insert backward op @@ -1085,7 +1110,10 @@ def _scale_loss(self): elementwise_mul_grad_op = main_block.ops[loss_op_idx + 3] assert elementwise_mul_grad_op.type == "elementwise_mul_grad" naive_set_dist_op_attr_for_program_by_mesh_and_mapping( - elementwise_mul_grad_op, ref_mesh, [-1], self.dist_context + elementwise_mul_grad_op, + ref_mesh, + [-1 for i in loss.shape], + self.dist_context, ) else: scaled_loss = loss diff --git a/python/paddle/distributed/passes/auto_parallel_fp16.py b/python/paddle/distributed/passes/auto_parallel_fp16.py index 7cf10cfbc7fbd..3216c1b408276 100644 --- a/python/paddle/distributed/passes/auto_parallel_fp16.py +++ b/python/paddle/distributed/passes/auto_parallel_fp16.py @@ -678,7 +678,12 @@ def _insert_memcopy(block, idx, src_var, dist_context, direction="D2H"): stop_gradient=src_var.stop_gradient, ) - set_var_dist_attr(dist_context, output_var, [-1], world_process_group.ranks) + set_var_dist_attr( + dist_context, + output_var, + [-1 for i in src_var.shape], + world_process_group.ranks, + ) # TODO to support CUDAPinned/NPU/XPU Places if direction == "D2H": @@ -894,7 +899,7 @@ def _apply_single_impl(self, main_program, startup_program, context): set_var_dist_attr( self.dist_context, found_inf, - [-1], + [-1 for i in found_inf.shape], world_process_group.ranks, ) _set_op_dist_attr_with_ranks( diff --git a/python/paddle/distributed/passes/auto_parallel_grad_clip.py b/python/paddle/distributed/passes/auto_parallel_grad_clip.py index 91631b04ad625..481ba3b6c3113 100644 --- a/python/paddle/distributed/passes/auto_parallel_grad_clip.py +++ b/python/paddle/distributed/passes/auto_parallel_grad_clip.py @@ -221,7 +221,7 @@ def _init_dist_attr(self, op): in_var = self.block.vars[in_name] in_dist_attr = TensorDistAttr() in_dist_attr.process_mesh = ProcessMesh(self.world_ranks) - in_dist_attr.dims_mapping = [-1] + in_dist_attr.dims_mapping = [-1 for i in in_var.shape] self.dist_context.set_tensor_dist_attr_for_program( in_var, in_dist_attr ) @@ -230,7 +230,7 @@ def _init_dist_attr(self, op): out_var = self.block.vars[out_name] out_dist_attr = TensorDistAttr() out_dist_attr.process_mesh = ProcessMesh(self.world_ranks) - out_dist_attr.dims_mapping = [-1] + out_dist_attr.dims_mapping = [-1 for i in out_var.shape] self.dist_context.set_tensor_dist_attr_for_program( out_var, out_dist_attr ) diff --git a/python/paddle/distributed/passes/auto_parallel_quantization.py b/python/paddle/distributed/passes/auto_parallel_quantization.py index f1e6261cafb03..f2f35b33728bb 100644 --- a/python/paddle/distributed/passes/auto_parallel_quantization.py +++ b/python/paddle/distributed/passes/auto_parallel_quantization.py @@ -300,7 +300,7 @@ def set_dist_attr_for_qat_program( for slot_name in quant_op.desc.input_names(): in_name = quant_op.desc.input(slot_name)[0] input_var = block._var_recursive(in_name) - ref_dims_mapping = [-1] + ref_dims_mapping = [-1 for i in input_var.shape] if slot_name == "X": continue elif slot_name in ['Scale', 'ZeroPoint']: @@ -333,7 +333,7 @@ def set_dist_attr_for_qat_program( for slot_name in quant_op.desc.output_names(): output_name = quant_op.desc.output(slot_name)[0] output_var = block._var_recursive(output_name) - ref_dims_mapping = [-1] + ref_dims_mapping = [-1 for i in output_var.shape] if slot_name == "Y": dist_context.set_tensor_dist_attr_for_program( output_var, consume_input_dist_attr diff --git a/python/paddle/fluid/tests/unittests/check_nan_inf_base.py b/python/paddle/fluid/tests/unittests/check_nan_inf_base.py index 4237987f997ea..f48df68baddeb 100644 --- a/python/paddle/fluid/tests/unittests/check_nan_inf_base.py +++ b/python/paddle/fluid/tests/unittests/check_nan_inf_base.py @@ -95,11 +95,7 @@ def check(use_cuda): fetch_list=[y_predict.name, avg_cost.name, acc_top1.name], ) step += 1 - print( - 'iter={:.0f},cost={},acc1={}'.format( - step, outs[1][0], outs[2] - ) - ) + print(f'iter={step:.0f},cost={outs[1]},acc1={outs[2]}') if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/seresnext_test_base.py b/python/paddle/fluid/tests/unittests/seresnext_test_base.py index 44fdf28caec55..bed6180caea74 100644 --- a/python/paddle/fluid/tests/unittests/seresnext_test_base.py +++ b/python/paddle/fluid/tests/unittests/seresnext_test_base.py @@ -49,17 +49,19 @@ def _compare_result_with_origin_model( ) if compare_separately: - for loss in zip(func_1_first_loss, func_2_first_loss): - self.assertAlmostEqual(loss[0], loss[1], delta=1e-5) - for loss in zip(func_1_last_loss, func_2_last_loss): - self.assertAlmostEqual(loss[0], loss[1], delta=delta2) + self.assertAlmostEqual( + func_1_first_loss, func_2_first_loss, delta=1e-5 + ) + self.assertAlmostEqual( + func_1_last_loss, func_2_last_loss, delta=delta2 + ) else: np.testing.assert_allclose( func_1_loss_area, func_2_loss_area, rtol=delta2 ) self.assertAlmostEqual( - np.mean(func_1_first_loss), func_2_first_loss[0], delta=1e-5 + func_1_first_loss, func_2_first_loss, delta=1e-5 ) self.assertAlmostEqual( - np.mean(func_1_last_loss), func_2_last_loss[0], delta=delta2 + func_1_last_loss, func_2_last_loss, delta=delta2 ) diff --git a/python/paddle/fluid/tests/unittests/test_argsort_op.py b/python/paddle/fluid/tests/unittests/test_argsort_op.py index b1eafb3ace41a..ec6db2f6651e9 100644 --- a/python/paddle/fluid/tests/unittests/test_argsort_op.py +++ b/python/paddle/fluid/tests/unittests/test_argsort_op.py @@ -24,6 +24,7 @@ from paddle.fluid.framework import Program, grad_var_name np.random.seed(123) +paddle.enable_static() class PyArgsort: @@ -52,7 +53,7 @@ def forward(self): out = ( np.array(self.indices, dtype=self.indices.dtype), np.array(self.sorted_x, dtype=self.sorted_x.dtype), - np.array([self.loss], dtype=self.loss.dtype), + np.array(self.loss, dtype=self.loss.dtype), ) return out @@ -178,7 +179,7 @@ def get_numerical_gradient(self, delta=1e-7): f[...] = o dout_dfeed = (y_pos - y_neg) / (delta * 2) - g[...] = dout_dfeed[0] + g[...] = dout_dfeed return grad_list diff --git a/python/paddle/fluid/tests/unittests/test_cond.py b/python/paddle/fluid/tests/unittests/test_cond.py index e2f2cf1a34760..c23ecd6c5d896 100644 --- a/python/paddle/fluid/tests/unittests/test_cond.py +++ b/python/paddle/fluid/tests/unittests/test_cond.py @@ -674,7 +674,7 @@ def backward_value_helper(self, cond_func, use_cuda): }, fetch_list=[loss.name], ) - numerical_grad[0][j] = (loss_delta[0] - loss_value[0]) / delta + numerical_grad[0][j] = (loss_delta - loss_value) / delta feed_img_delta[0][j] = feed_img[0][j] np.testing.assert_allclose( img_grad, numerical_grad, rtol=0.05, atol=0.05 diff --git a/python/paddle/fluid/tests/unittests/test_cosine_embedding_loss.py b/python/paddle/fluid/tests/unittests/test_cosine_embedding_loss.py index 1b571e176fd42..882d2f505a718 100644 --- a/python/paddle/fluid/tests/unittests/test_cosine_embedding_loss.py +++ b/python/paddle/fluid/tests/unittests/test_cosine_embedding_loss.py @@ -64,7 +64,7 @@ def run_dynamic(self): reduction='mean', ) np.testing.assert_allclose(dy_result.numpy(), expected1, rtol=1e-05) - self.assertTrue(dy_result.shape, [1]) + self.assertEqual(dy_result.shape, []) dy_result = paddle.nn.functional.cosine_embedding_loss( input1, input2, label, margin=0.5, reduction='sum' @@ -78,7 +78,7 @@ def run_dynamic(self): ) np.testing.assert_allclose(dy_result.numpy(), expected2, rtol=1e-05) - self.assertTrue(dy_result.shape, [1]) + self.assertEqual(dy_result.shape, []) dy_result = paddle.nn.functional.cosine_embedding_loss( input1, input2, label, margin=0.5, reduction='none' @@ -92,7 +92,7 @@ def run_dynamic(self): ) np.testing.assert_allclose(dy_result.numpy(), expected3, rtol=1e-05) - self.assertTrue(dy_result.shape, [5]) + self.assertEqual(dy_result.shape, [5]) def run_static(self, use_gpu=False): input1 = static.data(name='input1', shape=[5, 3], dtype='float64') @@ -257,7 +257,7 @@ def run_dynamic(self): reduction='mean', ) np.testing.assert_allclose(dy_result.numpy(), expected1, rtol=1e-05) - self.assertTrue(dy_result.shape, [1]) + self.assertEqual(dy_result.shape, []) input1_1D = paddle.to_tensor(self.input1_np_1D) input2_1D = paddle.to_tensor(self.input2_np_1D) diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_recurrent_op.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_recurrent_op.py index dca923b3faa71..c01b05d1e462f 100644 --- a/python/paddle/fluid/tests/unittests/test_eager_deletion_recurrent_op.py +++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_recurrent_op.py @@ -42,7 +42,7 @@ def step(self, step_id, x): def forward(self): for step_id in range(self.x.shape[0]): self.step(step_id, self.x[step_id]) - return np.array([np.mean(self.y)]) + return np.mean(self.y) def segment_inputs(self): return [self.x[i] for i in range(self.x.shape[0])] @@ -251,7 +251,7 @@ def get_numerical_gradient(self, delta=0.005): f[...] = o dout_dfeed = (y_pos - y_neg) / (delta * 2) - g[...] = dout_dfeed[0] + g[...] = dout_dfeed return grad_list diff --git a/python/paddle/fluid/tests/unittests/test_fetch_lod_tensor_array.py b/python/paddle/fluid/tests/unittests/test_fetch_lod_tensor_array.py index b5936275dd0c2..bf0f894683d5e 100644 --- a/python/paddle/fluid/tests/unittests/test_fetch_lod_tensor_array.py +++ b/python/paddle/fluid/tests/unittests/test_fetch_lod_tensor_array.py @@ -69,9 +69,10 @@ def check_network(self, use_cuda=True): loss_v, array_v = exe.run( binary, feed=feed_dict, fetch_list=[loss, array] ) - self.assertEqual(np.array(loss_v).shape, (1,)) - self.assertEqual(np.array(array_v[0]).shape, (batch_size, 784)) - self.assertEqual(np.array(array_v[1]).shape, (batch_size, 1)) + self.assertEqual(loss_v.shape, ()) + self.assertEqual(array_v[0].shape, (batch_size, 784)) + self.assertEqual(array_v[1].shape, (batch_size, 1)) + self.assertEqual(array_v[2].shape, ()) np.testing.assert_allclose(loss_v, array_v[2], rtol=1e-05) def test_fetch_lod_tensor_array(self): @@ -81,4 +82,5 @@ def test_fetch_lod_tensor_array(self): if __name__ == '__main__': + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py index eb978064a5419..379010c8e42a4 100644 --- a/python/paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py +++ b/python/paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py @@ -78,10 +78,12 @@ def compare_fuse_all_reduce_ops( optimizer=optimizer, ) - for loss in zip(not_fuse_op_first_loss, fuse_op_first_loss): - self.assertAlmostEqual(loss[0], loss[1], delta=1e-6) - for loss in zip(not_fuse_op_last_loss, fuse_op_last_loss): - self.assertAlmostEqual(loss[0], loss[1], delta=1e-6) + self.assertAlmostEqual( + not_fuse_op_first_loss, fuse_op_first_loss, delta=1e-6 + ) + self.assertAlmostEqual( + not_fuse_op_last_loss, fuse_op_last_loss, delta=1e-6 + ) def optimizer(self, learning_rate=1e-3): optimizer = fluid.optimizer.SGD( diff --git a/python/paddle/fluid/tests/unittests/test_fuse_bn_act_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_bn_act_pass.py index 38ac248d0760e..6df1a3209ad12 100644 --- a/python/paddle/fluid/tests/unittests/test_fuse_bn_act_pass.py +++ b/python/paddle/fluid/tests/unittests/test_fuse_bn_act_pass.py @@ -98,7 +98,7 @@ def check(self, place, use_cuda): loss_v = exe.run( binary, feed=feeder.feed(data), fetch_list=[loss] ) - loss_vals.append(loss_v[0][0]) + loss_vals.append(loss_v[0]) # open fused_bn_act_ops build_strategy_fused = fluid.BuildStrategy() @@ -118,7 +118,7 @@ def check(self, place, use_cuda): loss_v = exe.run( binary_fused, feed=feeder.feed(data), fetch_list=[loss] ) - loss_vals_fused.append(loss_v[0][0]) + loss_vals_fused.append(loss_v[0]) # check loss for i in range(iters): diff --git a/python/paddle/fluid/tests/unittests/test_fuse_bn_add_act_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_bn_add_act_pass.py index bbbbc4a91a303..63796599278ce 100644 --- a/python/paddle/fluid/tests/unittests/test_fuse_bn_add_act_pass.py +++ b/python/paddle/fluid/tests/unittests/test_fuse_bn_add_act_pass.py @@ -216,7 +216,7 @@ def check(self, place, use_cuda): loss_v = exe.run( binary_fused, feed={"x": x, "y": y}, fetch_list=[loss] ) - loss_vals_fused.append(loss_v[0][0]) + loss_vals_fused.append(loss_v[0]) # build_origin_program: turn off fused_bn_act_ops build_strategy = fluid.BuildStrategy() @@ -234,7 +234,7 @@ def check(self, place, use_cuda): feed={"x": x_data[i], "y": y_data[i]}, fetch_list=[loss], ) - loss_vals.append(loss_v[0][0]) + loss_vals.append(loss_v[0]) # check loss for i in range(iters): diff --git a/python/paddle/fluid/tests/unittests/test_fuse_elewise_add_act_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_elewise_add_act_pass.py index 29fc8944e2107..cce3451ed5f61 100644 --- a/python/paddle/fluid/tests/unittests/test_fuse_elewise_add_act_pass.py +++ b/python/paddle/fluid/tests/unittests/test_fuse_elewise_add_act_pass.py @@ -74,10 +74,12 @@ def _optimizer(learning_rate=1e-6): optimizer=_optimizer, ) - for loss in zip(not_fuse_op_first_loss, fuse_op_first_loss): - self.assertAlmostEqual(loss[0], loss[1], delta=1e-6) - for loss in zip(not_fuse_op_last_loss, fuse_op_last_loss): - self.assertAlmostEqual(loss[0], loss[1], delta=1e-6) + self.assertAlmostEqual( + not_fuse_op_first_loss, fuse_op_first_loss, delta=1e-6 + ) + self.assertAlmostEqual( + not_fuse_op_last_loss, fuse_op_last_loss, delta=1e-6 + ) def test_simple_fc_with_fuse_op(self): self._compare_fuse_elewise_add_act_ops(simple_fc_net, DeviceType.CUDA) diff --git a/python/paddle/fluid/tests/unittests/test_fuse_optimizer_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_optimizer_pass.py index 71eaf8bc5420b..4e7f1901d72e4 100644 --- a/python/paddle/fluid/tests/unittests/test_fuse_optimizer_pass.py +++ b/python/paddle/fluid/tests/unittests/test_fuse_optimizer_pass.py @@ -70,10 +70,12 @@ def _compare_fused_optimizer_ops( optimizer=optimizer, ) - for loss in zip(not_fuse_op_first_loss, fuse_op_first_loss): - self.assertAlmostEqual(loss[0], loss[1], delta=1e-6) - for loss in zip(not_fuse_op_last_loss, fuse_op_last_loss): - self.assertAlmostEqual(loss[0], loss[1], delta=1e-6) + self.assertAlmostEqual( + not_fuse_op_first_loss, fuse_op_first_loss, delta=1e-6 + ) + self.assertAlmostEqual( + not_fuse_op_last_loss, fuse_op_last_loss, delta=1e-6 + ) def _decorate_compare_fused_optimizer_ops( self, model, use_device, optimizer diff --git a/python/paddle/fluid/tests/unittests/test_fuse_relu_depthwise_conv_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_relu_depthwise_conv_pass.py index fea42b00140a9..2a48de7bedda9 100644 --- a/python/paddle/fluid/tests/unittests/test_fuse_relu_depthwise_conv_pass.py +++ b/python/paddle/fluid/tests/unittests/test_fuse_relu_depthwise_conv_pass.py @@ -118,10 +118,12 @@ def _optimizer(learning_rate=1e-6): optimizer=_optimizer, ) - for loss in zip(not_fuse_op_first_loss, fuse_op_first_loss): - self.assertAlmostEqual(loss[0], loss[1], delta=1e-6) - for loss in zip(not_fuse_op_last_loss, fuse_op_last_loss): - self.assertAlmostEqual(loss[0], loss[1], delta=1e-6) + self.assertAlmostEqual( + not_fuse_op_first_loss, fuse_op_first_loss, delta=1e-6 + ) + self.assertAlmostEqual( + not_fuse_op_last_loss, fuse_op_last_loss, delta=1e-6 + ) def test_simple_depthwise_with_fuse_op(self): self._compare(simple_depthwise_net, DeviceType.CUDA) diff --git a/python/paddle/fluid/tests/unittests/test_gradient_clip.py b/python/paddle/fluid/tests/unittests/test_gradient_clip.py index 9a20c536bc6f3..dc84d9e70b028 100644 --- a/python/paddle/fluid/tests/unittests/test_gradient_clip.py +++ b/python/paddle/fluid/tests/unittests/test_gradient_clip.py @@ -152,7 +152,7 @@ def check_sparse_gradient_clip(self, place): data = next(self.train_data()) val = exe.run(prog, feed=feeder.feed(data), fetch_list=[cost])[0] - self.assertEqual((1,), val.shape) + self.assertEqual(val.shape, ()) self.assertFalse(np.isnan(val)) def backward_and_optimize(self, cost): diff --git a/python/paddle/fluid/tests/unittests/test_hinge_embedding_loss.py b/python/paddle/fluid/tests/unittests/test_hinge_embedding_loss.py index 1ce6c1bc5f025..5bfeb0f0f143a 100644 --- a/python/paddle/fluid/tests/unittests/test_hinge_embedding_loss.py +++ b/python/paddle/fluid/tests/unittests/test_hinge_embedding_loss.py @@ -50,7 +50,7 @@ def run_dynamic_check(self, place=paddle.CPUPlace()): dy_result = paddle.nn.functional.hinge_embedding_loss(input, label) expected = calc_hinge_embedding_loss(self.input_np, self.label_np) np.testing.assert_allclose(dy_result.numpy(), expected, rtol=1e-05) - self.assertTrue(dy_result.shape, [1]) + self.assertEqual(dy_result.shape, []) dy_result = paddle.nn.functional.hinge_embedding_loss( input, label, reduction='sum' @@ -59,7 +59,7 @@ def run_dynamic_check(self, place=paddle.CPUPlace()): self.input_np, self.label_np, reduction='sum' ) np.testing.assert_allclose(dy_result.numpy(), expected, rtol=1e-05) - self.assertTrue(dy_result.shape, [1]) + self.assertEqual(dy_result.shape, []) dy_result = paddle.nn.functional.hinge_embedding_loss( input, label, reduction='none' @@ -68,7 +68,7 @@ def run_dynamic_check(self, place=paddle.CPUPlace()): self.input_np, self.label_np, reduction='none' ) np.testing.assert_allclose(dy_result.numpy(), expected, rtol=1e-05) - self.assertTrue(dy_result.shape, self.shape) + self.assertEqual(dy_result.shape, list(self.shape)) def run_static_check(self, place=paddle.CPUPlace): paddle.enable_static() @@ -129,7 +129,7 @@ def run_dynamic_check(self, place=paddle.CPUPlace()): dy_result = hinge_embedding_loss(input, label) expected = calc_hinge_embedding_loss(self.input_np, self.label_np) np.testing.assert_allclose(dy_result.numpy(), expected, rtol=1e-05) - self.assertTrue(dy_result.shape, [1]) + self.assertEqual(dy_result.shape, []) hinge_embedding_loss = paddle.nn.loss.HingeEmbeddingLoss( reduction='sum' @@ -139,7 +139,7 @@ def run_dynamic_check(self, place=paddle.CPUPlace()): self.input_np, self.label_np, reduction='sum' ) np.testing.assert_allclose(dy_result.numpy(), expected, rtol=1e-05) - self.assertTrue(dy_result.shape, [1]) + self.assertEqual(dy_result.shape, []) hinge_embedding_loss = paddle.nn.loss.HingeEmbeddingLoss( reduction='none' @@ -149,7 +149,7 @@ def run_dynamic_check(self, place=paddle.CPUPlace()): self.input_np, self.label_np, reduction='none' ) np.testing.assert_allclose(dy_result.numpy(), expected, rtol=1e-05) - self.assertTrue(dy_result.shape, self.shape) + self.assertTrue(dy_result.shape, list(self.shape)) def run_static_check(self, place=paddle.CPUPlace): paddle.enable_static() diff --git a/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_pass.py b/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_pass.py index 046ad9a89cb12..4fff201519ca2 100644 --- a/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_pass.py +++ b/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_pass.py @@ -80,10 +80,9 @@ def _compare_ir_memory_optimize(self, model, use_device): use_device=use_device, use_ir_memory_optimize=True, ) - for loss in zip(first_loss0, first_loss1): - self.assertAlmostEqual(loss[0], loss[1], delta=1e-6) - for loss in zip(last_loss0, last_loss1): - self.assertAlmostEqual(loss[0], loss[1], delta=1e-6) + + self.assertAlmostEqual(first_loss0, first_loss1, delta=1e-6) + self.assertAlmostEqual(last_loss0, last_loss1, delta=1e-6) def test_simple_fc_net(self): self._compare_ir_memory_optimize(simple_fc_net, DeviceType.CPU) diff --git a/python/paddle/fluid/tests/unittests/test_l1_loss.py b/python/paddle/fluid/tests/unittests/test_l1_loss.py index ce5d4127fbc7e..20295d1a9a92b 100644 --- a/python/paddle/fluid/tests/unittests/test_l1_loss.py +++ b/python/paddle/fluid/tests/unittests/test_l1_loss.py @@ -36,7 +36,7 @@ def run_imperative(self): dy_result = paddle.nn.functional.l1_loss(input, label, reduction='sum') expected = np.sum(np.abs(self.input_np - self.label_np)) np.testing.assert_allclose(dy_result.numpy(), expected, rtol=1e-05) - self.assertEqual(dy_result.shape, [1]) + self.assertEqual(dy_result.shape, []) dy_result = paddle.nn.functional.l1_loss(input, label, reduction='none') expected = np.abs(self.input_np - self.label_np) @@ -125,7 +125,7 @@ def run_imperative(self): dy_result = l1_loss(input, label) expected = np.sum(np.abs(self.input_np - self.label_np)) np.testing.assert_allclose(dy_result.numpy(), expected, rtol=1e-05) - self.assertEqual(dy_result.shape, [1]) + self.assertEqual(dy_result.shape, []) l1_loss = paddle.nn.loss.L1Loss(reduction='none') dy_result = l1_loss(input, label) diff --git a/python/paddle/fluid/tests/unittests/test_mse_loss.py b/python/paddle/fluid/tests/unittests/test_mse_loss.py index ba680f91458dc..37fb0a6a95883 100644 --- a/python/paddle/fluid/tests/unittests/test_mse_loss.py +++ b/python/paddle/fluid/tests/unittests/test_mse_loss.py @@ -118,7 +118,7 @@ def test_NNMseLoss_mean(self): np.testing.assert_allclose(static_result, expected, rtol=1e-05) np.testing.assert_allclose(static_result, dy_result, rtol=1e-05) np.testing.assert_allclose(dy_result, expected, rtol=1e-05) - self.assertTrue(dy_result.shape, [1]) + self.assertEqual(dy_result.shape, ()) def test_NNMseLoss_sum(self): for dim in [[10, 10], [2, 10, 10], [3, 3, 10, 10]]: @@ -164,7 +164,7 @@ def test_NNMseLoss_sum(self): np.testing.assert_allclose(static_result, expected, rtol=1e-05) np.testing.assert_allclose(static_result, dy_result, rtol=1e-05) np.testing.assert_allclose(dy_result, expected, rtol=1e-05) - self.assertTrue(dy_result.shape, [1]) + self.assertEqual(dy_result.shape, ()) def test_NNMseLoss_none(self): for dim in [[10, 10], [2, 10, 10], [3, 3, 10, 10]]: @@ -210,7 +210,7 @@ def test_NNMseLoss_none(self): np.testing.assert_allclose(static_result, expected, rtol=1e-05) np.testing.assert_allclose(static_result, dy_result, rtol=1e-05) np.testing.assert_allclose(dy_result, expected, rtol=1e-05) - self.assertTrue(dy_result.shape, [1]) + self.assertEqual(dy_result.shape, tuple(dim)) class TestNNFunctionalMseLoss(unittest.TestCase): @@ -254,7 +254,7 @@ def test_NNFunctionalMseLoss_mean(self): np.testing.assert_allclose(static_result, expected, rtol=1e-05) np.testing.assert_allclose(static_result, dy_result, rtol=1e-05) np.testing.assert_allclose(dy_result, expected, rtol=1e-05) - self.assertTrue(dy_result.shape, [1]) + self.assertEqual(dy_result.shape, ()) def test_NNFunctionalMseLoss_sum(self): for dim in [[10, 10], [2, 10, 10], [3, 3, 10, 10]]: @@ -296,7 +296,7 @@ def test_NNFunctionalMseLoss_sum(self): np.testing.assert_allclose(static_result, expected, rtol=1e-05) np.testing.assert_allclose(static_result, dy_result, rtol=1e-05) np.testing.assert_allclose(dy_result, expected, rtol=1e-05) - self.assertTrue(dy_result.shape, [1]) + self.assertEqual(dy_result.shape, ()) def test_NNFunctionalMseLoss_none(self): for dim in [[10, 10], [2, 10, 10], [3, 3, 10, 10]]: @@ -338,7 +338,7 @@ def test_NNFunctionalMseLoss_none(self): np.testing.assert_allclose(static_result, expected, rtol=1e-05) np.testing.assert_allclose(static_result, dy_result, rtol=1e-05) np.testing.assert_allclose(dy_result, expected, rtol=1e-05) - self.assertTrue(dy_result.shape, [1]) + self.assertEqual(dy_result.shape, tuple(dim)) if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/test_nan_inf.py b/python/paddle/fluid/tests/unittests/test_nan_inf.py index 5e233e26a6283..4a60902768fd4 100644 --- a/python/paddle/fluid/tests/unittests/test_nan_inf.py +++ b/python/paddle/fluid/tests/unittests/test_nan_inf.py @@ -42,10 +42,6 @@ def check_nan_inf(self): out, err = proc.communicate() returncode = proc.returncode - - print(out) - print(err) - # in python3, type(out+err) is 'bytes', need use encode assert (out + err).find(b'There are NAN or INF') != -1 diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_run_cinn.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_run_cinn.py index 79cbabee262c7..c598073f434a8 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_run_cinn.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_run_cinn.py @@ -110,7 +110,7 @@ def train(dot_save_dir, prefix, seed=1234): loss_values = [] for step in range(iters): loss_v = exe.run(compiled_program, feed=feed[step], fetch_list=[loss]) - loss_values.append(loss_v[0][0]) + loss_values.append(loss_v[0]) return loss_values diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_reduce_cpu.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_reduce_cpu.py index fa1abbed09505..d1e3d9f6d0c9a 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_reduce_cpu.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_reduce_cpu.py @@ -48,10 +48,14 @@ def _compare_reduce_and_allreduce(self, use_device, delta2=1e-5): optimizer=seresnext_net.optimizer, ) - for loss in zip(all_reduce_first_loss, reduce_first_loss): - self.assertAlmostEqual(loss[0], loss[1], delta=1e-5) - for loss in zip(all_reduce_last_loss, reduce_last_loss): - self.assertAlmostEqual(loss[0], loss[1], delta=loss[0] * delta2) + self.assertAlmostEqual( + all_reduce_first_loss, reduce_first_loss, delta=1e-5 + ) + self.assertAlmostEqual( + all_reduce_last_loss, + reduce_last_loss, + delta=all_reduce_last_loss * delta2, + ) if not use_device: return @@ -86,20 +90,32 @@ def _compare_reduce_and_allreduce(self, use_device, delta2=1e-5): enable_sequential_execution=True, ) - for loss in zip(all_reduce_first_loss, all_reduce_first_loss_seq): - self.assertAlmostEqual(loss[0], loss[1], delta=1e-5) - for loss in zip(all_reduce_last_loss, all_reduce_last_loss_seq): - self.assertAlmostEqual(loss[0], loss[1], delta=loss[0] * delta2) + self.assertAlmostEqual( + all_reduce_first_loss, all_reduce_first_loss_seq, delta=1e-5 + ) + self.assertAlmostEqual( + all_reduce_last_loss, + all_reduce_last_loss_seq, + delta=all_reduce_last_loss * delta2, + ) - for loss in zip(reduce_first_loss, reduce_first_loss_seq): - self.assertAlmostEqual(loss[0], loss[1], delta=1e-5) - for loss in zip(reduce_last_loss, reduce_last_loss_seq): - self.assertAlmostEqual(loss[0], loss[1], delta=loss[0] * delta2) + self.assertAlmostEqual( + reduce_first_loss, reduce_first_loss_seq, delta=1e-5 + ) + self.assertAlmostEqual( + reduce_last_loss, + reduce_last_loss_seq, + delta=reduce_last_loss * delta2, + ) - for loss in zip(all_reduce_first_loss_seq, reduce_first_loss_seq): - self.assertAlmostEqual(loss[0], loss[1], delta=1e-5) - for loss in zip(all_reduce_last_loss_seq, reduce_last_loss_seq): - self.assertAlmostEqual(loss[0], loss[1], delta=loss[0] * delta2) + self.assertAlmostEqual( + all_reduce_first_loss_seq, reduce_first_loss_seq, delta=1e-5 + ) + self.assertAlmostEqual( + all_reduce_last_loss_seq, + reduce_last_loss_seq, + delta=all_reduce_last_loss_seq * delta2, + ) class TestResnetWithReduceCPU(TestResnetWithReduceBase): diff --git a/python/paddle/fluid/tests/unittests/test_recurrent_op.py b/python/paddle/fluid/tests/unittests/test_recurrent_op.py index 5a468d8e3ef3a..8874d955c06ab 100644 --- a/python/paddle/fluid/tests/unittests/test_recurrent_op.py +++ b/python/paddle/fluid/tests/unittests/test_recurrent_op.py @@ -37,7 +37,7 @@ def step(self, step_id, x): def forward(self): for step_id in range(self.x.shape[0]): self.step(step_id, self.x[step_id]) - return np.array([np.mean(self.y)]) + return np.mean(self.y) def segment_inputs(self): return [self.x[i] for i in range(self.x.shape[0])] @@ -239,7 +239,7 @@ def get_numerical_gradient(self, delta=0.005): f[...] = o dout_dfeed = (y_pos - y_neg) / (delta * 2) - g[...] = dout_dfeed[0] + g[...] = dout_dfeed return grad_list diff --git a/python/paddle/fluid/tests/unittests/test_resnet50_with_cinn.py b/python/paddle/fluid/tests/unittests/test_resnet50_with_cinn.py index 85517ffeba2d0..d262319eee8e3 100644 --- a/python/paddle/fluid/tests/unittests/test_resnet50_with_cinn.py +++ b/python/paddle/fluid/tests/unittests/test_resnet50_with_cinn.py @@ -103,7 +103,7 @@ def train(self, place, iters, feed, use_cinn=False, seed=1234): fetch_list=[loss], return_numpy=True, ) - loss_vals.append(loss_v[0][0]) + loss_vals.append(loss_v[0]) return loss_vals def test_check_resnet50_accuracy(self): diff --git a/python/paddle/fluid/tests/unittests/test_run_program_op.py b/python/paddle/fluid/tests/unittests/test_run_program_op.py index b780bf493397c..6d86f9f60f7a4 100644 --- a/python/paddle/fluid/tests/unittests/test_run_program_op.py +++ b/python/paddle/fluid/tests/unittests/test_run_program_op.py @@ -510,7 +510,7 @@ def test_stop_gradient(self): dy_loss = self.train(to_static=False) st_loss = self.train(to_static=True) - self.assertEqual(dy_loss[0], st_loss[0]) + self.assertEqual(dy_loss, st_loss) paddle.enable_static() diff --git a/python/paddle/fluid/tests/unittests/test_zero_dim_tensor.py b/python/paddle/fluid/tests/unittests/test_zero_dim_tensor.py index 9c049ddbf435d..3ab74278f28e8 100644 --- a/python/paddle/fluid/tests/unittests/test_zero_dim_tensor.py +++ b/python/paddle/fluid/tests/unittests/test_zero_dim_tensor.py @@ -219,14 +219,6 @@ def test_dygraph_reduce(self): self.assertEqual(x.grad.shape, []) np.testing.assert_allclose(x.grad.numpy(), np.array(3.0)) - if api in [ - paddle.sum, - paddle.mean, - paddle.nanmean, - paddle.nansum, - ]: - return - # 2) x is ND, reduce to 0D if api in [paddle.all, paddle.any]: x = paddle.randint(0, 2, [3, 5]).astype('bool') @@ -301,20 +293,11 @@ def test_static_reduce(self): np.testing.assert_allclose(res[2], np.array(1.0)) np.testing.assert_allclose(res[3], np.array(1.0)) - if api in [ - paddle.sum, - paddle.mean, - paddle.nanmean, - paddle.nansum, - ]: - return - # 2) x is ND, reduce to 0D if api in [paddle.all, paddle.any]: x = paddle.randint(0, 2, [3, 5]).astype('bool') else: x = paddle.rand([3, 5]) - x = paddle.rand([3, 5]) x.stop_gradient = False out = api(x, None) paddle.static.append_backward(out) @@ -1364,6 +1347,7 @@ def test_metric_accuracy(self): self.assertEqual(out.shape, []) def test_std(self): + # 1) x is 0D x = paddle.rand([]) x.stop_gradient = False out1 = paddle.std(x) @@ -1371,18 +1355,24 @@ def test_std(self): out1.backward() out2.backward() - # checkout shape of out self.assertEqual(out1.shape, []) self.assertEqual(out2.shape, []) - - # checkout value of out self.assertEqual(out1, 0) self.assertEqual(out2, 0) - # checkout backward self.assertEqual(x.grad.shape, []) + # 2) x is ND + x = paddle.rand([3, 5]) + x.stop_gradient = False + out = paddle.std(x) + out.backward() + + self.assertEqual(out.shape, []) + self.assertEqual(x.grad.shape, [3, 5]) + def test_var(self): + # 1) x is 0D x = paddle.rand([]) x.stop_gradient = False out1 = paddle.var(x) @@ -1390,18 +1380,23 @@ def test_var(self): out1.backward() out2.backward() - # checkout shape of out self.assertEqual(out1.shape, []) self.assertEqual(out2.shape, []) - - # checkout value of out self.assertEqual(out1, 0) self.assertEqual(out2, 0) - # checkout backward self.assertEqual(x.grad.shape, []) np.testing.assert_allclose(x.grad, 0) + # 2) x is ND + x = paddle.rand([3, 5]) + x.stop_gradient = False + out = paddle.std(x) + out.backward() + + self.assertEqual(out.shape, []) + self.assertEqual(x.grad.shape, [3, 5]) + def test_quantile(self): # 1) x is 0D x = paddle.rand([]) @@ -1597,7 +1592,6 @@ def test_clip(self): out = paddle.clip(x, -5, 5) out.retain_grads() out.backward() - self.assertEqual(out.shape, []) self.assertEqual(out.grad.shape, []) self.assertEqual(x.grad.shape, []) @@ -1607,7 +1601,6 @@ def test_clip(self): out1 = paddle.clip(x1, paddle.full([], 5.0), paddle.full([], 5.0)) out1.retain_grads() out1.backward() - self.assertEqual(out1.shape, []) self.assertEqual(out1.grad.shape, []) self.assertEqual(x1.grad.shape, []) @@ -5152,8 +5145,7 @@ def test_Categorical(self): self.assertEqual( d.log_prob(paddle.full([], 2, dtype='int64')).shape, [] ) - # because use paddle.sum - # self.assertEqual(d.entropy().shape, []) + self.assertEqual(d.entropy().shape, []) def test_Normal(self): normal = paddle.distribution.Normal(0.0, 3.0) @@ -5196,10 +5188,9 @@ def test_Beta(self): self.assertEqual(beta.sample([]).shape, []) self.assertEqual(beta.mean.shape, []) self.assertEqual(beta.variance.shape, []) - # because use paddle.sum - # self.assertEqual(beta.prob(self.x).shape, []) - # self.assertEqual(beta.log_prob(self.x).shape, []) - # self.assertEqual(beta.entropy().shape, []) + self.assertEqual(beta.prob(self.x).shape, []) + self.assertEqual(beta.log_prob(self.x).shape, []) + self.assertEqual(beta.entropy().shape, []) def test_kl_divergence(self): p = paddle.distribution.Beta(alpha=0.5, beta=0.5) @@ -5258,10 +5249,9 @@ def test_Multinomial(self): d = paddle.distribution.Multinomial( 10, paddle.to_tensor([0.2, 0.3, 0.5]) ) - # because use paddle.sum - # self.assertEqual(d.prob(self.x).shape, []) - # self.assertEqual(d.log_prob(self.x).shape, []) - # self.assertEqual(d.entropy().shape, []) + self.assertEqual(d.prob(self.x).shape, []) + self.assertEqual(d.log_prob(self.x).shape, []) + self.assertEqual(d.entropy().shape, []) class TestLossAPI(unittest.TestCase): @@ -5279,10 +5269,10 @@ def test_sigmoid_focal_loss(self): fg_num_1 = paddle.full([1], 2.0) out0 = F.sigmoid_focal_loss( - logit, label, normalizer=fg_num_0, reduction='mean' + logit, label, normalizer=fg_num_0, reduction='sum' ) out1 = F.sigmoid_focal_loss( - logit, label, normalizer=fg_num_1, reduction='mean' + logit, label, normalizer=fg_num_1, reduction='sum' ) out0.retain_grads() @@ -5297,6 +5287,28 @@ def test_sigmoid_focal_loss(self): self.assertEqual(out0.grad.shape, []) self.assertEqual(logit.grad.shape, [2, 3]) + def test_cross_entropy(self): + input = paddle.rand([3, 5]) + input.stop_gradient = False + label = paddle.randint(0, 5, shape=[3]) + + loss = paddle.nn.functional.cross_entropy(input, label, reduction='sum') + loss.backward() + + self.assertEqual(loss.shape, []) + self.assertEqual(input.grad.shape, [3, 5]) + + def test_l1_loss(self): + input = paddle.rand([3, 5]) + input.stop_gradient = False + label = paddle.rand([3, 5]) + + loss = paddle.nn.functional.l1_loss(input, label, reduction='mean') + loss.backward() + + self.assertEqual(loss.shape, []) + self.assertEqual(input.grad.shape, [3, 5]) + class TestLossAPIStatic(unittest.TestCase): def setUp(self): @@ -5327,12 +5339,42 @@ def test_sigmoid_focal_loss(self): prog, fetch_list=[out0, out1, out0.grad_name, logit.grad_name] ) np.testing.assert_allclose(res[0], res[1]) - # because static use paddle.mean - # self.assertEqual(res[0].shape, ()) - # self.assertEqual(res[1].shape, ()) - # self.assertEqual(res[2].shape, ()) + self.assertEqual(res[0].shape, ()) + self.assertEqual(res[1].shape, ()) + self.assertEqual(res[2].shape, ()) self.assertEqual(res[3].shape, (2, 3)) + @prog_scope() + def test_cross_entropy(self): + input = paddle.rand([3, 5]) + input.stop_gradient = False + label = paddle.randint(0, 5, shape=[3]) + label.stop_gradient = False + + loss = paddle.nn.functional.cross_entropy( + input, label, reduction='mean' + ) + paddle.static.append_backward(loss) + + prog = paddle.static.default_main_program() + res = self.exe.run(prog, fetch_list=[loss, input.grad_name]) + self.assertEqual(res[0].shape, ()) + self.assertEqual(res[1].shape, (3, 5)) + + @prog_scope() + def test_l1_loss(self): + input = paddle.rand([3, 5]) + input.stop_gradient = False + label = paddle.rand([3, 5]) + + loss = paddle.nn.functional.l1_loss(input, label, reduction='sum') + paddle.static.append_backward(loss) + + prog = paddle.static.default_main_program() + res = self.exe.run(prog, fetch_list=[loss, input.grad_name]) + self.assertEqual(res[0].shape, ()) + self.assertEqual(res[1].shape, (3, 5)) + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/incubate/autograd/composite_rules.py b/python/paddle/incubate/autograd/composite_rules.py index 72bc1601bfacc..4dbb517f66c04 100644 --- a/python/paddle/incubate/autograd/composite_rules.py +++ b/python/paddle/incubate/autograd/composite_rules.py @@ -252,7 +252,7 @@ def mean_composite(x, axis, keepdim): operator.mul, [x.shape[axis] for axis in axes] ) norm = fill_constant( - shape=x.shape if len(x.shape) == 0 else [1], + shape=[], value=value_to_fill, dtype=sum_x.dtype, ) diff --git a/python/paddle/incubate/distributed/models/moe/grad_clip.py b/python/paddle/incubate/distributed/models/moe/grad_clip.py index a386347ce2770..59ba6bb8f9454 100644 --- a/python/paddle/incubate/distributed/models/moe/grad_clip.py +++ b/python/paddle/incubate/distributed/models/moe/grad_clip.py @@ -142,22 +142,18 @@ def get_l2_norm_pow(params_grads, sum_dtype=None): global_norm_var = [] if len(sum_square_list_fp16) > 0: - global_norm_var_fp16 = paddle.concat(sum_square_list_fp16) - global_norm_var_fp16 = paddle.sum(global_norm_var_fp16) + global_norm_var_fp16 = paddle.add_n(sum_square_list_fp16) global_norm_var.append(global_norm_var_fp16.astype(sum_dtype)) if len(sum_square_list_fp32) > 0: - global_norm_var_fp32 = paddle.concat(sum_square_list_fp32) - global_norm_var_fp32 = paddle.sum(global_norm_var_fp32) + global_norm_var_fp32 = paddle.add_n(sum_square_list_fp32) if sum_dtype == 'float32': global_norm_var.append(global_norm_var_fp32) else: global_norm_var.append(global_norm_var_fp32.astype(sum_dtype)) if len(sum_square_list) > 0: - global_norm_var_fp64 = paddle.concat(sum_square_list) - global_norm_var_fp64 = paddle.sum(global_norm_var_fp64) + global_norm_var_fp64 = paddle.add_n(sum_square_list) global_norm_var.append(global_norm_var_fp64) - global_norm_var = paddle.concat(global_norm_var) - global_norm_var = paddle.sum(global_norm_var) + global_norm_var = paddle.add_n(global_norm_var) return global_norm_var, sum_dtype @no_grad() diff --git a/test/auto_parallel/test_while_op_partition.py b/test/auto_parallel/test_while_op_partition.py index fdbcee8255a2b..6dc02d6834f5b 100644 --- a/test/auto_parallel/test_while_op_partition.py +++ b/test/auto_parallel/test_while_op_partition.py @@ -206,7 +206,7 @@ def get_program(): auto.shard_tensor(error_cost, _g_process_mesh, [None, None, None]) loss = paddle.mean(error_cost) - auto.shard_tensor(loss, _g_process_mesh, [None]) + auto.shard_tensor(loss, _g_process_mesh, []) return train_program, start_program, dataloader, i, loss diff --git a/test/autograd/test_autograd_functional_static.py b/test/autograd/test_autograd_functional_static.py index 19ab630651b99..38c27c5e2384e 100644 --- a/test/autograd/test_autograd_functional_static.py +++ b/test/autograd/test_autograd_functional_static.py @@ -41,14 +41,14 @@ 'v_not_none', utils.reduce, np.random.rand(2, 3), - np.random.rand(1), + np.array(np.random.rand()), False, ), ( 'xs_stop_gradient', utils.reduce, np.random.rand(2, 3), - np.random.rand(1), + np.array(np.random.rand()), True, ), ( diff --git a/test/contrib/test_multi_precision_fp16_train.py b/test/contrib/test_multi_precision_fp16_train.py index a364d2161ebe4..96ad34e5c0c99 100644 --- a/test/contrib/test_multi_precision_fp16_train.py +++ b/test/contrib/test_multi_precision_fp16_train.py @@ -178,7 +178,7 @@ def train_loop(): (loss,) = exe.run( train_program, feed=feeder.feed(data), fetch_list=[sum_cost] ) - loss_v = loss[0] if isinstance(loss, np.ndarray) else loss + loss_v = float(loss) if isinstance(loss, np.ndarray) else loss print( 'PassID {:1}, Train Batch ID {:04}, train loss {:2.4}'.format( pass_id, batch_id + 1, float(loss_v) diff --git a/test/distribution/test_distribution_transform.py b/test/distribution/test_distribution_transform.py index 63ecd99d77baa..640391b472d7a 100644 --- a/test/distribution/test_distribution_transform.py +++ b/test/distribution/test_distribution_transform.py @@ -1205,7 +1205,7 @@ def test_inverse_shape(self, shape, expected_shape): @param.param_func(((np.random.random(10),),)) def test_forward_log_det_jacobian(self, x): self.assertEqual( - self._t.forward_log_det_jacobian(paddle.to_tensor(x)).shape, [1] + self._t.forward_log_det_jacobian(paddle.to_tensor(x)).shape, [] ) diff --git a/test/legacy_test/test_async_read_write.py b/test/legacy_test/test_async_read_write.py index 4fc20039881b9..98aee8d905669 100644 --- a/test/legacy_test/test_async_read_write.py +++ b/test/legacy_test/test_async_read_write.py @@ -65,7 +65,7 @@ def func_test_async_read_success(self): ) # index data index_array1 = paddle.gather(self.src, self.index) - count_numel = paddle.sum(count).numpy()[0] + count_numel = paddle.sum(count).item() index_array2 = self.dst[count_numel : count_numel + len(self.index)] np.testing.assert_allclose( index_array1.numpy(), index_array2.numpy(), rtol=1e-05 diff --git a/test/prim/prim/vjp/eager/test_comp_eager_sum_grad.py b/test/prim/prim/vjp/eager/test_comp_eager_sum_grad.py index bed88e395210b..9cade52686fe1 100644 --- a/test/prim/prim/vjp/eager/test_comp_eager_sum_grad.py +++ b/test/prim/prim/vjp/eager/test_comp_eager_sum_grad.py @@ -41,7 +41,7 @@ def desired(primal, cotangent, axis, keep_dim): class TestSumGradComp(unittest.TestCase): def test_sum_grad_comp_1(self): self.primal = np.random.rand(10, 10) - self.cotangent = np.random.rand(1) + self.cotangent = np.array(np.random.rand()) paddle.disable_static() np.testing.assert_allclose( diff --git a/test/standalone_executor/test_standalone_cuda_graph_multi_stream.py b/test/standalone_executor/test_standalone_cuda_graph_multi_stream.py index f6deb87d1c602..bee898653ca07 100644 --- a/test/standalone_executor/test_standalone_cuda_graph_multi_stream.py +++ b/test/standalone_executor/test_standalone_cuda_graph_multi_stream.py @@ -126,7 +126,7 @@ def test_result(self): for out in outs: for baseline, result in zip(outs[0], out): - self.assertEqual(baseline[0], result[0]) + self.assertEqual(baseline, result) if __name__ == "__main__": diff --git a/test/xpu/test_zero_dim_tensor_xpu.py b/test/xpu/test_zero_dim_tensor_xpu.py index 9ecce0af8305d..9369a9b0ed3ff 100644 --- a/test/xpu/test_zero_dim_tensor_xpu.py +++ b/test/xpu/test_zero_dim_tensor_xpu.py @@ -1269,10 +1269,11 @@ def test_sigmoid_focal_loss(self): out0.numpy(), out1.numpy(), ) + self.assertEqual(out0.shape, []) out0.retain_grads() out0.backward() - self.assertEqual(out0.grad.shape, [1]) + self.assertEqual(out0.grad.shape, []) self.assertEqual(logit.grad.shape, [2, 3]) def test_allclose(self):