diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto index b28c884429c17..1de6d26d05b9e 100644 --- a/paddle/fluid/framework/distributed_strategy.proto +++ b/paddle/fluid/framework/distributed_strategy.proto @@ -183,7 +183,7 @@ message DistributedStrategy { optional bool use_hierarchical_allreduce = 15 [ default = false ]; optional int32 hierarchical_allreduce_inter_nranks = 16 [ default = 1 ]; optional bool sync_batch_norm = 17 [ default = false ]; - optional bool fuse_all_reduce_ops = 18 [ default = false ]; + optional bool fuse_all_reduce_ops = 18 [ default = true ]; optional int32 fuse_grad_size_in_MB = 19 [ default = 32 ]; optional float fuse_grad_size_in_TFLOPS = 20 [ default = 50 ]; optional bool cudnn_exhaustive_search = 21 [ default = false ]; diff --git a/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py index a1cd0df8d7c7e..f9e28c003f509 100755 --- a/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py @@ -366,6 +366,8 @@ def test_sharding_hybrid_dp(self): "gradient_merge_acc_step": 1, "mp_degree": 1 } + + strategy.fuse_all_reduce_ops = False self.optimizer(avg_cost, strategy, train_prog, startup_prog) startup_prog_ops = startup_prog.global_block().ops main_prog_ops = train_prog.global_block().ops