PaddlePaddle · w5688414 · Dec 18, 2023 · Dec 4, 2023 · Dec 13, 2023 · Dec 13, 2023
diff --git a/model_zoo/bert/README.md b/model_zoo/bert/README.md
@@ -65,7 +65,6 @@ python -m paddle.distributed.launch --gpus "0" run_pretrain.py \
     --weight_decay 1e-2 \
     --adam_epsilon 1e-6 \
     --warmup_steps 10000 \
-    --num_train_epochs 3 \
     --input_dir data/ \
     --output_dir pretrained_models/ \
     --logging_steps 1 \
@@ -83,7 +82,6 @@ python -m paddle.distributed.launch --gpus "0" run_pretrain.py \
 - `weight_decay` 表示AdamW优化器中使用的weight_decay的系数。
 - `adam_epsilon` 表示AdamW优化器中使用的epsilon值。
 - `warmup_steps` 表示动态学习率热启的step数。
-- `num_train_epochs` 表示训练轮数。
 - `input_dir` 表示输入数据的目录，该目录下所有文件名中包含training的文件将被作为训练数据。
 - `output_dir` 表示模型的保存目录。
 - `logging_steps` 表示日志打印间隔。
@@ -128,7 +126,6 @@ python -m paddle.distributed.launch --xpus "0" run_pretrain.py \
     --weight_decay 1e-2 \
     --adam_epsilon 1e-6 \
     --warmup_steps 10000 \
-    --num_train_epochs 3 \
     --input_dir data/ \
     --output_dir pretrained_models/ \
     --logging_steps 1 \
@@ -146,7 +143,6 @@ python -m paddle.distributed.launch --xpus "0" run_pretrain.py \
 - `weight_decay` 表示AdamW优化器中使用的weight_decay的系数。
 - `adam_epsilon` 表示AdamW优化器中使用的epsilon值。
 - `warmup_steps` 表示动态学习率热启的step数。
-- `num_train_epochs` 表示训练轮数。
 - `input_dir` 表示输入数据的目录，该目录下所有文件名中包含training的文件将被作为训练数据。
 - `output_dir` 表示模型的保存目录。
 - `logging_steps` 表示日志打印间隔。

diff --git a/paddlenlp/trainer/trainer.py b/paddlenlp/trainer/trainer.py
@@ -264,6 +264,10 @@
         if model is None:
             raise RuntimeError("`Trainer` requires either a `model` or `model_init` argument")
 
+        if self.args.to_static:
+            model = paddle.jit.to_static(model)
+            logger.info("Successfully to apply @to_static to the whole model.")
+
         if self.args.should_save or self.args.should_save_model_state:
             os.makedirs(self.args.output_dir, exist_ok=True)
 

diff --git a/paddlenlp/trainer/training_args.py b/paddlenlp/trainer/training_args.py
@@ -705,6 +705,10 @@
         default=False,
         metadata={"help": "Whether to unify hybrid parallel checkpoint."},
     )
+    to_static: Optional[bool] = field(
+        default=False,
+        metadata={"help": "Enable training under @to_static."},
+    )
 
     def __post_init__(self):
         env_local_rank = int(os.environ.get("PADDLE_RANK_IN_NODE", -1))
@@ -809,7 +813,7 @@

        try:
            self.use_auto_parallel = self.parallel_mode == "auto"
        except:
            pass

        if paddle.distributed.get_world_size() > 1 and (
@@ -1050,34 +1054,34 @@
                fleet.init(is_collective=True, strategy=strategy)
                logger.info(strategy)

        elif self.use_auto_parallel:
            world_size = paddle.distributed.get_world_size()
            tensor_parallel_degree = max(self.tensor_parallel_degree, 1)
            pipeline_parallel_degree = max(self.pipeline_parallel_degree, 1)

            assert (
                world_size % (tensor_parallel_degree * pipeline_parallel_degree) == 0
            ), f"Total world_size:{world_size} shoule be devided by tensor_parallel_degree: {self.tensor_parallel_degree} and pipeline_parallel_degree: {self.pipeline_parallel_degree}."

            self.data_parallel_degree = world_size // (tensor_parallel_degree * pipeline_parallel_degree)

            if self.sharding_parallel_degree == -1:
                if len(self.sharding) > 0:
                    self.sharding_parallel_degree = self.data_parallel_degree

            sharding_parallel_degree = max(self.sharding_parallel_degree, 1)
            if sharding_parallel_degree == 1 and len(self.sharding) > 0:
                logger.warning("sharding_parallel_degree=1 means no sharding, please set sharding to empty!")
                self.sharding = []

            if ShardingOption.OFFLOAD in self.sharding:
                warnings.warn("`offload` is not supported NOW!")

            strategy = fleet.auto.Strategy()
            if pipeline_parallel_degree > 1:
                pipeline_parallel_config = set(self.pipeline_parallel_config.split(" "))
                for x in pipeline_parallel_config:
                    if len(x) > 0:
                        if x not in [
                            # "disable_p2p_cache_shape",      # no need for auto_parallel
                            # "disable_partial_send_recv",    # no implemenation for auto_parallel
@@ -1086,108 +1090,108 @@
                            # "enable_sharding_comm_overlap", # no implemenation for auto_parallel
                            # "enable_timer",                 # no implemenation for auto_parallel
                        ]:
                            raise ValueError(
                                f"Found unknown pipeline mode config {x}, accpet config is disable_p2p_cache_shape, disable_partial_send_recv."
                            )

                pipeline = strategy.pipeline
                pipeline.enable = True
                pipeline.accumulate_steps = self.gradient_accumulation_steps
                pipeline.micro_batch_size = self.per_device_train_batch_size
                pipeline.schedule_mode = "1F1B"

                if self.amp_master_grad:
                    warnings.warn("`amp_master_grad` is not supported NOW in AutoParallel!")
                    self.amp_master_grad = False
                logger.info(f"PP configs:{strategy.pipeline}, use master_grad: {self.amp_master_grad}")

                if self.do_eval:
                    assert (
                        self.per_device_train_batch_size * self.gradient_accumulation_steps
                        == self.per_device_eval_batch_size
                    ), (
                        "In pipeline model, the evaluation also shares same setting with training. "
                        "Please set per_device_eval_batch_size=per_device_train_batch_size * gradient_accumulation_steps."
                    )

            if tensor_parallel_degree > 1:
                mp_optimization = strategy.mp_optimization

                if " " in self.tensor_parallel_config:
                    mp_config = set(self.tensor_parallel_config.split(" "))
                else:
                    mp_config = set(self.tensor_parallel_config.split(","))

                for x in mp_config:
                    if len(x) > 0:
                        if x not in [
                            "enable_mp_async_allreduce",  # allreduce_matmul_grad_overlapping in auto_parallel
                            # "enable_mp_skip_c_identity",
                            # "enable_mp_fused_linear_param_grad_add",
                        ]:
                            raise ValueError(
                                f"Found unknown tensor parallell config {x}, "
                                f"accept config is enable_mp_async_allreduce, enable_mp_skip_c_identity and enable_mp_fused_linear_param_grad_add"
                            )
                try:
                    if "enable_mp_async_allreduce" in mp_config:
                        mp_optimization.allreduce_matmul_grad_overlapping = True
                except:
                    warnings.warn(
                        "The enable_mp_async_allreduce, enable_mp_skip_c_identity and enable_mp_fused_linear_param_grad_add are not supported "
                        "by current version of Paddle. Please try latest develop Paddle."
                    )

            if sharding_parallel_degree > 1:
                sharding = strategy.sharding
                sharding.enable = True
                sharding.degree = sharding_parallel_degree
                if ShardingOption.SHARD_OP in self.sharding:
                    sharding.stage = 1
                elif ShardingOption.SHARD_GRAD_OP in self.sharding:
                    sharding.stage = 2
                elif ShardingOption.FULL_SHARD in self.sharding:
                    sharding.stage = 3

                sharding_parallel_config = set(self.sharding_parallel_config.split(" "))
                for x in sharding_parallel_config:
                    if len(x) > 0:
                        if x not in [
                            # "enable_stage1_tensor_fusion",
                            # "enable_stage1_overlap",
                            # "enable_stage2_overlap",
                        ]:
                            raise ValueError(
                                f"Found unknown pipeline mode config {x}, " f"accpet config is reduce_overlap."
                            )

                    if (
                        "enable_stage1_overlap" in sharding_parallel_config
                        or "enable_stage2_overlap" in sharding_parallel_config
                    ):
                        sharding.reduce_overlap = True

            if self.bf16 or self.fp16:
                amp = strategy.amp
                amp.enable = True
                amp.dtype = "bfloat16" if self.bf16 else "float16"
                amp.level = self.fp16_opt_level
                amp.init_loss_scaling = self.scale_loss
                amp.custom_black_list = self.amp_custom_black_list
                amp.custom_white_list = self.amp_custom_white_list

            if self.recompute:
                recompute = strategy.recompute
                recompute.enable = True

            self.strategy = strategy
            logger.info(self.strategy)
            order = ["dp", "pp", "mp"]
            degree = [self.data_parallel_degree, pipeline_parallel_degree, tensor_parallel_degree]
            mesh_dims = list(filter(lambda x: x[1] > 1, list(zip(order, degree))))
            if not mesh_dims:
                mesh_dims = [("dp", 1)]
            fleet.auto.create_mesh(mesh_dims)
        else:
            world_size = paddle.distributed.get_world_size()
            if world_size > 1: