Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix ernie ci auto trainer error #9758

Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
[AutoParallel]:fix ernine auto_trainer error
blacksheep-Aristotle committed Jan 10, 2025
commit a2f571fd3878a8a7e5d6694ad78d70a9b3e30fe3
6 changes: 1 addition & 5 deletions paddlenlp/trainer/auto_trainer.py
Original file line number Diff line number Diff line change
@@ -20,7 +20,7 @@
import numpy as np
import paddle
import paddle.distributed as dist
import paddle.distributed.auto_parallel.intermediate.parallelize as parallelize

Check warning on line 23 in paddlenlp/trainer/auto_trainer.py

Codecov / codecov/patch

paddlenlp/trainer/auto_trainer.py#L23

Added line #L23 was not covered by tests
import paddle.nn as nn
from paddle.distributed import fleet
from tqdm.auto import tqdm
@@ -30,7 +30,7 @@
from ..utils.batch_sampler import DistributedBatchSampler as NlpDistributedBatchSampler
from ..utils.log import logger
from .argparser import strtobool
from .auto_training_args import AutoTrainingArguments

Check warning on line 33 in paddlenlp/trainer/auto_trainer.py

Codecov / codecov/patch

paddlenlp/trainer/auto_trainer.py#L33

Added line #L33 was not covered by tests
from .trainer import SCALER_NAME, SCHEDULER_NAME, TRAINER_STATE_NAME, TRAINING_ARGS_NAME
from .trainer_callback import TrainerState
from .trainer_utils import ( # set_hyrbid_parallel_seed,
@@ -67,27 +67,23 @@
return loss

kwargs.update({"criterion": loss_func})
self.auto_dist_config = kwargs.pop("auto_dist_config", None)
model = kwargs.get("model", None)
assert model is not None

Check warning on line 72 in paddlenlp/trainer/auto_trainer.py

Codecov / codecov/patch

paddlenlp/trainer/auto_trainer.py#L70-L72

Added lines #L70 - L72 were not covered by tests
if (
kwargs.get("args", None) is not None
and kwargs["args"].use_intermediate_api
and not parallelize.has_parallelized_model
):
if kwargs.get("args", None) is not None and kwargs["args"].use_intermediate_api:
if not parallelize.has_parallelized_model:
model, self.auto_dist_config = self.parallel_model(model, kwargs["args"])
kwargs["model"] = model

Check warning on line 76 in paddlenlp/trainer/auto_trainer.py

Codecov / codecov/patch

paddlenlp/trainer/auto_trainer.py#L74-L76

Added lines #L74 - L76 were not covered by tests
else:
assert kwargs.get(

Check warning on line 78 in paddlenlp/trainer/auto_trainer.py

Codecov / codecov/patch

paddlenlp/trainer/auto_trainer.py#L78

Added line #L78 was not covered by tests
"auto_dist_config", None
), "if use AutoTrainer.parallel_model , auto_dist_config obtained from parallel_model should be passed to AutoTrainer "
self.auto_dist_config = kwargs.pop("auto_dist_config")

Check warning on line 81 in paddlenlp/trainer/auto_trainer.py

Codecov / codecov/patch

paddlenlp/trainer/auto_trainer.py#L81

Added line #L81 was not covered by tests
model = kwargs["model"]
for param in model.parameters():
# NOTE(zhangwl):in pipeline mode , param my be initialized before while delte init_func ,but param is still not is_initialized
if not param._is_initialized() and param._init_func is not None:
param.initialize()

Check warning on line 86 in paddlenlp/trainer/auto_trainer.py

Codecov / codecov/patch

paddlenlp/trainer/auto_trainer.py#L85-L86

Added lines #L85 - L86 were not covered by tests
Copy link
Contributor

@jeff41404 jeff41404 Jan 9, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if param._init_func is not None, should use param._init_func() or model. _init_weights(Layer) ?

kwargs["model"] = model
super().__init__(*args, **kwargs)
assert self.args.enable_auto_parallel
@@ -96,20 +92,20 @@
self.comm_group_in_pp = fleet.get_hybrid_communicate_group().get_pipe_parallel_group()
self._in_pir_mode = paddle.base.framework.get_flags("FLAGS_enable_pir_api")["FLAGS_enable_pir_api"]

@classmethod
def parallel_model(cls, model, training_args: AutoTrainingArguments):
if not training_args.use_intermediate_api:
return model, None
assert model is not None
for param in model.parameters():
if param._is_initialized():
logger.warning(

Check warning on line 102 in paddlenlp/trainer/auto_trainer.py

Codecov / codecov/patch

paddlenlp/trainer/auto_trainer.py#L95-L102

Added lines #L95 - L102 were not covered by tests
"intermediate_api needs lazy init because if param init before parallelize_model ,"
+ " param will be allocated the full amount of memory"
+ " We recommend reallocating memory after paralleliz-model to reduce the peak of memory allocation"
)

auto_dist_degree = {

Check warning on line 108 in paddlenlp/trainer/auto_trainer.py

Codecov / codecov/patch

paddlenlp/trainer/auto_trainer.py#L108

Added line #L108 was not covered by tests
"tensor_parallel": training_args.tensor_parallel_degree > 1,
"sequence_parallel": training_args.sequence_parallel,
"pipeline_parallel": training_args.pipeline_parallel_degree > 1,
@@ -117,12 +113,12 @@
"sharding": training_args.sharding,
"sharding_mesh_dim": training_args.sharding_parallel_mesh_dimension,
}
auto_dist_config = model._generate_auto_dist_config(auto_dist_degree)
model = parallelize.parallelize_model(

Check warning on line 117 in paddlenlp/trainer/auto_trainer.py

Codecov / codecov/patch

paddlenlp/trainer/auto_trainer.py#L116-L117

Added lines #L116 - L117 were not covered by tests
model,
config=auto_dist_config,
)
return model, auto_dist_config

Check warning on line 121 in paddlenlp/trainer/auto_trainer.py

Codecov / codecov/patch

paddlenlp/trainer/auto_trainer.py#L121

Added line #L121 was not covered by tests

def _nested_gather(self, tensors):
"""
@@ -171,7 +167,7 @@

if self.args.use_intermediate_api:
assert self.auto_dist_config is not None
self.optimizer = parallelize.parallelize_optimizer(

Check warning on line 170 in paddlenlp/trainer/auto_trainer.py

Codecov / codecov/patch

paddlenlp/trainer/auto_trainer.py#L170

Added line #L170 was not covered by tests
self.optimizer,
config=self.auto_dist_config,
)
Loading