Skip to content

Commit

Permalink
Revert PR 32299, flag users when Zero-3 was missed (#32851)
Browse files Browse the repository at this point in the history
Revert PR 32299
  • Loading branch information
muellerzr authored and ArthurZucker committed Aug 16, 2024
1 parent 025485b commit 7e85948
Show file tree
Hide file tree
Showing 3 changed files with 28 additions and 40 deletions.
6 changes: 0 additions & 6 deletions src/transformers/modeling_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1477,9 +1477,6 @@ def _from_config(cls, config, **kwargs):
else:
model = cls(config, **kwargs)

# Flag for if we init with `zero3`, add an attr to the model so we can check downstream for issues
model._transformers_zero3_init_used = is_deepspeed_zero3_enabled()

# restore default dtype if it was modified
if dtype_orig is not None:
torch.set_default_dtype(dtype_orig)
Expand Down Expand Up @@ -3809,9 +3806,6 @@ def from_pretrained(
# Let's make sure we don't run the init function of buffer modules
model = cls(config, *model_args, **model_kwargs)

# If we init with `zero3`, add an attr to the model so we can check downstream for issues
model._transformers_zero3_init_used = is_deepspeed_zero3_enabled() and not is_quantized

# make sure we use the model's config since the __init__ call might have copied it
config = model.config

Expand Down
10 changes: 0 additions & 10 deletions src/transformers/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,6 @@
get_model_param_count,
get_module_class_from_name,
get_parameter_names,
is_deepspeed_zero3_enabled,
nested_concat,
nested_detach,
nested_numpify,
Expand Down Expand Up @@ -436,15 +435,6 @@ def __init__(
)
self.model_init = model_init

# Will reach this branch if the user has
# 1. Used `.from_pretrained` or `.from_config` to initialize their model
# 2. Did not configure Zero-3 via `TrainingArguments` or `accelerate launch` beforehand
# New models init such as `MyModel()` will not hit this step
if is_deepspeed_zero3_enabled() and not getattr(model, "_transformers_zero3_init_used", True):
raise ValueError(
"Model was not initialized with `Zero-3` despite being configured for DeepSpeed Zero-3. Please re-initialize your model via `Model.from_pretrained(...)` or `Model.from_config(...)` after creating your `TrainingArguments`!"
)

if model.__class__.__name__ in MODEL_MAPPING_NAMES:
raise ValueError(
f"The model you have picked ({model.__class__.__name__}) cannot be used as is for training: it only "
Expand Down
52 changes: 28 additions & 24 deletions tests/deepspeed/test_deepspeed.py
Original file line number Diff line number Diff line change
Expand Up @@ -709,30 +709,34 @@ def test_gradient_accumulation(self, stage, dtype):
# Relative difference. See the note above how to get identical loss on a small bs
self.assertTrue((no_grad_accum_loss - yes_grad_accum_loss) / (no_grad_accum_loss + 1e-15) <= 1e-3)

def test_missed_zero3_init(self):
from transformers import Trainer # noqa

with mockenv_context(**self.dist_env_1_gpu):
model = AutoModel.from_pretrained(T5_TINY)
training_args = TrainingArguments(
output_dir="./test_missed_zero3_init",
deepspeed=self.get_config_dict(ZERO3),
)
with self.assertRaises(
ValueError, msg="Model was not initialized with `Zero-3` despite being configured."
):
_ = Trainer(
model=model,
args=training_args,
)
# Now do it properly, triggered from our `TrainingArguments` earlier
model = AutoModel.from_pretrained(T5_TINY)
trainer = Trainer(
model=model,
args=training_args,
)
assert trainer.is_deepspeed_enabled
assert model._transformers_zero3_init_used
# NOTE: Currently a disabled test. In the future we should re-enable it.
# Issue resolves around Zero-3 w/ DPO/TRL + DeepSpeed
# As well as Zero-3 inference
# Related PR: https://github.com/huggingface/transformers/pull/32299
# def test_missed_zero3_init(self):
# from transformers import Trainer # noqa

# with mockenv_context(**self.dist_env_1_gpu):
# model = AutoModel.from_pretrained(T5_TINY)
# training_args = TrainingArguments(
# output_dir="./test_missed_zero3_init",
# deepspeed=self.get_config_dict(ZERO3),
# )
# with self.assertRaises(
# ValueError, msg="Model was not initialized with `Zero-3` despite being configured."
# ):
# _ = Trainer(
# model=model,
# args=training_args,
# )
# # Now do it properly, triggered from our `TrainingArguments` earlier
# model = AutoModel.from_pretrained(T5_TINY)
# trainer = Trainer(
# model=model,
# args=training_args,
# )
# assert trainer.is_deepspeed_enabled
# assert model._transformers_zero3_init_used

def check_saved_checkpoints_deepspeed(self, output_dir, freq, total, stage, dtype):
# adapted from TrainerIntegrationCommon.check_saved_checkpoints
Expand Down

0 comments on commit 7e85948

Please sign in to comment.