Skip to content

Commit

Permalink
Fix restore sequence parallel (NVIDIA#7273)
Browse files Browse the repository at this point in the history
* Fix restore

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>

* reset and restore transformer config sequence parallel

Signed-off-by: Jason Wang <jasonwan@nvidia.com>

* modify model parallel config as well

Signed-off-by: Jason Wang <jasonwan@nvidia.com>

---------

Signed-off-by: Cheng-Ping Hsieh <chsieh@nvidia.com>
Signed-off-by: Jason Wang <jasonwan@nvidia.com>
Co-authored-by: Jason Wang <jasonwan@nvidia.com>
  • Loading branch information
hsiehjackson and blahBlahhhJ authored Aug 21, 2023
1 parent aab3c4b commit 335b876
Show file tree
Hide file tree
Showing 2 changed files with 5 additions and 0 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -1348,6 +1348,8 @@ def _reset_sequence_parallelism_args(self):

# Reset config values. Needed for calling generate.
self.cfg.sequence_parallel = False
self.model_parallel_config.sequence_parallel = False
self.transformer_config.sequence_parallel = False

# Reset model parameters.
for module in self.get_gpt_module_list():
Expand All @@ -1362,6 +1364,8 @@ def _restore_sequence_parallelism_args(self):
"""
# Restore config values.
self.cfg.sequence_parallel = self.last_sequence_parallel
self.model_parallel_config.sequence_parallel = self.last_sequence_parallel
self.transformer_config.sequence_parallel = self.last_sequence_parallel

# Restore model parameters.
for module in self.get_gpt_module_list():
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -586,6 +586,7 @@ def inference_epoch_end(self, outputs, mode, data_cfg):
# Merge the functionality of previous on_inference_epoch_end() within inference_epoch_end() func here
app_state = AppState()
self._restore_activation_checkpointing_args()
self._restore_sequence_parallelism_args()
if hasattr(self, "_train_ds"):
_reconfigure_microbatch_calculator(
rank=app_state.global_rank,
Expand Down

0 comments on commit 335b876

Please sign in to comment.