diff --git a/examples/nlp/language_modeling/megatron_bert_pretraining.py b/examples/nlp/language_modeling/megatron_bert_pretraining.py index b6cb84481fc7..e6abee295a1a 100644 --- a/examples/nlp/language_modeling/megatron_bert_pretraining.py +++ b/examples/nlp/language_modeling/megatron_bert_pretraining.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import torch.multiprocessing as mp from omegaconf.omegaconf import OmegaConf, open_dict from pytorch_lightning import Trainer from pytorch_lightning.plugins.environments import TorchElasticEnvironment @@ -28,6 +29,8 @@ from nemo.utils import logging from nemo.utils.exp_manager import exp_manager +mp.set_start_method("spawn", force=True) + @hydra_runner(config_path="conf", config_name="megatron_bert_config") def main(cfg) -> None: diff --git a/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py b/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py index 12d121ca2bcd..90053f3052c8 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py @@ -643,7 +643,11 @@ def build_pretraining_data_loader(self, dataset, consumed_samples): # Torch dataloader. return torch.utils.data.DataLoader( - dataset, batch_sampler=batch_sampler, num_workers=self.cfg.data.num_workers, pin_memory=True, + dataset, + batch_sampler=batch_sampler, + num_workers=self.cfg.data.num_workers, + pin_memory=True, + persistent_workers=True if self.cfg.data.num_workers > 0 else False, ) def setup_training_data(self, cfg): diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index 52a640a9efd1..d4132c4c7e80 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -795,7 +795,7 @@ def build_pretraining_data_loader( batch_sampler=batch_sampler, num_workers=self.cfg.data.num_workers, pin_memory=True, - persistent_workers=True, + persistent_workers=True if self.cfg.data.num_workers > 0 else False, ) def setup(self, stage=None): diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_prompt_learning_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_prompt_learning_model.py index acf96688b33a..331136c64a46 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_prompt_learning_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_prompt_learning_model.py @@ -604,7 +604,9 @@ def build_virtual_prompt_dataset( drop_last=drop_last, num_workers=num_workers, pin_memory=pin_memory, - persistent_workers=True, # (@adithyare and @eharper) We need this to make spawn=True to work. + persistent_workers=True + if num_workers > 0 + else False, # (@adithyare and @eharper) We need this to make spawn=True to work. ) return dataset, dataloader diff --git a/nemo/collections/nlp/models/language_modeling/megatron_t5_prompt_learning_model.py b/nemo/collections/nlp/models/language_modeling/megatron_t5_prompt_learning_model.py index bfcc2c43631d..b3c08dff7ae8 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_t5_prompt_learning_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_t5_prompt_learning_model.py @@ -440,7 +440,9 @@ def build_virtual_prompt_dataset( drop_last=drop_last, num_workers=num_workers, pin_memory=pin_memory, - persistent_workers=True, # (@adithyare and @eharper) We need to set this to True to get around issues with spawn=True + persistent_workers=True + if num_workers > 0 + else False, # (@adithyare and @eharper) We need to set this to True to get around issues with spawn=True ) print('build success', len(dataloader), dataset_paths) return dataset, dataloader diff --git a/nemo/collections/nlp/models/machine_translation/megatron_nmt_model.py b/nemo/collections/nlp/models/machine_translation/megatron_nmt_model.py index fbf4b029fcdc..efa059419eda 100644 --- a/nemo/collections/nlp/models/machine_translation/megatron_nmt_model.py +++ b/nemo/collections/nlp/models/machine_translation/megatron_nmt_model.py @@ -430,7 +430,7 @@ def _setup_eval_dataloader_from_config(self, cfg: DictConfig, dataset): pin_memory=cfg.get("pin_memory", False), drop_last=cfg.get("drop_last", False), shuffle=False, - persistent_workers=True, + persistent_workers=True if cfg.get("num_workers", 0) > 0 else False, ) ) @@ -592,7 +592,7 @@ def _setup_megatron_dataloader_from_config(self, cfg, dataset, consumed_samples) collate_fn=collate_fn, num_workers=cfg.num_workers, pin_memory=cfg.pin_memory, - persistent_workers=True, + persistent_workers=True if cfg.num_workers > 0 else False, ) def process_global_batch_for_text_translation_datasets(self, batch):