You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
when trying to run Llama framework with fsdp turned on, this is the error I get :
Traceback (most recent call last):
File "/opt/NeMo/examples/nlp/language_modeling/megatron_gpt_pretraining.py", line 21, in
from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
File "/opt/NeMo/nemo/collections/nlp/init.py", line 15, in
from nemo.collections.nlp import data, losses, models, modules
File "/opt/NeMo/nemo/collections/nlp/data/init.py", line 42, in
from nemo.collections.nlp.data.zero_shot_intent_recognition.zero_shot_intent_dataset import (
File "/opt/NeMo/nemo/collections/nlp/data/zero_shot_intent_recognition/init.py", line 16, in
from nemo.collections.nlp.data.zero_shot_intent_recognition.zero_shot_intent_dataset import (
File "/opt/NeMo/nemo/collections/nlp/data/zero_shot_intent_recognition/zero_shot_intent_dataset.py", line 30, in
from nemo.collections.nlp.parts.utils_funcs import tensor2list
File "/opt/NeMo/nemo/collections/nlp/parts/init.py", line 17, in
from nemo.collections.nlp.parts.utils_funcs import list2str, tensor2list
File "/opt/NeMo/nemo/collections/nlp/parts/utils_funcs.py", line 37, in
from nemo.collections.nlp.modules.common.megatron.utils import erf_gelu
File "/opt/NeMo/nemo/collections/nlp/modules/init.py", line 16, in
from nemo.collections.nlp.modules.common import (
File "/opt/NeMo/nemo/collections/nlp/modules/common/init.py", line 36, in
from nemo.collections.nlp.modules.common.tokenizer_utils import get_tokenizer, get_tokenizer_list
File "/opt/NeMo/nemo/collections/nlp/modules/common/tokenizer_utils.py", line 28, in
from nemo.collections.nlp.parts.nlp_overrides import HAVE_MEGATRON_CORE
File "/opt/NeMo/nemo/collections/nlp/parts/nlp_overrides.py", line 37, in
from pytorch_lightning.plugins.precision import FSDPPrecision, MixedPrecisionPlugin
ImportError: cannot import name 'FSDPPrecision' from 'pytorch_lightning.plugins.precision' (/usr/local/lib/python3.10/dist-packages/pytorch_lightning/plugins/precision/init.py)
The command I used is this :
I ran using version 24.01
when trying to run Llama framework with fsdp turned on, this is the error I get :
Traceback (most recent call last):
File "/opt/NeMo/examples/nlp/language_modeling/megatron_gpt_pretraining.py", line 21, in
from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
File "/opt/NeMo/nemo/collections/nlp/init.py", line 15, in
from nemo.collections.nlp import data, losses, models, modules
File "/opt/NeMo/nemo/collections/nlp/data/init.py", line 42, in
from nemo.collections.nlp.data.zero_shot_intent_recognition.zero_shot_intent_dataset import (
File "/opt/NeMo/nemo/collections/nlp/data/zero_shot_intent_recognition/init.py", line 16, in
from nemo.collections.nlp.data.zero_shot_intent_recognition.zero_shot_intent_dataset import (
File "/opt/NeMo/nemo/collections/nlp/data/zero_shot_intent_recognition/zero_shot_intent_dataset.py", line 30, in
from nemo.collections.nlp.parts.utils_funcs import tensor2list
File "/opt/NeMo/nemo/collections/nlp/parts/init.py", line 17, in
from nemo.collections.nlp.parts.utils_funcs import list2str, tensor2list
File "/opt/NeMo/nemo/collections/nlp/parts/utils_funcs.py", line 37, in
from nemo.collections.nlp.modules.common.megatron.utils import erf_gelu
File "/opt/NeMo/nemo/collections/nlp/modules/init.py", line 16, in
from nemo.collections.nlp.modules.common import (
File "/opt/NeMo/nemo/collections/nlp/modules/common/init.py", line 36, in
from nemo.collections.nlp.modules.common.tokenizer_utils import get_tokenizer, get_tokenizer_list
File "/opt/NeMo/nemo/collections/nlp/modules/common/tokenizer_utils.py", line 28, in
from nemo.collections.nlp.parts.nlp_overrides import HAVE_MEGATRON_CORE
File "/opt/NeMo/nemo/collections/nlp/parts/nlp_overrides.py", line 37, in
from pytorch_lightning.plugins.precision import FSDPPrecision, MixedPrecisionPlugin
ImportError: cannot import name 'FSDPPrecision' from 'pytorch_lightning.plugins.precision' (/usr/local/lib/python3.10/dist-packages/pytorch_lightning/plugins/precision/init.py)
The command I used is this :
I ran using version 24.01
export HYDRA_FULL_ERROR=1; venv/bin/python3 launcher_scripts/main.py training=llama/llama2_70b stages=["training"] numa_mapping.enable=True container=/mtrsysgwork/liorpa/containers/nemo_24_01.sqsh training.trainer.num_nodes=16 training.model.global_batch_size=256 training.model.virtual_pipeline_model_parallel_size=null training.model.tensor_model_parallel_size=4 training.model.pipeline_model_parallel_size=1 training.model.micro_batch_size=2 training.model.data.data_prefix=["1.0",'${data_dir}/my-gpt3_00_text_document'] data_dir=/raid/dataset/the_pile/shard00 launcher_scripts_path=$PWD/launcher_scripts base_results_dir=/mtrsysgwork/liorpa/llama_results training.model.data.index_mapping_dir=/mtrsysgwork/liorpa/llama_results training.trainer.max_steps=100 training.trainer.val_check_interval=100 training.exp_manager.create_checkpoint_callback=false training.trainer.enable_checkpointing=False training.trainer.log_every_n_steps=1 training.run.name=llama2_70b_release_24_01 cluster.partition=ISR1-ALL +training.model.fsdp=true training.model.megatron_amp_O2=false training.model.activations_checkpoint_num_layers=null training.model.optim.name=fused_adam cluster.gpus_per_node=8 training.model.tokenizer.model=/mtrsysgwork/liorpa/llama_results/llama/tokenizer.model training.model.gradient_accumulation_fusion=false training.exp_manager.create_wandb_logger=false ~training.model.optim.bucket_cap_mb ~training.model.optim.overlap_grad_sync ~training.model.optim.overlap_param_sync ~training.model.optim.contiguous_grad_buffer
Perhaps it could be fixed using this :
NVIDIA/NeMo#8689
The text was updated successfully, but these errors were encountered: