diff --git a/nemo/collections/llm/gpt/model/base.py b/nemo/collections/llm/gpt/model/base.py index 535d0c9c21a2..6b158a33b226 100644 --- a/nemo/collections/llm/gpt/model/base.py +++ b/nemo/collections/llm/gpt/model/base.py @@ -317,7 +317,6 @@ class GPTConfig175B(GPTConfig): num_attention_heads: int = 96 hidden_dropout: float = 0.0 attention_dropout: float = 0.0 - ffn_dropout: float = 0.0 bias_activation_fusion: bool = True bias_dropout_add_fusion: bool = True use_transformer_engine_full_layer_spec: bool = True diff --git a/nemo/collections/llm/recipes/llama3_70b.py b/nemo/collections/llm/recipes/llama3_70b.py index e393dea908b7..e2156993647d 100644 --- a/nemo/collections/llm/recipes/llama3_70b.py +++ b/nemo/collections/llm/recipes/llama3_70b.py @@ -244,7 +244,7 @@ def pretrain_performance_optimizations(recipe: run.Partial) -> run.Partial: def finetune_recipe( dir: Optional[str] = None, name: str = "default", - num_nodes: int = 1, + num_nodes: int = None, num_gpus_per_node: int = 8, peft_scheme: Optional[str] = 'lora', seq_length: Optional[int] = None, @@ -293,11 +293,16 @@ def finetune_recipe( if seq_length is None: seq_length = 4096 if packed_sequence else 2048 + if num_nodes is None: + if peft_scheme is None or peft_scheme.lower() == 'none': + num_nodes = 4 + elif peft_scheme.lower() == 'lora': + num_nodes = 1 + recipe = default_finetune_recipe( model(), "meta-llama/Meta-Llama-3-70B", dir, name, num_nodes, num_gpus_per_node, packed_sequence ) if peft_scheme is None or peft_scheme.lower() == 'none': - assert num_nodes >= 4 recipe.trainer.strategy.tensor_model_parallel_size = 8 recipe.trainer.strategy.pipeline_model_parallel_size = 4 recipe.optim.config.lr = 5e-6