Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add llama 3.1 recipes #11273

Merged
merged 5 commits into from
Nov 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion nemo/collections/llm/gpt/model/llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -273,7 +273,12 @@ def make_vocab_size_divisible_by(vocab_size):
base //= 2
return base

output = LlamaConfig(
if getattr(source, 'rope_scaling', None) is not None and source.rope_scaling.get('rope_type') == 'llama3':
# Apply Llama3.1 customize rope scaling
cls = Llama31Config
else:
cls = LlamaConfig
output = cls(
num_layers=source.num_hidden_layers,
hidden_size=source.hidden_size,
ffn_hidden_size=source.intermediate_size,
Expand Down
4 changes: 4 additions & 0 deletions nemo/collections/llm/recipes/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@
llama3_70b,
llama3_70b_16k,
llama3_70b_64k,
llama31_8b,
llama31_70b,
llama31_405b,
mamba2_1_3b,
mamba2_2_7b,
Expand Down Expand Up @@ -82,6 +84,8 @@
"llama3_70b",
"llama3_70b_16k",
"llama3_70b_64k",
"llama31_8b",
"llama31_70b",
"llama31_405b",
"mamba2_130m",
"mamba2_370m",
Expand Down
19 changes: 17 additions & 2 deletions nemo/collections/llm/recipes/finetune_default.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

import nemo_run as run
import pytorch_lightning as pl
import torch

import nemo.lightning as nl
from nemo.collections import llm
Expand Down Expand Up @@ -82,7 +83,7 @@ def default_finetune_recipe(
def default_finetune_trainer(
tensor_parallelism=1,
pipeline_parallelism=1,
pipeline_parallelism_type=None,
pipeline_parallelism_type=torch.bfloat16,
virtual_pipeline_parallelism=None,
context_parallelism=1,
sequence_parallelism=False,
Expand All @@ -93,6 +94,19 @@ def default_finetune_trainer(
limit_val_batches=None,
val_check_interval=30,
):
"""
Create a default fine-tuning trainer for any model.

This function sets up a template for strategy and trainer.

Args:
See docstrings of MegatronStrategy and Trainer.

Returns:
run.Config: Config for a finetuning trainer.

See usages of this in recipes for further details.
"""
strategy = run.Config(
nl.MegatronStrategy,
tensor_model_parallel_size=tensor_parallelism,
Expand Down Expand Up @@ -125,7 +139,8 @@ def default_finetune_trainer(

def nemo_resume(model_id: str) -> run.Config[nl.AutoResume]:
"""
Configure automatic resumption from a NeMo checkpoint converted from Huggingface for https://huggingface.co/{model_id}.
Configure automatic resumption from a NeMo checkpoint converted from Huggingface for
https://huggingface.co/{model_id}.

This NeMo checkpoint should be converted from Huggingface beforehand, using nemo.collections.llm.import_ckpt.
When converting the checkpoint, the NeMo checkpoint will be saved in NEMO_HOME (set to ~/.cache/nemo by default).
Expand Down
118 changes: 110 additions & 8 deletions nemo/collections/llm/recipes/llama31_405b.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
from nemo import lightning as nl
from nemo.collections.llm.api import finetune, pretrain
from nemo.collections.llm.gpt.data.mock import MockDataModule
from nemo.collections.llm.gpt.data.packed_sequence import PackedSequenceSpecs
from nemo.collections.llm.gpt.model.llama import Llama31Config405B, LlamaModel
from nemo.collections.llm.peft.lora import LoRA
from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe
Expand All @@ -33,6 +34,7 @@
from nemo.collections.llm.recipes.tp_overlap_configs.userbuffers import (
userbuffers_bf16_h100_h16384_tp8_cp2_mbs1_seqlen8192,
)
from nemo.lightning.pytorch.callbacks import GarbageCollectionCallback
from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback
from nemo.utils.exp_manager import TimingCallback

Expand Down Expand Up @@ -248,6 +250,9 @@ def finetune_recipe(
num_nodes: int = 3,
num_gpus_per_node: int = 8,
peft_scheme: Optional[str] = 'lora',
seq_length: Optional[int] = None,
packed_sequence: Optional[bool] = None,
performance_mode: bool = False,
) -> run.Partial:
"""
Create a fine-tuning recipe for Llama3.1 405B model.
Expand All @@ -261,8 +266,11 @@ def finetune_recipe(
name (str): Name of the fine-tuning run.
num_nodes (int): Number of compute nodes to use.
num_gpus_per_node (int): Number of GPUs per node.
peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora', 'none'/None.

peft_scheme (Optional[str]): Name of the peft scheme to use for finetuning. Allowed values: 'lora'/'none'/None.
seq_length (int): Maximum number of tokens per microbatch.
packed_sequence (Optional[bool]): If true, fine-tuning sequences will be packed into batches up to the given
maximum seq_length for better efficiency. By default, this value equals performance_mode.
performance_mode (bool): If true, enables optimizations for maximum performance.
Returns:
run.Partial: Partial configuration for fine-tuning.

Expand All @@ -279,22 +287,116 @@ def finetune_recipe(
This recipe uses the SQuAD dataset for fine-tuning. Be aware that fine-tuning a 405B model
requires substantial computational resources.
"""
if packed_sequence is None:
packed_sequence = performance_mode

if seq_length is None:
seq_length = 2048

if num_nodes is None:
if peft_scheme is None or peft_scheme.lower() == 'none':
num_nodes = 12
elif peft_scheme.lower() == 'lora':
num_nodes = 3

recipe = default_finetune_recipe(
model(), "meta-llama/Meta-Llama-3.1-405B", dir, name, num_nodes, num_gpus_per_node
model(), "meta-llama/Llama-3.1-405B", dir, name, num_nodes, num_gpus_per_node, packed_sequence
)

if peft_scheme is None or peft_scheme.lower() == 'none':
assert num_nodes >= 4
recipe.trainer.strategy.tensor_model_parallel_size = 8
recipe.trainer.strategy.pipeline_model_parallel_size = 4
recipe.trainer.strategy.pipeline_model_parallel_size = 14
recipe.data.global_batch_size = 6
recipe.optim.config.lr = 5e-6
elif peft_scheme.lower() == 'lora':
recipe.peft = run.Config(LoRA)
recipe.peft.dim = 16
recipe.peft.alpha = 32
recipe.peft.target_modules = ['linear_qkv']
recipe.optim.config.use_distributed_optimizer = False

# some settings currently do not function correctly with LoRA
recipe.model.config.cross_entropy_loss_fusion = False
recipe.trainer.strategy.tensor_model_parallel_size = 4
recipe.trainer.strategy.pipeline_model_parallel_size = 6
recipe.trainer.strategy.virtual_pipeline_parallelism = 7
recipe.data.global_batch_size = 128
recipe.trainer.strategy.virtual_pipeline_model_parallel_size = 7
recipe.data.global_batch_size = 6
recipe.optim.config.lr = 1e-4
else:
raise ValueError(f"Unrecognized peft scheme: {peft_scheme}")

# Sequence length settings in the model and dataset must agree
recipe.model.config.seq_length = seq_length
recipe.data.seq_length = seq_length
if packed_sequence:
recipe.data.dataset_kwargs = {'pad_to_max_length': True}
recipe.data.packed_sequence_specs = run.Config(PackedSequenceSpecs, packed_sequence_size=seq_length)

if performance_mode:
recipe = finetune_performance_optimizations(recipe, peft_scheme)

return recipe


def finetune_performance_optimizations(
recipe: run.Partial,
peft_scheme: str,
) -> run.Partial:
"""
Modify the given recipe to optimize settings for performance.

This method enables performance optimizations that may not be suitable for all use cases.
Intended to build upon the standard fine-tuning recipe.

Args:
recipe (run.Partial): Base fine-tuning recipe to which performance optimizations will be added
peft_scheme (str): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora', 'none'/None.

Returns:
run.Partial: Partial configuration for performance-optimized fine-tuning.

Note:
Use this method with caution and only when you need maximum performance.
It may not be suitable for all hardware configurations or use cases.
"""

if not hasattr(recipe.trainer, "callbacks"):
recipe.trainer.callbacks = []

if peft_scheme is None or peft_scheme.lower() == 'none':
# Note: limited support. This is not necessarily the most optimized setting
recipe.trainer.strategy.tensor_model_parallel_size = 8
recipe.trainer.strategy.pipeline_model_parallel_size = 14
recipe.trainer.plugins.grad_reduce_in_fp32 = False
recipe.trainer.strategy.ddp = run.Config(
DistributedDataParallelConfig,
check_for_nan_in_grad=True,
grad_reduce_in_fp32=False,
overlap_grad_reduce=True,
overlap_param_gather=True,
average_in_collective=True,
)
recipe.trainer.callbacks.append(
run.Config(
MegatronCommOverlapCallback,
tp_comm_overlap=True,
defer_embedding_wgrad_compute=True,
wgrad_deferral_limit=22,
)
)
else:
recipe.trainer.strategy.tensor_model_parallel_size = 4
recipe.trainer.strategy.pipeline_model_parallel_size = 6
recipe.trainer.strategy.virtual_pipeline_model_parallel_size = 7

recipe.trainer.strategy.sequence_parallel = True

recipe.trainer.callbacks.append(run.Config(TimingCallback))
recipe.trainer.callbacks.append(
run.Config(
GarbageCollectionCallback,
100,
100,
)
)

return recipe
Loading
Loading