diff --git a/README.md b/README.md index cdc991a506..aba8a5ad83 100644 --- a/README.md +++ b/README.md @@ -126,12 +126,12 @@ tune convert_checkpoint --checkpoint-path On a single GPU ``` -tune finetune_llm --config alpaca_llama2_finetune +tune --nnodes 1 --nproc_per_node 1 full_finetune --config alpaca_llama2_full_finetune ``` On multiple GPUs using FSDP ``` -tune --nnodes 1 --nproc_per_node 4 finetune_llm --config alpaca_llama2_finetune --fsdp True +tune --nnodes 1 --nproc_per_node 4 full_finetune --config alpaca_llama2_full_finetune ```   @@ -140,9 +140,9 @@ tune --nnodes 1 --nproc_per_node 4 finetune_llm --config alpaca_llama2_finetune To copy a recipe to customize it yourself and then run ``` -tune recipe cp finetune_llm my_recipe/finetune_llm.py -tune config cp alpaca_llama2_finetune my_recipe/alpaca_llama2_finetune.yaml -tune my_recipe/finetune_llm.py --config my_recipe/alpaca_llama2_finetune.yaml +tune recipe cp full_finetune my_recipe/full_finetune.py +tune config cp alpaca_llama2_full_finetune my_recipe/alpaca_llama2_full_finetune.yaml +tune my_recipe/full_finetune.py --config my_recipe/alpaca_llama2_full_finetune.yaml ```   @@ -154,15 +154,11 @@ recipes. Aside from torchtune recipe utilties, it integrates with ``torch.distri to support distributed job launching by default. ``tune`` offers everyting that ``torchrun`` does with the following additional functionalities: -1. ``tune `` with no optional ``torchrun`` options launches a single python process +1. ``tune `` will launch a torchrun job 2. ```` and recipe arg ```` can both be passed in as names instead of paths if they're included in torchtune -3. ``tune `` can be used to launch local recipes - -4. ``tune `` will launch a torchrun job - -5. ``tune recipe`` and ``tune config`` commands provide utilities for listing and copying packaged recipes and configs +3. ``tune recipe`` and ``tune config`` commands provide utilities for listing and copying packaged recipes and configs   diff --git a/docs/source/recipes/finetune_llm.rst b/docs/source/recipes/finetune_llm.rst index 5a7e4e3d91..51fedd1eba 100644 --- a/docs/source/recipes/finetune_llm.rst +++ b/docs/source/recipes/finetune_llm.rst @@ -16,25 +16,3 @@ This recipe supports: * :ref:`Distributed Training with FSDP` * :ref:`Activation Checkpointing` - -To run the recipe directly, launch with - -.. code-block:: bash - - tune finetune_llm --config - -Recipe ------- - -Copy the recipe directly into your own script or notebook to modify and edit for yourself. - -.. literalinclude:: ../../../recipes/finetune_llm.py - -Configs -------- - -.. tabs:: - - .. tab:: alpaca_llama2_finetune - - .. literalinclude:: ../../../recipes/configs/alpaca_llama2_finetune.yaml diff --git a/recipes/__init__.py b/recipes/__init__.py index 035c57172b..c7c543f119 100644 --- a/recipes/__init__.py +++ b/recipes/__init__.py @@ -5,8 +5,11 @@ # LICENSE file in the root directory of this source tree. -_RECIPE_LIST = ["finetune_llm", "alpaca_generate"] -_CONFIG_LISTS = {"finetune_llm": ["alpaca_llama2_finetune"], "alpaca_generate": []} +_RECIPE_LIST = ["full_finetune", "alpaca_generate"] +_CONFIG_LISTS = { + "full_finetune": ["alpaca_llama2_full_finetune"], + "alpaca_generate": [], +} def list_recipes(): diff --git a/recipes/configs/alpaca_llama2_finetune.yaml b/recipes/configs/alpaca_llama2_finetune.yaml deleted file mode 100644 index db08e89660..0000000000 --- a/recipes/configs/alpaca_llama2_finetune.yaml +++ /dev/null @@ -1,32 +0,0 @@ -# Runs the finetune_llm.py recipe using FullFinetuneParams -# -# To launch, run the following command from root: -# tune finetune_llm --config alpaca_llama2_finetune --override model_checkpoint= ... - -# Dataset and Dataloader -dataset: alpaca -seed: null -shuffle: True - -# Model Arguments -model: llama2_7b -model_checkpoint: /tmp/llama2-7b -tokenizer: llama2_tokenizer -tokenizer_checkpoint: /tmp/tokenizer.model - -# Fine-tuning arguments -batch_size: 2 -lr: 2e-5 -epochs: 3 -optimizer: SGD -loss: CrossEntropyLoss -output_dir: /tmp/alpaca-llama2-finetune -device: cuda -dtype: fp32 -enable_activation_checkpointing: True -enable_fsdp: True -cpu_offload: False -resume_from_checkpoint: False - -# Metrics arguments -metric_logger_type: disk diff --git a/recipes/finetune_llm.py b/recipes/finetune_llm.py deleted file mode 100644 index 8e54748919..0000000000 --- a/recipes/finetune_llm.py +++ /dev/null @@ -1,229 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - - -import argparse -import os -from functools import partial - -import torch -from torch.cuda.amp import GradScaler -from torch.utils.data import DataLoader, DistributedSampler - -from torchtune import datasets, losses, models, modules, optim, utils -from torchtune.utils.checkpoint import load_checkpoint, save_checkpoint -from torchtune.utils.generation import generate_from_prompt -from tqdm import tqdm - -from recipes.params import FullFinetuneParams - - -def recipe( - params: FullFinetuneParams, -) -> None: - """Training loop for fine-tuning an LLM on a provided dataset. Supports evals, - checkpointing, and distributed training. - - Args: - params (FullFinetuneParams): dataclass containing all args for recipe. See ``FullFinetuneParams`` for - more details. - - Raises: - ValueError: If ``cpu_offload`` is ``True`` but ``device`` is not ``cuda`` and <= 1 GPUs. - """ - # ---- Initialize components ---- # - distributed = utils.init_distributed() - world_size, rank = utils.get_world_size_and_rank() - - logger = utils.get_logger("DEBUG") - metric_logger = utils.get_metric_logger( - metric_logger_type=params.metric_logger_type, - project=params.project, - log_dir=params.output_dir, - ) - - device = utils.get_device(params.device) - dtype = utils.get_dtype(params.dtype) - seed = utils.set_seed(params.seed) - - # ---- Setup model and load checkpoint ---- # - tokenizer = models.get_tokenizer(params.tokenizer, path=params.tokenizer_checkpoint) - logger.info(msg=f"Loaded tokenizer from {params.tokenizer_checkpoint}") - - # TODO: initialize models for distributed on meta or cpu device to avoid OOMs - model = models.get_model(params.model, device=device) - - if params.cpu_offload and not distributed: - raise ValueError( - "CPU offload is only supported with FSDP in a distributed setting." - "Please launch in a distributed setting. If you do not wish to use > 1 GPU," - "use ``tune --nnodes 1 --nproc_per_node 1 ...``. FSDP will not shard" - "any parameters." - ) - - if distributed: # Use FSDP model for distributed training - model = utils.wrap_fsdp( - model=model, - device=device, - dtype=dtype, - strategy="FULL_SHARD", - auto_wrap_policy={modules.TransformerDecoderLayer}, - cpu_offload=params.cpu_offload, - ) - if params.enable_activation_checkpointing: - utils.set_activation_checkpointing( - model, auto_wrap_policy={modules.TransformerDecoderLayer} - ) - - # ---- Setup optimization functions ---- # - opt = optim.get_optimizer(params.optimizer, model, params.lr) - # Load model and possibly optimizer states - if params.resume_from_checkpoint: - ckpt_dict = load_checkpoint(params.model_checkpoint, model, opt) - model.load_state_dict(ckpt_dict["model"]) - # Note: optimizer entry in dictionary is pre-transformed if using FSDP - opt.load_state_dict(ckpt_dict["optimizer"]) - if rank == 0: - logger.info( - msg=f"Loaded checkpoint from previous finetune from {params.model_checkpoint}" - ) - else: - ckpt_dict = load_checkpoint(params.model_checkpoint, model) - model.load_state_dict(ckpt_dict["model"]) - if rank == 0: - logger.info(msg=f"Loaded pretrained model from {params.model_checkpoint}") - - # TODO add lr schedule option - loss_fn = losses.get_loss(params.loss) - - autocast = utils.get_autocast(dtype, device) - if dtype == torch.float16: - grad_scaler = utils.get_gradient_scaler(distributed) - else: - grad_scaler = GradScaler(enabled=False) - - # ---- Load dataset, set up sampler, and dataloader ---- # - ds = datasets.get_dataset( - params.dataset, - split="train", - tokenizer=tokenizer, - train_on_input=params.train_on_input, - ) - sampler = DistributedSampler( - ds, - num_replicas=world_size, - rank=rank, - shuffle=params.shuffle, - seed=0, - ) - dataloader = DataLoader( - dataset=ds, - batch_size=params.batch_size, - sampler=sampler, - collate_fn=partial( - utils.padded_collate, - padding_idx=tokenizer.pad_id, - ignore_idx=loss_fn.ignore_index, # TODO support loss without ignore_index - ), - ) - logger.info(msg=f"Loaded dataset {params.dataset}") - - # ---- Train loop ---- # - for epoch in range(params.epochs): - sampler.set_epoch(epoch) # distributed sampler requires set_epoch - for idx, batch in enumerate(pbar := tqdm(dataloader, disable=not (rank == 0))): - if ( - params.max_steps_per_epoch is not None - and idx == params.max_steps_per_epoch - ): - break - opt.zero_grad() - - input_ids, labels = batch - input_ids = input_ids.to(device) - labels = labels.to(device) - - with autocast: - logits = model(input_ids) - # Shift so that tokens < n predict n - logits = logits[..., :-1, :].contiguous() - labels = labels[..., 1:].contiguous() - logits = logits.transpose(1, 2) - # Compute loss - loss = loss_fn(logits, labels) - - pbar.set_description(f"{epoch+1}|{idx+1}|Loss: {loss.item()}") - - # Log metrics at each step - # If no metric logger is specified, this is a no-op - if rank == 0: - metric_logger.log_dict( - { - "loss": loss.item(), - "lr": opt.param_groups[0]["lr"], - "gpu_resources": torch.cuda.memory_allocated(), - }, - step=epoch * len(dataloader) - + idx, # Each step is unique, not limited to each epoch - ) - - grad_scaler.scale(loss).backward() - grad_scaler.step(opt) - grad_scaler.update() - - # --- TODO TEMPORARY EVAL Code ---- # - if params.run_generation and idx % params.run_generation == 0: - # Log a sample generation for the instruction. - # Just using a hardcoded prompt for now - prompt = ( - "Below is an instruction that describes a task, paired with an input that provides further context. " - "Write a response that appropriately completes the request.\n\n### Instruction:\nCreate a classification task " - "by clustering the given list of items.\n\n### Input:\nApples, oranges, bananas, strawberries, pineapples\n\n" - "### Response:" - ) - generation_str, decoded_tokens = generate_from_prompt( - prompt=prompt, tokenizer=tokenizer, decoder=model - ) - if rank == 0: - logger.info(f"Generation tokens: {decoded_tokens}") - logger.info(f"Generation: {generation_str}") - # --- TODO TEMPORARY EVAL Code Ends ---- # - - # ---- Save checkpoint at end of each epoch (to be changed later) ---- # - os.makedirs(params.output_dir, exist_ok=True) - output_loc = f"{params.output_dir}/model_{epoch}.ckpt" - ckpt_dict = { - "model": model, - "optimizer": opt, - } - if epoch == params.epochs - 1: - # Don't save optimizer state when producing final checkpoint to reduce checkpoint file size. - ckpt_dict.pop("optimizer") - if rank == 0: - logger.info(msg=f"Saving model checkpoint to {output_loc}") - save_checkpoint(ckpt_dict, output_loc) - if rank == 0: - logger.info( - msg=f"Model checkpoint of size {os.path.getsize(output_loc) >> 20} MB saved to {output_loc}" - ) - - metric_logger.close() - - -if __name__ == "__main__": - parser = utils.TuneArgumentParser( - description=FullFinetuneParams.__doc__, - formatter_class=argparse.RawDescriptionHelpFormatter, - ) - # Get user-specified args from config and CLI and create params for recipe - args, _ = parser.parse_known_args() - args = vars(args) - params = FullFinetuneParams(**args) - - logger = utils.get_logger("DEBUG") - logger.info(msg=f"Running finetune_llm.py with parameters {params}") - - recipe(params) diff --git a/recipes/tests/test_finetune_llm.py b/recipes/tests/test_full_finetune.py similarity index 68% rename from recipes/tests/test_finetune_llm.py rename to recipes/tests/test_full_finetune.py index 2b6dde071b..5b4c2848b2 100644 --- a/recipes/tests/test_finetune_llm.py +++ b/recipes/tests/test_full_finetune.py @@ -11,7 +11,6 @@ import pytest -import recipes.finetune_llm as finetune_llm from recipes.full_finetune import FullFinetuneRecipe from recipes.params import FullFinetuneParams @@ -40,124 +39,6 @@ def small_test_ckpt(max_batch_size: Optional[int] = None) -> TransformerDecoder: logger = logging.getLogger(__name__) -class TestFinetuneLLMRecipe: - def _fetch_loss_values(self, output) -> Dict[str, float]: - lines = output.splitlines() - loss_values = {} - for line in lines: - if "Loss:" in line: - splits = line.split("Loss:") - loss_value = float(splits[1].split(":")[0]) - loss_values[splits[0]] = loss_value - return loss_values - - def _fetch_expected_loss_values(self, ckpt) -> Dict[str, float]: - small_test_ckpt_loss_values = { - "1|1|": 10.5074, - "1|2|": 10.5563, - "2|1|": 10.5152, - "2|2|": 10.4851, - } - llama2_7b_ckpt_loss_values = { - "1|1|": 1.1333, - "1|2|": 1.1199, - "2|1|": 1.2614, - "2|2|": 0.9486, - } - if ckpt == "small_test_ckpt": - return small_test_ckpt_loss_values - if ckpt == "llama2_7b": - return llama2_7b_ckpt_loss_values - raise ValueError(f"Unknown ckpt {ckpt}") - - def _fetch_ckpt_model_path(self, ckpt) -> str: - if ckpt == "small_test_ckpt": - return "/tmp/test-artifacts/small-ckpt-01242024" - if ckpt == "llama2_7b": - return "/tmp/test-artifacts/llama2-7b-01242024" - raise ValueError(f"Unknown ckpt {ckpt}") - - def test_finetune_llm_loss(self, capsys, pytestconfig): - large_scale = pytestconfig.getoption("--large-scale") - ckpt = "llama2_7b" if large_scale else "small_test_ckpt" - expected_loss_values = self._fetch_expected_loss_values(ckpt) - - kwargs_values = { - "dataset": "alpaca", - "train_on_input": False, - "seed": 9, - "shuffle": True, - "model": ckpt, - "model_checkpoint": self._fetch_ckpt_model_path(ckpt), - "tokenizer": "llama2_tokenizer", - "tokenizer_checkpoint": "/tmp/test-artifacts/tokenizer.model", - "batch_size": 8, - "lr": 2e-5, - "epochs": 2, - "max_steps_per_epoch": 2, - "optimizer": "AdamW", - "loss": "CrossEntropyLoss", - "output_dir": "/tmp", - "device": "cpu", - "dtype": "fp32", - "enable_activation_checkpointing": False, - "enable_fsdp": False, - "run_generation": None, - "metric_logger_type": "disk", - "project": None, - "resume_from_checkpoint": False, - "cpu_offload": False, - } - - finetune_llm.recipe(FullFinetuneParams(**kwargs_values)) - loss_values = self._fetch_loss_values(capsys.readouterr().err) - logger.info("Expected loss values : ", expected_loss_values) - logger.info("Loss values from Finetune : ", loss_values) - assert len(loss_values) == len(expected_loss_values) - for key, value in loss_values.items(): - assert key in expected_loss_values - expected_loss_value = expected_loss_values[key] - assert value == pytest.approx(expected_loss_value, abs=0.001) - - def test_finetune_errors(self, capsys, pytestconfig): - large_scale = pytestconfig.getoption("--large-scale") - ckpt = "llama2_7b" if large_scale else "small_test_ckpt" - expected_loss_values = self._fetch_expected_loss_values(ckpt) - - kwargs_values = { - "dataset": "alpaca", - "train_on_input": False, - "seed": 9, - "shuffle": True, - "model": ckpt, - "model_checkpoint": self._fetch_ckpt_model_path(ckpt), - "tokenizer": "llama2_tokenizer", - "tokenizer_checkpoint": "/tmp/test-artifacts/tokenizer.model", - "batch_size": 8, - "lr": 2e-5, - "epochs": 2, - "max_steps_per_epoch": 2, - "optimizer": "AdamW", - "loss": "CrossEntropyLoss", - "output_dir": "/tmp", - "device": "cpu", - "dtype": "fp32", - "enable_activation_checkpointing": False, - "enable_fsdp": False, - "run_generation": None, - "metric_logger_type": "disk", - "project": None, - "resume_from_checkpoint": False, - "cpu_offload": True, - } - - with pytest.raises( - ValueError, - match="Cannot offload model to CPU if device is not cuda or <= 1 GPUs.", - ): - finetune_llm.recipe(FullFinetuneParams(**kwargs_values)) - - class TestFullFinetuneRecipe: def _fetch_loss_values(self, output) -> Dict[str, float]: lines = output.splitlines() diff --git a/tests/scripts/test_tune.py b/tests/scripts/test_tune.py index 1bdaaf6e01..3ce8586a36 100644 --- a/tests/scripts/test_tune.py +++ b/tests/scripts/test_tune.py @@ -32,7 +32,7 @@ def test_recipe_list(self, capsys): def test_recipe_cp(self, tmp_path, capsys): # Valid recipe - recipe = "finetune_llm" + recipe = "full_finetune" path = tmp_path / "dummy.py" testargs = f"tune recipe cp {recipe} {path}".split() with patch.object(sys, "argv", testargs): @@ -69,7 +69,7 @@ def test_recipe_paths(self): assert os.path.exists(recipe_path), f"{recipe_path} must exist" def test_config_list(self, capsys): - recipe = "finetune_llm" + recipe = "full_finetune" testargs = f"tune config list --recipe {recipe}".split() with patch.object(sys, "argv", testargs): runpy.run_path(TUNE_PATH, run_name="__main__") @@ -82,7 +82,7 @@ def test_config_list(self, capsys): def test_config_cp(self, tmp_path, capsys): # Valid recipe - config = "alpaca_llama2_finetune" + config = "alpaca_llama2_full_finetune" path = tmp_path / "dummy.yaml" testargs = f"tune config cp {config} {path}".split() with patch.object(sys, "argv", testargs): @@ -123,10 +123,10 @@ def test_config_paths(self): assert os.path.exists(config_path), f"{config_path} must exist" def test_run(self, capsys): - recipe = "finetune_llm" + recipe = "full_finetune" # Make sure we're not running on GPU which can lead to issues on GH CI testargs = f"\ - tune {recipe} --config alpaca_llama2_finetune --override tokenizer=fake \ + tune {recipe} --config alpaca_llama2_full_finetune --override tokenizer=fake \ device=cpu enable_fsdp=False enable_activation_checkpointing=False \ ".split() with patch.object(sys, "argv", testargs):