pytorch
diff --git a/‎docs/checkpoint.md‎
Lines changed: 59 additions & 33 deletions b/‎docs/checkpoint.md‎
Lines changed: 59 additions & 33 deletions
diff --git a/‎scripts/convert_from_hf.py‎ renamed to ‎scripts/checkpoint_conversion/convert_from_hf.py‎
Lines changed: 3 additions & 15 deletions b/‎scripts/convert_from_hf.py‎ renamed to ‎scripts/checkpoint_conversion/convert_from_hf.py‎
Lines changed: 3 additions & 15 deletions
diff --git a/‎scripts/convert_llama_to_dcp.py‎ renamed to ‎scripts/checkpoint_conversion/convert_from_llama.py‎
Lines changed: 2 additions & 2 deletions b/‎scripts/convert_llama_to_dcp.py‎ renamed to ‎scripts/checkpoint_conversion/convert_from_llama.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎scripts/convert_to_hf.py‎ renamed to ‎scripts/checkpoint_conversion/convert_to_hf.py‎
Lines changed: 7 additions & 13 deletions b/‎scripts/convert_to_hf.py‎ renamed to ‎scripts/checkpoint_conversion/convert_to_hf.py‎
Lines changed: 7 additions & 13 deletions
diff --git a/‎torchtitan/components/checkpoint.py‎
Lines changed: 8 additions & 4 deletions b/‎torchtitan/components/checkpoint.py‎
Lines changed: 8 additions & 4 deletions
@@ -1,19 +1,9 @@
-## How to convert a Llama 3 checkpoint for use in torchtitan
+# How to use checkpoints in TorchTitan
 
-If you want to continue training from an existing model checkpoint, the checkpoint must be in the DCP format expected by the checkpoint manager.
-An example script for converting the original Llama3 checkpoints into the expected DCP format can be found in `scripts/convert_llama_to_dcp.py`.
-
-The script expects a path to the original checkpoint files, and a path to an output directory:
-```bash
-python -m scripts.convert_llama_to_dcp <input_dir> <output_dir>
-```
+You may want to enable checkpointing in TorchTitan for better fault tolerance during training, or to enable easier importing and exporting of weights between TorchTitan and other libraries. TorchTitan offers varying degrees of support for other checkpoint formats which are listed further below.
 
+## A general guide to use checkpoints during training
 
-## How to convert a torchtitan checkpoint for use in torchtune
-
-This guide will walk you through the steps required to convert a checkpoint from torchtitan so that it can be loaded into torchtune.
-
-### Steps
 1. ENABLE CHECKPOINTING
 In your torchtitan training config, ensure that `enable_checkpoint` is set to True.
 ```
@@ -22,8 +12,6 @@ enable_checkpoint = true
 folder = "checkpoint"
 interval = 500
 ```
-
-
 2. SAVE MODEL ONLY
 By setting `last_save_model_only` to `True`, the checkpoint will only contain the model and exclude the optimizer state and extra train states, resulting in a smaller checkpoint size.
 ```
@@ -41,7 +29,17 @@ last_save_model_only = true
 export_dtype = "bfloat16"
 ```
 
-4. EXAMPLE CHECKPOINT CONFIGURATION
+4. EXCLUDING SPECIFIC KEYS FROM CHECKPOINT LOADING
+In some cases, you may want to partially load from a previous-trained checkpoint and modify certain settings, such as the number of GPUs or the current step. To achieve this, you can use the `exclude_from_loading` parameter to specify which keys should be excluded from loading.
+This parameter takes a list of string that should be excluded from loading.
+```
+[checkpoint]
+enable_checkpoint = true
+exclude_from_loading = ["data_loader", "lr_scheduler"]
+```
+When used in command line, the parameter should be a comma-separated list of strings. For example: `--checkpoint.exclude_from_loading data_loader,lr_scheduler`.
+
+5. EXAMPLE CHECKPOINT CONFIGURATION
 ```
 [checkpoint]
 enable_checkpoint = true
@@ -52,30 +50,63 @@ last_save_model_only = true
 export_dtype = "bfloat16"
 ```
 
-5. SAVE THE FINAL CHECKPOINT\
-Once the above have been set, the final checkpoint at the end of the training step will consist of model only with the desired export dtype. However, if the final step has not been reached yet, full checkpoints will still be saved so that training can be resumed.
+A more exhaustive and up-to-date list of checkpoint config options can be found in torchtitan/config/job_config.py
 
-6. CONVERT SHARDED CHECKPOINTS TO A SINGLE FILE\
-Finally, once you have obtained the last checkpoint, you can use the following command to convert the sharded checkpoints to a single .pt file that can be loaded into torchtune:
+## Conversion support
 
-```
-python -m torch.distributed.checkpoint.format_utils dcp_to_torch torchtitan/outputs/checkpoint/step-1000 checkpoint.pt
+### PyTorch Meta Llama
+
+If you want to continue training from an existing model checkpoint, the checkpoint must be in the DCP format expected by the checkpoint manager.
+An example script for converting the original Llama3 checkpoints into the expected DCP format can be found in `scripts/convert_llama_to_dcp.py`.
+
+The script expects a path to the original checkpoint files, and a path to an output directory:
+```bash
+python -m scripts.convert_from_llama <input_dir> <output_dir>
 ```
 
-7. EXCLUDING SPECIFIC KEYS FROM CHECKPOINT LOADING
-In some cases, you may want to partially load from a previous-trained checkpoint and modify certain settings, such as the number of GPUs or the current step. To achieve this, you can use the `exclude_from_loading` parameter to specify which keys should be excluded from loading.
-This parameter takes a list of string that should be excluded from loading.
+
+### Torchtune
+
+This guide will walk you through the steps required to convert a checkpoint from torchtitan so that it can be loaded into torchtune.
+
+1. CHECKPOINT CONFIGURATION
 ```
 [checkpoint]
 enable_checkpoint = true
-exclude_from_loading = ["data_loader", "lr_scheduler"]
+folder = "checkpoint"
+interval = 10
+last_save_model_only = true
+export_dtype = "bfloat16"
 ```
-When used in command line, the parameter should be a comma-separated list of strings. For example: `--checkpoint.exclude_from_loading data_loader,lr_scheduler`.
+
+2. SAVE THE FINAL CHECKPOINT\
+Once the above have been set, the final checkpoint at the end of the training step will consist of model only with the desired export dtype. However, if the final step has not been reached yet, full checkpoints will still be saved so that training can be resumed.
+
+3. CONVERT SHARDED CHECKPOINTS TO A SINGLE FILE\
+Finally, once you have obtained the last checkpoint, you can use the following command to convert the sharded checkpoints to a single .pt file that can be loaded into torchtune:
+
+```
+python -m torch.distributed.checkpoint.format_utils dcp_to_torch torchtitan/outputs/checkpoint/step-1000 checkpoint.pt
+```
+
 
 That's it. You have now successfully converted a sharded torchtitan checkpoint for use in torchtune.
 
+### HuggingFace
+TorchTitan supports two methods now for supporting huggingface, directly saving and loading a hf checkpoint during training, or using an example conversion script to directly reformat the weights.
 
-## How to create a seed checkpoint
+1. You can directly save huggingface model weights during training by using the `--checkpoint.last_save_in_safetensors_format` and `--checkpoint.last_save_model_only` options together. To directly load a torchtitan training session from a huggingface safetensors file, simply enable `--checkpoint.initial_load_model_only` and set `--checkpoint.initial_load_path` to the directory containing the huggingface checkpoint.
+
+2. To directly reformat the weights without the need to run a training loop, run the corresponding conversion script. The naming scheme is torchtitan-centric, e.g. convert_from_hf means convert hf->tt.
+
+```
+python ./scripts/checkpoint_conversion/convert_from_hf.py <input_dir> <output_dir> --model_name <model_name> --model_flavor <model_flavor>
+python ./scripts/checkpoint_conversion/convert_to_hf.py <input_dir> <output_dir> --model_name <model_name> --model_flavor <model_flavor>
+# e.g.
+python ./scripts/convert_from_hf.py ~/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3-8B/snapshots/8cde5ca8380496c9a6cc7ef3a8b46a0372a1d920/ ./outputs/checkpoint/step-0 --model_name llama3 --model_flavor 8B
+```
+
+### Seed Checkpoint
 Sometimes one needs to create a seed checkpoint to initialize a model from step 0.
 E.g. it is hard, if not impossible, for meta initialization on multiple devices to reproduce the initialization on a single device.
 A seed checkpoint does initialization of the model on a single CPU, and can be loaded from another job on an arbitrary number of GPUs via DCP resharding.
@@ -85,8 +116,3 @@ e.g.
 ```bash
 NGPU=1 CONFIG=<path_to_model_config> ./run_train.sh --checkpoint.enable_checkpoint --checkpoint.create_seed_checkpoint --parallelism.data_parallel_replicate_degree 1 --parallelism.data_parallel_shard_degree 1 --parallelism.tensor_parallel_degree 1 --parallelism.pipeline_parallel_degree 1 --parallelism.context_parallel_degree 1 --parallelism.expert_parallel_degree 1
 ```
-
-
-## How to load / save a checkpoint in HF safetensors format
-For save, users need to set `--checkpoint.last_save_in_safetensors_format` and `--checkpoint.last_save_model_only` to save the last checkpoint in HF format (intermediate ones are always in DCP format).
-For load, users need to either put the checkpoint in the `step-0` folder if using `--checkpoint.folder`, or specify `--checkpoint.initial_load_path` to load from a different folder. They also need to set `--checkpoint.initial_load_model_only` to load the checkpoint in HF format.
 
@@ -12,8 +12,6 @@
 import torchtitan.protocols.train_spec as train_spec_module
 from torch.distributed.checkpoint import HuggingFaceStorageReader
 from torchtitan.components.checkpoint import ModelWrapper
-from torchtitan.components.tokenizer import build_hf_tokenizer
-from torchtitan.config_manager import ConfigManager
 
 
 @torch.inference_mode()
@@ -22,41 +20,31 @@ def convert_from_hf(input_dir, output_dir, model_name, model_flavor):
     train_spec = train_spec_module.get_train_spec(model_name)
     model_args = train_spec.model_args[model_flavor]
 
-    config_manager = ConfigManager()
-    config = config_manager.parse_args(
-        [
-            "--model.tokenizer-path",
-            "./assets/tokenizer/Llama-3.1-8B",
-        ]
-    )
-    tokenizer = build_hf_tokenizer(config)
-    model_args.update_from_config(config, tokenizer)
     with torch.device("cpu"):
         model = train_spec.model_cls(model_args)
     model = ModelWrapper(model)
 
-    sd_adapter = train_spec.state_dict_adapter
+    sd_adapter = train_spec.state_dict_adapter(model_args)
     assert (
         sd_adapter is not None
     ), "trying to convert checkpoint from HF to DCP safetensors format, but sd_adapter is not provided."
     # get state dict in tt format with allocated memory
     state_dict = model._get_state_dict()
     # convert empty state dict to hf format so that hf weights can be loaded into it
-    hf_state_dict = sd_adapter.to_hf(state_dict, model_args)
+    hf_state_dict, _ = sd_adapter.to_hf(state_dict)
     dcp.load(
         hf_state_dict,
         storage_reader=HuggingFaceStorageReader(path=input_dir),
     )
     # convert state dict format back hf->tt and save
-    state_dict = sd_adapter.from_hf(hf_state_dict, model_args)
+    state_dict = sd_adapter.from_hf(hf_state_dict)
     dcp.save(
         state_dict,
         checkpoint_id=output_dir,
     )
 
 
 if __name__ == "__main__":
-    init_logger()
     parser = argparse.ArgumentParser(description="Convert Llama weights to DCP format.")
     parser.add_argument(
         "input_dir", type=Path, help="Input directory with original Llama weights."
 
@@ -14,7 +14,7 @@
 
 
 @torch.inference_mode()
-def convert_llama_weights(input_dir, output_dir, max_seq_len: int):
+def convert_from_llama(input_dir, output_dir, max_seq_len: int):
     with open(input_dir / "params.json", "r") as f:
         params = json.load(f)
     n_layers = params["n_layers"]
@@ -143,4 +143,4 @@ def convert_llama_weights(input_dir, output_dir, max_seq_len: int):
     )
     args = parser.parse_args()
 
-    convert_llama_weights(args.input_dir, args.output_dir, max_seq_len=args.max_seq_len)
+    convert_from_llama(args.input_dir, args.output_dir, max_seq_len=args.max_seq_len)
@@ -5,15 +5,14 @@
 # LICENSE file in the root directory of this source tree.
 
 import argparse
+import json
 from pathlib import Path
 
 import torch
 import torch.distributed.checkpoint as dcp
 import torchtitan.protocols.train_spec as train_spec_module
 from torch.distributed.checkpoint import HuggingFaceStorageWriter
 from torchtitan.components.checkpoint import ModelWrapper
-from torchtitan.components.tokenizer import build_hf_tokenizer
-from torchtitan.config_manager import ConfigManager
 
 
 @torch.inference_mode()
@@ -22,20 +21,11 @@ def convert_to_hf(input_dir, output_dir, model_name, model_flavor):
     train_spec = train_spec_module.get_train_spec(model_name)
     model_args = train_spec.model_args[model_flavor]
 
-    config_manager = ConfigManager()
-    config = config_manager.parse_args(
-        [
-            "--model.tokenizer-path",
-            "./assets/tokenizer/Llama-3.1-8B",
-        ]
-    )
-    tokenizer = build_hf_tokenizer(config)
-    model_args.update_from_config(config, tokenizer)
     with torch.device("cpu"):
         model = train_spec.model_cls(model_args)
     model = ModelWrapper(model)
 
-    sd_adapter = train_spec.state_dict_adapter
+    sd_adapter = train_spec.state_dict_adapter(model_args)
     assert (
         sd_adapter is not None
     ), "trying to convert checkpoint from DCP to HF safetensors format, but sd_adapter is not provided."
@@ -48,7 +38,7 @@ def convert_to_hf(input_dir, output_dir, model_name, model_flavor):
     )
 
     # convert state dict tt->hf
-    hf_state_dict = sd_adapter.to_hf(state_dict, model_args)
+    hf_state_dict, config_json = sd_adapter.to_hf(state_dict)
 
     fqn_to_index_mapping = {}
     num_fqns_per_file = 30
@@ -70,6 +60,10 @@ def convert_to_hf(input_dir, output_dir, model_name, model_flavor):
         storage_writer=storage_writer,
     )
 
+    config_path = output_dir / "config.json"
+    with config_path.open("w") as f:
+        json.dump(config_json, f, indent=4)
+
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Convert Llama weights to HF format.")
 
@@ -6,13 +6,15 @@
 
 import enum
 import functools
+import json
 import os
 import queue
 import re
 import shutil
 import threading
 import time
 from concurrent.futures import Future
+from pathlib import Path
 from typing import Any
 
 import torch
@@ -192,7 +194,7 @@ def __init__(
         lr_schedulers: LRSchedulersContainer,
         states: dict[str, Any],
         checkpoint_config: CheckpointConfig,
-        sd_adapter: type[StateDictAdapter] | None,
+        sd_adapter: StateDictAdapter | None,
         base_folder: str = "",
         ft_manager: FTManager | None = None,
     ) -> None:
@@ -202,7 +204,6 @@ def __init__(
             assert (
                 sd_adapter is not None
             ), "job_config.checkpoint.last_save_in_hf is True, but sd_adapter is not provided."
-        self.sd_adapter = sd_adapter
 
         self.ft_manager = (
             ft_manager.manager if ft_manager and ft_manager.enabled else None
@@ -358,7 +359,7 @@ def dcp_save(
             assert (
                 self.sd_adapter is not None
             ), "trying to save checkpoint in HF safetensors format, but sd_adapter is not provided."
-            state_dict = self.sd_adapter.to_hf(state_dict)
+            state_dict, config_json = self.sd_adapter.to_hf(state_dict)
 
             fqn_to_index_mapping = {}
             num_fqns_per_file = 30
@@ -376,6 +377,9 @@ def dcp_save(
                 enable_consolidation=True,
                 thread_count_consolidation=5,
             )
+            config_path = Path(checkpoint_id) / "config.json"
+            with config_path.open("w") as f:
+                json.dump(config_json, f, indent=4)
         else:
             checkpoint_save_id = checkpoint_id
 
@@ -425,7 +429,7 @@ def dcp_load(
             assert (
                 self.sd_adapter is not None
             ), "trying to load checkpoint in HF safetensors format, but sd_adapter is not provided."
-            hf_state_dict = self.sd_adapter.to_hf(
+            hf_state_dict, _ = self.sd_adapter.to_hf(
                 state_dict, self.states["train_state"].model_args
             )