added model definition converison for llama3

Wesley Truong · wesleytruong · commit 95009788eee1 · 2025-07-22T15:38:37.000-07:00
diff --git a/scripts/convert_from_hf.py b/scripts/convert_from_hf.py
@@ -0,0 +1,74 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+from pathlib import Path
+
+import torch
+import torch.distributed.checkpoint as dcp
+import torchtitan.protocols.train_spec as train_spec_module
+from torch.distributed.checkpoint import HuggingFaceStorageReader
+from torchtitan.components.checkpoint import ModelWrapper
+from torchtitan.components.tokenizer import build_hf_tokenizer
+from torchtitan.config_manager import ConfigManager
+
+
+@torch.inference_mode()
+def convert_from_hf(input_dir, output_dir, model_name, model_flavor):
+    # initialize model to allocate memory for state dict
+    train_spec = train_spec_module.get_train_spec(model_name)
+    model_args = train_spec.model_args[model_flavor]
+
+    config_manager = ConfigManager()
+    config = config_manager.parse_args(
+        [
+            "--model.tokenizer-path",
+            "./assets/tokenizer/Llama-3.1-8B",
+        ]
+    )
+    tokenizer = build_hf_tokenizer(config)
+    model_args.update_from_config(config, tokenizer)
+    with torch.device("cpu"):
+        model = train_spec.model_cls(model_args)
+    model = ModelWrapper(model)
+
+    sd_adapter = train_spec.state_dict_adapter
+    assert (
+        sd_adapter is not None
+    ), "trying to convert checkpoint from HF to DCP safetensors format, but sd_adapter is not provided."
+    # get state dict in tt format with allocated memory
+    state_dict = model._get_state_dict()
+    # convert empty state dict to hf format so that hf weights can be loaded into it
+    hf_state_dict = sd_adapter.to_hf(state_dict, model_args)
+    dcp.load(
+        hf_state_dict,
+        storage_reader=HuggingFaceStorageReader(path=input_dir),
+    )
+    # convert state dict format back hf->tt and save
+    state_dict = sd_adapter.from_hf(hf_state_dict, model_args)
+    dcp.save(
+        state_dict,
+        checkpoint_id=output_dir,
+    )
+
+
+if __name__ == "__main__":
+    init_logger()
+    parser = argparse.ArgumentParser(description="Convert Llama weights to DCP format.")
+    parser.add_argument(
+        "input_dir", type=Path, help="Input directory with original Llama weights."
+    )
+    parser.add_argument("output_dir", type=Path, help="Output directory for DCP.")
+    parser.add_argument("--model_name", type=str, nargs="?", default="llama3")
+    parser.add_argument("--model_flavor", type=str, nargs="?", default="8B")
+    args = parser.parse_args()
+
+    convert_from_hf(
+        args.input_dir,
+        args.output_dir,
+        args.model_name,
+        args.model_flavor,
+    )
diff --git a/scripts/convert_to_hf.py b/scripts/convert_to_hf.py
@@ -0,0 +1,89 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+from pathlib import Path
+
+import torch
+import torch.distributed.checkpoint as dcp
+import torchtitan.protocols.train_spec as train_spec_module
+from torch.distributed.checkpoint import HuggingFaceStorageWriter
+from torchtitan.components.checkpoint import ModelWrapper
+from torchtitan.components.tokenizer import build_hf_tokenizer
+from torchtitan.config_manager import ConfigManager
+
+
+@torch.inference_mode()
+def convert_to_hf(input_dir, output_dir, model_name, model_flavor):
+    # load model and model args so that we can get the state dict shape
+    train_spec = train_spec_module.get_train_spec(model_name)
+    model_args = train_spec.model_args[model_flavor]
+
+    config_manager = ConfigManager()
+    config = config_manager.parse_args(
+        [
+            "--model.tokenizer-path",
+            "./assets/tokenizer/Llama-3.1-8B",
+        ]
+    )
+    tokenizer = build_hf_tokenizer(config)
+    model_args.update_from_config(config, tokenizer)
+    with torch.device("cpu"):
+        model = train_spec.model_cls(model_args)
+    model = ModelWrapper(model)
+
+    sd_adapter = train_spec.state_dict_adapter
+    assert (
+        sd_adapter is not None
+    ), "trying to convert checkpoint from DCP to HF safetensors format, but sd_adapter is not provided."
+
+    # allocate state dict memory with empty weights to load checkpoint
+    state_dict = model._get_state_dict()
+    dcp.load(
+        state_dict,
+        checkpoint_id=input_dir,
+    )
+
+    # convert state dict tt->hf
+    hf_state_dict = sd_adapter.to_hf(state_dict, model_args)
+
+    fqn_to_index_mapping = {}
+    num_fqns_per_file = 30
+
+    for i, key in enumerate(hf_state_dict.keys()):
+        group_num = (i // num_fqns_per_file) + 1
+        fqn_to_index_mapping[key] = group_num
+
+    storage_writer = HuggingFaceStorageWriter(
+        path=output_dir,
+        save_distributed=True,
+        fqn_to_index_mapping=fqn_to_index_mapping,
+        enable_consolidation=True,
+        thread_count_consolidation=5,
+    )
+
+    dcp.save(
+        hf_state_dict,
+        storage_writer=storage_writer,
+    )
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Convert Llama weights to HF format.")
+    parser.add_argument(
+        "input_dir", type=Path, help="Input directory with original Llama weights."
+    )
+    parser.add_argument("output_dir", type=Path, help="Output directory for DCP.")
+    parser.add_argument("--model_name", type=str, nargs="?", default="llama3")
+    parser.add_argument("--model_flavor", type=str, nargs="?", default="8B")
+    args = parser.parse_args()
+
+    convert_to_hf(
+        args.input_dir,
+        args.output_dir,
+        args.model_name,
+        args.model_flavor,
+    )
diff --git a/torchtitan/components/checkpoint.py b/torchtitan/components/checkpoint.py
@@ -348,7 +348,9 @@ def dcp_save(
         checkpoint_save_id: str | None = None
         if to_hf:
             assert self.sd_adapter is not None
-            state_dict = self.sd_adapter.to_hf(state_dict)
+            state_dict = self.sd_adapter.to_hf(
+                state_dict, self.states["train_state"].model_args
+            )
 
             fqn_to_index_mapping = {}
             num_fqns_per_file = 30
@@ -415,14 +417,18 @@ def dcp_load(
             assert (
                 self.sd_adapter is not None
             ), "trying to load checkpoint in HF safetensors format, but sd_adapter is not provided."
-            hf_state_dict = self.sd_adapter.to_hf(state_dict)
+            hf_state_dict = self.sd_adapter.to_hf(
+                state_dict, self.states["train_state"].model_args
+            )
 
             dcp.load(
                 hf_state_dict,
                 storage_reader=HuggingFaceStorageReader(path=checkpoint_id),
             )
 
-            state_dict = self.sd_adapter.from_hf(hf_state_dict)
+            state_dict = self.sd_adapter.from_hf(
+                hf_state_dict, self.states["train_state"].model_args
+            )
             self.states[MODEL].load_state_dict(state_dict)
         else:
             dcp.load(state_dict, checkpoint_id=checkpoint_id)
diff --git a/torchtitan/models/llama3/model/state_dict_adapter.py b/torchtitan/models/llama3/model/state_dict_adapter.py
@@ -4,18 +4,129 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import re
 from typing import Any
 
 from torchtitan.protocols.state_dict_adapter import StateDictAdapter
 
+from .args import TransformerModelArgs
+
 
 class Llama3StateDictAdapter(StateDictAdapter):
+    from_hf_map = {
+        "model.embed_tokens.weight": "tok_embeddings.weight",
+        "model.layers.{}.self_attn.q_proj.weight": "layers.{}.attention.wq.weight",
+        "model.layers.{}.self_attn.k_proj.weight": "layers.{}.attention.wk.weight",
+        "model.layers.{}.self_attn.v_proj.weight": "layers.{}.attention.wv.weight",
+        "model.layers.{}.self_attn.o_proj.weight": "layers.{}.attention.wo.weight",
+        "model.layers.{}.self_attn.rotary_emb.inv_freq": None,
+        "model.layers.{}.mlp.gate_proj.weight": "layers.{}.feed_forward.w1.weight",
+        "model.layers.{}.mlp.up_proj.weight": "layers.{}.feed_forward.w3.weight",
+        "model.layers.{}.mlp.down_proj.weight": "layers.{}.feed_forward.w2.weight",
+        "model.layers.{}.input_layernorm.weight": "layers.{}.attention_norm.weight",
+        "model.layers.{}.post_attention_layernorm.weight": "layers.{}.ffn_norm.weight",
+        "model.norm.weight": "norm.weight",
+        "lm_head.weight": "output.weight",
+    }
+    to_hf_map = {v: k for k, v in from_hf_map.items()}
+
+    # HuggingFace permutation function (exact copy from their conversion script)
     @staticmethod
-    def to_hf(state_dict: dict[str, Any]) -> dict[str, Any]:
-        # TODO: implement this
-        return state_dict
+    def _permute(w, n_heads_arg, dim1=None, dim2=None):
+        if dim1 is None:
+            dim1 = w.shape[0]
+        if dim2 is None:
+            dim2 = w.shape[1]
+        return (
+            w.view(n_heads_arg, dim1 // n_heads_arg // 2, 2, dim2)
+            .transpose(1, 2)
+            .reshape(dim1, dim2)
+            .clone()
+        )
+
+    @staticmethod
+    def _reverse_permute(w, n_heads_arg, dim1=None, dim2=None):
+        if dim1 is None:
+            dim1 = w.shape[0]
+        if dim2 is None:
+            dim2 = w.shape[1]
+        return (
+            w.view(n_heads_arg, 2, dim1 // n_heads_arg // 2, dim2)
+            .transpose(1, 2)
+            .reshape(dim1, dim2)
+        )
 
     @staticmethod
-    def from_hf(hf_state_dict: dict[str, Any]) -> dict[str, Any]:
-        # TODO: implement this
+    def to_hf(
+        state_dict: dict[str, Any], model_args: TransformerModelArgs
+    ) -> dict[str, Any]:
+
+        n_heads = model_args.n_heads
+        n_kv_heads = (
+            model_args.n_kv_heads if model_args.n_kv_heads is not None else n_heads
+        )
+        dim = model_args.dim
+        head_dim = dim // n_heads
+        hf_state_dict = {}
+
+        for key, value in state_dict.items():
+            if "layers" in key:
+                abstract_key = re.sub(r"(\d+)", "{}", key, count=1)
+                layer_num = re.search(r"\d+", key).group(0)
+                new_key = Llama3StateDictAdapter.to_hf_map[abstract_key]
+                # We need to permute the weights in wq and wk layer in order to account for the difference between
+                # the native Llama and huggingface RoPE implementation.
+                if abstract_key == "layers.{}.attention.wq.weight":
+                    value = Llama3StateDictAdapter._permute(value, n_heads)
+                if abstract_key == "layers.{}.attention.wk.weight":
+                    key_value_dim = head_dim * n_kv_heads
+                    value = Llama3StateDictAdapter._permute(
+                        value, n_kv_heads, key_value_dim, dim
+                    )
+
+                if new_key is None:
+                    continue
+                new_key = new_key.format(layer_num)
+            else:
+                new_key = Llama3StateDictAdapter.to_hf_map[key]
+
+            hf_state_dict[new_key] = value
         return hf_state_dict
+
+    @staticmethod
+    def from_hf(
+        hf_state_dict: dict[str, Any], model_args: TransformerModelArgs
+    ) -> dict[str, Any]:
+        n_heads = model_args.n_heads
+        n_kv_heads = (
+            model_args.n_kv_heads if model_args.n_kv_heads is not None else n_heads
+        )
+        dim = model_args.dim
+        head_dim = dim // n_heads
+        state_dict = {}
+
+        for key, value in hf_state_dict.items():
+            if "layers" in key:
+                abstract_key = re.sub(r"(\d+)", "{}", key, count=1)
+                layer_num = re.search(r"\d+", key).group(0)
+                new_key = Llama3StateDictAdapter.from_hf_map[abstract_key]
+                print(f"{new_key} in layer {layer_num}")
+
+                # We need to permute the weights in wq and wk layer in order to account for the difference between
+                # the native Llama and huggingface RoPE implementation.
+                if abstract_key == "model.layers.{}.self_attn.q_proj.weight":
+                    value = Llama3StateDictAdapter._reverse_permute(value, n_heads)
+                if abstract_key == "model.layers.{}.self_attn.k_proj.weight":
+                    key_value_dim = head_dim * n_kv_heads
+                    value = Llama3StateDictAdapter._reverse_permute(
+                        value, n_kv_heads, key_value_dim, dim
+                    )
+
+                if new_key is None:
+                    continue
+                new_key = new_key.format(layer_num)
+            else:
+                new_key = Llama3StateDictAdapter.from_hf_map[key]
+
+            state_dict[new_key] = value
+        return state_dict
diff --git a/torchtitan/train.py b/torchtitan/train.py
@@ -137,15 +137,15 @@ def __init__(self, job_config: JobConfig):
         )
 
         # build model (using meta init)
-        model_args = self.train_spec.model_args[job_config.model.flavor]
+        self.model_args = self.train_spec.model_args[job_config.model.flavor]
         # set the model args from training job configs
-        model_args.update_from_config(job_config, tokenizer)
+        self.model_args.update_from_config(job_config, tokenizer)
 
         logger.info(
-            f"Building {self.train_spec.name} {job_config.model.flavor} with {model_args}"
+            f"Building {self.train_spec.name} {job_config.model.flavor} with {self.model_args}"
         )
         with torch.device("meta"):
-            model = self.train_spec.model_cls(model_args)
+            model = self.train_spec.model_cls(self.model_args)
 
         # Build the collection of model converters. No-op if `model.converters` empty
         model_converters = build_model_converters(job_config, parallel_dims)
@@ -158,15 +158,15 @@ def __init__(self, job_config: JobConfig):
             else self.train_spec.build_metrics_processor_fn
         )
         self.metrics_processor = build_metrics_processor_fn(
-            job_config, parallel_dims, model_args
+            job_config, parallel_dims, self.model_args
         )
         color = self.metrics_processor.color
 
         # calculate model size and flops per token
         (
             model_param_count,
             self.metrics_processor.num_flops_per_token,
-        ) = model_args.get_nparams_and_flops(model, job_config.training.seq_len)
+        ) = self.model_args.get_nparams_and_flops(model, job_config.training.seq_len)
 
         logger.info(
             f"{color.blue}Model {self.train_spec.name} {job_config.model.flavor} "
@@ -229,7 +229,7 @@ def __init__(self, job_config: JobConfig):
                 parallel_dims,
                 job_config,
                 self.device,
-                model_args,
+                self.model_args,
                 self.train_spec.parallelize_fn,
                 self.loss_fn,
             )