pytorch · ebsmothers · May 21, 2024 · May 3, 2024 · May 3, 2024 · May 3, 2024
diff --git a/recipes/lora_finetune_distributed.py b/recipes/lora_finetune_distributed.py
@@ -29,6 +29,7 @@
 from torchtune.datasets import ConcatDataset
 from torchtune.modules.peft.peft_utils import (
     get_adapter_params,
+    get_lora_module_names,
     get_merged_lora_ckpt,
     set_trainable_params,
     validate_state_dict_for_lora,
@@ -278,6 +279,12 @@ def _setup_model(
               the correct device.
         """
 
+        self._lora_rank = cfg_model.lora_rank
+        self._lora_alpha = cfg_model.lora_alpha
+        self._lora_attn_modules = list(cfg_model.lora_attn_modules)
+        self._apply_lora_to_mlp = cfg_model.apply_lora_to_mlp
+        self._apply_lora_to_output = getattr(cfg_model, "apply_lora_to_output", False)
+
         if self._is_rank_zero:
             log.info("FSDP is enabled. Instantiating Model on CPU for Rank 0 ...")
             init_start = time.perf_counter()
@@ -510,6 +517,18 @@ def save_checkpoint(
                     }
                 )
 
+            adapter_config = {
+                "r": self._lora_rank,
+                "lora_alpha": self._lora_alpha,
+                "target_modules": get_lora_module_names(
+                    self._lora_attn_modules,
+                    self._apply_lora_to_mlp,
+                    self._apply_lora_to_output,
+                ),
+                "peft_type": "LORA",
+            }
+            checkpoint_dict.update({utils.ADAPTER_CONFIG: adapter_config})
+
             self._checkpointer.save_checkpoint(
                 checkpoint_dict,
                 epoch=epoch,

diff --git a/recipes/lora_finetune_single_device.py b/recipes/lora_finetune_single_device.py
@@ -22,6 +22,7 @@
 from torchtune.datasets import ConcatDataset
 from torchtune.modules.peft.peft_utils import (
     get_adapter_params,
+    get_lora_module_names,
     get_merged_lora_ckpt,
     set_trainable_params,
     validate_missing_and_unexpected_for_lora,
@@ -258,6 +259,9 @@ def _setup_model(
 
         self._lora_rank = cfg_model.lora_rank
         self._lora_alpha = cfg_model.lora_alpha
+        self._lora_attn_modules = list(cfg_model.lora_attn_modules)
+        self._apply_lora_to_mlp = cfg_model.apply_lora_to_mlp
+        self._apply_lora_to_output = getattr(cfg_model, "apply_lora_to_output", False)
         self.adapter_params = get_adapter_params(model)
         set_trainable_params(model, self.adapter_params)
 
@@ -275,11 +279,10 @@ def _setup_model(
             )
         else:
             lora_missing, lora_unexpected = None, None
-
         validate_missing_and_unexpected_for_lora(
-            lora_attn_modules=cfg_model.lora_attn_modules,
-            apply_lora_to_mlp=cfg_model.apply_lora_to_mlp,
-            apply_lora_to_output=getattr(cfg_model, "apply_lora_to_output", False),
+            lora_attn_modules=self._lora_attn_modules,
+            apply_lora_to_mlp=self._apply_lora_to_mlp,
+            apply_lora_to_output=self._apply_lora_to_output,
             base_missing=base_missing,
             base_unexpected=base_unexpected,
             lora_missing=lora_missing,
@@ -417,6 +420,17 @@ def save_checkpoint(self, epoch: int) -> None:
             k: v for k, v in self._model.state_dict().items() if adapter_key_filter(k)
         }
         ckpt_dict.update({utils.ADAPTER_KEY: adapter_state_dict})
+        adapter_config = {
+            "r": self._lora_rank,
+            "lora_alpha": self._lora_alpha,
+            "target_modules": get_lora_module_names(
+                self._lora_attn_modules,
+                self._apply_lora_to_mlp,
+                self._apply_lora_to_output,
+            ),
+            "peft_type": "LORA",
+        }
+        ckpt_dict.update({utils.ADAPTER_CONFIG: adapter_config})
         self._checkpointer.save_checkpoint(
             ckpt_dict,
             epoch=epoch,

diff --git a/tests/torchtune/utils/test_checkpointer.py b/tests/torchtune/utils/test_checkpointer.py
@@ -14,8 +14,14 @@
 from torch import randn
 
 from torchtune.models import llama2, mistral
+from torchtune.modules.peft.peft_utils import (
+    get_adapter_params,
+    get_lora_module_names,
+    validate_missing_and_unexpected_for_lora,
+)
 from torchtune.utils._checkpointing import FullModelHFCheckpointer
 from torchtune.utils._checkpointing._checkpointer_utils import safe_torch_load
+from torchtune.utils.constants import ADAPTER_CONFIG, ADAPTER_KEY
 from torchtune.utils.seed import set_seed
 
 _VOCAB_SIZE = 100
@@ -293,6 +299,144 @@ def test_save_load_checkpoint_multiple_file(
         assert len(output_state_dict_1.keys()) + 1 == len(orig_state_dict_1.keys())
         assert len(output_state_dict_2.keys()) + 1 == len(orig_state_dict_2.keys())
 
+    def test_save_checkpoint_in_peft_format(
+        self,
+        single_file_checkpointer: FullModelHFCheckpointer,
+        llama2_hf_checkpoints: Tuple[Path, Path],
+    ):
+        """
+        Test save_checkpoint method within the FullModelCheckpointer for
+        integration with HF PEFT (i.e. save_in_peft_format=True).
+
+        We test that:
+        * The file adapter_config.json contains the fields required by PEFT
+        and the correct values
+        * The state dict keys of the saved adapter checkpoint are remapped as expected
+        * The state dict values of the saved adapter checkpoint (after key remapping)
+        match those in torchtune for parameters that are not permuted by HF
+        # The state dict values of the saved adapter checkpoint (after key remapping)
+        do not match those in torchtune for parameters that are permuted by HF, but the
+        sums along the dimension of permutation match
+        """
+
+        # Define LoRA params for this test
+        lora_attn_modules = ["q_proj", "output_proj"]
+        apply_lora_to_mlp = True
+        apply_lora_to_output = True
+        lora_rank = 4
+        lora_alpha = 8
+
+        checkpoint_file, _ = llama2_hf_checkpoints
+        state_dict = single_file_checkpointer.load_checkpoint()
+
+        # Build LoRA Llama2 model and load in base model weights
+        model = llama2.lora_llama2(
+            lora_attn_modules=lora_attn_modules,
+            apply_lora_to_mlp=apply_lora_to_mlp,
+            apply_lora_to_output=apply_lora_to_output,
+            vocab_size=_VOCAB_SIZE,
+            num_layers=1,
+            num_heads=_NUM_HEADS,
+            num_kv_heads=_NUM_KV_HEADS,
+            embed_dim=_DIM,
+            max_seq_len=128,
+            lora_rank=lora_rank,
+            lora_alpha=lora_alpha,
+        )
+        missing, unexpected = model.load_state_dict(state_dict["model"], strict=False)
+        validate_missing_and_unexpected_for_lora(
+            lora_attn_modules=lora_attn_modules,
+            apply_lora_to_mlp=apply_lora_to_mlp,
+            apply_lora_to_output=apply_lora_to_output,
+            base_missing=missing,
+            base_unexpected=unexpected,
+        )
+
+        # LoRA B params are zero-initialized, randomly initialize them to make
+        # the test of their permutation on checkpoint save nontrivial
+        lora_b_sd = {
+            k: torch.randn_like(v)
+            for k, v in model.state_dict().items()
+            if "lora_b" in k
+        }
+        model.load_state_dict(lora_b_sd, strict=False)
+
+        # Construct the adapter weights and config and save using checkpointer
+        adapter_params = get_adapter_params(model)
+        adapter_key_filter = lambda x: x in adapter_params
+        expected_adapter_state_dict = {
+            k: v for k, v in model.state_dict().items() if adapter_key_filter(k)
+        }
+        adapter_config = {
+            "r": lora_rank,
+            "lora_alpha": lora_alpha,
+            "target_modules": get_lora_module_names(
+                lora_attn_modules,
+                apply_lora_to_mlp,
+                apply_lora_to_output,
+            ),
+            "peft_type": "LORA",
+        }
+        state_dict.update({ADAPTER_KEY: expected_adapter_state_dict})
+        state_dict.update({ADAPTER_CONFIG: adapter_config})
+        single_file_checkpointer.save_checkpoint(state_dict, epoch=1)
+
+        # Load saved adapter weights and config from file for comparison
+        adapter_weights_file = Path.joinpath(
+            checkpoint_file.parent, "adapter_model.bin"
+        )
+        actual_adapter_state_dict = safe_torch_load(adapter_weights_file)
+
+        adapter_config_file = Path.joinpath(
+            checkpoint_file.parent, "adapter_config.json"
+        )
+        with open(adapter_config_file, "r") as f:
+            adapter_config = json.load(f)
+
+        expected_target_modules = [
+            "down_proj",
+            "gate_proj",
+            "lm_head",
+            "o_proj",
+            "q_proj",
+            "up_proj",
+        ]
+        assert sorted(adapter_config["target_modules"]) == expected_target_modules
+
+        # Map PEFT keys back to torchtune keys
+        peft_to_tt = {
+            "o_proj": "output_proj",
+            "gate_proj": "w1",
+            "down_proj": "w2",
+            "up_proj": "w3",
+            "lm_head": "output",
+        }
+        for k, v in actual_adapter_state_dict.items():
+            new_k = k.replace("base_model.model.", "").replace("self_attn", "attn")
+            if "lm_head" not in new_k:
+                new_k = new_k.replace("model.", "")
+            for kk, vv in peft_to_tt.items():
+                if kk in k:
+                    new_k = new_k.replace(kk, vv)
+            new_k = new_k.replace("lora_A", "lora_a").replace("lora_B", "lora_b")
+
+            # LoRA B matrix for Q should not match due to Q and K permutation
+            # However, since they're permuted along embed dim, their sum along that axis should match
+            if "lora_b" in new_k and "q_proj" in new_k:
+                assert not torch.allclose(
+                    actual_adapter_state_dict[k], expected_adapter_state_dict[new_k]
+                )
+                torch.testing.assert_close(
+                    actual_adapter_state_dict[k].sum(dim=0),
+                    expected_adapter_state_dict[new_k].sum(dim=0),
+                )
+
+            # All other matrices should match exactly
+            if "lora_b" not in new_k:
+                torch.testing.assert_close(
+                    actual_adapter_state_dict[k], expected_adapter_state_dict[new_k]
+                )
+
 
 class TestHFMistralRewardModelFullModelCheckpointer:
     @pytest.fixture

diff --git a/torchtune/models/convert_weights.py b/torchtune/models/convert_weights.py
@@ -6,7 +6,7 @@
 
 import re
 
-from typing import Dict
+from typing import Any, Dict
 
 import torch
 
@@ -198,3 +198,85 @@ def _permute(t, n_heads):
         converted_state_dict[new_key] = value
 
     return converted_state_dict
+
+
+# Mapping from torchtune LoRA module names to PEFT LoRA module names
+_TO_PEFT_KEYS = {
+    "lora_a": "lora_A",
+    "lora_b": "lora_B",
+}
+
+# Mapping from torchtune module names to target modules for PEFT adapter config
+_TO_PEFT_TARGET_MODULES = {
+    "q_proj": "q_proj",
+    "k_proj": "k_proj",
+    "v_proj": "v_proj",
+    "output_proj": "o_proj",
+    "w1": "gate_proj",
+    "w2": "down_proj",
+    "w3": "up_proj",
+    "output": "lm_head",
+}
+
+# Keys expected in PEFT's adapter_config.json
+_PEFT_CONFIG_EXPECTED_KEYS = ["target_modules", "r", "lora_alpha"]
+
+
+def tune_to_peft_adapter_config(
+    adapter_config: Dict[str, Any],
+):
+    if not all([x in adapter_config.keys() for x in _PEFT_CONFIG_EXPECTED_KEYS]):
+        raise ValueError(
+            f"PEFT adapter config requires {_PEFT_CONFIG_EXPECTED_KEYS}, found {adapter_config.keys()}"
+        )
+
+    for k in adapter_config["target_modules"]:
+        if k not in _TO_PEFT_TARGET_MODULES:
+            raise ValueError(f"Unknown target module {k}")
+    adapter_config["target_modules"] = list(
+        map(_TO_PEFT_TARGET_MODULES.get, adapter_config["target_modules"])
+    )
+
+    return adapter_config
+
+
+def tune_to_peft_adapter_weights(
+    state_dict: Dict[str, torch.Tensor],
+    num_heads: int = 32,
+    num_kv_heads: int = 32,
+    dim: int = 4096,
+):
+    converted_state_dict = {}
+    full_mapping = {}
+    # Rather than recreate a separate mapping for LoRA adapter weights, we just
+    # re-use the _FROM_HF mapping for base model weights. We iterate over it twice:
+    # once to add mappings for LoRA A matrices and once to add mappings for LoRA B matrices.
+    for k, v in _TO_PEFT_KEYS.items():
+        full_mapping.update(
+            {
+                vv.replace(".weight", f".{k}.weight"): kk.replace(
+                    ".weight", f".{v}.weight"
+                )
+                for kk, vv in _FROM_HF.items()
+                if vv is not None
+            }
+        )
+
+    head_dim = dim // num_heads
+
+    def _permute_lora_matrix(t, n_heads):
+        rank = t.shape[-1]
+        return (
+            t.view(n_heads, head_dim // 2, 2, rank)
+            .transpose(1, 2)
+            .reshape((head_dim * n_heads), rank)
+        )
+
+    for key, value in state_dict.items():
+        new_key = get_mapped_key(key, full_mapping)
+        if "q_proj" in new_key and "lora_B" in new_key:
+            value = _permute_lora_matrix(value, num_heads)
+        elif "k_proj" in new_key and "lora_B" in new_key:
+            value = _permute_lora_matrix(value, num_kv_heads)
+        converted_state_dict["base_model.model." + new_key] = value
+    return converted_state_dict
diff --git a/torchtune/utils/__init__.py b/torchtune/utils/__init__.py
@@ -29,6 +29,7 @@
 from .argparse import TuneRecipeArgumentParser
 from .collate import padded_collate, padded_collate_dpo
 from .constants import (  # noqa
+    ADAPTER_CONFIG,
     ADAPTER_KEY,
     EPOCHS_KEY,
     MAX_STEPS_KEY,