enable QLoRA + FSDP2 (pytorch#909)

Co-authored-by: Kartikay Khandelwal <47255723+kartikayk@users.noreply.github.com> Co-authored-by: ebsmothers <ebs@meta.com> Co-authored-by: Rafi Ayub <33648637+RdoubleA@users.noreply.github.com> Co-authored-by: Joe Cummings <jrcummings27@gmail.com> Co-authored-by: Salman Mohammadi <salman.mohammadi@outlook.com> Co-authored-by: Rohan Varma <rvarm1@fb.com> Co-authored-by: Optimox <sebastien.fischman@gmail.com> Co-authored-by: Tanish Ambulkar <tanish.ambulkar99@gmail.com> Co-authored-by: Botao Chen <markchen1015@meta.com> Co-authored-by: solitude-alive <44771751+solitude-alive@users.noreply.github.com> Co-authored-by: christobill <christobill@users.noreply.github.com> Co-authored-by: Philip Bontrager <pbontrager@gmail.com> Co-authored-by: Evan Smothers <ebs@fb.com>
maximegmd · Jul 13, 2024 · b4df22b · b4df22b
1 parent f8f734c
commit b4df22b
Show file tree

Hide file tree

Showing 9 changed files with 326 additions and 66 deletions.
diff --git a/recipes/configs/dev/llama2/70B_qlora_fsdp2.yaml b/recipes/configs/dev/llama2/70B_qlora_fsdp2.yaml
@@ -0,0 +1,90 @@
+# Config for multi-device QLoRA in lora_finetune_fsdp2.py
+# using a Llama2 70B model
+#
+# This config assumes that you've run the following command before launching
+# this run:
+#   tune download meta-llama/Llama-2-70b-hf --output-dir /tmp/Llama-2-70b-hf --hf-token <HF_TOKEN>
+#
+# This config needs 8 GPUs to run
+#   # tune run --nproc_per_node 8 lora_finetune_fsdp2 --config llama2/70B_qlora
+#
+
+# Model Arguments
+model:
+  _component_: torchtune.models.llama2.qlora_llama2_70b
+  lora_attn_modules: ['q_proj', 'v_proj', 'k_proj', 'output_proj']
+  apply_lora_to_mlp: True
+  apply_lora_to_output: False
+  lora_rank: 16
+  lora_alpha: 32
+
+tokenizer:
+  _component_: torchtune.models.llama2.llama2_tokenizer
+  path: /tmp/Llama-2-70b-hf/tokenizer.model
+
+checkpointer:
+  _component_: torchtune.utils.FullModelHFCheckpointer
+  checkpoint_dir:  /tmp/Llama-2-70b-hf
+  checkpoint_files: [
+    pytorch_model-00001-of-00015.bin,
+    pytorch_model-00002-of-00015.bin,
+    pytorch_model-00003-of-00015.bin,
+    pytorch_model-00004-of-00015.bin,
+    pytorch_model-00005-of-00015.bin,
+    pytorch_model-00006-of-00015.bin,
+    pytorch_model-00007-of-00015.bin,
+    pytorch_model-00008-of-00015.bin,
+    pytorch_model-00009-of-00015.bin,
+    pytorch_model-00010-of-00015.bin,
+    pytorch_model-00011-of-00015.bin,
+    pytorch_model-00012-of-00015.bin,
+    pytorch_model-00013-of-00015.bin,
+    pytorch_model-00014-of-00015.bin,
+    pytorch_model-00015-of-00015.bin,
+  ]
+  recipe_checkpoint: null
+  output_dir: /tmp/Llama-2-70b-hf
+  model_type: LLAMA2
+resume_from_checkpoint: False
+
+# Dataset and Sampler
+dataset:
+  _component_: torchtune.datasets.alpaca_dataset
+  train_on_input: True
+seed: null
+shuffle: True
+batch_size: 2
+
+# Optimizer and Scheduler
+optimizer:
+  _component_: torch.optim.AdamW
+  weight_decay: 0.01
+  lr: 3e-4
+lr_scheduler:
+  _component_: torchtune.modules.get_cosine_schedule_with_warmup
+  num_warmup_steps: 100
+
+loss:
+  _component_: torch.nn.CrossEntropyLoss
+
+fsdp:
+  cpu_offload: False
+
+# Training
+epochs: 1
+max_steps_per_epoch: null
+gradient_accumulation_steps: 1
+compile: False
+
+# Logging
+output_dir: /tmp/qlora_finetune_output
+metric_logger:
+  _component_: torchtune.utils.metric_logging.DiskLogger
+  log_dir: ${output_dir}
+log_every_n_steps: 1
+log_peak_memory_stats: False
+
+# Environment
+device: cuda
+dtype: bf16
+enable_activation_checkpointing: True
diff --git a/recipes/configs/dev/llama2/7B_qlora_fsdp2.yaml b/recipes/configs/dev/llama2/7B_qlora_fsdp2.yaml
@@ -0,0 +1,84 @@
+# Config for single device QLoRA with lora_finetune_fsdp2.py
+# using a Llama2 7B model
+#
+# This config assumes that you've run the following command before launching
+# this run:
+#   tune download meta-llama/Llama-2-7b-hf --output-dir /tmp/Llama-2-7b-hf --hf-token <HF_TOKEN>
+#
+# To launch on a single device, run the following command from root:
+#   tune run lora_finetune_fsdp2 --config llama2/7B_qlora
+#
+# You can add specific overrides through the command line. For example
+# to override the checkpointer directory while launching training
+# you can run:
+#   tune run lora_finetune_fsdp2 --config llama2/7B_qlora checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
+#
+# This config works only for training on single device.
+
+# Model Arguments
+model:
+  _component_: torchtune.models.llama2.qlora_llama2_7b
+  lora_attn_modules: ['q_proj', 'v_proj', 'k_proj', 'output_proj']
+  apply_lora_to_mlp: True
+  apply_lora_to_output: False
+  lora_rank: 8
+  lora_alpha: 16
+
+tokenizer:
+  _component_: torchtune.models.llama2.llama2_tokenizer
+  path: /tmp/Llama-2-7b-hf/tokenizer.model
+
+checkpointer:
+  _component_: torchtune.utils.FullModelHFCheckpointer
+  checkpoint_dir: /tmp/Llama-2-7b-hf
+  checkpoint_files: [
+    pytorch_model-00001-of-00002.bin,
+    pytorch_model-00002-of-00002.bin
+  ]
+  adapter_checkpoint: null
+  recipe_checkpoint: null
+  output_dir: /tmp/Llama-2-7b-hf
+  model_type: LLAMA2
+resume_from_checkpoint: False
+
+# Dataset and Sampler
+dataset:
+  _component_: torchtune.datasets.alpaca_cleaned_dataset
+  train_on_input: True
+seed: null
+shuffle: True
+batch_size: 2
+
+# Optimizer and Scheduler
+optimizer:
+  _component_: torch.optim.AdamW
+  weight_decay: 0.01
+  lr: 3e-4
+lr_scheduler:
+  _component_: torchtune.modules.get_cosine_schedule_with_warmup
+  num_warmup_steps: 100
+
+loss:
+  _component_: torch.nn.CrossEntropyLoss
+
+fsdp:
+  cpu_offload: False
+
+# Training
+epochs: 1
+max_steps_per_epoch: null
+gradient_accumulation_steps: 16
+compile: False
+
+# Logging
+output_dir: /tmp/qlora_finetune_output/
+metric_logger:
+  _component_: torchtune.utils.metric_logging.DiskLogger
+  log_dir: ${output_dir}
+log_every_n_steps: 1
+log_peak_memory_stats: False
+
+# Environment
+device: cuda
+dtype: bf16
+enable_activation_checkpointing: True
diff --git a/recipes/dev/lora_finetune_fsdp2.py b/recipes/dev/lora_finetune_fsdp2.py
@@ -9,7 +9,7 @@
 import time
 
 from functools import partial
-from typing import Any, Dict, Optional, Tuple
+from typing import Any, Dict, Optional, Tuple, Union
 from warnings import warn
 
 import torch
@@ -32,7 +32,7 @@
     get_lora_module_names,
     get_merged_lora_ckpt,
     set_trainable_params,
-    validate_state_dict_for_lora,
+    validate_missing_and_unexpected_for_lora,
 )
 from torchtune.recipe_interfaces import FTRecipeInterface
 
@@ -214,6 +214,7 @@ def setup(self, cfg: DictConfig) -> None:
                 if self._resume_from_checkpoint
                 else None
             ),
+            cfg_fsdp=cfg.fsdp if hasattr(cfg, "fsdp") else None,
         )
         self._tokenizer = config.instantiate(cfg.tokenizer)
 
@@ -265,6 +266,7 @@ def _setup_model(
         enable_activation_checkpointing: bool,
         base_model_state_dict: Dict[str, Any],
         lora_weights_state_dict: Optional[Dict[str, Any]] = None,
+        cfg_fsdp: Optional[Union[DictConfig, None]] = None,
     ) -> nn.Module:
         """
         Model initialization has some important considerations:
@@ -290,25 +292,6 @@ def _setup_model(
         with utils.set_default_dtype(self._dtype), torch.device("meta"):
             model = config.instantiate(cfg_model)
 
-        if self._is_rank_zero:
-            # The model contains LoRA params which won't have any matching keys in
-            # the state dict. As a result, we need to load with strict=False.
-            # Before loading the state dict, ensure the state dict keys for the base
-            # model and adapters (if available) match the keys in the full LoRA model
-            # This is a good sanity check to prevent silent errors
-            validate_state_dict_for_lora(
-                lora_attn_modules=cfg_model.lora_attn_modules,
-                apply_lora_to_mlp=cfg_model.apply_lora_to_mlp,
-                apply_lora_to_output=getattr(cfg_model, "apply_lora_to_output", False),
-                full_model_state_dict_keys=model.state_dict().keys(),
-                lora_state_dict_keys=(
-                    lora_weights_state_dict.keys()
-                    if lora_weights_state_dict is not None
-                    else None
-                ),
-                base_model_state_dict_keys=base_model_state_dict.keys(),
-            )
-
         self.adapter_params = get_adapter_params(model)
         set_trainable_params(model, self.adapter_params)
 
@@ -317,47 +300,58 @@ def _setup_model(
                 model, auto_wrap_policy={modules.TransformerDecoderLayer}
             )
 
+        fsdp_kwargs = {}
+        if cfg_fsdp and cfg_fsdp.cpu_offload:
+            from torch.distributed._composable.fsdp import CPUOffloadPolicy
+
+            fsdp_kwargs["offload_policy"] = CPUOffloadPolicy()
         # iterating from lowerer modules to higher
         # eg grouping lora adapters before transformer block
         for m in reversed(list(model.modules())):
             if isinstance(m, nn.Linear) and m.weight.requires_grad:
-                fully_shard(m)
+                fully_shard(m, **fsdp_kwargs)
             # TransformerDecoderLayer is wrapped by CheckpointWrapper
             # when enable_activation_checkpointing
             if enable_activation_checkpointing:
                 if isinstance(m, CheckpointWrapper):
-                    fully_shard(m)
+                    fully_shard(m, **fsdp_kwargs)
             else:
                 if isinstance(m, modules.TransformerDecoderLayer):
-                    fully_shard(m)
-
-        fully_shard(model)
+                    fully_shard(m, **fsdp_kwargs)
+        fully_shard(model, **fsdp_kwargs)
 
         if lora_weights_state_dict:
-            utils.load_from_full_model_state_dict(
+            lora_missing, lora_unexpected = utils.load_from_full_model_state_dict(
                 model, lora_weights_state_dict, self._device, self._is_rank_zero
             )
+        else:
+            lora_missing, lora_unexpected = None, None
 
         with utils.set_default_dtype(self._dtype), self._device:
+            lora_device = "cpu" if cfg_fsdp and cfg_fsdp.cpu_offload else self._device
             for m in model.modules():
                 if isinstance(m, LoRALinear) and not lora_weights_state_dict:
                     # lora may not be covered in state dict
                     # if finetune for the 1st time
-                    m.lora_a.to_empty(device=self._device)
-                    m.lora_b.to_empty(device=self._device)
+                    m.lora_a.to_empty(device=lora_device)
+                    m.lora_b.to_empty(device=lora_device)
                     m.initialize_parameters()
                 # RoPE is not covered in state dict
                 if isinstance(m, modules.RotaryPositionalEmbeddings):
                     m.reset_parameters()
 
-        utils.load_from_full_model_state_dict(
+        base_missing, base_unexpected = utils.load_from_full_model_state_dict(
             model, base_model_state_dict, self._device, self._is_rank_zero
         )
-
-        # LoRA hyper-params needed for merging weights while saving checkpoints
-        self._lora_rank = cfg_model.lora_rank
-        self._lora_alpha = cfg_model.lora_alpha
-
+        validate_missing_and_unexpected_for_lora(
+            lora_attn_modules=self._lora_attn_modules,
+            apply_lora_to_mlp=self._apply_lora_to_mlp,
+            apply_lora_to_output=self._apply_lora_to_output,
+            base_missing=base_missing,
+            base_unexpected=base_unexpected,
+            lora_missing=lora_missing,
+            lora_unexpected=lora_unexpected,
+        )
         # Ensure no params and buffers are on meta device
         utils.validate_no_params_on_meta_device(model)
 
@@ -590,6 +584,8 @@ def train(self) -> None:
                 logits = logits.transpose(1, 2)
                 # Compute loss
                 loss = self._loss_fn(logits, labels)
+                # free logits otherwise it peaks backward memory
+                del logits
 
                 loss = loss / self._gradient_accumulation_steps
                 running_loss += loss

diff --git a/tests/torchtune/utils/test_distributed.py b/tests/torchtune/utils/test_distributed.py
@@ -16,9 +16,11 @@
 from torch.distributed import launcher
 
 from torch.distributed._composable.fsdp import fully_shard
+from torch.distributed._tensor import DTensor
 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
 
 from torch.testing._internal.common_fsdp import FSDPTest, MLP
+from torchao.dtypes.nf4tensor import NF4Tensor
 from torchtune import modules, utils
 from torchtune.models.llama2._component_builders import llama2, lora_llama2
 from torchtune.models.llama3._component_builders import llama3
@@ -237,7 +239,7 @@ def test_lora_meta_device_init_fsdp(self):
                 embed_dim=EMBED_DIM,
                 max_seq_len=MAX_SEQ_LEN,
                 lora_rank=4,
-                lora_alpha=1.0,
+                lora_alpha=8,
             )
         utils.prepare_model_for_fsdp_with_meta_device(lora)
         for m in lora.modules():
@@ -257,7 +259,7 @@ def world_size(self) -> int:
         return 2
 
     @gpu_test(gpu_count=2)
-    def test_state_dict(self):
+    def test_lora_state_dict(self):
         rank = self.rank
         is_rank_zero = rank == 0
         mlp_dim = 4
@@ -393,6 +395,45 @@ def test_state_dict(self):
         for key, value in sharded_model_sd.items():
             self.assertEqual(value, expected_sharded_model_sd[key])
 
+    @gpu_test(gpu_count=2)
+    def test_qlora_state_dict(self):
+        is_rank_zero = self.rank == 0
+        torch.manual_seed(42)
+        kwargs = {
+            "lora_attn_modules": ["q_proj", "v_proj", "k_proj", "output_proj"],
+            "apply_lora_to_mlp": True,
+            "apply_lora_to_output": False,
+            "vocab_size": 1024,
+            "num_layers": 3,
+            "num_heads": 4,
+            "num_kv_heads": 2,
+            "embed_dim": 1024,
+            "max_seq_len": 64,
+            "lora_rank": 4,
+            "lora_alpha": 1.0,
+        }
+        with torch.device("cuda"):
+            lora_kwargs = dict({"quantize_base": False}, **kwargs)
+            model_lora = lora_llama2(**lora_kwargs)
+        full_sd = model_lora.cpu().state_dict()
+        with torch.device("meta"):
+            qlora_kwargs = dict({"quantize_base": True}, **kwargs)
+            model_qlora = lora_llama2(**qlora_kwargs)
+        set_trainable_params(model_qlora, get_adapter_params(model_qlora))
+        for m in model_qlora.modules():
+            if isinstance(m, modules.TransformerDecoderLayer):
+                fully_shard(m)
+        fully_shard(model_qlora)
+        utils.load_from_full_model_state_dict(
+            model_qlora, full_sd, "cuda", is_rank_zero
+        )
+        # LoRALinear base weights should be DTensor(NF4)
+        for name, module in model_qlora.named_modules():
+            if isinstance(module, LoRALinear):
+                self.assertTrue(isinstance(module.weight, DTensor))
+                self.assertTrue(isinstance(module.weight._local_tensor, NF4Tensor))
+                self.assertEqual(module.weight.device.type, "cuda")
+
     def _broadcast_full_state_dict(self, full_sd):
         result = []
         if torch.distributed.get_rank() == 0: