address comments

ebsmothers · ebsmothers · commit f772fbd9386d · 2024-11-16T13:49:12.000-08:00
diff --git a/docs/source/tutorials/lora_finetune.rst b/docs/source/tutorials/lora_finetune.rst
@@ -205,8 +205,7 @@ model without any wrappers or custom checkpoint conversion logic.
 
 .. note::
     Whenever loading weights with :code:`strict=False`, you should verify that any missing or extra keys in
-    the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via e.g.
-    :func:`validate_state_dict_for_lora() <torchtune.modules.peft.validate_state_dict_for_lora>` or
+    the loaded :code:`state_dict` are as expected. torchtune's LoRA recipes do this by default via
     :func:`validate_missing_and_unexpected_for_lora() <torchtune.modules.peft.validate_missing_and_unexpected_for_lora>`.
 
 Once we've loaded the base model weights, we also want to set only LoRA parameters to trainable.
diff --git a/docs/source/tutorials/qat_finetune.rst b/docs/source/tutorials/qat_finetune.rst
@@ -168,11 +168,6 @@ modifications accordingly:
   fake_quant_after_n_steps: 1000
   memory_efficient_fsdp_wrap: False
 
-.. note::
-
-  QAT in torchtune is currently not compatible with `memory_efficient_fsdp_wrap <https://pytorch.org/torchtune/stable/generated/torchtune.utils.get_full_finetune_fsdp_wrap_policy.html#torchtune.utils.get_full_finetune_fsdp_wrap_policy>`_.
-  This is a known issue and will be fixed in a future torchtune version.
-
 Empirically, we observed that disabling fake quantization for the first N steps
 led to better results, presumably because doing so allows the weights to stabilize
 before we start introducing quantization noise to the fine-tuning process.
diff --git a/torchtune/modules/peft/_utils.py b/torchtune/modules/peft/_utils.py
@@ -260,10 +260,9 @@ def validate_missing_and_unexpected_for_lora(
     """
     A more memory-efficient way to validate that LoRA state dict loading was done properly.
 
-    Similar to :func:`validate_state_dict_for_lora`, this function uses a model's LoRA config to
-    check that LoRA and/or base model weights are loaded into the full model correctly.
-    Unlike that function, this method relies only on the values of missing and unexpected
-    as returned by the load_state_dict API with strict=False. This allows us to do the
+    This function uses a model's LoRA config to check that LoRA and/or base model weights
+    are loaded into the full model correctly. This function relies only on the values of missing and
+    unexpected as returned by the load_state_dict API with strict=False. This allows us to do the
     validation without any additional calls to .state_dict(), which use additional memory.
 
     Args:
diff --git a/torchtune/modules/peft/lora.py b/torchtune/modules/peft/lora.py
@@ -91,14 +91,6 @@ def __init__(
         self.lora_a = nn.Linear(in_features=in_dim, out_features=rank, bias=False)
         self.lora_b = nn.Linear(in_features=rank, out_features=out_dim, bias=False)
         self.merged = False
-        # Note: FSDP's meta device initialization contract assumes that a module's
-        # reset_parameters method only initializes its own parameters (i.e. no child
-        # params are initialized, as is done in initialize_parameters below).
-        # For that reason, we patch reset_parameters directly on lora_a and lora_b submodules
-        # when using meta device. This is done in
-        # torchtune.training.prepare_model_for_fsdp_with_meta_device.
-        # See this issue for more details: https://github.com/pytorch/pytorch/issues/104187.
-        # Without meta device, we only need the following:
         self.initialize_parameters()
 
     def initialize_parameters(self):