From 73ee4548c41467a8a67b10cf3ede8dc939b48f96 Mon Sep 17 00:00:00 2001
From: Salman Mohammadi <salman.mohammadi@outlook.com>
Date: Tue, 29 Oct 2024 15:19:12 +0000
Subject: [PATCH 1/9] adding docs

---
 docs/source/api_ref_modules.rst               |   1 +
 .../source/tutorials/memory_optimizations.rst | 112 +++++++++++++++++-
 torchtune/modules/peft/dora.py                |  17 ++-
 3 files changed, 115 insertions(+), 15 deletions(-)

diff --git a/docs/source/api_ref_modules.rst b/docs/source/api_ref_modules.rst
index cc9a493147..f360b4f02c 100644
--- a/docs/source/api_ref_modules.rst
+++ b/docs/source/api_ref_modules.rst
@@ -71,6 +71,7 @@ PEFT Components
     :nosignatures:
 
     peft.LoRALinear
+    peft.DoRALinear
     peft.AdapterModule
     peft.get_adapter_params
     peft.set_trainable_params
diff --git a/docs/source/tutorials/memory_optimizations.rst b/docs/source/tutorials/memory_optimizations.rst
index 04644093a9..e989b7590a 100644
--- a/docs/source/tutorials/memory_optimizations.rst
+++ b/docs/source/tutorials/memory_optimizations.rst
@@ -273,11 +273,12 @@ These are all specified under the ``model`` flag or config entry, i.e:
 
   tune run lora_finetune_single_device --config llama3/8B_lora_single_device  \
   model.apply_lora_to_mlp=True \
-  model.lora_attn_modules=["q_proj","k_proj","v_proj"]
+  model.lora_attn_modules=["q_proj","k_proj","v_proj"] \
 
 .. code-block:: yaml
 
   model:
+    _component_: torchtune.models.llama3.lora_llama3_8b
     apply_lora_to_mlp: True
     model.lora_attn_modules: ["q_proj", "k_proj", "v_proj"]
 
@@ -292,7 +293,24 @@ Secondly, parameters which control the scale of the impact of LoRA on the model:
   to your specific use case. Typically, one jointly changes ``lora_rank`` and ``lora_alpha`` together, where ``lora_alpha ~= 2*lora_rank``.
 * ``lora_dropout`` introduces dropout in the LoRA layers to help regularize training. We default to 0.0 for all of our models.
 
-As above, these parameters are also specified under the ``model`` flag or config entry.
+As above, these parameters are also specified under the ``model`` flag or config entry:
+
+.. code-block:: bash
+
+  tune run lora_finetune_single_device --config llama3/8B_lora_single_device  \
+  model.apply_lora_to_mlp=True \
+  model.lora_attn_modules=["q_proj","k_proj","v_proj"] \
+  model.lora_rank=128 \
+  model.lora_rank=256
+
+.. code-block:: yaml
+
+  model:
+    _component_: torchtune.models.llama3.lora_llama3_8b
+    apply_lora_to_mlp: True
+    model.lora_attn_modules: ["q_proj", "k_proj", "v_proj"]
+    model.lora_rank: 128
+    model.lora_rank: 256
 
 .. note::
 
@@ -323,18 +341,100 @@ You can finetune using QLoRA with any of our LoRA recipes, i.e. recipes with the
 QLoRA-enabled model builders, which we support for all our models, and also use the ``qlora_`` prefix, e.g.
 the :func:`torchtune.models.llama3.llama3_8b` model has a corresponding :func:`torchtune.models.llama3.qlora_llama3_8b`.
 We aim to provide a comprehensive set of configurations to allow you to get started with training with QLoRA quickly,
-just specify any config with ``_qlora`` in its name, e.g:
+just specify any config with ``_qlora`` in its name.
 
+All the rest of the LoRA parameters remain the same for QLoRA - check out the section above on :ref:`LoRA <glossary_lora>`
+to see how to configure.
+
+To configure from the command line:
 
 .. code-block:: bash
 
-  tune run lora_finetune_single_device --config llama3/8B_qlora_single_device
+  tune run lora_finetune_single_device --config llama3/8B_qlora_single_device \
+  model.apply_lora_to_mlp=True \
+  model.lora_attn_modules=["q_proj","k_proj","v_proj"] \
+  model.lora_rank=128 \
+  model.lora_rank=256 \
 
-All the rest of the LoRA parameters remain the same for QLoRA - check out the section above on :ref:`LoRA <glossary_lora>`
-to see how to configure.
+
+or, by modifying a config:
+
+.. code-block:: yaml
+
+  model:
+    _component_: torchtune.models.qlora_llama3_8b
+    apply_lora_to_mlp: True
+    model.lora_attn_modules: ["q_proj", "k_proj", "v_proj"]
+    model.lora_rank: 128
+    model.lora_rank: 256
+
+
+
+.. _glossary_dora:
+
+Weight-Decomposed Low-Rank Adaptation (DoRA)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+*What's going on here?*
+
+`DoRA <https://arxiv.org/abs/2402.09353>`_ is another PEFT technique which builds on-top of LoRA by
+further decomposing the pre-trained weights into two components: magnitude and direction. The magnitude component
+is a scalar vector that adjusts the scale, while the direction component corresponds to the original LoRA decomposition and
+updates the orientation of weights.
+
+DoRA adds a small overhead to LoRA training due to the addition of the magnitude parameter, but it has been shown to
+improve the performance of LoRA, particularly at low ranks.
+
+*Sounds great! How do I use it?*
+
+Much like LoRA and QLoRA, you can finetune using DoRA with any of our LoRA recipes. We use the same model builders for LoRA
+as we do for DoRA, so you can use the ``lora_`` version of any model builder with ``use_dora=True``. For example, to finetune
+:func:`torchtune.models.llama3.llama3_8b` with DoRA, you would use :func:`torchtune.models.llama3.lora_llama3_8b` with ``use_dora=True``:
+
+.. code-block:: bash
+
+  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \
+  model.use_dora=True
+
+.. code-block:: yaml
+
+  model:
+    _component_: torchtune.models.lora_llama3_8b
+    use_dora: True
+
+Since DoRA extends LoRA, the parameters for :ref:`customizing LoRA <glossary_lora>` are identical. You can also quantize the base model weights like in :ref:`glossary_qlora` by using ``quantize=True`` to reap
+even more memory savings!
+
+.. code-block:: bash
+
+  tune run lora_finetune_single_device --config llama3/8B_lora_single_device \
+  model.apply_lora_to_mlp=True \
+  model.lora_attn_modules=["q_proj","k_proj","v_proj"] \
+  model.lora_rank=128 \
+  model.lora_rank=256 \
+  model.use_dora=True \
+  model.quantize_base=True \
+
+.. code-block:: yaml
+
+  model:
+    _component_: torchtune.models.lora_llama3_8b
+    apply_lora_to_mlp: True
+    model.lora_attn_modules: ["q_proj", "k_proj", "v_proj"]
+    model.lora_rank: 128
+    model.lora_rank: 256
+    use_dora: True
+    quantize_base: True
+
+
+.. note::
+
+   Under the hood, we've enabled DoRA by adding the :class:`~torchtune.modules.peft.DoRALinear` module, which we swap
+   out for :class:`~torchtune.modules.peft.LoRALinear` when ``use_dora=True``.
 
 .. _glossary_distrib:
 
+
 .. TODO
 
 .. Distributed
diff --git a/torchtune/modules/peft/dora.py b/torchtune/modules/peft/dora.py
index 153b3c78e1..52ad9c7321 100644
--- a/torchtune/modules/peft/dora.py
+++ b/torchtune/modules/peft/dora.py
@@ -18,15 +18,14 @@
 
 
 class DoRALinear(nn.Module, AdapterModule):
-    """LoRA linear layer as introduced in `LoRA: Low-Rank Adaptation of Large Language Models <https://arxiv.org/abs/2106.09685>`_.
-
-    LoRA perturbs a given layer via a low-rank approximation where only
-    the rank decomposition matrices are trainable. In a linear layer instead of
-    :math:`x \\mapsto W_0x` a LoRALinear layer is defined as
-    :math:`x \\mapsto W_0x + (\\alpha / r)BAx`, where :math:`r` is the rank of
-    the matrices :math:`A` and :math:`B` and :math:`\\alpha` is a scaling factor.
-    As in the original implementation, we support dropout before multiplication
-    by the low-rank matrices.
+    """DoRA linear layer as introduced in
+    `DoRA: Weight-Decomposed Low-Rank Adaptation of Large Language Models <https://arxiv.org/abs/2402.09353>`_.
+
+    DoRA (Weight-Decomposed Low-Rank Adaptation) fine-tunes a layer by decomposing the pre-trained weights
+    into two components: magnitude and direction. The magnitude component is a learnable scalar vector
+    that scales each output channel, while the direction component, modified via LoRA, adjusts the orientation
+    of weights. By scaling the LoRA update component :math:`BAx` with the `magnitude` vector, DoRA allows the model
+    to apply distinct scaling adjustments across different output dimensions.
 
     Args:
         in_dim (int): input dimension

From 1c3cc39d519a27cd41da14d7d55ea3d9c3a3cb02 Mon Sep 17 00:00:00 2001
From: Salman Mohammadi <salman.mohammadi@outlook.com>
Date: Tue, 29 Oct 2024 15:22:20 +0000
Subject: [PATCH 2/9] whoops

---
 docs/source/recipes/lora_finetune_single_device.rst | 1 +
 docs/source/tutorials/memory_optimizations.rst      | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/source/recipes/lora_finetune_single_device.rst b/docs/source/recipes/lora_finetune_single_device.rst
index 83d7a385c0..4b4d476058 100644
--- a/docs/source/recipes/lora_finetune_single_device.rst
+++ b/docs/source/recipes/lora_finetune_single_device.rst
@@ -44,6 +44,7 @@ see our documentation for the different PEFT training paradigms we support:
 
 * :ref:`glossary_lora`
 * :ref:`glossary_qlora`
+* :ref:`glossary_dora`
 
 Many of our other memory optimization features can be used in this recipe. You can learn more about all of our memory optimization features in our :ref:`memory optimization overview<memory_optimization_overview_label>`.
 
diff --git a/docs/source/tutorials/memory_optimizations.rst b/docs/source/tutorials/memory_optimizations.rst
index e989b7590a..34ddb0db8d 100644
--- a/docs/source/tutorials/memory_optimizations.rst
+++ b/docs/source/tutorials/memory_optimizations.rst
@@ -273,7 +273,7 @@ These are all specified under the ``model`` flag or config entry, i.e:
 
   tune run lora_finetune_single_device --config llama3/8B_lora_single_device  \
   model.apply_lora_to_mlp=True \
-  model.lora_attn_modules=["q_proj","k_proj","v_proj"] \
+  model.lora_attn_modules=["q_proj","k_proj","v_proj"]
 
 .. code-block:: yaml
 

From 9bc635024d11bc46672ee0ebb6aae628b0bcf186 Mon Sep 17 00:00:00 2001
From: Salman Mohammadi <salman.mohammadi@outlook.com>
Date: Tue, 29 Oct 2024 15:23:11 +0000
Subject: [PATCH 3/9] whoops2

---
 docs/source/tutorials/memory_optimizations.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/tutorials/memory_optimizations.rst b/docs/source/tutorials/memory_optimizations.rst
index 34ddb0db8d..f453f533b6 100644
--- a/docs/source/tutorials/memory_optimizations.rst
+++ b/docs/source/tutorials/memory_optimizations.rst
@@ -344,7 +344,7 @@ We aim to provide a comprehensive set of configurations to allow you to get star
 just specify any config with ``_qlora`` in its name.
 
 All the rest of the LoRA parameters remain the same for QLoRA - check out the section above on :ref:`LoRA <glossary_lora>`
-to see how to configure.
+to see how to configure these parameters.
 
 To configure from the command line:
 

From 847bee36f4e6bf6e146e4e57781c7c11357d77b2 Mon Sep 17 00:00:00 2001
From: Salman Mohammadi <salman.mohammadi@outlook.com>
Date: Tue, 29 Oct 2024 15:29:28 +0000
Subject: [PATCH 4/9] fixing one more thing

---
 docs/source/tutorials/memory_optimizations.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/tutorials/memory_optimizations.rst b/docs/source/tutorials/memory_optimizations.rst
index f453f533b6..7357ed78a4 100644
--- a/docs/source/tutorials/memory_optimizations.rst
+++ b/docs/source/tutorials/memory_optimizations.rst
@@ -413,7 +413,7 @@ even more memory savings!
   model.lora_rank=128 \
   model.lora_rank=256 \
   model.use_dora=True \
-  model.quantize_base=True \
+  model.quantize_base=True
 
 .. code-block:: yaml
 

From 318500fa52554775f5a0b804736b207a2a384ef5 Mon Sep 17 00:00:00 2001
From: Salman Mohammadi <salman.mohammadi@outlook.com>
Date: Tue, 29 Oct 2024 15:36:18 +0000
Subject: [PATCH 5/9] missed one more thing

---
 docs/source/tutorials/memory_optimizations.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/source/tutorials/memory_optimizations.rst b/docs/source/tutorials/memory_optimizations.rst
index 7357ed78a4..f9ad6e0c8d 100644
--- a/docs/source/tutorials/memory_optimizations.rst
+++ b/docs/source/tutorials/memory_optimizations.rst
@@ -21,6 +21,7 @@ To make things easy, we've summarized these components in the following table:
    ":ref:`glossary_opt_in_bwd`", "Helps reduce memory usage when using stateful optimizers, particularly when full-finetuning large models with high gradient memory usage. This is not compatible with ``gradient_accumulation_steps``, so training may slow down due to reduced model throughput."
    ":ref:`glossary_lora`", "When you want to significantly reduce the number of trainable parameters, saving gradient and optimizer memory during training, and significantly speeding up training."
    ":ref:`glossary_qlora`", "When you need even more memory savings than LoRA, at the potential cost of some training speed. Useful for very large models or limited hardware."
+   ":ref:`glossary_dora`", "Like LoRA, DoRA can provide significant memory savings and training speed-ups. DoRA may improve performance over LoRA, particularly when using small rank updates."
 
 
 .. note::

From b208c94bcc02b45be284f7f4d5e77d3582add638 Mon Sep 17 00:00:00 2001
From: Salman Mohammadi <salman.mohammadi@outlook.com>
Date: Tue, 29 Oct 2024 17:09:14 +0000
Subject: [PATCH 6/9] whoops...aroonie?

---
 .../source/tutorials/memory_optimizations.rst | 26 +++++++++----------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/docs/source/tutorials/memory_optimizations.rst b/docs/source/tutorials/memory_optimizations.rst
index f9ad6e0c8d..495fba181c 100644
--- a/docs/source/tutorials/memory_optimizations.rst
+++ b/docs/source/tutorials/memory_optimizations.rst
@@ -309,9 +309,9 @@ As above, these parameters are also specified under the ``model`` flag or config
   model:
     _component_: torchtune.models.llama3.lora_llama3_8b
     apply_lora_to_mlp: True
-    model.lora_attn_modules: ["q_proj", "k_proj", "v_proj"]
-    model.lora_rank: 128
-    model.lora_rank: 256
+    lora_attn_modules: ["q_proj", "k_proj", "v_proj"]
+    lora_rank: 32
+    lora_rank: 64
 
 .. note::
 
@@ -354,8 +354,8 @@ To configure from the command line:
   tune run lora_finetune_single_device --config llama3/8B_qlora_single_device \
   model.apply_lora_to_mlp=True \
   model.lora_attn_modules=["q_proj","k_proj","v_proj"] \
-  model.lora_rank=128 \
-  model.lora_rank=256 \
+  model.lora_rank=32 \
+  model.lora_alpha=64
 
 
 or, by modifying a config:
@@ -365,9 +365,9 @@ or, by modifying a config:
   model:
     _component_: torchtune.models.qlora_llama3_8b
     apply_lora_to_mlp: True
-    model.lora_attn_modules: ["q_proj", "k_proj", "v_proj"]
-    model.lora_rank: 128
-    model.lora_rank: 256
+    lora_attn_modules: ["q_proj", "k_proj", "v_proj"]
+    lora_rank: 32
+    lora_alpha: 64
 
 
 
@@ -411,8 +411,8 @@ even more memory savings!
   tune run lora_finetune_single_device --config llama3/8B_lora_single_device \
   model.apply_lora_to_mlp=True \
   model.lora_attn_modules=["q_proj","k_proj","v_proj"] \
-  model.lora_rank=128 \
-  model.lora_rank=256 \
+  model.lora_rank=16 \
+  model.lora_alpha=32 \
   model.use_dora=True \
   model.quantize_base=True
 
@@ -421,9 +421,9 @@ even more memory savings!
   model:
     _component_: torchtune.models.lora_llama3_8b
     apply_lora_to_mlp: True
-    model.lora_attn_modules: ["q_proj", "k_proj", "v_proj"]
-    model.lora_rank: 128
-    model.lora_rank: 256
+    lora_attn_modules: ["q_proj", "k_proj", "v_proj"]
+    lora_rank: 16
+    lora_rank: 32
     use_dora: True
     quantize_base: True
 

From 0448ed7a3930c94587c3fa15b1821c594409c0a9 Mon Sep 17 00:00:00 2001
From: Salman Mohammadi <salman.mohammadi@outlook.com>
Date: Tue, 29 Oct 2024 17:23:44 +0000
Subject: [PATCH 7/9] OnElast thing

---
 docs/source/tutorials/memory_optimizations.rst | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/docs/source/tutorials/memory_optimizations.rst b/docs/source/tutorials/memory_optimizations.rst
index 495fba181c..c9f0f8dd16 100644
--- a/docs/source/tutorials/memory_optimizations.rst
+++ b/docs/source/tutorials/memory_optimizations.rst
@@ -301,8 +301,8 @@ As above, these parameters are also specified under the ``model`` flag or config
   tune run lora_finetune_single_device --config llama3/8B_lora_single_device  \
   model.apply_lora_to_mlp=True \
   model.lora_attn_modules=["q_proj","k_proj","v_proj"] \
-  model.lora_rank=128 \
-  model.lora_rank=256
+  model.lora_rank=32 \
+  model.lora_rank=64
 
 .. code-block:: yaml
 
@@ -311,7 +311,7 @@ As above, these parameters are also specified under the ``model`` flag or config
     apply_lora_to_mlp: True
     lora_attn_modules: ["q_proj", "k_proj", "v_proj"]
     lora_rank: 32
-    lora_rank: 64
+    lora_alpha: 64
 
 .. note::
 
@@ -369,8 +369,6 @@ or, by modifying a config:
     lora_rank: 32
     lora_alpha: 64
 
-
-
 .. _glossary_dora:
 
 Weight-Decomposed Low-Rank Adaptation (DoRA)

From 9c4f0bb83089bd22b975c79160a9401d8a4c1d0b Mon Sep 17 00:00:00 2001
From: Salman Mohammadi <salman.mohammadi@outlook.com>
Date: Tue, 29 Oct 2024 17:49:40 +0000
Subject: [PATCH 8/9] removing nightly ref

---
 docs/source/tutorials/memory_optimizations.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/tutorials/memory_optimizations.rst b/docs/source/tutorials/memory_optimizations.rst
index c9f0f8dd16..cf7554205f 100644
--- a/docs/source/tutorials/memory_optimizations.rst
+++ b/docs/source/tutorials/memory_optimizations.rst
@@ -109,7 +109,7 @@ checkpointing, where all activations will either be recomputed later in the back
 
 To enable activation offloading, use the ``enable_activation_offloading`` config entry or flag
 in our lora finetuning single device recipe, e.g. ``enable_activation_offloading=True``. To allow
-usage of streams, make sure you are on a torch version later than PyTorch 2.5.0.dev20240907.
+usage of streams, make sure you are on a torch version later than PyTorch 2.5.0.
 
 .. _glossary_grad_accm:
 

From 0d76a65990123fa86797f819657e315e1bfb0e72 Mon Sep 17 00:00:00 2001
From: Salman Mohammadi <salman.mohammadi@outlook.com>
Date: Tue, 29 Oct 2024 23:07:08 +0000
Subject: [PATCH 9/9] cmon chief

---
 docs/source/tutorials/memory_optimizations.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/tutorials/memory_optimizations.rst b/docs/source/tutorials/memory_optimizations.rst
index cf7554205f..cc71df53ae 100644
--- a/docs/source/tutorials/memory_optimizations.rst
+++ b/docs/source/tutorials/memory_optimizations.rst
@@ -302,7 +302,7 @@ As above, these parameters are also specified under the ``model`` flag or config
   model.apply_lora_to_mlp=True \
   model.lora_attn_modules=["q_proj","k_proj","v_proj"] \
   model.lora_rank=32 \
-  model.lora_rank=64
+  model.lora_alpha=64
 
 .. code-block:: yaml
 
@@ -421,7 +421,7 @@ even more memory savings!
     apply_lora_to_mlp: True
     lora_attn_modules: ["q_proj", "k_proj", "v_proj"]
     lora_rank: 16
-    lora_rank: 32
+    lora_alpha: 32
     use_dora: True
     quantize_base: True