expose and set lora_dropout = 0.0 (#1492)

Co-authored-by: Felipe Mello <felipemello@fb.com>
pytorch · Sep 11, 2024 · d7fae96 · d7fae96
1 parent 221031a
commit d7fae96
Show file tree

Hide file tree

Showing 48 changed files with 90 additions and 28 deletions.
diff --git a/recipes/configs/code_llama2/7B_lora_single_device.yaml b/recipes/configs/code_llama2/7B_lora_single_device.yaml
@@ -23,6 +23,7 @@ model:
   apply_lora_to_output: False
   lora_rank: 8
   lora_alpha: 16
+  lora_dropout: 0.0
 
 # Tokenizer
 tokenizer:

diff --git a/recipes/configs/code_llama2/7B_qlora_single_device.yaml b/recipes/configs/code_llama2/7B_qlora_single_device.yaml
@@ -23,6 +23,7 @@ model:
   apply_lora_to_output: False
   lora_rank: 8
   lora_alpha: 16
+  lora_dropout: 0.0
 
 # Tokenizer
 tokenizer:

diff --git a/recipes/configs/gemma/2B_lora.yaml b/recipes/configs/gemma/2B_lora.yaml
@@ -34,6 +34,7 @@ model:
   apply_lora_to_mlp: True
   lora_rank: 64
   lora_alpha: 16
+  lora_dropout: 0.0
 
 checkpointer:
   _component_: torchtune.training.FullModelHFCheckpointer

diff --git a/recipes/configs/gemma/2B_lora_single_device.yaml b/recipes/configs/gemma/2B_lora_single_device.yaml
@@ -33,6 +33,7 @@ model:
   apply_lora_to_mlp: True
   lora_rank: 64
   lora_alpha: 16
+  lora_dropout: 0.0
 
 checkpointer:
   _component_: torchtune.training.FullModelHFCheckpointer

diff --git a/recipes/configs/gemma/2B_qlora_single_device.yaml b/recipes/configs/gemma/2B_qlora_single_device.yaml
@@ -33,6 +33,7 @@ model:
   apply_lora_to_mlp: True
   lora_rank: 64
   lora_alpha: 16
+  lora_dropout: 0.0
 
 checkpointer:
   _component_: torchtune.training.FullModelHFCheckpointer

diff --git a/recipes/configs/gemma/7B_lora.yaml b/recipes/configs/gemma/7B_lora.yaml
@@ -34,6 +34,7 @@ model:
   apply_lora_to_mlp: True
   lora_rank: 64
   lora_alpha: 16
+  lora_dropout: 0.0
 
 checkpointer:
   _component_: torchtune.training.FullModelHFCheckpointer

diff --git a/recipes/configs/gemma/7B_lora_single_device.yaml b/recipes/configs/gemma/7B_lora_single_device.yaml
@@ -33,6 +33,7 @@ model:
   apply_lora_to_mlp: True
   lora_rank: 8
   lora_alpha: 16
+  lora_dropout: 0.0
 
 checkpointer:
   _component_: torchtune.training.FullModelHFCheckpointer

diff --git a/recipes/configs/gemma/7B_qlora_single_device.yaml b/recipes/configs/gemma/7B_qlora_single_device.yaml
@@ -33,6 +33,7 @@ model:
   apply_lora_to_mlp: True
   lora_rank: 64
   lora_alpha: 16
+  lora_dropout: 0.0
 
 checkpointer:
   _component_: torchtune.training.FullModelHFCheckpointer

diff --git a/recipes/configs/llama2/13B_lora.yaml b/recipes/configs/llama2/13B_lora.yaml
@@ -27,6 +27,7 @@ model:
   apply_lora_to_output: True
   lora_rank: 8
   lora_alpha: 16
+  lora_dropout: 0.0
 
 checkpointer:
   _component_: torchtune.training.FullModelHFCheckpointer

diff --git a/recipes/configs/llama2/13B_qlora_single_device.yaml b/recipes/configs/llama2/13B_qlora_single_device.yaml
@@ -23,6 +23,7 @@ model:
   apply_lora_to_output: False
   lora_rank: 8
   lora_alpha: 16
+  lora_dropout: 0.0
 
 tokenizer:
   _component_: torchtune.models.llama2.llama2_tokenizer

diff --git a/recipes/configs/llama2/70B_lora.yaml b/recipes/configs/llama2/70B_lora.yaml
@@ -17,6 +17,7 @@ model:
   apply_lora_to_output: False
   lora_rank: 16
   lora_alpha: 32
+  lora_dropout: 0.0
 
 tokenizer:
   _component_: torchtune.models.llama2.llama2_tokenizer

diff --git a/recipes/configs/llama2/70B_qlora.yaml b/recipes/configs/llama2/70B_qlora.yaml
@@ -22,6 +22,7 @@ model:
   apply_lora_to_output: False
   lora_rank: 16
   lora_alpha: 32
+  lora_dropout: 0.0
 
 tokenizer:
   _component_: torchtune.models.llama2.llama2_tokenizer

diff --git a/recipes/configs/llama2/7B_lora.yaml b/recipes/configs/llama2/7B_lora.yaml
@@ -26,6 +26,7 @@ model:
   apply_lora_to_output: False
   lora_rank: 8
   lora_alpha: 16
+  lora_dropout: 0.0
 
 tokenizer:
   _component_: torchtune.models.llama2.llama2_tokenizer

diff --git a/recipes/configs/llama2/7B_lora_dpo.yaml b/recipes/configs/llama2/7B_lora_dpo.yaml
@@ -25,6 +25,7 @@ model:
   lora_rank: 8
   lora_alpha: 16
   lora_dropout: 0.0
+  lora_dropout: 0.0
 
 # Tokenizer
 tokenizer:

diff --git a/recipes/configs/llama2/7B_lora_dpo_single_device.yaml b/recipes/configs/llama2/7B_lora_dpo_single_device.yaml
@@ -24,6 +24,7 @@ model:
   lora_rank: 8
   lora_alpha: 16
   lora_dropout: 0.0
+  lora_dropout: 0.0
 
 # Tokenizer
 tokenizer:

diff --git a/recipes/configs/llama2/7B_lora_single_device.yaml b/recipes/configs/llama2/7B_lora_single_device.yaml
@@ -24,6 +24,7 @@ model:
   apply_lora_to_output: False
   lora_rank: 8
   lora_alpha: 16
+  lora_dropout: 0.0
 
 tokenizer:
   _component_: torchtune.models.llama2.llama2_tokenizer

diff --git a/recipes/configs/llama2/7B_qlora.yaml b/recipes/configs/llama2/7B_qlora.yaml
@@ -25,6 +25,7 @@ model:
   apply_lora_to_output: False
   lora_rank: 8
   lora_alpha: 16
+  lora_dropout: 0.0
 
 tokenizer:
   _component_: torchtune.models.llama2.llama2_tokenizer

diff --git a/recipes/configs/llama2/7B_qlora_single_device.yaml b/recipes/configs/llama2/7B_qlora_single_device.yaml
@@ -23,6 +23,7 @@ model:
   apply_lora_to_output: False
   lora_rank: 8
   lora_alpha: 16
+  lora_dropout: 0.0
 
 tokenizer:
   _component_: torchtune.models.llama2.llama2_tokenizer

diff --git a/recipes/configs/llama3/70B_lora.yaml b/recipes/configs/llama3/70B_lora.yaml
@@ -17,6 +17,7 @@ model:
   apply_lora_to_output: False
   lora_rank: 16
   lora_alpha: 32
+  lora_dropout: 0.0
 
 tokenizer:
   _component_: torchtune.models.llama3.llama3_tokenizer

diff --git a/recipes/configs/llama3/8B_lora.yaml b/recipes/configs/llama3/8B_lora.yaml
@@ -31,6 +31,7 @@ model:
   apply_lora_to_output: False
   lora_rank: 8
   lora_alpha: 16
+  lora_dropout: 0.0
 
 checkpointer:
   _component_: torchtune.training.FullModelMetaCheckpointer

diff --git a/recipes/configs/llama3/8B_lora_single_device.yaml b/recipes/configs/llama3/8B_lora_single_device.yaml
@@ -24,6 +24,7 @@ model:
   apply_lora_to_output: False
   lora_rank: 8
   lora_alpha: 16
+  lora_dropout: 0.0
 
 # Tokenizer
 tokenizer:

diff --git a/recipes/configs/llama3/8B_qlora_single_device.yaml b/recipes/configs/llama3/8B_qlora_single_device.yaml
@@ -23,6 +23,7 @@ model:
   apply_lora_to_output: False
   lora_rank: 8
   lora_alpha: 16
+  lora_dropout: 0.0
 
 # Tokenizer
 tokenizer:

diff --git a/recipes/configs/llama3_1/70B_lora.yaml b/recipes/configs/llama3_1/70B_lora.yaml
@@ -16,6 +16,7 @@ model:
   apply_lora_to_output: False
   lora_rank: 16
   lora_alpha: 32
+  lora_dropout: 0.0
 
 tokenizer:
   _component_: torchtune.models.llama3.llama3_tokenizer

diff --git a/recipes/configs/llama3_1/8B_lora.yaml b/recipes/configs/llama3_1/8B_lora.yaml
@@ -31,6 +31,7 @@ model:
   apply_lora_to_output: False
   lora_rank: 8
   lora_alpha: 16
+  lora_dropout: 0.0
 
 checkpointer:
   _component_: torchtune.training.FullModelHFCheckpointer

diff --git a/recipes/configs/llama3_1/8B_lora_single_device.yaml b/recipes/configs/llama3_1/8B_lora_single_device.yaml
@@ -24,6 +24,7 @@ model:
   apply_lora_to_output: False
   lora_rank: 8
   lora_alpha: 16
+  lora_dropout: 0.0
 
 # Tokenizer
 tokenizer:

diff --git a/recipes/configs/llama3_1/8B_qlora_single_device.yaml b/recipes/configs/llama3_1/8B_qlora_single_device.yaml
@@ -23,6 +23,7 @@ model:
   apply_lora_to_output: False
   lora_rank: 8
   lora_alpha: 16
+  lora_dropout: 0.0
 
 # Tokenizer
 tokenizer:

diff --git a/recipes/configs/mistral/7B_lora.yaml b/recipes/configs/mistral/7B_lora.yaml
@@ -42,6 +42,7 @@ model:
   apply_lora_to_output: True
   lora_rank: 64
   lora_alpha: 16
+  lora_dropout: 0.0
 
 checkpointer:
   _component_: torchtune.training.FullModelHFCheckpointer

diff --git a/recipes/configs/mistral/7B_lora_single_device.yaml b/recipes/configs/mistral/7B_lora_single_device.yaml
@@ -39,6 +39,7 @@ model:
   apply_lora_to_output: True
   lora_rank: 64
   lora_alpha: 16
+  lora_dropout: 0.0
 
 checkpointer:
   _component_: torchtune.training.FullModelHFCheckpointer

diff --git a/recipes/configs/mistral/7B_qlora_single_device.yaml b/recipes/configs/mistral/7B_qlora_single_device.yaml
@@ -40,6 +40,7 @@ model:
   apply_lora_to_output: False
   lora_rank: 64
   lora_alpha: 16
+  lora_dropout: 0.0
 
 checkpointer:
   _component_: torchtune.training.FullModelHFCheckpointer

diff --git a/recipes/configs/phi3/mini_lora.yaml b/recipes/configs/phi3/mini_lora.yaml
@@ -25,6 +25,7 @@ model:
   apply_lora_to_output: False
   lora_rank: 8
   lora_alpha: 16
+  lora_dropout: 0.0
 
 # Tokenizer
 tokenizer:

diff --git a/recipes/configs/phi3/mini_lora_single_device.yaml b/recipes/configs/phi3/mini_lora_single_device.yaml
@@ -23,6 +23,7 @@ model:
   apply_lora_to_output: False
   lora_rank: 8
   lora_alpha: 16
+  lora_dropout: 0.0
 
 # Tokenizer
 tokenizer:

diff --git a/recipes/configs/phi3/mini_qlora_single_device.yaml b/recipes/configs/phi3/mini_qlora_single_device.yaml
@@ -23,6 +23,7 @@ model:
   apply_lora_to_output: False
   lora_rank: 8
   lora_alpha: 16
+  lora_dropout: 0.0
 
 # Tokenizer
 tokenizer:

diff --git a/recipes/configs/qwen2/0.5B_lora.yaml b/recipes/configs/qwen2/0.5B_lora.yaml
@@ -25,6 +25,7 @@ model:
   apply_lora_to_mlp: False
   lora_rank: 32
   lora_alpha: 64
+  lora_dropout: 0.0
 
 tokenizer:
   _component_: torchtune.models.qwen2.qwen2_tokenizer

diff --git a/recipes/configs/qwen2/0.5B_lora_single_device.yaml b/recipes/configs/qwen2/0.5B_lora_single_device.yaml
@@ -23,6 +23,7 @@ model:
   apply_lora_to_mlp: False
   lora_rank: 32
   lora_alpha: 64
+  lora_dropout: 0.0
 
 tokenizer:
   _component_: torchtune.models.qwen2.qwen2_tokenizer

diff --git a/recipes/configs/qwen2/1.5B_lora.yaml b/recipes/configs/qwen2/1.5B_lora.yaml
@@ -23,6 +23,7 @@ model:
   apply_lora_to_mlp: False
   lora_rank: 32
   lora_alpha: 64
+  lora_dropout: 0.0
 
 tokenizer:
   _component_: torchtune.models.qwen2.qwen2_tokenizer

diff --git a/recipes/configs/qwen2/1.5B_lora_single_device.yaml b/recipes/configs/qwen2/1.5B_lora_single_device.yaml
@@ -23,6 +23,7 @@ model:
   apply_lora_to_mlp: False
   lora_rank: 32
   lora_alpha: 64
+  lora_dropout: 0.0
 
 tokenizer:
   _component_: torchtune.models.qwen2.qwen2_tokenizer

diff --git a/recipes/configs/qwen2/7B_lora.yaml b/recipes/configs/qwen2/7B_lora.yaml
@@ -26,6 +26,7 @@ model:
   apply_lora_to_output: False
   lora_rank: 8
   lora_alpha: 16
+  lora_dropout: 0.0
 
 tokenizer:
   _component_: torchtune.models.qwen2.qwen2_tokenizer

diff --git a/recipes/configs/qwen2/7B_lora_single_device.yaml b/recipes/configs/qwen2/7B_lora_single_device.yaml
@@ -24,6 +24,7 @@ model:
   apply_lora_to_output: False
   lora_rank: 8
   lora_alpha: 16
+  lora_dropout: 0.0
 
 tokenizer:
   _component_: torchtune.models.qwen2.qwen2_tokenizer

diff --git a/torchtune/models/code_llama2/_model_builders.py b/torchtune/models/code_llama2/_model_builders.py
@@ -41,7 +41,7 @@ def lora_code_llama2_7b(
     apply_lora_to_output: bool = False,
     lora_rank: int = 8,
     lora_alpha: float = 16,
-    lora_dropout: float = 0.05,
+    lora_dropout: float = 0.0,
     use_dora: bool = False,
     quantize_base: bool = False,
 ) -> TransformerDecoder:
@@ -62,7 +62,7 @@ def lora_code_llama2_7b(
             Default: False
         lora_rank (int): rank of each low-rank approximation
         lora_alpha (float): scaling factor for the low-rank approximation
-        lora_dropout (float): dropout probability for LoRA linear layers. Default: 0.05
+        lora_dropout (float): dropout probability for LoRA linear layers. Default: 0.0
         quantize_base (bool): Whether to quantize base model weights
 
     Returns:
@@ -125,7 +125,7 @@ def lora_code_llama2_13b(
     apply_lora_to_output: bool = False,
     lora_rank: int = 8,
     lora_alpha: float = 16,
-    lora_dropout: float = 0.05,
+    lora_dropout: float = 0.0,
     use_dora: bool = False,
     quantize_base: bool = False,
 ) -> TransformerDecoder:
@@ -212,7 +212,7 @@ def lora_code_llama2_70b(
     apply_lora_to_output: bool = False,
     lora_rank: int = 8,
     lora_alpha: float = 16,
-    lora_dropout: float = 0.05,
+    lora_dropout: float = 0.0,
     use_dora: bool = False,
     quantize_base: bool = False,
 ) -> TransformerDecoder:
@@ -233,6 +233,7 @@ def lora_code_llama2_70b(
             Default: False
         lora_rank (int): rank of each low-rank approximation
         lora_alpha (float): scaling factor for the low-rank approximation
+        lora_dropout (float): LoRA dropout probability. Default: 0.0
         use_dora (bool): Decompose the LoRA weight into magnitude and direction, as
             introduced in "DoRA: Weight-Decomposed Low-Rank Adaptation" (https://arxiv.org/abs/2402.09353).
         quantize_base (bool): Whether to quantize base model weights

diff --git a/torchtune/models/gemma/_model_builders.py b/torchtune/models/gemma/_model_builders.py
@@ -68,6 +68,7 @@ def lora_gemma_2b(
     apply_lora_to_mlp: bool = False,
     lora_rank: int = 8,
     lora_alpha: float = 16,
+    lora_dropout: float = 0.0,
     use_dora: bool = False,
     quantize_base: bool = False,
 ) -> GemmaTransformerDecoder:
@@ -86,6 +87,7 @@ def lora_gemma_2b(
             Default: False
         lora_rank (int): rank of each low-rank approximation
         lora_alpha (float): scaling factor for the low-rank approximation
+        lora_dropout (float): dropout probability for the low-rank approximation. Default: 0.0
         use_dora (bool): Decompose the LoRA weight into magnitude and direction, as
             introduced in "DoRA: Weight-Decomposed Low-Rank Adaptation" (https://arxiv.org/abs/2402.09353).
         quantize_base (bool): Whether to quantize base model weights
@@ -108,7 +110,7 @@ def lora_gemma_2b(
         norm_eps=1e-6,
         lora_rank=lora_rank,
         lora_alpha=lora_alpha,
-        lora_dropout=0.05,
+        lora_dropout=lora_dropout,
         use_dora=use_dora,
         quantize_base=quantize_base,
     )
@@ -150,6 +152,7 @@ def lora_gemma_7b(
     apply_lora_to_mlp: bool = False,
     lora_rank: int = 8,
     lora_alpha: float = 16,
+    lora_dropout: float = 0.0,
     use_dora: bool = False,
     quantize_base: bool = False,
 ) -> GemmaTransformerDecoder:
@@ -168,6 +171,7 @@ def lora_gemma_7b(
             Default: False
         lora_rank (int): rank of each low-rank approximation
         lora_alpha (float): scaling factor for the low-rank approximation
+        lora_dropout (float): dropout probability for the low-rank approximation. Default: 0.0
         use_dora (bool): Decompose the LoRA weight into magnitude and direction, as
             introduced in "DoRA: Weight-Decomposed Low-Rank Adaptation" (https://arxiv.org/abs/2402.09353).
         quantize_base (bool): Whether to quantize base model weights
@@ -190,7 +194,7 @@ def lora_gemma_7b(
         norm_eps=1e-6,
         lora_rank=lora_rank,
         lora_alpha=lora_alpha,
-        lora_dropout=0.05,
+        lora_dropout=lora_dropout,
         use_dora=use_dora,
         quantize_base=quantize_base,
     )