diff --git a/recipes/configs/code_llama2/7B_lora_single_device.yaml b/recipes/configs/code_llama2/7B_lora_single_device.yaml index ba5d43113d..1281bc3651 100644 --- a/recipes/configs/code_llama2/7B_lora_single_device.yaml +++ b/recipes/configs/code_llama2/7B_lora_single_device.yaml @@ -60,6 +60,7 @@ batch_size: 2 gradient_accumulation_steps: 16 optimizer: _component_: torch.optim.AdamW + fused: True weight_decay: 0.01 lr: 3e-4 lr_scheduler: diff --git a/recipes/configs/code_llama2/7B_qlora_single_device.yaml b/recipes/configs/code_llama2/7B_qlora_single_device.yaml index c2e990bf7b..1d4e91562c 100644 --- a/recipes/configs/code_llama2/7B_qlora_single_device.yaml +++ b/recipes/configs/code_llama2/7B_qlora_single_device.yaml @@ -60,6 +60,7 @@ batch_size: 2 gradient_accumulation_steps: 16 optimizer: _component_: torch.optim.AdamW + fused: True weight_decay: 0.01 lr: 3e-4 lr_scheduler: diff --git a/recipes/configs/dev/8B_full_experimental.yaml b/recipes/configs/dev/8B_full_experimental.yaml index 498e99e0a1..4ed8a80e09 100644 --- a/recipes/configs/dev/8B_full_experimental.yaml +++ b/recipes/configs/dev/8B_full_experimental.yaml @@ -52,8 +52,7 @@ epochs: 3 optimizer: _component_: torch.optim.AdamW lr: 2e-5 - foreach: False - + fused: True loss: _component_: torchtune.modules.loss.CEWithChunkedOutputLoss max_steps_per_epoch: null diff --git a/recipes/configs/gemma/2B_full.yaml b/recipes/configs/gemma/2B_full.yaml index 2875bf5cce..7ee9e830e9 100644 --- a/recipes/configs/gemma/2B_full.yaml +++ b/recipes/configs/gemma/2B_full.yaml @@ -48,6 +48,7 @@ batch_size: 2 epochs: 3 optimizer: _component_: torch.optim.AdamW + fused: True lr: 2e-5 loss: _component_: torchtune.modules.loss.CEWithChunkedOutputLoss diff --git a/recipes/configs/gemma/2B_lora.yaml b/recipes/configs/gemma/2B_lora.yaml index 8e67fe2168..4df65e4430 100644 --- a/recipes/configs/gemma/2B_lora.yaml +++ b/recipes/configs/gemma/2B_lora.yaml @@ -51,6 +51,7 @@ save_adapter_weights_only: False optimizer: _component_: torch.optim.AdamW + fused: True lr: 2e-5 lr_scheduler: diff --git a/recipes/configs/gemma/2B_lora_single_device.yaml b/recipes/configs/gemma/2B_lora_single_device.yaml index 8c322495ce..6d74bafc5b 100644 --- a/recipes/configs/gemma/2B_lora_single_device.yaml +++ b/recipes/configs/gemma/2B_lora_single_device.yaml @@ -50,6 +50,7 @@ save_adapter_weights_only: False optimizer: _component_: torch.optim.AdamW + fused: True lr: 2e-5 lr_scheduler: diff --git a/recipes/configs/gemma/2B_qlora_single_device.yaml b/recipes/configs/gemma/2B_qlora_single_device.yaml index 7ed60ce180..c2a1992c8d 100644 --- a/recipes/configs/gemma/2B_qlora_single_device.yaml +++ b/recipes/configs/gemma/2B_qlora_single_device.yaml @@ -50,6 +50,7 @@ save_adapter_weights_only: False optimizer: _component_: torch.optim.AdamW + fused: True lr: 2e-5 lr_scheduler: diff --git a/recipes/configs/gemma/7B_full.yaml b/recipes/configs/gemma/7B_full.yaml index dc4bbe26ae..41bc1513f0 100644 --- a/recipes/configs/gemma/7B_full.yaml +++ b/recipes/configs/gemma/7B_full.yaml @@ -50,6 +50,7 @@ batch_size: 1 epochs: 1 optimizer: _component_: torch.optim.AdamW + fused: True lr: 2e-5 loss: _component_: torchtune.modules.loss.CEWithChunkedOutputLoss diff --git a/recipes/configs/gemma/7B_lora.yaml b/recipes/configs/gemma/7B_lora.yaml index 5d0bcdb08f..295db1b951 100644 --- a/recipes/configs/gemma/7B_lora.yaml +++ b/recipes/configs/gemma/7B_lora.yaml @@ -53,6 +53,7 @@ save_adapter_weights_only: False optimizer: _component_: torch.optim.AdamW + fused: True lr: 2e-5 lr_scheduler: diff --git a/recipes/configs/gemma/7B_lora_single_device.yaml b/recipes/configs/gemma/7B_lora_single_device.yaml index aa69fa50f8..71d002c69c 100644 --- a/recipes/configs/gemma/7B_lora_single_device.yaml +++ b/recipes/configs/gemma/7B_lora_single_device.yaml @@ -52,6 +52,7 @@ save_adapter_weights_only: False optimizer: _component_: torch.optim.AdamW + fused: True lr: 5e-5 lr_scheduler: diff --git a/recipes/configs/gemma/7B_qlora_single_device.yaml b/recipes/configs/gemma/7B_qlora_single_device.yaml index 8a08c49b5c..72321e0ab1 100644 --- a/recipes/configs/gemma/7B_qlora_single_device.yaml +++ b/recipes/configs/gemma/7B_qlora_single_device.yaml @@ -52,6 +52,7 @@ save_adapter_weights_only: False optimizer: _component_: torch.optim.AdamW + fused: True lr: 2e-5 lr_scheduler: diff --git a/recipes/configs/llama2/13B_full.yaml b/recipes/configs/llama2/13B_full.yaml index 0b740b17cb..f5ecffc2ab 100644 --- a/recipes/configs/llama2/13B_full.yaml +++ b/recipes/configs/llama2/13B_full.yaml @@ -52,6 +52,7 @@ batch_size: 2 epochs: 3 optimizer: _component_: torch.optim.AdamW + fused: True lr: 2e-5 loss: _component_: torchtune.modules.loss.CEWithChunkedOutputLoss diff --git a/recipes/configs/llama2/13B_lora.yaml b/recipes/configs/llama2/13B_lora.yaml index 33d160ece0..267725ab92 100644 --- a/recipes/configs/llama2/13B_lora.yaml +++ b/recipes/configs/llama2/13B_lora.yaml @@ -60,6 +60,7 @@ batch_size: 2 # Optimizer and Scheduler optimizer: _component_: torch.optim.AdamW + fused: True weight_decay: 0.01 lr: 2e-4 lr_scheduler: diff --git a/recipes/configs/llama2/13B_qlora_single_device.yaml b/recipes/configs/llama2/13B_qlora_single_device.yaml index 74c51fd6bc..176cbe23b9 100644 --- a/recipes/configs/llama2/13B_qlora_single_device.yaml +++ b/recipes/configs/llama2/13B_qlora_single_device.yaml @@ -55,6 +55,7 @@ batch_size: 2 # Optimizer and Scheduler optimizer: _component_: torch.optim.AdamW + fused: True weight_decay: 0.01 lr: 3e-4 lr_scheduler: diff --git a/recipes/configs/llama2/70B_lora.yaml b/recipes/configs/llama2/70B_lora.yaml index 2c745402f1..ff4f56493b 100644 --- a/recipes/configs/llama2/70B_lora.yaml +++ b/recipes/configs/llama2/70B_lora.yaml @@ -60,6 +60,7 @@ batch_size: 2 # Optimizer and Scheduler optimizer: _component_: torch.optim.AdamW + fused: True weight_decay: 0.01 lr: 3e-4 lr_scheduler: diff --git a/recipes/configs/llama2/70B_qlora.yaml b/recipes/configs/llama2/70B_qlora.yaml index 7d2301106a..b8ff55c01b 100644 --- a/recipes/configs/llama2/70B_qlora.yaml +++ b/recipes/configs/llama2/70B_qlora.yaml @@ -66,6 +66,7 @@ batch_size: 2 # Optimizer and Scheduler optimizer: _component_: torch.optim.AdamW + fused: True weight_decay: 0.01 lr: 3e-4 lr_scheduler: diff --git a/recipes/configs/llama2/7B_full.yaml b/recipes/configs/llama2/7B_full.yaml index 79d41cd8bc..2e80276c84 100644 --- a/recipes/configs/llama2/7B_full.yaml +++ b/recipes/configs/llama2/7B_full.yaml @@ -51,6 +51,7 @@ batch_size: 2 epochs: 3 optimizer: _component_: torch.optim.AdamW + fused: True lr: 2e-5 loss: _component_: torchtune.modules.loss.CEWithChunkedOutputLoss diff --git a/recipes/configs/llama2/7B_lora.yaml b/recipes/configs/llama2/7B_lora.yaml index dd527194b3..68e1d302df 100644 --- a/recipes/configs/llama2/7B_lora.yaml +++ b/recipes/configs/llama2/7B_lora.yaml @@ -57,6 +57,7 @@ batch_size: 2 # Optimizer and Scheduler optimizer: _component_: torch.optim.AdamW + fused: True weight_decay: 0.01 lr: 3e-4 lr_scheduler: diff --git a/recipes/configs/llama2/7B_lora_dpo.yaml b/recipes/configs/llama2/7B_lora_dpo.yaml index 81a83577fe..3b6603d013 100644 --- a/recipes/configs/llama2/7B_lora_dpo.yaml +++ b/recipes/configs/llama2/7B_lora_dpo.yaml @@ -55,6 +55,7 @@ batch_size: 4 # Optimizer and Scheduler optimizer: _component_: torch.optim.AdamW + fused: True weight_decay: 0.05 lr: 5e-4 lr_scheduler: diff --git a/recipes/configs/llama2/7B_lora_dpo_single_device.yaml b/recipes/configs/llama2/7B_lora_dpo_single_device.yaml index ef70976ee4..c01f28363b 100644 --- a/recipes/configs/llama2/7B_lora_dpo_single_device.yaml +++ b/recipes/configs/llama2/7B_lora_dpo_single_device.yaml @@ -54,6 +54,7 @@ batch_size: 4 # Optimizer and Scheduler optimizer: _component_: torch.optim.AdamW + fused: True weight_decay: 0.05 lr: 5e-4 lr_scheduler: diff --git a/recipes/configs/llama2/7B_lora_single_device.yaml b/recipes/configs/llama2/7B_lora_single_device.yaml index dba911a0d7..6ff1740a8a 100644 --- a/recipes/configs/llama2/7B_lora_single_device.yaml +++ b/recipes/configs/llama2/7B_lora_single_device.yaml @@ -55,6 +55,7 @@ batch_size: 2 # Optimizer and Scheduler optimizer: _component_: torch.optim.AdamW + fused: True weight_decay: 0.01 lr: 3e-4 lr_scheduler: diff --git a/recipes/configs/llama2/7B_qat_full.yaml b/recipes/configs/llama2/7B_qat_full.yaml index 4a9cd4a63f..6fca6c4d4a 100644 --- a/recipes/configs/llama2/7B_qat_full.yaml +++ b/recipes/configs/llama2/7B_qat_full.yaml @@ -47,6 +47,7 @@ batch_size: 2 epochs: 3 optimizer: _component_: torch.optim.AdamW + fused: True lr: 2e-5 loss: _component_: torchtune.modules.loss.CEWithChunkedOutputLoss diff --git a/recipes/configs/llama2/7B_qlora.yaml b/recipes/configs/llama2/7B_qlora.yaml index 701eff6fa2..630d1f6357 100644 --- a/recipes/configs/llama2/7B_qlora.yaml +++ b/recipes/configs/llama2/7B_qlora.yaml @@ -57,6 +57,7 @@ batch_size: 2 # Optimizer and Scheduler optimizer: _component_: torch.optim.AdamW + fused: True weight_decay: 0.01 lr: 3e-4 lr_scheduler: diff --git a/recipes/configs/llama2/7B_qlora_single_device.yaml b/recipes/configs/llama2/7B_qlora_single_device.yaml index 427cdc50be..34f0dcbebe 100644 --- a/recipes/configs/llama2/7B_qlora_single_device.yaml +++ b/recipes/configs/llama2/7B_qlora_single_device.yaml @@ -54,6 +54,7 @@ batch_size: 2 # Optimizer and Scheduler optimizer: _component_: torch.optim.AdamW + fused: True weight_decay: 0.01 lr: 3e-4 lr_scheduler: diff --git a/recipes/configs/llama3/70B_full.yaml b/recipes/configs/llama3/70B_full.yaml index 031ae44d38..ab49fd3270 100644 --- a/recipes/configs/llama3/70B_full.yaml +++ b/recipes/configs/llama3/70B_full.yaml @@ -79,8 +79,7 @@ epochs: 3 optimizer: _component_: torch.optim.AdamW lr: 2e-5 - foreach: False - # Note: highly recommended to use fused=True optimizer flag + fused: True # Note: highly recommended to use fused=True optimizer flag # with CPU offload for faster optimizer step. fused: True diff --git a/recipes/configs/llama3/70B_lora.yaml b/recipes/configs/llama3/70B_lora.yaml index c323a277ed..00bd438fab 100644 --- a/recipes/configs/llama3/70B_lora.yaml +++ b/recipes/configs/llama3/70B_lora.yaml @@ -75,6 +75,7 @@ batch_size: 2 # Optimizer and Scheduler optimizer: _component_: torch.optim.AdamW + fused: True weight_decay: 0.01 lr: 3e-4 lr_scheduler: diff --git a/recipes/configs/llama3/8B_dora.yaml b/recipes/configs/llama3/8B_dora.yaml index 4ea5fca983..3911e856c2 100644 --- a/recipes/configs/llama3/8B_dora.yaml +++ b/recipes/configs/llama3/8B_dora.yaml @@ -50,6 +50,7 @@ batch_size: 2 # Optimizer and Scheduler optimizer: _component_: torch.optim.AdamW + fused: True weight_decay: 0.01 lr: 3e-4 lr_scheduler: diff --git a/recipes/configs/llama3/8B_dora_single_device.yaml b/recipes/configs/llama3/8B_dora_single_device.yaml index cc149a3b61..7118733f53 100644 --- a/recipes/configs/llama3/8B_dora_single_device.yaml +++ b/recipes/configs/llama3/8B_dora_single_device.yaml @@ -52,6 +52,7 @@ batch_size: 2 # Optimizer and Scheduler optimizer: _component_: torch.optim.AdamW + fused: True weight_decay: 0.01 lr: 3e-4 lr_scheduler: diff --git a/recipes/configs/llama3/8B_full.yaml b/recipes/configs/llama3/8B_full.yaml index 676ae8e72a..7f24376db7 100644 --- a/recipes/configs/llama3/8B_full.yaml +++ b/recipes/configs/llama3/8B_full.yaml @@ -52,8 +52,7 @@ epochs: 3 optimizer: _component_: torch.optim.AdamW lr: 2e-5 - foreach: False - + fused: True loss: _component_: torchtune.modules.loss.CEWithChunkedOutputLoss max_steps_per_epoch: null diff --git a/recipes/configs/llama3/8B_lora.yaml b/recipes/configs/llama3/8B_lora.yaml index 26016cd5cd..5c3510f466 100644 --- a/recipes/configs/llama3/8B_lora.yaml +++ b/recipes/configs/llama3/8B_lora.yaml @@ -55,6 +55,7 @@ batch_size: 2 # Optimizer and Scheduler optimizer: _component_: torch.optim.AdamW + fused: True weight_decay: 0.01 lr: 3e-4 lr_scheduler: diff --git a/recipes/configs/llama3/8B_lora_single_device.yaml b/recipes/configs/llama3/8B_lora_single_device.yaml index c35d93b9b4..8ae6853b62 100644 --- a/recipes/configs/llama3/8B_lora_single_device.yaml +++ b/recipes/configs/llama3/8B_lora_single_device.yaml @@ -54,6 +54,7 @@ batch_size: 2 # Optimizer and Scheduler optimizer: _component_: torch.optim.AdamW + fused: True weight_decay: 0.01 lr: 3e-4 lr_scheduler: diff --git a/recipes/configs/llama3/8B_qat_full.yaml b/recipes/configs/llama3/8B_qat_full.yaml index e990c757e9..ff4d9c3195 100644 --- a/recipes/configs/llama3/8B_qat_full.yaml +++ b/recipes/configs/llama3/8B_qat_full.yaml @@ -52,8 +52,7 @@ quantizer: optimizer: _component_: torch.optim.AdamW lr: 2e-5 - foreach: False - + fused: True loss: _component_: torchtune.modules.loss.CEWithChunkedOutputLoss max_steps_per_epoch: null diff --git a/recipes/configs/llama3/8B_qdora_single_device.yaml b/recipes/configs/llama3/8B_qdora_single_device.yaml index 5a66aa9edc..6e6b572c8e 100644 --- a/recipes/configs/llama3/8B_qdora_single_device.yaml +++ b/recipes/configs/llama3/8B_qdora_single_device.yaml @@ -53,6 +53,7 @@ batch_size: 2 # Optimizer and Scheduler optimizer: _component_: torch.optim.AdamW + fused: True weight_decay: 0.01 lr: 3e-4 lr_scheduler: diff --git a/recipes/configs/llama3/8B_qlora_single_device.yaml b/recipes/configs/llama3/8B_qlora_single_device.yaml index b06523b4a0..cfc6ce93bc 100644 --- a/recipes/configs/llama3/8B_qlora_single_device.yaml +++ b/recipes/configs/llama3/8B_qlora_single_device.yaml @@ -53,6 +53,7 @@ batch_size: 2 # Optimizer and Scheduler optimizer: _component_: torch.optim.AdamW + fused: True weight_decay: 0.01 lr: 3e-4 lr_scheduler: diff --git a/recipes/configs/llama3_1/70B_full.yaml b/recipes/configs/llama3_1/70B_full.yaml index 30f5cd87b4..308093ef25 100644 --- a/recipes/configs/llama3_1/70B_full.yaml +++ b/recipes/configs/llama3_1/70B_full.yaml @@ -79,7 +79,6 @@ epochs: 3 optimizer: _component_: torch.optim.AdamW lr: 2e-5 - foreach: False # Note: highly recommended to use fused=True optimizer flag # with CPU offload for faster optimizer step. fused: True diff --git a/recipes/configs/llama3_1/70B_lora.yaml b/recipes/configs/llama3_1/70B_lora.yaml index 995039db36..d7749569b8 100644 --- a/recipes/configs/llama3_1/70B_lora.yaml +++ b/recipes/configs/llama3_1/70B_lora.yaml @@ -74,6 +74,7 @@ batch_size: 2 # Optimizer and Scheduler optimizer: _component_: torch.optim.AdamW + fused: True weight_decay: 0.01 lr: 3e-4 lr_scheduler: diff --git a/recipes/configs/llama3_1/8B_full.yaml b/recipes/configs/llama3_1/8B_full.yaml index c59dbdd228..ad81a099ca 100644 --- a/recipes/configs/llama3_1/8B_full.yaml +++ b/recipes/configs/llama3_1/8B_full.yaml @@ -55,8 +55,7 @@ epochs: 3 optimizer: _component_: torch.optim.AdamW lr: 2e-5 - foreach: False - + fused: True loss: _component_: torchtune.modules.loss.CEWithChunkedOutputLoss max_steps_per_epoch: null diff --git a/recipes/configs/llama3_1/8B_lora.yaml b/recipes/configs/llama3_1/8B_lora.yaml index 7f364a4462..e6b4a40dfc 100644 --- a/recipes/configs/llama3_1/8B_lora.yaml +++ b/recipes/configs/llama3_1/8B_lora.yaml @@ -58,6 +58,7 @@ batch_size: 2 # Optimizer and Scheduler optimizer: _component_: torch.optim.AdamW + fused: True weight_decay: 0.01 lr: 3e-4 lr_scheduler: diff --git a/recipes/configs/llama3_1/8B_lora_single_device.yaml b/recipes/configs/llama3_1/8B_lora_single_device.yaml index a445051a98..dbd464c80f 100644 --- a/recipes/configs/llama3_1/8B_lora_single_device.yaml +++ b/recipes/configs/llama3_1/8B_lora_single_device.yaml @@ -57,6 +57,7 @@ batch_size: 2 # Optimizer and Scheduler optimizer: _component_: torch.optim.AdamW + fused: True weight_decay: 0.01 lr: 3e-4 lr_scheduler: diff --git a/recipes/configs/llama3_1/8B_qlora_single_device.yaml b/recipes/configs/llama3_1/8B_qlora_single_device.yaml index 6b8b3497c2..e9b5255ec8 100644 --- a/recipes/configs/llama3_1/8B_qlora_single_device.yaml +++ b/recipes/configs/llama3_1/8B_qlora_single_device.yaml @@ -56,6 +56,7 @@ batch_size: 2 # Optimizer and Scheduler optimizer: _component_: torch.optim.AdamW + fused: True weight_decay: 0.01 lr: 3e-4 lr_scheduler: diff --git a/recipes/configs/mistral/7B_full.yaml b/recipes/configs/mistral/7B_full.yaml index b5d8e318a7..602b3fe082 100644 --- a/recipes/configs/mistral/7B_full.yaml +++ b/recipes/configs/mistral/7B_full.yaml @@ -54,6 +54,7 @@ batch_size: 2 epochs: 3 optimizer: _component_: torch.optim.AdamW + fused: True lr: 5e-6 loss: _component_: torchtune.modules.loss.CEWithChunkedOutputLoss diff --git a/recipes/configs/mistral/7B_lora.yaml b/recipes/configs/mistral/7B_lora.yaml index a15b437d5a..fd2c637df7 100644 --- a/recipes/configs/mistral/7B_lora.yaml +++ b/recipes/configs/mistral/7B_lora.yaml @@ -59,6 +59,7 @@ save_adapter_weights_only: False optimizer: _component_: torch.optim.AdamW + fused: True lr: 2e-5 lr_scheduler: diff --git a/recipes/configs/mistral/7B_lora_single_device.yaml b/recipes/configs/mistral/7B_lora_single_device.yaml index 1c461f9f46..9946cf7d54 100644 --- a/recipes/configs/mistral/7B_lora_single_device.yaml +++ b/recipes/configs/mistral/7B_lora_single_device.yaml @@ -56,6 +56,7 @@ save_adapter_weights_only: False optimizer: _component_: torch.optim.AdamW + fused: True lr: 2e-5 lr_scheduler: diff --git a/recipes/configs/mistral/7B_qlora_single_device.yaml b/recipes/configs/mistral/7B_qlora_single_device.yaml index 54d0906150..61af23965d 100644 --- a/recipes/configs/mistral/7B_qlora_single_device.yaml +++ b/recipes/configs/mistral/7B_qlora_single_device.yaml @@ -57,6 +57,7 @@ save_adapter_weights_only: False optimizer: _component_: torch.optim.AdamW + fused: True lr: 2e-5 lr_scheduler: diff --git a/recipes/configs/phi3/mini_full.yaml b/recipes/configs/phi3/mini_full.yaml index d92e05cb6d..0ee746ddd4 100644 --- a/recipes/configs/phi3/mini_full.yaml +++ b/recipes/configs/phi3/mini_full.yaml @@ -53,6 +53,7 @@ batch_size: 2 gradient_accumulation_steps: 16 optimizer: _component_: torch.optim.AdamW + fused: True lr: 5e-6 loss: _component_: torchtune.modules.loss.CEWithChunkedOutputLoss diff --git a/recipes/configs/phi3/mini_lora.yaml b/recipes/configs/phi3/mini_lora.yaml index 0556b1de2c..721a61790b 100644 --- a/recipes/configs/phi3/mini_lora.yaml +++ b/recipes/configs/phi3/mini_lora.yaml @@ -60,6 +60,7 @@ batch_size: 2 gradient_accumulation_steps: 16 optimizer: _component_: torch.optim.AdamW + fused: True weight_decay: 0.01 lr: 3e-4 lr_scheduler: diff --git a/recipes/configs/phi3/mini_lora_single_device.yaml b/recipes/configs/phi3/mini_lora_single_device.yaml index 15e2fc02a5..e722b8f4e3 100644 --- a/recipes/configs/phi3/mini_lora_single_device.yaml +++ b/recipes/configs/phi3/mini_lora_single_device.yaml @@ -58,6 +58,7 @@ batch_size: 2 gradient_accumulation_steps: 16 optimizer: _component_: torch.optim.AdamW + fused: True weight_decay: 0.01 lr: 3e-4 lr_scheduler: diff --git a/recipes/configs/phi3/mini_qlora_single_device.yaml b/recipes/configs/phi3/mini_qlora_single_device.yaml index 5b3ae9e6a1..3c7b1675b8 100644 --- a/recipes/configs/phi3/mini_qlora_single_device.yaml +++ b/recipes/configs/phi3/mini_qlora_single_device.yaml @@ -58,6 +58,7 @@ batch_size: 2 gradient_accumulation_steps: 16 optimizer: _component_: torch.optim.AdamW + fused: True weight_decay: 0.01 lr: 3e-4 lr_scheduler: diff --git a/recipes/configs/qwen2/0.5B_full.yaml b/recipes/configs/qwen2/0.5B_full.yaml index 05e08eca3c..5bf14591f9 100644 --- a/recipes/configs/qwen2/0.5B_full.yaml +++ b/recipes/configs/qwen2/0.5B_full.yaml @@ -50,6 +50,7 @@ batch_size: 2 epochs: 1 optimizer: _component_: torch.optim.AdamW + fused: True lr: 2e-5 loss: _component_: torchtune.modules.loss.CEWithChunkedOutputLoss diff --git a/recipes/configs/qwen2/0.5B_full_single_device.yaml b/recipes/configs/qwen2/0.5B_full_single_device.yaml index 6f3db3fc47..67091a4e8a 100644 --- a/recipes/configs/qwen2/0.5B_full_single_device.yaml +++ b/recipes/configs/qwen2/0.5B_full_single_device.yaml @@ -48,6 +48,7 @@ batch_size: 2 epochs: 1 optimizer: _component_: torch.optim.AdamW + fused: True lr: 2e-5 loss: diff --git a/recipes/configs/qwen2/0.5B_lora.yaml b/recipes/configs/qwen2/0.5B_lora.yaml index dc0f0ebe5f..9ccd400897 100644 --- a/recipes/configs/qwen2/0.5B_lora.yaml +++ b/recipes/configs/qwen2/0.5B_lora.yaml @@ -55,6 +55,7 @@ batch_size: 4 # Optimizer and Scheduler optimizer: _component_: torch.optim.AdamW + fused: True weight_decay: 0.01 lr: 2e-3 diff --git a/recipes/configs/qwen2/0.5B_lora_single_device.yaml b/recipes/configs/qwen2/0.5B_lora_single_device.yaml index 23bcb1742c..f2cf00ebf3 100644 --- a/recipes/configs/qwen2/0.5B_lora_single_device.yaml +++ b/recipes/configs/qwen2/0.5B_lora_single_device.yaml @@ -53,6 +53,7 @@ batch_size: 4 # Optimizer and Scheduler optimizer: _component_: torch.optim.AdamW + fused: True weight_decay: 0.01 lr: 2e-3 diff --git a/recipes/configs/qwen2/1.5B_full.yaml b/recipes/configs/qwen2/1.5B_full.yaml index 0c1aa9fa6a..cb7b5e2318 100644 --- a/recipes/configs/qwen2/1.5B_full.yaml +++ b/recipes/configs/qwen2/1.5B_full.yaml @@ -50,6 +50,7 @@ batch_size: 2 epochs: 3 optimizer: _component_: torch.optim.AdamW + fused: True lr: 2e-5 loss: _component_: torchtune.modules.loss.CEWithChunkedOutputLoss diff --git a/recipes/configs/qwen2/1.5B_lora.yaml b/recipes/configs/qwen2/1.5B_lora.yaml index 35516eefff..84fd73696b 100644 --- a/recipes/configs/qwen2/1.5B_lora.yaml +++ b/recipes/configs/qwen2/1.5B_lora.yaml @@ -52,6 +52,7 @@ batch_size: 2 # Optimizer and Scheduler optimizer: _component_: torch.optim.AdamW + fused: True lr: 2e-5 lr_scheduler: diff --git a/recipes/configs/qwen2/1.5B_lora_single_device.yaml b/recipes/configs/qwen2/1.5B_lora_single_device.yaml index 6c77f84d16..fb1487bf1d 100644 --- a/recipes/configs/qwen2/1.5B_lora_single_device.yaml +++ b/recipes/configs/qwen2/1.5B_lora_single_device.yaml @@ -52,6 +52,7 @@ batch_size: 2 # Optimizer and Scheduler optimizer: _component_: torch.optim.AdamW + fused: True lr: 2e-3 lr_scheduler: diff --git a/recipes/configs/qwen2/7B_full.yaml b/recipes/configs/qwen2/7B_full.yaml index 2974583fb5..7ffc07e457 100644 --- a/recipes/configs/qwen2/7B_full.yaml +++ b/recipes/configs/qwen2/7B_full.yaml @@ -53,6 +53,7 @@ batch_size: 2 epochs: 1 optimizer: _component_: torch.optim.AdamW + fused: True lr: 5e-6 loss: _component_: torchtune.modules.loss.CEWithChunkedOutputLoss diff --git a/recipes/configs/qwen2/7B_lora.yaml b/recipes/configs/qwen2/7B_lora.yaml index fa6737e8ed..f6a4cc2ac6 100644 --- a/recipes/configs/qwen2/7B_lora.yaml +++ b/recipes/configs/qwen2/7B_lora.yaml @@ -58,6 +58,7 @@ batch_size: 2 # Optimizer and Scheduler optimizer: _component_: torch.optim.AdamW + fused: True weight_decay: 0.01 lr: 3e-4 lr_scheduler: diff --git a/recipes/configs/qwen2/7B_lora_single_device.yaml b/recipes/configs/qwen2/7B_lora_single_device.yaml index 188066e1e2..550878fa7a 100644 --- a/recipes/configs/qwen2/7B_lora_single_device.yaml +++ b/recipes/configs/qwen2/7B_lora_single_device.yaml @@ -56,6 +56,7 @@ batch_size: 2 # Optimizer and Scheduler optimizer: _component_: torch.optim.AdamW + fused: True weight_decay: 0.01 lr: 3e-4 lr_scheduler: