diff --git a/recipes/configs/code_llama2/7B_lora_single_device.yaml b/recipes/configs/code_llama2/7B_lora_single_device.yaml
index ba5d43113d..1281bc3651 100644
--- a/recipes/configs/code_llama2/7B_lora_single_device.yaml
+++ b/recipes/configs/code_llama2/7B_lora_single_device.yaml
@@ -60,6 +60,7 @@ batch_size: 2
 gradient_accumulation_steps: 16
 optimizer:
   _component_: torch.optim.AdamW
+  fused: True
   weight_decay: 0.01
   lr: 3e-4
 lr_scheduler:
diff --git a/recipes/configs/code_llama2/7B_qlora_single_device.yaml b/recipes/configs/code_llama2/7B_qlora_single_device.yaml
index c2e990bf7b..1d4e91562c 100644
--- a/recipes/configs/code_llama2/7B_qlora_single_device.yaml
+++ b/recipes/configs/code_llama2/7B_qlora_single_device.yaml
@@ -60,6 +60,7 @@ batch_size: 2
 gradient_accumulation_steps: 16
 optimizer:
   _component_: torch.optim.AdamW
+  fused: True
   weight_decay: 0.01
   lr: 3e-4
 lr_scheduler:
diff --git a/recipes/configs/dev/8B_full_experimental.yaml b/recipes/configs/dev/8B_full_experimental.yaml
index 498e99e0a1..4ed8a80e09 100644
--- a/recipes/configs/dev/8B_full_experimental.yaml
+++ b/recipes/configs/dev/8B_full_experimental.yaml
@@ -52,8 +52,7 @@ epochs: 3
 optimizer:
   _component_: torch.optim.AdamW
   lr: 2e-5
-  foreach: False
-
+  fused: True
 loss:
   _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
 max_steps_per_epoch: null
diff --git a/recipes/configs/gemma/2B_full.yaml b/recipes/configs/gemma/2B_full.yaml
index 2875bf5cce..7ee9e830e9 100644
--- a/recipes/configs/gemma/2B_full.yaml
+++ b/recipes/configs/gemma/2B_full.yaml
@@ -48,6 +48,7 @@ batch_size: 2
 epochs: 3
 optimizer:
   _component_: torch.optim.AdamW
+  fused: True
   lr: 2e-5
 loss:
   _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
diff --git a/recipes/configs/gemma/2B_lora.yaml b/recipes/configs/gemma/2B_lora.yaml
index 8e67fe2168..4df65e4430 100644
--- a/recipes/configs/gemma/2B_lora.yaml
+++ b/recipes/configs/gemma/2B_lora.yaml
@@ -51,6 +51,7 @@ save_adapter_weights_only: False
 
 optimizer:
   _component_: torch.optim.AdamW
+  fused: True
   lr: 2e-5
 
 lr_scheduler:
diff --git a/recipes/configs/gemma/2B_lora_single_device.yaml b/recipes/configs/gemma/2B_lora_single_device.yaml
index 8c322495ce..6d74bafc5b 100644
--- a/recipes/configs/gemma/2B_lora_single_device.yaml
+++ b/recipes/configs/gemma/2B_lora_single_device.yaml
@@ -50,6 +50,7 @@ save_adapter_weights_only: False
 
 optimizer:
   _component_: torch.optim.AdamW
+  fused: True
   lr: 2e-5
 
 lr_scheduler:
diff --git a/recipes/configs/gemma/2B_qlora_single_device.yaml b/recipes/configs/gemma/2B_qlora_single_device.yaml
index 7ed60ce180..c2a1992c8d 100644
--- a/recipes/configs/gemma/2B_qlora_single_device.yaml
+++ b/recipes/configs/gemma/2B_qlora_single_device.yaml
@@ -50,6 +50,7 @@ save_adapter_weights_only: False
 
 optimizer:
   _component_: torch.optim.AdamW
+  fused: True
   lr: 2e-5
 
 lr_scheduler:
diff --git a/recipes/configs/gemma/7B_full.yaml b/recipes/configs/gemma/7B_full.yaml
index dc4bbe26ae..41bc1513f0 100644
--- a/recipes/configs/gemma/7B_full.yaml
+++ b/recipes/configs/gemma/7B_full.yaml
@@ -50,6 +50,7 @@ batch_size: 1
 epochs: 1
 optimizer:
   _component_: torch.optim.AdamW
+  fused: True
   lr: 2e-5
 loss:
   _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
diff --git a/recipes/configs/gemma/7B_lora.yaml b/recipes/configs/gemma/7B_lora.yaml
index 5d0bcdb08f..295db1b951 100644
--- a/recipes/configs/gemma/7B_lora.yaml
+++ b/recipes/configs/gemma/7B_lora.yaml
@@ -53,6 +53,7 @@ save_adapter_weights_only: False
 
 optimizer:
   _component_: torch.optim.AdamW
+  fused: True
   lr: 2e-5
 
 lr_scheduler:
diff --git a/recipes/configs/gemma/7B_lora_single_device.yaml b/recipes/configs/gemma/7B_lora_single_device.yaml
index aa69fa50f8..71d002c69c 100644
--- a/recipes/configs/gemma/7B_lora_single_device.yaml
+++ b/recipes/configs/gemma/7B_lora_single_device.yaml
@@ -52,6 +52,7 @@ save_adapter_weights_only: False
 
 optimizer:
   _component_: torch.optim.AdamW
+  fused: True
   lr: 5e-5
 
 lr_scheduler:
diff --git a/recipes/configs/gemma/7B_qlora_single_device.yaml b/recipes/configs/gemma/7B_qlora_single_device.yaml
index 8a08c49b5c..72321e0ab1 100644
--- a/recipes/configs/gemma/7B_qlora_single_device.yaml
+++ b/recipes/configs/gemma/7B_qlora_single_device.yaml
@@ -52,6 +52,7 @@ save_adapter_weights_only: False
 
 optimizer:
   _component_: torch.optim.AdamW
+  fused: True
   lr: 2e-5
 
 lr_scheduler:
diff --git a/recipes/configs/llama2/13B_full.yaml b/recipes/configs/llama2/13B_full.yaml
index 0b740b17cb..f5ecffc2ab 100644
--- a/recipes/configs/llama2/13B_full.yaml
+++ b/recipes/configs/llama2/13B_full.yaml
@@ -52,6 +52,7 @@ batch_size: 2
 epochs: 3
 optimizer:
   _component_: torch.optim.AdamW
+  fused: True
   lr: 2e-5
 loss:
   _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
diff --git a/recipes/configs/llama2/13B_lora.yaml b/recipes/configs/llama2/13B_lora.yaml
index 33d160ece0..267725ab92 100644
--- a/recipes/configs/llama2/13B_lora.yaml
+++ b/recipes/configs/llama2/13B_lora.yaml
@@ -60,6 +60,7 @@ batch_size: 2
 # Optimizer and Scheduler
 optimizer:
   _component_: torch.optim.AdamW
+  fused: True
   weight_decay: 0.01
   lr: 2e-4
 lr_scheduler:
diff --git a/recipes/configs/llama2/13B_qlora_single_device.yaml b/recipes/configs/llama2/13B_qlora_single_device.yaml
index 74c51fd6bc..176cbe23b9 100644
--- a/recipes/configs/llama2/13B_qlora_single_device.yaml
+++ b/recipes/configs/llama2/13B_qlora_single_device.yaml
@@ -55,6 +55,7 @@ batch_size: 2
 # Optimizer and Scheduler
 optimizer:
   _component_: torch.optim.AdamW
+  fused: True
   weight_decay: 0.01
   lr: 3e-4
 lr_scheduler:
diff --git a/recipes/configs/llama2/70B_lora.yaml b/recipes/configs/llama2/70B_lora.yaml
index 2c745402f1..ff4f56493b 100644
--- a/recipes/configs/llama2/70B_lora.yaml
+++ b/recipes/configs/llama2/70B_lora.yaml
@@ -60,6 +60,7 @@ batch_size: 2
 # Optimizer and Scheduler
 optimizer:
   _component_: torch.optim.AdamW
+  fused: True
   weight_decay: 0.01
   lr: 3e-4
 lr_scheduler:
diff --git a/recipes/configs/llama2/70B_qlora.yaml b/recipes/configs/llama2/70B_qlora.yaml
index 7d2301106a..b8ff55c01b 100644
--- a/recipes/configs/llama2/70B_qlora.yaml
+++ b/recipes/configs/llama2/70B_qlora.yaml
@@ -66,6 +66,7 @@ batch_size: 2
 # Optimizer and Scheduler
 optimizer:
   _component_: torch.optim.AdamW
+  fused: True
   weight_decay: 0.01
   lr: 3e-4
 lr_scheduler:
diff --git a/recipes/configs/llama2/7B_full.yaml b/recipes/configs/llama2/7B_full.yaml
index 79d41cd8bc..2e80276c84 100644
--- a/recipes/configs/llama2/7B_full.yaml
+++ b/recipes/configs/llama2/7B_full.yaml
@@ -51,6 +51,7 @@ batch_size: 2
 epochs: 3
 optimizer:
   _component_: torch.optim.AdamW
+  fused: True
   lr: 2e-5
 loss:
   _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
diff --git a/recipes/configs/llama2/7B_lora.yaml b/recipes/configs/llama2/7B_lora.yaml
index dd527194b3..68e1d302df 100644
--- a/recipes/configs/llama2/7B_lora.yaml
+++ b/recipes/configs/llama2/7B_lora.yaml
@@ -57,6 +57,7 @@ batch_size: 2
 # Optimizer and Scheduler
 optimizer:
   _component_: torch.optim.AdamW
+  fused: True
   weight_decay: 0.01
   lr: 3e-4
 lr_scheduler:
diff --git a/recipes/configs/llama2/7B_lora_dpo.yaml b/recipes/configs/llama2/7B_lora_dpo.yaml
index 81a83577fe..3b6603d013 100644
--- a/recipes/configs/llama2/7B_lora_dpo.yaml
+++ b/recipes/configs/llama2/7B_lora_dpo.yaml
@@ -55,6 +55,7 @@ batch_size: 4
 # Optimizer and Scheduler
 optimizer:
   _component_: torch.optim.AdamW
+  fused: True
   weight_decay: 0.05
   lr: 5e-4
 lr_scheduler:
diff --git a/recipes/configs/llama2/7B_lora_dpo_single_device.yaml b/recipes/configs/llama2/7B_lora_dpo_single_device.yaml
index ef70976ee4..c01f28363b 100644
--- a/recipes/configs/llama2/7B_lora_dpo_single_device.yaml
+++ b/recipes/configs/llama2/7B_lora_dpo_single_device.yaml
@@ -54,6 +54,7 @@ batch_size: 4
 # Optimizer and Scheduler
 optimizer:
   _component_: torch.optim.AdamW
+  fused: True
   weight_decay: 0.05
   lr: 5e-4
 lr_scheduler:
diff --git a/recipes/configs/llama2/7B_lora_single_device.yaml b/recipes/configs/llama2/7B_lora_single_device.yaml
index dba911a0d7..6ff1740a8a 100644
--- a/recipes/configs/llama2/7B_lora_single_device.yaml
+++ b/recipes/configs/llama2/7B_lora_single_device.yaml
@@ -55,6 +55,7 @@ batch_size: 2
 # Optimizer and Scheduler
 optimizer:
   _component_: torch.optim.AdamW
+  fused: True
   weight_decay: 0.01
   lr: 3e-4
 lr_scheduler:
diff --git a/recipes/configs/llama2/7B_qat_full.yaml b/recipes/configs/llama2/7B_qat_full.yaml
index 4a9cd4a63f..6fca6c4d4a 100644
--- a/recipes/configs/llama2/7B_qat_full.yaml
+++ b/recipes/configs/llama2/7B_qat_full.yaml
@@ -47,6 +47,7 @@ batch_size: 2
 epochs: 3
 optimizer:
   _component_: torch.optim.AdamW
+  fused: True
   lr: 2e-5
 loss:
   _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
diff --git a/recipes/configs/llama2/7B_qlora.yaml b/recipes/configs/llama2/7B_qlora.yaml
index 701eff6fa2..630d1f6357 100644
--- a/recipes/configs/llama2/7B_qlora.yaml
+++ b/recipes/configs/llama2/7B_qlora.yaml
@@ -57,6 +57,7 @@ batch_size: 2
 # Optimizer and Scheduler
 optimizer:
   _component_: torch.optim.AdamW
+  fused: True
   weight_decay: 0.01
   lr: 3e-4
 lr_scheduler:
diff --git a/recipes/configs/llama2/7B_qlora_single_device.yaml b/recipes/configs/llama2/7B_qlora_single_device.yaml
index 427cdc50be..34f0dcbebe 100644
--- a/recipes/configs/llama2/7B_qlora_single_device.yaml
+++ b/recipes/configs/llama2/7B_qlora_single_device.yaml
@@ -54,6 +54,7 @@ batch_size: 2
 # Optimizer and Scheduler
 optimizer:
   _component_: torch.optim.AdamW
+  fused: True
   weight_decay: 0.01
   lr: 3e-4
 lr_scheduler:
diff --git a/recipes/configs/llama3/70B_full.yaml b/recipes/configs/llama3/70B_full.yaml
index 031ae44d38..ab49fd3270 100644
--- a/recipes/configs/llama3/70B_full.yaml
+++ b/recipes/configs/llama3/70B_full.yaml
@@ -79,8 +79,7 @@ epochs: 3
 optimizer:
   _component_: torch.optim.AdamW
   lr: 2e-5
-  foreach: False
-  # Note: highly recommended to use fused=True optimizer flag
+  fused: True  # Note: highly recommended to use fused=True optimizer flag
   # with CPU offload for faster optimizer step.
   fused: True
 
diff --git a/recipes/configs/llama3/70B_lora.yaml b/recipes/configs/llama3/70B_lora.yaml
index c323a277ed..00bd438fab 100644
--- a/recipes/configs/llama3/70B_lora.yaml
+++ b/recipes/configs/llama3/70B_lora.yaml
@@ -75,6 +75,7 @@ batch_size: 2
 # Optimizer and Scheduler
 optimizer:
   _component_: torch.optim.AdamW
+  fused: True
   weight_decay: 0.01
   lr: 3e-4
 lr_scheduler:
diff --git a/recipes/configs/llama3/8B_dora.yaml b/recipes/configs/llama3/8B_dora.yaml
index 4ea5fca983..3911e856c2 100644
--- a/recipes/configs/llama3/8B_dora.yaml
+++ b/recipes/configs/llama3/8B_dora.yaml
@@ -50,6 +50,7 @@ batch_size: 2
 # Optimizer and Scheduler
 optimizer:
   _component_: torch.optim.AdamW
+  fused: True
   weight_decay: 0.01
   lr: 3e-4
 lr_scheduler:
diff --git a/recipes/configs/llama3/8B_dora_single_device.yaml b/recipes/configs/llama3/8B_dora_single_device.yaml
index cc149a3b61..7118733f53 100644
--- a/recipes/configs/llama3/8B_dora_single_device.yaml
+++ b/recipes/configs/llama3/8B_dora_single_device.yaml
@@ -52,6 +52,7 @@ batch_size: 2
 # Optimizer and Scheduler
 optimizer:
   _component_: torch.optim.AdamW
+  fused: True
   weight_decay: 0.01
   lr: 3e-4
 lr_scheduler:
diff --git a/recipes/configs/llama3/8B_full.yaml b/recipes/configs/llama3/8B_full.yaml
index 676ae8e72a..7f24376db7 100644
--- a/recipes/configs/llama3/8B_full.yaml
+++ b/recipes/configs/llama3/8B_full.yaml
@@ -52,8 +52,7 @@ epochs: 3
 optimizer:
   _component_: torch.optim.AdamW
   lr: 2e-5
-  foreach: False
-
+  fused: True
 loss:
   _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
 max_steps_per_epoch: null
diff --git a/recipes/configs/llama3/8B_lora.yaml b/recipes/configs/llama3/8B_lora.yaml
index 26016cd5cd..5c3510f466 100644
--- a/recipes/configs/llama3/8B_lora.yaml
+++ b/recipes/configs/llama3/8B_lora.yaml
@@ -55,6 +55,7 @@ batch_size: 2
 # Optimizer and Scheduler
 optimizer:
   _component_: torch.optim.AdamW
+  fused: True
   weight_decay: 0.01
   lr: 3e-4
 lr_scheduler:
diff --git a/recipes/configs/llama3/8B_lora_single_device.yaml b/recipes/configs/llama3/8B_lora_single_device.yaml
index c35d93b9b4..8ae6853b62 100644
--- a/recipes/configs/llama3/8B_lora_single_device.yaml
+++ b/recipes/configs/llama3/8B_lora_single_device.yaml
@@ -54,6 +54,7 @@ batch_size: 2
 # Optimizer and Scheduler
 optimizer:
   _component_: torch.optim.AdamW
+  fused: True
   weight_decay: 0.01
   lr: 3e-4
 lr_scheduler:
diff --git a/recipes/configs/llama3/8B_qat_full.yaml b/recipes/configs/llama3/8B_qat_full.yaml
index e990c757e9..ff4d9c3195 100644
--- a/recipes/configs/llama3/8B_qat_full.yaml
+++ b/recipes/configs/llama3/8B_qat_full.yaml
@@ -52,8 +52,7 @@ quantizer:
 optimizer:
   _component_: torch.optim.AdamW
   lr: 2e-5
-  foreach: False
-
+  fused: True
 loss:
   _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
 max_steps_per_epoch: null
diff --git a/recipes/configs/llama3/8B_qdora_single_device.yaml b/recipes/configs/llama3/8B_qdora_single_device.yaml
index 5a66aa9edc..6e6b572c8e 100644
--- a/recipes/configs/llama3/8B_qdora_single_device.yaml
+++ b/recipes/configs/llama3/8B_qdora_single_device.yaml
@@ -53,6 +53,7 @@ batch_size: 2
 # Optimizer and Scheduler
 optimizer:
   _component_: torch.optim.AdamW
+  fused: True
   weight_decay: 0.01
   lr: 3e-4
 lr_scheduler:
diff --git a/recipes/configs/llama3/8B_qlora_single_device.yaml b/recipes/configs/llama3/8B_qlora_single_device.yaml
index b06523b4a0..cfc6ce93bc 100644
--- a/recipes/configs/llama3/8B_qlora_single_device.yaml
+++ b/recipes/configs/llama3/8B_qlora_single_device.yaml
@@ -53,6 +53,7 @@ batch_size: 2
 # Optimizer and Scheduler
 optimizer:
   _component_: torch.optim.AdamW
+  fused: True
   weight_decay: 0.01
   lr: 3e-4
 lr_scheduler:
diff --git a/recipes/configs/llama3_1/70B_full.yaml b/recipes/configs/llama3_1/70B_full.yaml
index 30f5cd87b4..308093ef25 100644
--- a/recipes/configs/llama3_1/70B_full.yaml
+++ b/recipes/configs/llama3_1/70B_full.yaml
@@ -79,7 +79,6 @@ epochs: 3
 optimizer:
   _component_: torch.optim.AdamW
   lr: 2e-5
-  foreach: False
   # Note: highly recommended to use fused=True optimizer flag
   # with CPU offload for faster optimizer step.
   fused: True
diff --git a/recipes/configs/llama3_1/70B_lora.yaml b/recipes/configs/llama3_1/70B_lora.yaml
index 995039db36..d7749569b8 100644
--- a/recipes/configs/llama3_1/70B_lora.yaml
+++ b/recipes/configs/llama3_1/70B_lora.yaml
@@ -74,6 +74,7 @@ batch_size: 2
 # Optimizer and Scheduler
 optimizer:
   _component_: torch.optim.AdamW
+  fused: True
   weight_decay: 0.01
   lr: 3e-4
 lr_scheduler:
diff --git a/recipes/configs/llama3_1/8B_full.yaml b/recipes/configs/llama3_1/8B_full.yaml
index c59dbdd228..ad81a099ca 100644
--- a/recipes/configs/llama3_1/8B_full.yaml
+++ b/recipes/configs/llama3_1/8B_full.yaml
@@ -55,8 +55,7 @@ epochs: 3
 optimizer:
   _component_: torch.optim.AdamW
   lr: 2e-5
-  foreach: False
-
+  fused: True
 loss:
   _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
 max_steps_per_epoch: null
diff --git a/recipes/configs/llama3_1/8B_lora.yaml b/recipes/configs/llama3_1/8B_lora.yaml
index 7f364a4462..e6b4a40dfc 100644
--- a/recipes/configs/llama3_1/8B_lora.yaml
+++ b/recipes/configs/llama3_1/8B_lora.yaml
@@ -58,6 +58,7 @@ batch_size: 2
 # Optimizer and Scheduler
 optimizer:
   _component_: torch.optim.AdamW
+  fused: True
   weight_decay: 0.01
   lr: 3e-4
 lr_scheduler:
diff --git a/recipes/configs/llama3_1/8B_lora_single_device.yaml b/recipes/configs/llama3_1/8B_lora_single_device.yaml
index a445051a98..dbd464c80f 100644
--- a/recipes/configs/llama3_1/8B_lora_single_device.yaml
+++ b/recipes/configs/llama3_1/8B_lora_single_device.yaml
@@ -57,6 +57,7 @@ batch_size: 2
 # Optimizer and Scheduler
 optimizer:
   _component_: torch.optim.AdamW
+  fused: True
   weight_decay: 0.01
   lr: 3e-4
 lr_scheduler:
diff --git a/recipes/configs/llama3_1/8B_qlora_single_device.yaml b/recipes/configs/llama3_1/8B_qlora_single_device.yaml
index 6b8b3497c2..e9b5255ec8 100644
--- a/recipes/configs/llama3_1/8B_qlora_single_device.yaml
+++ b/recipes/configs/llama3_1/8B_qlora_single_device.yaml
@@ -56,6 +56,7 @@ batch_size: 2
 # Optimizer and Scheduler
 optimizer:
   _component_: torch.optim.AdamW
+  fused: True
   weight_decay: 0.01
   lr: 3e-4
 lr_scheduler:
diff --git a/recipes/configs/mistral/7B_full.yaml b/recipes/configs/mistral/7B_full.yaml
index b5d8e318a7..602b3fe082 100644
--- a/recipes/configs/mistral/7B_full.yaml
+++ b/recipes/configs/mistral/7B_full.yaml
@@ -54,6 +54,7 @@ batch_size: 2
 epochs: 3
 optimizer:
   _component_: torch.optim.AdamW
+  fused: True
   lr: 5e-6
 loss:
   _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
diff --git a/recipes/configs/mistral/7B_lora.yaml b/recipes/configs/mistral/7B_lora.yaml
index a15b437d5a..fd2c637df7 100644
--- a/recipes/configs/mistral/7B_lora.yaml
+++ b/recipes/configs/mistral/7B_lora.yaml
@@ -59,6 +59,7 @@ save_adapter_weights_only: False
 
 optimizer:
   _component_: torch.optim.AdamW
+  fused: True
   lr: 2e-5
 
 lr_scheduler:
diff --git a/recipes/configs/mistral/7B_lora_single_device.yaml b/recipes/configs/mistral/7B_lora_single_device.yaml
index 1c461f9f46..9946cf7d54 100644
--- a/recipes/configs/mistral/7B_lora_single_device.yaml
+++ b/recipes/configs/mistral/7B_lora_single_device.yaml
@@ -56,6 +56,7 @@ save_adapter_weights_only: False
 
 optimizer:
   _component_: torch.optim.AdamW
+  fused: True
   lr: 2e-5
 
 lr_scheduler:
diff --git a/recipes/configs/mistral/7B_qlora_single_device.yaml b/recipes/configs/mistral/7B_qlora_single_device.yaml
index 54d0906150..61af23965d 100644
--- a/recipes/configs/mistral/7B_qlora_single_device.yaml
+++ b/recipes/configs/mistral/7B_qlora_single_device.yaml
@@ -57,6 +57,7 @@ save_adapter_weights_only: False
 
 optimizer:
   _component_: torch.optim.AdamW
+  fused: True
   lr: 2e-5
 
 lr_scheduler:
diff --git a/recipes/configs/phi3/mini_full.yaml b/recipes/configs/phi3/mini_full.yaml
index d92e05cb6d..0ee746ddd4 100644
--- a/recipes/configs/phi3/mini_full.yaml
+++ b/recipes/configs/phi3/mini_full.yaml
@@ -53,6 +53,7 @@ batch_size: 2
 gradient_accumulation_steps: 16
 optimizer:
   _component_: torch.optim.AdamW
+  fused: True
   lr: 5e-6
 loss:
   _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
diff --git a/recipes/configs/phi3/mini_lora.yaml b/recipes/configs/phi3/mini_lora.yaml
index 0556b1de2c..721a61790b 100644
--- a/recipes/configs/phi3/mini_lora.yaml
+++ b/recipes/configs/phi3/mini_lora.yaml
@@ -60,6 +60,7 @@ batch_size: 2
 gradient_accumulation_steps: 16
 optimizer:
   _component_: torch.optim.AdamW
+  fused: True
   weight_decay: 0.01
   lr: 3e-4
 lr_scheduler:
diff --git a/recipes/configs/phi3/mini_lora_single_device.yaml b/recipes/configs/phi3/mini_lora_single_device.yaml
index 15e2fc02a5..e722b8f4e3 100644
--- a/recipes/configs/phi3/mini_lora_single_device.yaml
+++ b/recipes/configs/phi3/mini_lora_single_device.yaml
@@ -58,6 +58,7 @@ batch_size: 2
 gradient_accumulation_steps: 16
 optimizer:
   _component_: torch.optim.AdamW
+  fused: True
   weight_decay: 0.01
   lr: 3e-4
 lr_scheduler:
diff --git a/recipes/configs/phi3/mini_qlora_single_device.yaml b/recipes/configs/phi3/mini_qlora_single_device.yaml
index 5b3ae9e6a1..3c7b1675b8 100644
--- a/recipes/configs/phi3/mini_qlora_single_device.yaml
+++ b/recipes/configs/phi3/mini_qlora_single_device.yaml
@@ -58,6 +58,7 @@ batch_size: 2
 gradient_accumulation_steps: 16
 optimizer:
   _component_: torch.optim.AdamW
+  fused: True
   weight_decay: 0.01
   lr: 3e-4
 lr_scheduler:
diff --git a/recipes/configs/qwen2/0.5B_full.yaml b/recipes/configs/qwen2/0.5B_full.yaml
index 05e08eca3c..5bf14591f9 100644
--- a/recipes/configs/qwen2/0.5B_full.yaml
+++ b/recipes/configs/qwen2/0.5B_full.yaml
@@ -50,6 +50,7 @@ batch_size: 2
 epochs: 1
 optimizer:
   _component_: torch.optim.AdamW
+  fused: True
   lr: 2e-5
 loss:
   _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
diff --git a/recipes/configs/qwen2/0.5B_full_single_device.yaml b/recipes/configs/qwen2/0.5B_full_single_device.yaml
index 6f3db3fc47..67091a4e8a 100644
--- a/recipes/configs/qwen2/0.5B_full_single_device.yaml
+++ b/recipes/configs/qwen2/0.5B_full_single_device.yaml
@@ -48,6 +48,7 @@ batch_size: 2
 epochs: 1
 optimizer:
   _component_: torch.optim.AdamW
+  fused: True
   lr: 2e-5
 
 loss:
diff --git a/recipes/configs/qwen2/0.5B_lora.yaml b/recipes/configs/qwen2/0.5B_lora.yaml
index dc0f0ebe5f..9ccd400897 100644
--- a/recipes/configs/qwen2/0.5B_lora.yaml
+++ b/recipes/configs/qwen2/0.5B_lora.yaml
@@ -55,6 +55,7 @@ batch_size: 4
 # Optimizer and Scheduler
 optimizer:
   _component_: torch.optim.AdamW
+  fused: True
   weight_decay: 0.01
   lr: 2e-3
 
diff --git a/recipes/configs/qwen2/0.5B_lora_single_device.yaml b/recipes/configs/qwen2/0.5B_lora_single_device.yaml
index 23bcb1742c..f2cf00ebf3 100644
--- a/recipes/configs/qwen2/0.5B_lora_single_device.yaml
+++ b/recipes/configs/qwen2/0.5B_lora_single_device.yaml
@@ -53,6 +53,7 @@ batch_size: 4
 # Optimizer and Scheduler
 optimizer:
   _component_: torch.optim.AdamW
+  fused: True
   weight_decay: 0.01
   lr: 2e-3
 
diff --git a/recipes/configs/qwen2/1.5B_full.yaml b/recipes/configs/qwen2/1.5B_full.yaml
index 0c1aa9fa6a..cb7b5e2318 100644
--- a/recipes/configs/qwen2/1.5B_full.yaml
+++ b/recipes/configs/qwen2/1.5B_full.yaml
@@ -50,6 +50,7 @@ batch_size: 2
 epochs: 3
 optimizer:
   _component_: torch.optim.AdamW
+  fused: True
   lr: 2e-5
 loss:
   _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
diff --git a/recipes/configs/qwen2/1.5B_lora.yaml b/recipes/configs/qwen2/1.5B_lora.yaml
index 35516eefff..84fd73696b 100644
--- a/recipes/configs/qwen2/1.5B_lora.yaml
+++ b/recipes/configs/qwen2/1.5B_lora.yaml
@@ -52,6 +52,7 @@ batch_size: 2
 # Optimizer and Scheduler
 optimizer:
   _component_: torch.optim.AdamW
+  fused: True
   lr: 2e-5
 
 lr_scheduler:
diff --git a/recipes/configs/qwen2/1.5B_lora_single_device.yaml b/recipes/configs/qwen2/1.5B_lora_single_device.yaml
index 6c77f84d16..fb1487bf1d 100644
--- a/recipes/configs/qwen2/1.5B_lora_single_device.yaml
+++ b/recipes/configs/qwen2/1.5B_lora_single_device.yaml
@@ -52,6 +52,7 @@ batch_size: 2
 # Optimizer and Scheduler
 optimizer:
   _component_: torch.optim.AdamW
+  fused: True
   lr: 2e-3
 
 lr_scheduler:
diff --git a/recipes/configs/qwen2/7B_full.yaml b/recipes/configs/qwen2/7B_full.yaml
index 2974583fb5..7ffc07e457 100644
--- a/recipes/configs/qwen2/7B_full.yaml
+++ b/recipes/configs/qwen2/7B_full.yaml
@@ -53,6 +53,7 @@ batch_size: 2
 epochs: 1
 optimizer:
   _component_: torch.optim.AdamW
+  fused: True
   lr: 5e-6
 loss:
   _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
diff --git a/recipes/configs/qwen2/7B_lora.yaml b/recipes/configs/qwen2/7B_lora.yaml
index fa6737e8ed..f6a4cc2ac6 100644
--- a/recipes/configs/qwen2/7B_lora.yaml
+++ b/recipes/configs/qwen2/7B_lora.yaml
@@ -58,6 +58,7 @@ batch_size: 2
 # Optimizer and Scheduler
 optimizer:
   _component_: torch.optim.AdamW
+  fused: True
   weight_decay: 0.01
   lr: 3e-4
 lr_scheduler:
diff --git a/recipes/configs/qwen2/7B_lora_single_device.yaml b/recipes/configs/qwen2/7B_lora_single_device.yaml
index 188066e1e2..550878fa7a 100644
--- a/recipes/configs/qwen2/7B_lora_single_device.yaml
+++ b/recipes/configs/qwen2/7B_lora_single_device.yaml
@@ -56,6 +56,7 @@ batch_size: 2
 # Optimizer and Scheduler
 optimizer:
   _component_: torch.optim.AdamW
+  fused: True
   weight_decay: 0.01
   lr: 3e-4
 lr_scheduler: