Update QAT: add grad clipping, torch.compile, collate fn

andrewor14 · andrewor14 · commit 3df904502bfc · 2024-11-01T16:29:27.000-07:00
**Summary:** Update the qat_distributed recipe to match the
full_finetune_distributed recipe. This commit adds features to
QAT like gradient clipping, torch.compile, and user configurable
collate function for data pre-processing.

**Test Plan:** TBD
diff --git a/recipes/configs/llama2/7B_qat_full.yaml b/recipes/configs/llama2/7B_qat_full.yaml
@@ -66,7 +66,7 @@ device: cuda
 
 # Memory management
 enable_activation_checkpointing: True
-memory_efficient_fsdp_wrap: False
+enable_activation_offloading: False  # True reduces memory
 
 # Reduced precision
 dtype: bf16
diff --git a/recipes/configs/llama3/8B_qat_full.yaml b/recipes/configs/llama3/8B_qat_full.yaml
@@ -44,7 +44,6 @@ resume_from_checkpoint: False
 # Fine-tuning arguments
 batch_size: 2
 epochs: 3
-compile: False
 
 # QAT arguments
 quantizer:
@@ -59,13 +58,15 @@ loss:
   _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
 max_steps_per_epoch: null
 gradient_accumulation_steps: 1
+compile: False
 
 # Training env
 device: cuda
 
 # Memory management
 enable_activation_checkpointing: True
-memory_efficient_fsdp_wrap: True
+enable_activation_offloading: False  # True reduces memory
+custom_sharded_layers: ['tok_embeddings', 'output']
 
 # Reduced precision
 dtype: bf16
@@ -74,6 +75,6 @@ dtype: bf16
 metric_logger:
   _component_: torchtune.training.metric_logging.DiskLogger
   log_dir: ${output_dir}
-output_dir: /tmp/alpaca-llama3-finetune
+output_dir: /tmp/full-llama3-finetune
 log_every_n_steps: 1
 log_peak_memory_stats: True
diff --git a/recipes/qat_distributed.py b/recipes/qat_distributed.py