From 3978d15b4e158583bfe1537d6624f622d7cfc472 Mon Sep 17 00:00:00 2001 From: krammnic Date: Sun, 20 Oct 2024 14:23:38 -0400 Subject: [PATCH 01/10] add required arguments in configs --- recipes/configs/code_llama2/7B_full_low_memory.yaml | 4 +++- recipes/configs/code_llama2/7B_lora_single_device.yaml | 4 +++- recipes/configs/code_llama2/7B_qlora_single_device.yaml | 3 ++- recipes/configs/dev/8B_full_experimental.yaml | 4 +++- recipes/configs/gemma/2B_full.yaml | 4 +++- recipes/configs/gemma/2B_lora.yaml | 4 +++- recipes/configs/gemma/2B_lora_single_device.yaml | 3 ++- recipes/configs/gemma/2B_qlora_single_device.yaml | 3 ++- recipes/configs/gemma/7B_full.yaml | 4 +++- recipes/configs/gemma/7B_lora.yaml | 4 +++- recipes/configs/gemma/7B_lora_single_device.yaml | 3 ++- recipes/configs/gemma/7B_qlora_single_device.yaml | 3 ++- recipes/configs/llama2/13B_full.yaml | 4 +++- recipes/configs/llama2/13B_lora.yaml | 4 +++- recipes/configs/llama2/13B_qlora_single_device.yaml | 3 ++- recipes/configs/llama2/70B_lora.yaml | 3 ++- recipes/configs/llama2/70B_qlora.yaml | 3 ++- recipes/configs/llama2/7B_full.yaml | 5 +++-- recipes/configs/llama2/7B_full_low_memory.yaml | 3 ++- recipes/configs/llama2/7B_lora.yaml | 7 ++++--- recipes/configs/llama2/7B_lora_dpo.yaml | 4 +++- recipes/configs/llama2/7B_lora_dpo_single_device.yaml | 3 ++- recipes/configs/llama2/7B_lora_single_device.yaml | 3 ++- recipes/configs/llama2/7B_qat_full.yaml | 4 +++- recipes/configs/llama2/7B_qlora.yaml | 3 ++- recipes/configs/llama2/7B_qlora_single_device.yaml | 3 ++- recipes/configs/llama3/70B_full.yaml | 3 ++- recipes/configs/llama3/70B_lora.yaml | 3 ++- recipes/configs/llama3/8B_dora.yaml | 4 +++- recipes/configs/llama3/8B_dora_single_device.yaml | 3 ++- recipes/configs/llama3/8B_full.yaml | 5 +++-- recipes/configs/llama3/8B_full_single_device.yaml | 3 ++- recipes/configs/llama3/8B_lora.yaml | 4 +++- recipes/configs/llama3/8B_lora_single_device.yaml | 7 ++++--- recipes/configs/llama3/8B_qat_full.yaml | 4 +++- recipes/configs/llama3/8B_qdora_single_device.yaml | 3 ++- recipes/configs/llama3/8B_qlora_single_device.yaml | 3 ++- recipes/configs/llama3_1/405B_qlora.yaml | 1 + recipes/configs/llama3_1/70B_full.yaml | 3 ++- recipes/configs/llama3_1/70B_lora.yaml | 3 ++- recipes/configs/llama3_1/8B_full.yaml | 5 +++-- recipes/configs/llama3_1/8B_full_single_device.yaml | 3 ++- recipes/configs/llama3_1/8B_lora.yaml | 3 ++- recipes/configs/llama3_1/8B_lora_single_device.yaml | 3 ++- recipes/configs/llama3_1/8B_qlora_single_device.yaml | 3 ++- recipes/configs/llama3_2/1B_full.yaml | 3 ++- recipes/configs/llama3_2/1B_full_single_device.yaml | 3 ++- recipes/configs/llama3_2/1B_lora.yaml | 3 ++- recipes/configs/llama3_2/1B_lora_single_device.yaml | 3 ++- recipes/configs/llama3_2/1B_qlora_single_device.yaml | 3 ++- recipes/configs/llama3_2/3B_full.yaml | 3 ++- recipes/configs/llama3_2/3B_full_single_device.yaml | 3 ++- recipes/configs/llama3_2/3B_lora.yaml | 3 ++- recipes/configs/llama3_2/3B_lora_single_device.yaml | 3 ++- recipes/configs/llama3_2/3B_qlora_single_device.yaml | 3 ++- .../llama3_2/knowledge_distillation_single_device.yaml | 4 +++- recipes/configs/llama3_2_vision/11B_full.yaml | 3 ++- .../configs/llama3_2_vision/11B_full_single_device.yaml | 3 ++- recipes/configs/llama3_2_vision/11B_lora.yaml | 3 ++- .../configs/llama3_2_vision/11B_lora_single_device.yaml | 3 ++- recipes/configs/mistral/7B_full.yaml | 4 +++- recipes/configs/mistral/7B_full_low_memory.yaml | 3 ++- recipes/configs/mistral/7B_full_ppo_low_memory.yaml | 3 ++- recipes/configs/mistral/7B_lora.yaml | 4 +++- recipes/configs/mistral/7B_lora_single_device.yaml | 3 ++- recipes/configs/mistral/7B_qlora_single_device.yaml | 3 ++- recipes/configs/phi3/mini_full.yaml | 4 +++- recipes/configs/phi3/mini_full_low_memory.yaml | 3 ++- recipes/configs/phi3/mini_lora.yaml | 4 +++- recipes/configs/phi3/mini_lora_single_device.yaml | 3 ++- recipes/configs/phi3/mini_qlora_single_device.yaml | 3 ++- recipes/configs/qwen2/0.5B_full.yaml | 5 +++-- recipes/configs/qwen2/0.5B_full_single_device.yaml | 3 ++- recipes/configs/qwen2/0.5B_lora.yaml | 4 +++- recipes/configs/qwen2/0.5B_lora_single_device.yaml | 3 ++- recipes/configs/qwen2/1.5B_full.yaml | 5 +++-- recipes/configs/qwen2/1.5B_full_single_device.yaml | 3 ++- recipes/configs/qwen2/1.5B_lora.yaml | 4 +++- recipes/configs/qwen2/1.5B_lora_single_device.yaml | 3 ++- recipes/configs/qwen2/7B_full.yaml | 5 +++-- recipes/configs/qwen2/7B_full_single_device.yaml | 3 ++- recipes/configs/qwen2/7B_lora.yaml | 4 +++- recipes/configs/qwen2/7B_lora_single_device.yaml | 3 ++- .../qwen2/knowledge_distillation_single_device.yaml | 4 +++- recipes/full_finetune_distributed.py | 3 ++- recipes/full_finetune_single_device.py | 3 ++- recipes/knowledge_distillation_single_device.py | 3 ++- recipes/lora_dpo_distributed.py | 3 ++- recipes/lora_dpo_single_device.py | 3 ++- recipes/lora_finetune_distributed.py | 3 ++- recipes/lora_finetune_single_device.py | 3 ++- recipes/ppo_full_finetune_single_device.py | 3 ++- recipes/qat_distributed.py | 3 ++- 93 files changed, 218 insertions(+), 102 deletions(-) diff --git a/recipes/configs/code_llama2/7B_full_low_memory.yaml b/recipes/configs/code_llama2/7B_full_low_memory.yaml index 6bca6c378f..bae760c67e 100644 --- a/recipes/configs/code_llama2/7B_full_low_memory.yaml +++ b/recipes/configs/code_llama2/7B_full_low_memory.yaml @@ -45,7 +45,9 @@ resume_from_checkpoint: False # Dataset dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_dataset + seed: null shuffle: True @@ -75,4 +77,4 @@ metric_logger: _component_: torchtune.training.metric_logging.DiskLogger log_dir: /tmp/CodeLlama-7b-hf/logs log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True diff --git a/recipes/configs/code_llama2/7B_lora_single_device.yaml b/recipes/configs/code_llama2/7B_lora_single_device.yaml index 263e3c12e1..1ada63446b 100644 --- a/recipes/configs/code_llama2/7B_lora_single_device.yaml +++ b/recipes/configs/code_llama2/7B_lora_single_device.yaml @@ -49,7 +49,9 @@ save_adapter_weights_only: False # Dataset dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_cleaned_dataset + seed: null shuffle: True @@ -84,7 +86,7 @@ metric_logger: _component_: torchtune.training.metric_logging.DiskLogger log_dir: /tmp/CodeLlama-7b-hf/logs log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Showcase the usage of PyTorch profiler # Set enabled to False as it's only needed for debugging training diff --git a/recipes/configs/code_llama2/7B_qlora_single_device.yaml b/recipes/configs/code_llama2/7B_qlora_single_device.yaml index 4f6fd9be61..e7910d73cc 100644 --- a/recipes/configs/code_llama2/7B_qlora_single_device.yaml +++ b/recipes/configs/code_llama2/7B_qlora_single_device.yaml @@ -49,6 +49,7 @@ save_adapter_weights_only: False # Dataset dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_cleaned_dataset seed: null shuffle: True @@ -84,7 +85,7 @@ metric_logger: _component_: torchtune.training.metric_logging.DiskLogger log_dir: /tmp/CodeLlama-7b-hf/logs log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Show case the usage of pytorch profiler # Set enabled to False as it's only needed for debugging training diff --git a/recipes/configs/dev/8B_full_experimental.yaml b/recipes/configs/dev/8B_full_experimental.yaml index 4ed8a80e09..ee1e0f650c 100644 --- a/recipes/configs/dev/8B_full_experimental.yaml +++ b/recipes/configs/dev/8B_full_experimental.yaml @@ -26,6 +26,7 @@ tokenizer: # Dataset dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_dataset seed: null shuffle: True @@ -57,7 +58,7 @@ loss: _component_: torchtune.modules.loss.CEWithChunkedOutputLoss max_steps_per_epoch: null gradient_accumulation_steps: 1 - +compile: False # Training env device: cuda @@ -78,3 +79,4 @@ metric_logger: log_dir: ${output_dir} output_dir: /tmp/alpaca-llama3-finetune log_every_n_steps: null +log_peak_memory_stats: True diff --git a/recipes/configs/gemma/2B_full.yaml b/recipes/configs/gemma/2B_full.yaml index e1bd3272d0..a3b8ed59f7 100644 --- a/recipes/configs/gemma/2B_full.yaml +++ b/recipes/configs/gemma/2B_full.yaml @@ -23,6 +23,7 @@ tokenizer: # Dataset dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_dataset seed: null shuffle: True @@ -54,6 +55,7 @@ loss: _component_: torchtune.modules.loss.CEWithChunkedOutputLoss max_steps_per_epoch: null gradient_accumulation_steps: 1 +compile: False # Training env device: cuda @@ -70,4 +72,4 @@ metric_logger: log_dir: ${output_dir} output_dir: /tmp/alpaca-gemma-finetune log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True diff --git a/recipes/configs/gemma/2B_lora.yaml b/recipes/configs/gemma/2B_lora.yaml index b82faa39e2..8ed92dd115 100644 --- a/recipes/configs/gemma/2B_lora.yaml +++ b/recipes/configs/gemma/2B_lora.yaml @@ -22,6 +22,7 @@ tokenizer: # Dataset dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_dataset seed: null shuffle: True @@ -66,6 +67,7 @@ batch_size: 4 epochs: 3 max_steps_per_epoch: null gradient_accumulation_steps: 1 +compile: False # Training env device: cuda @@ -82,4 +84,4 @@ metric_logger: log_dir: ${output_dir} output_dir: /tmp/alpaca-gemma-lora log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True diff --git a/recipes/configs/gemma/2B_lora_single_device.yaml b/recipes/configs/gemma/2B_lora_single_device.yaml index d6e1664b71..b661710caf 100644 --- a/recipes/configs/gemma/2B_lora_single_device.yaml +++ b/recipes/configs/gemma/2B_lora_single_device.yaml @@ -22,6 +22,7 @@ tokenizer: # Dataset dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_dataset seed: null shuffle: True @@ -83,7 +84,7 @@ metric_logger: log_dir: ${output_dir} output_dir: /tmp/alpaca-gemma-lora log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Show case the usage of pytorch profiler # Set enabled to False as it's only needed for debugging training diff --git a/recipes/configs/gemma/2B_qlora_single_device.yaml b/recipes/configs/gemma/2B_qlora_single_device.yaml index 9b24d6c0ee..2b5cbf96bb 100644 --- a/recipes/configs/gemma/2B_qlora_single_device.yaml +++ b/recipes/configs/gemma/2B_qlora_single_device.yaml @@ -22,6 +22,7 @@ tokenizer: # Dataset dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_dataset seed: null shuffle: True @@ -83,7 +84,7 @@ metric_logger: log_dir: ${output_dir} output_dir: /tmp/alpaca-gemma-lora log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Show case the usage of pytorch profiler # Set enabled to False as it's only needed for debugging training diff --git a/recipes/configs/gemma/7B_full.yaml b/recipes/configs/gemma/7B_full.yaml index a8924836fe..eb6b8c9426 100644 --- a/recipes/configs/gemma/7B_full.yaml +++ b/recipes/configs/gemma/7B_full.yaml @@ -23,6 +23,7 @@ tokenizer: # Dataset dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_dataset seed: null shuffle: True @@ -56,6 +57,7 @@ loss: _component_: torchtune.modules.loss.CEWithChunkedOutputLoss max_steps_per_epoch: null gradient_accumulation_steps: 1 +compile: False # Training env device: cuda @@ -72,4 +74,4 @@ metric_logger: log_dir: ${output_dir} output_dir: /tmp/alpaca-gemma-finetune log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True diff --git a/recipes/configs/gemma/7B_lora.yaml b/recipes/configs/gemma/7B_lora.yaml index 6db9b0ab82..4d74f93671 100644 --- a/recipes/configs/gemma/7B_lora.yaml +++ b/recipes/configs/gemma/7B_lora.yaml @@ -23,6 +23,7 @@ tokenizer: # Dataset dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_dataset seed: null shuffle: True @@ -68,6 +69,7 @@ batch_size: 4 epochs: 3 max_steps_per_epoch: null gradient_accumulation_steps: 1 +compile: False # Training env device: cuda @@ -84,4 +86,4 @@ metric_logger: log_dir: ${output_dir} output_dir: /tmp/alpaca-gemma-lora log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True diff --git a/recipes/configs/gemma/7B_lora_single_device.yaml b/recipes/configs/gemma/7B_lora_single_device.yaml index c82f0b76ba..369ba715e5 100644 --- a/recipes/configs/gemma/7B_lora_single_device.yaml +++ b/recipes/configs/gemma/7B_lora_single_device.yaml @@ -22,6 +22,7 @@ tokenizer: # Dataset dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_dataset seed: null shuffle: True @@ -85,7 +86,7 @@ metric_logger: log_dir: ${output_dir} output_dir: /tmp/alpaca-gemma-lora log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Show case the usage of pytorch profiler # Set enabled to False as it's only needed for debugging training diff --git a/recipes/configs/gemma/7B_qlora_single_device.yaml b/recipes/configs/gemma/7B_qlora_single_device.yaml index fcbccb786b..301a7b4a5d 100644 --- a/recipes/configs/gemma/7B_qlora_single_device.yaml +++ b/recipes/configs/gemma/7B_qlora_single_device.yaml @@ -22,6 +22,7 @@ tokenizer: # Dataset dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_dataset seed: null shuffle: True @@ -85,7 +86,7 @@ metric_logger: log_dir: ${output_dir} output_dir: /tmp/alpaca-gemma-lora log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Show case the usage of pytorch profiler # Set enabled to False as it's only needed for debugging training diff --git a/recipes/configs/llama2/13B_full.yaml b/recipes/configs/llama2/13B_full.yaml index f5ecffc2ab..be5a4e8b1d 100644 --- a/recipes/configs/llama2/13B_full.yaml +++ b/recipes/configs/llama2/13B_full.yaml @@ -43,6 +43,7 @@ tokenizer: # Dataset dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_dataset seed: null shuffle: True @@ -58,6 +59,7 @@ loss: _component_: torchtune.modules.loss.CEWithChunkedOutputLoss max_steps_per_epoch: null gradient_accumulation_steps: 1 +compile: False # Training env device: cuda @@ -74,4 +76,4 @@ metric_logger: log_dir: ${output_dir} output_dir: /tmp/alpaca-llama2-finetune log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True diff --git a/recipes/configs/llama2/13B_lora.yaml b/recipes/configs/llama2/13B_lora.yaml index d657754139..797abc2a63 100644 --- a/recipes/configs/llama2/13B_lora.yaml +++ b/recipes/configs/llama2/13B_lora.yaml @@ -52,6 +52,7 @@ tokenizer: # Dataset and Sampler dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_cleaned_dataset seed: null shuffle: True @@ -74,6 +75,7 @@ loss: epochs: 1 max_steps_per_epoch: null gradient_accumulation_steps: 16 +compile: False # Logging output_dir: /tmp/lora_finetune_output @@ -81,7 +83,7 @@ metric_logger: _component_: torchtune.training.metric_logging.DiskLogger log_dir: ${output_dir} log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Environment device: cuda diff --git a/recipes/configs/llama2/13B_qlora_single_device.yaml b/recipes/configs/llama2/13B_qlora_single_device.yaml index 56431fdff5..9e8faaa800 100644 --- a/recipes/configs/llama2/13B_qlora_single_device.yaml +++ b/recipes/configs/llama2/13B_qlora_single_device.yaml @@ -47,6 +47,7 @@ save_adapter_weights_only: False # Dataset and Sampler dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_cleaned_dataset seed: null shuffle: True @@ -77,7 +78,7 @@ metric_logger: _component_: torchtune.training.metric_logging.DiskLogger log_dir: ${output_dir} log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Environment device: cuda diff --git a/recipes/configs/llama2/70B_lora.yaml b/recipes/configs/llama2/70B_lora.yaml index b4d0d9c9a9..9502690be2 100644 --- a/recipes/configs/llama2/70B_lora.yaml +++ b/recipes/configs/llama2/70B_lora.yaml @@ -52,6 +52,7 @@ save_adapter_weights_only: False # Dataset and Sampler dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_dataset seed: null shuffle: True @@ -81,7 +82,7 @@ metric_logger: _component_: torchtune.training.metric_logging.DiskLogger log_dir: ${output_dir} log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Environment device: cuda diff --git a/recipes/configs/llama2/70B_qlora.yaml b/recipes/configs/llama2/70B_qlora.yaml index c1de2c2358..c0e2e320f3 100644 --- a/recipes/configs/llama2/70B_qlora.yaml +++ b/recipes/configs/llama2/70B_qlora.yaml @@ -57,6 +57,7 @@ save_adapter_weights_only: False # Dataset and Sampler dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_dataset train_on_input: True seed: null @@ -91,7 +92,7 @@ metric_logger: _component_: torchtune.training.metric_logging.DiskLogger log_dir: ${output_dir} log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Environment device: cuda diff --git a/recipes/configs/llama2/7B_full.yaml b/recipes/configs/llama2/7B_full.yaml index 2e80276c84..3a6e3c35f2 100644 --- a/recipes/configs/llama2/7B_full.yaml +++ b/recipes/configs/llama2/7B_full.yaml @@ -26,6 +26,7 @@ tokenizer: # Dataset dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_dataset seed: null shuffle: True @@ -57,7 +58,7 @@ loss: _component_: torchtune.modules.loss.CEWithChunkedOutputLoss max_steps_per_epoch: null gradient_accumulation_steps: 1 - +compile: False # Training env device: cuda @@ -74,4 +75,4 @@ metric_logger: log_dir: ${output_dir} output_dir: /tmp/alpaca-llama2-finetune log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True diff --git a/recipes/configs/llama2/7B_full_low_memory.yaml b/recipes/configs/llama2/7B_full_low_memory.yaml index 06558009ed..b9b933c2df 100644 --- a/recipes/configs/llama2/7B_full_low_memory.yaml +++ b/recipes/configs/llama2/7B_full_low_memory.yaml @@ -28,6 +28,7 @@ tokenizer: # Dataset dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_dataset seed: null shuffle: True @@ -79,4 +80,4 @@ metric_logger: log_dir: ${output_dir} output_dir: /tmp/alpaca-llama2-finetune log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True diff --git a/recipes/configs/llama2/7B_lora.yaml b/recipes/configs/llama2/7B_lora.yaml index 2c9a694d7b..82276fa317 100644 --- a/recipes/configs/llama2/7B_lora.yaml +++ b/recipes/configs/llama2/7B_lora.yaml @@ -49,6 +49,7 @@ save_adapter_weights_only: False # Dataset and Sampler dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_cleaned_dataset seed: null shuffle: True @@ -78,7 +79,7 @@ metric_logger: _component_: torchtune.training.metric_logging.DiskLogger log_dir: ${output_dir} log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Environment device: cuda @@ -92,14 +93,14 @@ profiler: enabled: False - #Output directory of trace artifacts + # Output directory of trace artifacts output_dir: ${output_dir}/profiling_outputs #`torch.profiler.ProfilerActivity` types to trace cpu: True cuda: True - #trace options passed to `torch.profiler.profile` + # trace options passed to `torch.profiler.profile` profile_memory: False with_stack: False record_shapes: True diff --git a/recipes/configs/llama2/7B_lora_dpo.yaml b/recipes/configs/llama2/7B_lora_dpo.yaml index 26f824814f..1a870956ff 100644 --- a/recipes/configs/llama2/7B_lora_dpo.yaml +++ b/recipes/configs/llama2/7B_lora_dpo.yaml @@ -46,6 +46,7 @@ save_adapter_weights_only: False # Dataset and Sampler dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.stack_exchange_paired_dataset seed: null shuffle: True @@ -70,6 +71,7 @@ loss: epochs: 1 max_steps_per_epoch: 1000 gradient_accumulation_steps: 8 +compile: False # Logging output_dir: /tmp/lora_dpo_output/ @@ -77,7 +79,7 @@ metric_logger: _component_: torchtune.training.metric_logging.DiskLogger log_dir: ${output_dir} log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Environment device: cuda diff --git a/recipes/configs/llama2/7B_lora_dpo_single_device.yaml b/recipes/configs/llama2/7B_lora_dpo_single_device.yaml index 2ad3988867..408e28a7be 100644 --- a/recipes/configs/llama2/7B_lora_dpo_single_device.yaml +++ b/recipes/configs/llama2/7B_lora_dpo_single_device.yaml @@ -45,6 +45,7 @@ save_adapter_weights_only: False # Dataset and Sampler dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.stack_exchange_paired_dataset seed: null shuffle: True @@ -75,7 +76,7 @@ metric_logger: _component_: torchtune.training.metric_logging.DiskLogger log_dir: ${output_dir} log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Environment device: cuda diff --git a/recipes/configs/llama2/7B_lora_single_device.yaml b/recipes/configs/llama2/7B_lora_single_device.yaml index ebaee584c2..a1c001b868 100644 --- a/recipes/configs/llama2/7B_lora_single_device.yaml +++ b/recipes/configs/llama2/7B_lora_single_device.yaml @@ -47,6 +47,7 @@ save_adapter_weights_only: False # Dataset and Sampler dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_cleaned_dataset seed: null shuffle: True @@ -77,7 +78,7 @@ metric_logger: _component_: torchtune.training.metric_logging.DiskLogger log_dir: ${output_dir} log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Environment device: cuda diff --git a/recipes/configs/llama2/7B_qat_full.yaml b/recipes/configs/llama2/7B_qat_full.yaml index 6fca6c4d4a..d1a408aca5 100644 --- a/recipes/configs/llama2/7B_qat_full.yaml +++ b/recipes/configs/llama2/7B_qat_full.yaml @@ -22,6 +22,7 @@ tokenizer: # Dataset dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_dataset seed: null shuffle: True @@ -53,6 +54,7 @@ loss: _component_: torchtune.modules.loss.CEWithChunkedOutputLoss max_steps_per_epoch: null gradient_accumulation_steps: 1 +compile: False # QAT arguments quantizer: @@ -75,4 +77,4 @@ metric_logger: log_dir: ${output_dir} output_dir: /tmp/alpaca-llama2-finetune log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True diff --git a/recipes/configs/llama2/7B_qlora.yaml b/recipes/configs/llama2/7B_qlora.yaml index 052cdb9296..26fc4faf11 100644 --- a/recipes/configs/llama2/7B_qlora.yaml +++ b/recipes/configs/llama2/7B_qlora.yaml @@ -48,6 +48,7 @@ save_adapter_weights_only: False # Dataset and Sampler dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_cleaned_dataset train_on_input: True seed: null @@ -82,7 +83,7 @@ metric_logger: _component_: torchtune.training.metric_logging.DiskLogger log_dir: ${output_dir} log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Environment device: cuda diff --git a/recipes/configs/llama2/7B_qlora_single_device.yaml b/recipes/configs/llama2/7B_qlora_single_device.yaml index 0893f48579..611c5b155b 100644 --- a/recipes/configs/llama2/7B_qlora_single_device.yaml +++ b/recipes/configs/llama2/7B_qlora_single_device.yaml @@ -46,6 +46,7 @@ save_adapter_weights_only: False # Dataset and Sampler dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_cleaned_dataset seed: null shuffle: True @@ -76,7 +77,7 @@ metric_logger: _component_: torchtune.training.metric_logging.DiskLogger log_dir: ${output_dir} log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Environment device: cuda diff --git a/recipes/configs/llama3/70B_full.yaml b/recipes/configs/llama3/70B_full.yaml index a8b7ba619c..64e678b910 100644 --- a/recipes/configs/llama3/70B_full.yaml +++ b/recipes/configs/llama3/70B_full.yaml @@ -30,6 +30,7 @@ tokenizer: # Dataset dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_dataset seed: null shuffle: True @@ -110,4 +111,4 @@ metric_logger: log_dir: ${output_dir} output_dir: /tmp/full-llama3-finetune log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True diff --git a/recipes/configs/llama3/70B_lora.yaml b/recipes/configs/llama3/70B_lora.yaml index f3a921f289..baac18bedd 100644 --- a/recipes/configs/llama3/70B_lora.yaml +++ b/recipes/configs/llama3/70B_lora.yaml @@ -67,6 +67,7 @@ save_adapter_weights_only: False # Dataset and Sampler dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_dataset seed: null shuffle: True @@ -97,7 +98,7 @@ metric_logger: _component_: torchtune.training.metric_logging.DiskLogger log_dir: ${output_dir} log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Environment device: cuda diff --git a/recipes/configs/llama3/8B_dora.yaml b/recipes/configs/llama3/8B_dora.yaml index 1265c82c72..a9ea97986e 100644 --- a/recipes/configs/llama3/8B_dora.yaml +++ b/recipes/configs/llama3/8B_dora.yaml @@ -42,6 +42,7 @@ resume_from_checkpoint: False # Dataset and Sampler dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_cleaned_dataset seed: null shuffle: True @@ -64,6 +65,7 @@ loss: epochs: 1 max_steps_per_epoch: null gradient_accumulation_steps: 1 +compile: False # Logging output_dir: /tmp/dora_finetune_output @@ -71,7 +73,7 @@ metric_logger: _component_: torchtune.training.metric_logging.DiskLogger log_dir: ${output_dir} log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Environment device: cuda diff --git a/recipes/configs/llama3/8B_dora_single_device.yaml b/recipes/configs/llama3/8B_dora_single_device.yaml index 0fc0a484dc..188b54f757 100644 --- a/recipes/configs/llama3/8B_dora_single_device.yaml +++ b/recipes/configs/llama3/8B_dora_single_device.yaml @@ -44,6 +44,7 @@ resume_from_checkpoint: False # Dataset and Sampler dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_cleaned_dataset seed: null shuffle: True @@ -74,7 +75,7 @@ metric_logger: _component_: torchtune.training.metric_logging.DiskLogger log_dir: ${output_dir} log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Environment device: cuda diff --git a/recipes/configs/llama3/8B_full.yaml b/recipes/configs/llama3/8B_full.yaml index 7f24376db7..baa4a79417 100644 --- a/recipes/configs/llama3/8B_full.yaml +++ b/recipes/configs/llama3/8B_full.yaml @@ -26,6 +26,7 @@ tokenizer: # Dataset dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_dataset seed: null shuffle: True @@ -57,7 +58,7 @@ loss: _component_: torchtune.modules.loss.CEWithChunkedOutputLoss max_steps_per_epoch: null gradient_accumulation_steps: 1 - +compile: False # Training env device: cuda @@ -75,4 +76,4 @@ metric_logger: log_dir: ${output_dir} output_dir: /tmp/full-llama3-finetune log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True diff --git a/recipes/configs/llama3/8B_full_single_device.yaml b/recipes/configs/llama3/8B_full_single_device.yaml index cd3e3586ce..6b8e1ad4b8 100644 --- a/recipes/configs/llama3/8B_full_single_device.yaml +++ b/recipes/configs/llama3/8B_full_single_device.yaml @@ -28,6 +28,7 @@ tokenizer: # Dataset dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_dataset seed: null shuffle: True @@ -78,4 +79,4 @@ metric_logger: log_dir: ${output_dir} output_dir: /tmp/full-llama3-finetune log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True diff --git a/recipes/configs/llama3/8B_lora.yaml b/recipes/configs/llama3/8B_lora.yaml index d65138f348..69a2349035 100644 --- a/recipes/configs/llama3/8B_lora.yaml +++ b/recipes/configs/llama3/8B_lora.yaml @@ -47,6 +47,7 @@ save_adapter_weights_only: False # Dataset and Sampler dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_cleaned_dataset seed: null shuffle: True @@ -69,6 +70,7 @@ loss: epochs: 1 max_steps_per_epoch: null gradient_accumulation_steps: 32 +compile: False # Logging output_dir: /tmp/lora_finetune_output @@ -76,7 +78,7 @@ metric_logger: _component_: torchtune.training.metric_logging.DiskLogger log_dir: ${output_dir} log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Environment device: cuda diff --git a/recipes/configs/llama3/8B_lora_single_device.yaml b/recipes/configs/llama3/8B_lora_single_device.yaml index e49afacbb1..661bbe86db 100644 --- a/recipes/configs/llama3/8B_lora_single_device.yaml +++ b/recipes/configs/llama3/8B_lora_single_device.yaml @@ -46,6 +46,7 @@ save_adapter_weights_only: False # Dataset and Sampler dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_cleaned_dataset seed: null shuffle: True @@ -76,7 +77,7 @@ metric_logger: _component_: torchtune.training.metric_logging.DiskLogger log_dir: ${output_dir} log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Environment device: cuda @@ -91,14 +92,14 @@ profiler: _component_: torchtune.training.setup_torch_profiler enabled: False - #Output directory of trace artifacts + # Output directory of trace artifacts output_dir: ${output_dir}/profiling_outputs #`torch.profiler.ProfilerActivity` types to trace cpu: True cuda: True - #trace options passed to `torch.profiler.profile` + # trace options passed to `torch.profiler.profile` profile_memory: False with_stack: False record_shapes: True diff --git a/recipes/configs/llama3/8B_qat_full.yaml b/recipes/configs/llama3/8B_qat_full.yaml index ff4d9c3195..07461e8243 100644 --- a/recipes/configs/llama3/8B_qat_full.yaml +++ b/recipes/configs/llama3/8B_qat_full.yaml @@ -21,6 +21,7 @@ tokenizer: # Dataset dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_dataset seed: null shuffle: True @@ -43,6 +44,7 @@ resume_from_checkpoint: False # Fine-tuning arguments batch_size: 2 epochs: 3 +compile: False # QAT arguments quantizer: @@ -74,4 +76,4 @@ metric_logger: log_dir: ${output_dir} output_dir: /tmp/alpaca-llama3-finetune log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True diff --git a/recipes/configs/llama3/8B_qdora_single_device.yaml b/recipes/configs/llama3/8B_qdora_single_device.yaml index 7180c5a72c..fafda9a123 100644 --- a/recipes/configs/llama3/8B_qdora_single_device.yaml +++ b/recipes/configs/llama3/8B_qdora_single_device.yaml @@ -45,6 +45,7 @@ resume_from_checkpoint: False # Dataset and Sampler dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_cleaned_dataset seed: null shuffle: True @@ -75,7 +76,7 @@ metric_logger: _component_: torchtune.training.metric_logging.DiskLogger log_dir: ${output_dir} log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Environment device: cuda diff --git a/recipes/configs/llama3/8B_qlora_single_device.yaml b/recipes/configs/llama3/8B_qlora_single_device.yaml index 1eef476d17..83c0dcb9d1 100644 --- a/recipes/configs/llama3/8B_qlora_single_device.yaml +++ b/recipes/configs/llama3/8B_qlora_single_device.yaml @@ -45,6 +45,7 @@ save_adapter_weights_only: False # Dataset and Sampler dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_cleaned_dataset seed: null shuffle: True @@ -75,7 +76,7 @@ metric_logger: _component_: torchtune.training.metric_logging.DiskLogger log_dir: ${output_dir} log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Environment device: cuda diff --git a/recipes/configs/llama3_1/405B_qlora.yaml b/recipes/configs/llama3_1/405B_qlora.yaml index 6398a840ec..421f2bb4c2 100644 --- a/recipes/configs/llama3_1/405B_qlora.yaml +++ b/recipes/configs/llama3_1/405B_qlora.yaml @@ -45,6 +45,7 @@ save_adapter_weights_only: True # Dataset and Sampler dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_dataset train_on_input: True seed: null diff --git a/recipes/configs/llama3_1/70B_full.yaml b/recipes/configs/llama3_1/70B_full.yaml index fcae062999..654b86c6e6 100644 --- a/recipes/configs/llama3_1/70B_full.yaml +++ b/recipes/configs/llama3_1/70B_full.yaml @@ -29,6 +29,7 @@ tokenizer: # Dataset dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_dataset seed: null shuffle: True @@ -112,4 +113,4 @@ metric_logger: log_dir: ${output_dir} output_dir: /tmp/full-llama3_1-finetune log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True diff --git a/recipes/configs/llama3_1/70B_lora.yaml b/recipes/configs/llama3_1/70B_lora.yaml index 861279127a..6e98357f13 100644 --- a/recipes/configs/llama3_1/70B_lora.yaml +++ b/recipes/configs/llama3_1/70B_lora.yaml @@ -66,6 +66,7 @@ save_adapter_weights_only: False # Dataset and Sampler dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_dataset seed: null shuffle: True @@ -96,7 +97,7 @@ metric_logger: _component_: torchtune.training.metric_logging.DiskLogger log_dir: ${output_dir} log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Environment device: cuda diff --git a/recipes/configs/llama3_1/8B_full.yaml b/recipes/configs/llama3_1/8B_full.yaml index 4420b0cae5..1c71813e42 100644 --- a/recipes/configs/llama3_1/8B_full.yaml +++ b/recipes/configs/llama3_1/8B_full.yaml @@ -26,6 +26,7 @@ tokenizer: # Dataset dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_dataset seed: null shuffle: True @@ -60,7 +61,7 @@ loss: _component_: torchtune.modules.loss.CEWithChunkedOutputLoss max_steps_per_epoch: null gradient_accumulation_steps: 1 - +compile: False # Training env device: cuda @@ -79,4 +80,4 @@ metric_logger: log_dir: ${output_dir} output_dir: /tmp/full-llama3.1-finetune log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True diff --git a/recipes/configs/llama3_1/8B_full_single_device.yaml b/recipes/configs/llama3_1/8B_full_single_device.yaml index 9f7d9472ce..3db2de566b 100644 --- a/recipes/configs/llama3_1/8B_full_single_device.yaml +++ b/recipes/configs/llama3_1/8B_full_single_device.yaml @@ -28,6 +28,7 @@ tokenizer: # Dataset dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_dataset seed: null shuffle: True @@ -78,7 +79,7 @@ metric_logger: log_dir: ${output_dir} output_dir: /tmp/full-llama3.1-finetune log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Profiler (disabled) profiler: diff --git a/recipes/configs/llama3_1/8B_lora.yaml b/recipes/configs/llama3_1/8B_lora.yaml index 5f101b170f..586ab6cd19 100644 --- a/recipes/configs/llama3_1/8B_lora.yaml +++ b/recipes/configs/llama3_1/8B_lora.yaml @@ -50,6 +50,7 @@ save_adapter_weights_only: False # Dataset and Sampler dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_cleaned_dataset seed: null shuffle: True @@ -80,7 +81,7 @@ metric_logger: _component_: torchtune.training.metric_logging.DiskLogger log_dir: ${output_dir} log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Environment device: cuda diff --git a/recipes/configs/llama3_1/8B_lora_single_device.yaml b/recipes/configs/llama3_1/8B_lora_single_device.yaml index 3991f728ce..4c24d330f9 100644 --- a/recipes/configs/llama3_1/8B_lora_single_device.yaml +++ b/recipes/configs/llama3_1/8B_lora_single_device.yaml @@ -49,6 +49,7 @@ save_adapter_weights_only: False # Dataset and Sampler dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_cleaned_dataset seed: null shuffle: True @@ -79,7 +80,7 @@ metric_logger: _component_: torchtune.training.metric_logging.DiskLogger log_dir: ${output_dir} log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Environment device: cuda diff --git a/recipes/configs/llama3_1/8B_qlora_single_device.yaml b/recipes/configs/llama3_1/8B_qlora_single_device.yaml index a9b0662105..5e66d817cc 100644 --- a/recipes/configs/llama3_1/8B_qlora_single_device.yaml +++ b/recipes/configs/llama3_1/8B_qlora_single_device.yaml @@ -48,6 +48,7 @@ save_adapter_weights_only: False # Dataset and Sampler dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_cleaned_dataset seed: null shuffle: True @@ -78,7 +79,7 @@ metric_logger: _component_: torchtune.training.metric_logging.DiskLogger log_dir: ${output_dir} log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Environment device: cuda diff --git a/recipes/configs/llama3_2/1B_full.yaml b/recipes/configs/llama3_2/1B_full.yaml index 23b699f754..74f2e15e43 100644 --- a/recipes/configs/llama3_2/1B_full.yaml +++ b/recipes/configs/llama3_2/1B_full.yaml @@ -26,6 +26,7 @@ tokenizer: # Dataset dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_dataset seed: null shuffle: True @@ -75,4 +76,4 @@ metric_logger: log_dir: ${output_dir} output_dir: /tmp/full-llama3.2-finetune log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True diff --git a/recipes/configs/llama3_2/1B_full_single_device.yaml b/recipes/configs/llama3_2/1B_full_single_device.yaml index fc4b0a507c..e7b9ca8939 100644 --- a/recipes/configs/llama3_2/1B_full_single_device.yaml +++ b/recipes/configs/llama3_2/1B_full_single_device.yaml @@ -28,6 +28,7 @@ tokenizer: # Dataset dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_dataset seed: null shuffle: True @@ -75,7 +76,7 @@ metric_logger: log_dir: ${output_dir} output_dir: /tmp/full-llama3.2-finetune log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Profiler (disabled) profiler: diff --git a/recipes/configs/llama3_2/1B_lora.yaml b/recipes/configs/llama3_2/1B_lora.yaml index 228e4989d5..bb182d3192 100644 --- a/recipes/configs/llama3_2/1B_lora.yaml +++ b/recipes/configs/llama3_2/1B_lora.yaml @@ -47,6 +47,7 @@ save_adapter_weights_only: False # Dataset and Sampler dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_cleaned_dataset seed: null shuffle: True @@ -77,7 +78,7 @@ metric_logger: _component_: torchtune.training.metric_logging.DiskLogger log_dir: ${output_dir} log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Environment device: cuda diff --git a/recipes/configs/llama3_2/1B_lora_single_device.yaml b/recipes/configs/llama3_2/1B_lora_single_device.yaml index c9ebed6dc7..eca60cd2ce 100644 --- a/recipes/configs/llama3_2/1B_lora_single_device.yaml +++ b/recipes/configs/llama3_2/1B_lora_single_device.yaml @@ -46,6 +46,7 @@ save_adapter_weights_only: False # Dataset and Sampler dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_cleaned_dataset seed: null shuffle: True @@ -76,7 +77,7 @@ metric_logger: _component_: torchtune.training.metric_logging.DiskLogger log_dir: ${output_dir} log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Environment device: cuda diff --git a/recipes/configs/llama3_2/1B_qlora_single_device.yaml b/recipes/configs/llama3_2/1B_qlora_single_device.yaml index da552b2a0f..f896668a45 100644 --- a/recipes/configs/llama3_2/1B_qlora_single_device.yaml +++ b/recipes/configs/llama3_2/1B_qlora_single_device.yaml @@ -45,6 +45,7 @@ save_adapter_weights_only: False # Dataset and Sampler dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_cleaned_dataset seed: null shuffle: True @@ -75,7 +76,7 @@ metric_logger: _component_: torchtune.training.metric_logging.DiskLogger log_dir: ${output_dir} log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Environment device: cuda diff --git a/recipes/configs/llama3_2/3B_full.yaml b/recipes/configs/llama3_2/3B_full.yaml index 6d738331ae..dee24434ad 100644 --- a/recipes/configs/llama3_2/3B_full.yaml +++ b/recipes/configs/llama3_2/3B_full.yaml @@ -26,6 +26,7 @@ tokenizer: # Dataset dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_dataset seed: null shuffle: True @@ -75,4 +76,4 @@ metric_logger: log_dir: ${output_dir} output_dir: /tmp/full-llama3.2-finetune log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True diff --git a/recipes/configs/llama3_2/3B_full_single_device.yaml b/recipes/configs/llama3_2/3B_full_single_device.yaml index 9b21f4f865..5a61d297d7 100644 --- a/recipes/configs/llama3_2/3B_full_single_device.yaml +++ b/recipes/configs/llama3_2/3B_full_single_device.yaml @@ -28,6 +28,7 @@ tokenizer: # Dataset dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_dataset seed: null shuffle: True @@ -76,7 +77,7 @@ metric_logger: log_dir: ${output_dir} output_dir: /tmp/full-llama3.2-finetune log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Profiler (disabled) profiler: diff --git a/recipes/configs/llama3_2/3B_lora.yaml b/recipes/configs/llama3_2/3B_lora.yaml index d13a303814..9ea9745a77 100644 --- a/recipes/configs/llama3_2/3B_lora.yaml +++ b/recipes/configs/llama3_2/3B_lora.yaml @@ -48,6 +48,7 @@ save_adapter_weights_only: False # Dataset and Sampler dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_cleaned_dataset seed: null shuffle: True @@ -78,7 +79,7 @@ metric_logger: _component_: torchtune.training.metric_logging.DiskLogger log_dir: ${output_dir} log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Environment device: cuda diff --git a/recipes/configs/llama3_2/3B_lora_single_device.yaml b/recipes/configs/llama3_2/3B_lora_single_device.yaml index 255c75e227..283f9eda40 100644 --- a/recipes/configs/llama3_2/3B_lora_single_device.yaml +++ b/recipes/configs/llama3_2/3B_lora_single_device.yaml @@ -47,6 +47,7 @@ save_adapter_weights_only: False # Dataset and Sampler dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_cleaned_dataset seed: null shuffle: True @@ -77,7 +78,7 @@ metric_logger: _component_: torchtune.training.metric_logging.DiskLogger log_dir: ${output_dir} log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Environment device: cuda diff --git a/recipes/configs/llama3_2/3B_qlora_single_device.yaml b/recipes/configs/llama3_2/3B_qlora_single_device.yaml index 360443b9e1..f36c5ee126 100644 --- a/recipes/configs/llama3_2/3B_qlora_single_device.yaml +++ b/recipes/configs/llama3_2/3B_qlora_single_device.yaml @@ -46,6 +46,7 @@ save_adapter_weights_only: False # Dataset and Sampler dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_cleaned_dataset seed: null shuffle: True @@ -76,7 +77,7 @@ metric_logger: _component_: torchtune.training.metric_logging.DiskLogger log_dir: ${output_dir} log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Environment device: cuda diff --git a/recipes/configs/llama3_2/knowledge_distillation_single_device.yaml b/recipes/configs/llama3_2/knowledge_distillation_single_device.yaml index 9cb029666f..ba39474639 100644 --- a/recipes/configs/llama3_2/knowledge_distillation_single_device.yaml +++ b/recipes/configs/llama3_2/knowledge_distillation_single_device.yaml @@ -7,6 +7,7 @@ # tune download meta-llama/Meta-Llama-3.1-8B-Instruct --output-dir /tmp/Meta-Llama-3.1-8B-Instruct --ignore-patterns "original/consolidated.00.pth" # # You get better results using KD if the teacher model has already been fine-tuned on the target dataset: + packed: False # Set to true for great speed ups # tune run lora_finetune_single_device --config llama3_1/8B_lora_single_device # # To launch on a single device, run the following command from root: @@ -62,6 +63,7 @@ teacher_checkpointer: # Dataset and Sampler dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_cleaned_dataset seed: null shuffle: True @@ -96,7 +98,7 @@ metric_logger: _component_: torchtune.training.metric_logging.DiskLogger log_dir: ${output_dir} log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Environment device: cuda diff --git a/recipes/configs/llama3_2_vision/11B_full.yaml b/recipes/configs/llama3_2_vision/11B_full.yaml index ee9180dbcf..1a4c76d307 100644 --- a/recipes/configs/llama3_2_vision/11B_full.yaml +++ b/recipes/configs/llama3_2_vision/11B_full.yaml @@ -42,6 +42,7 @@ resume_from_checkpoint: False # Dataset dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.multimodal.the_cauldron_dataset subset: ocrvqa seed: null @@ -76,4 +77,4 @@ metric_logger: _component_: torchtune.training.metric_logging.DiskLogger log_dir: /tmp/Llama-3.2-11B-Vision-Instruct/logs log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True diff --git a/recipes/configs/llama3_2_vision/11B_full_single_device.yaml b/recipes/configs/llama3_2_vision/11B_full_single_device.yaml index 3372c1a540..3e02d5d103 100644 --- a/recipes/configs/llama3_2_vision/11B_full_single_device.yaml +++ b/recipes/configs/llama3_2_vision/11B_full_single_device.yaml @@ -44,6 +44,7 @@ resume_from_checkpoint: False # Dataset dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.multimodal.the_cauldron_dataset subset: ocrvqa seed: null @@ -77,7 +78,7 @@ metric_logger: _component_: torchtune.training.metric_logging.DiskLogger log_dir: /tmp/Llama-3.2-11B-Vision-Instruct/logs log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Profiler (default is disabled) profiler: diff --git a/recipes/configs/llama3_2_vision/11B_lora.yaml b/recipes/configs/llama3_2_vision/11B_lora.yaml index 357af64496..f0cd05d012 100644 --- a/recipes/configs/llama3_2_vision/11B_lora.yaml +++ b/recipes/configs/llama3_2_vision/11B_lora.yaml @@ -48,6 +48,7 @@ resume_from_checkpoint: False # Dataset dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.multimodal.the_cauldron_dataset subset: ocrvqa seed: null @@ -86,4 +87,4 @@ metric_logger: _component_: torchtune.training.metric_logging.DiskLogger log_dir: /tmp/Llama-3.2-11B-Vision-Instruct/logs log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True diff --git a/recipes/configs/llama3_2_vision/11B_lora_single_device.yaml b/recipes/configs/llama3_2_vision/11B_lora_single_device.yaml index f56828c301..83e2227ca5 100644 --- a/recipes/configs/llama3_2_vision/11B_lora_single_device.yaml +++ b/recipes/configs/llama3_2_vision/11B_lora_single_device.yaml @@ -46,6 +46,7 @@ resume_from_checkpoint: False # Dataset dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.multimodal.the_cauldron_dataset subset: ocrvqa seed: null @@ -85,7 +86,7 @@ metric_logger: _component_: torchtune.training.metric_logging.DiskLogger log_dir: /tmp/Llama-3.2-11B-Vision-Instruct/logs log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Profiler (disabled) profiler: diff --git a/recipes/configs/mistral/7B_full.yaml b/recipes/configs/mistral/7B_full.yaml index 602b3fe082..25cf783846 100644 --- a/recipes/configs/mistral/7B_full.yaml +++ b/recipes/configs/mistral/7B_full.yaml @@ -29,6 +29,7 @@ tokenizer: # Dataset dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_dataset seed: null shuffle: True @@ -60,6 +61,7 @@ loss: _component_: torchtune.modules.loss.CEWithChunkedOutputLoss max_steps_per_epoch: null gradient_accumulation_steps: 1 +compile: False # Training env device: cuda @@ -76,4 +78,4 @@ metric_logger: log_dir: ${output_dir} output_dir: /tmp/Mistral-7B-v0.1/ log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True diff --git a/recipes/configs/mistral/7B_full_low_memory.yaml b/recipes/configs/mistral/7B_full_low_memory.yaml index 7e68ee8066..a6cf37fa8c 100644 --- a/recipes/configs/mistral/7B_full_low_memory.yaml +++ b/recipes/configs/mistral/7B_full_low_memory.yaml @@ -31,6 +31,7 @@ tokenizer: # Dataset dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_dataset seed: null shuffle: True @@ -81,4 +82,4 @@ metric_logger: log_dir: ${output_dir} output_dir: /tmp/Mistral-7B-v0.1/ log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True diff --git a/recipes/configs/mistral/7B_full_ppo_low_memory.yaml b/recipes/configs/mistral/7B_full_ppo_low_memory.yaml index bf9aad71c3..8c583fac0b 100644 --- a/recipes/configs/mistral/7B_full_ppo_low_memory.yaml +++ b/recipes/configs/mistral/7B_full_ppo_low_memory.yaml @@ -32,6 +32,7 @@ tokenizer: # Dataset dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.text_completion_dataset source: trl-internal-testing/sentiment-trl-style split: train @@ -135,7 +136,7 @@ optimizer: _component_: bitsandbytes.optim.PagedAdamW lr: 3e-6 optimizer_in_bwd: True -log_peak_memory_stats: False +log_peak_memory_stats: True enable_activation_checkpointing: True # Reduced precision diff --git a/recipes/configs/mistral/7B_lora.yaml b/recipes/configs/mistral/7B_lora.yaml index 08196660fc..a2dc801925 100644 --- a/recipes/configs/mistral/7B_lora.yaml +++ b/recipes/configs/mistral/7B_lora.yaml @@ -30,6 +30,7 @@ tokenizer: # Dataset dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_dataset seed: null shuffle: True @@ -74,6 +75,7 @@ batch_size: 4 epochs: 3 max_steps_per_epoch: null gradient_accumulation_steps: 1 +compile: False # Training env device: cuda @@ -90,4 +92,4 @@ metric_logger: log_dir: ${output_dir} output_dir: /tmp/Mistral-7B-v0.1 log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True diff --git a/recipes/configs/mistral/7B_lora_single_device.yaml b/recipes/configs/mistral/7B_lora_single_device.yaml index 2ebc9f798e..21212f4983 100644 --- a/recipes/configs/mistral/7B_lora_single_device.yaml +++ b/recipes/configs/mistral/7B_lora_single_device.yaml @@ -27,6 +27,7 @@ tokenizer: # Dataset dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_dataset seed: null shuffle: True @@ -89,7 +90,7 @@ metric_logger: log_dir: ${output_dir} output_dir: /tmp/Mistral-7B-v0.1 log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Show case the usage of pytorch profiler # Set enabled to False as it's only needed for debugging training diff --git a/recipes/configs/mistral/7B_qlora_single_device.yaml b/recipes/configs/mistral/7B_qlora_single_device.yaml index 3bbfebe3ba..e2f6884a9f 100644 --- a/recipes/configs/mistral/7B_qlora_single_device.yaml +++ b/recipes/configs/mistral/7B_qlora_single_device.yaml @@ -28,6 +28,7 @@ tokenizer: # Dataset dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_dataset seed: null shuffle: True @@ -90,7 +91,7 @@ metric_logger: log_dir: ${output_dir} output_dir: /tmp/Mistral-7B-v0.1 log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Show case the usage of pytorch profiler # Set enabled to False as it's only needed for debugging training diff --git a/recipes/configs/phi3/mini_full.yaml b/recipes/configs/phi3/mini_full.yaml index 0ee746ddd4..0be89337a7 100644 --- a/recipes/configs/phi3/mini_full.yaml +++ b/recipes/configs/phi3/mini_full.yaml @@ -42,6 +42,7 @@ resume_from_checkpoint: False # Dataset dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_cleaned_dataset seed: null shuffle: True @@ -57,6 +58,7 @@ optimizer: lr: 5e-6 loss: _component_: torchtune.modules.loss.CEWithChunkedOutputLoss +compile: False # Training env device: cuda @@ -71,4 +73,4 @@ metric_logger: _component_: torchtune.training.metric_logging.DiskLogger log_dir: /tmp/Phi-3-mini-4k-instruct/logs log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True diff --git a/recipes/configs/phi3/mini_full_low_memory.yaml b/recipes/configs/phi3/mini_full_low_memory.yaml index 182a4f6a98..470f4a1afe 100644 --- a/recipes/configs/phi3/mini_full_low_memory.yaml +++ b/recipes/configs/phi3/mini_full_low_memory.yaml @@ -44,6 +44,7 @@ resume_from_checkpoint: False # Dataset dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_cleaned_dataset seed: null shuffle: True @@ -74,4 +75,4 @@ metric_logger: _component_: torchtune.training.metric_logging.DiskLogger log_dir: /tmp/Phi-3-mini-4k-instruct/logs log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True diff --git a/recipes/configs/phi3/mini_lora.yaml b/recipes/configs/phi3/mini_lora.yaml index fff05885ef..1af4929985 100644 --- a/recipes/configs/phi3/mini_lora.yaml +++ b/recipes/configs/phi3/mini_lora.yaml @@ -49,6 +49,7 @@ save_adapter_weights_only: False # Dataset dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_cleaned_dataset seed: null shuffle: True @@ -68,6 +69,7 @@ lr_scheduler: num_warmup_steps: 100 loss: _component_: torchtune.modules.loss.CEWithChunkedOutputLoss +compile: False # Training env device: cuda @@ -82,4 +84,4 @@ metric_logger: _component_: torchtune.training.metric_logging.DiskLogger log_dir: /tmp/Phi-3-mini-4k-instruct/logs log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True diff --git a/recipes/configs/phi3/mini_lora_single_device.yaml b/recipes/configs/phi3/mini_lora_single_device.yaml index b5c14b19ca..21a12a3cc1 100644 --- a/recipes/configs/phi3/mini_lora_single_device.yaml +++ b/recipes/configs/phi3/mini_lora_single_device.yaml @@ -47,6 +47,7 @@ save_adapter_weights_only: False # Dataset dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_cleaned_dataset seed: null shuffle: True @@ -84,7 +85,7 @@ metric_logger: _component_: torchtune.training.metric_logging.DiskLogger log_dir: /tmp/Phi-3-mini-4k-instruct/logs log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Showcase the usage of PyTorch profiler # Set enabled to False as it's only needed for debugging training diff --git a/recipes/configs/phi3/mini_qlora_single_device.yaml b/recipes/configs/phi3/mini_qlora_single_device.yaml index 10114bc67a..21c9403bef 100644 --- a/recipes/configs/phi3/mini_qlora_single_device.yaml +++ b/recipes/configs/phi3/mini_qlora_single_device.yaml @@ -47,6 +47,7 @@ save_adapter_weights_only: False # Dataset dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_cleaned_dataset seed: null shuffle: True @@ -84,7 +85,7 @@ metric_logger: _component_: torchtune.training.metric_logging.DiskLogger log_dir: /tmp/Phi-3-mini-4k-instruct/logs log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Showcase the usage of PyTorch profiler # Set enabled to False as it's only needed for debugging training diff --git a/recipes/configs/qwen2/0.5B_full.yaml b/recipes/configs/qwen2/0.5B_full.yaml index 5bf14591f9..39748ee052 100644 --- a/recipes/configs/qwen2/0.5B_full.yaml +++ b/recipes/configs/qwen2/0.5B_full.yaml @@ -26,6 +26,7 @@ tokenizer: # Dataset dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_cleaned_dataset seed: null shuffle: True @@ -56,7 +57,7 @@ loss: _component_: torchtune.modules.loss.CEWithChunkedOutputLoss max_steps_per_epoch: null gradient_accumulation_steps: 16 - +compile: False # Training env device: cuda @@ -73,4 +74,4 @@ metric_logger: log_dir: ${output_dir} output_dir: /tmp/Qwen2-0.5B-Instruct-finetune log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True diff --git a/recipes/configs/qwen2/0.5B_full_single_device.yaml b/recipes/configs/qwen2/0.5B_full_single_device.yaml index 67091a4e8a..2d2afe883e 100644 --- a/recipes/configs/qwen2/0.5B_full_single_device.yaml +++ b/recipes/configs/qwen2/0.5B_full_single_device.yaml @@ -24,6 +24,7 @@ tokenizer: # Dataset dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_cleaned_dataset seed: null shuffle: True @@ -74,4 +75,4 @@ metric_logger: log_dir: ${output_dir} output_dir: /tmp/Qwen2-0.5B-Instruct-finetune log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True diff --git a/recipes/configs/qwen2/0.5B_lora.yaml b/recipes/configs/qwen2/0.5B_lora.yaml index e0608eba5c..33b5e968d0 100644 --- a/recipes/configs/qwen2/0.5B_lora.yaml +++ b/recipes/configs/qwen2/0.5B_lora.yaml @@ -46,6 +46,7 @@ resume_from_checkpoint: False # Dataset and Sampler dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_cleaned_dataset seed: null @@ -70,6 +71,7 @@ loss: epochs: 1 max_steps_per_epoch: null gradient_accumulation_steps: 4 +compile: False # Logging output_dir: /tmp/Qwen2-0.5B-Instruct-lora-finetune @@ -78,7 +80,7 @@ metric_logger: log_dir: ${output_dir} log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Environment device: cuda diff --git a/recipes/configs/qwen2/0.5B_lora_single_device.yaml b/recipes/configs/qwen2/0.5B_lora_single_device.yaml index 602c63853a..beeb21b072 100644 --- a/recipes/configs/qwen2/0.5B_lora_single_device.yaml +++ b/recipes/configs/qwen2/0.5B_lora_single_device.yaml @@ -45,6 +45,7 @@ resume_from_checkpoint: False # Dataset and Sampler dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_cleaned_dataset seed: null shuffle: True @@ -76,7 +77,7 @@ metric_logger: _component_: torchtune.training.metric_logging.DiskLogger log_dir: ${output_dir} log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Environment device: cuda diff --git a/recipes/configs/qwen2/1.5B_full.yaml b/recipes/configs/qwen2/1.5B_full.yaml index cb7b5e2318..8e850bae50 100644 --- a/recipes/configs/qwen2/1.5B_full.yaml +++ b/recipes/configs/qwen2/1.5B_full.yaml @@ -26,6 +26,7 @@ tokenizer: # Dataset dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_cleaned_dataset seed: null shuffle: True @@ -56,7 +57,7 @@ loss: _component_: torchtune.modules.loss.CEWithChunkedOutputLoss max_steps_per_epoch: null gradient_accumulation_steps: 1 - +compile: False # Training env device: cuda @@ -73,4 +74,4 @@ metric_logger: log_dir: ${output_dir} output_dir: /tmp/Qwen2-1.5B-Instruct-finetune log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True diff --git a/recipes/configs/qwen2/1.5B_full_single_device.yaml b/recipes/configs/qwen2/1.5B_full_single_device.yaml index 5da79ceb69..cc7fd5f566 100644 --- a/recipes/configs/qwen2/1.5B_full_single_device.yaml +++ b/recipes/configs/qwen2/1.5B_full_single_device.yaml @@ -28,6 +28,7 @@ tokenizer: # Dataset dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_cleaned_dataset seed: null @@ -79,4 +80,4 @@ metric_logger: log_dir: ${output_dir} output_dir: /tmp/Qwen2-1.5B-Instruct-finetune log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True diff --git a/recipes/configs/qwen2/1.5B_lora.yaml b/recipes/configs/qwen2/1.5B_lora.yaml index a496dade08..845cb71184 100644 --- a/recipes/configs/qwen2/1.5B_lora.yaml +++ b/recipes/configs/qwen2/1.5B_lora.yaml @@ -44,6 +44,7 @@ resume_from_checkpoint: False # Dataset and Sampler dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_cleaned_dataset seed: null shuffle: True @@ -66,6 +67,7 @@ loss: epochs: 1 max_steps_per_epoch: null gradient_accumulation_steps: 8 +compile: False # Logging output_dir: /tmp/Qwen2-1.5B-Instruct-lora-finetune @@ -73,7 +75,7 @@ metric_logger: _component_: torchtune.training.metric_logging.DiskLogger log_dir: ${output_dir} log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Environment device: cuda diff --git a/recipes/configs/qwen2/1.5B_lora_single_device.yaml b/recipes/configs/qwen2/1.5B_lora_single_device.yaml index b41269de1a..f2e8d2beb4 100644 --- a/recipes/configs/qwen2/1.5B_lora_single_device.yaml +++ b/recipes/configs/qwen2/1.5B_lora_single_device.yaml @@ -44,6 +44,7 @@ resume_from_checkpoint: False # Dataset and Sampler dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_cleaned_dataset seed: null shuffle: True @@ -74,7 +75,7 @@ metric_logger: _component_: torchtune.training.metric_logging.DiskLogger log_dir: ${output_dir} log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Environment device: cuda diff --git a/recipes/configs/qwen2/7B_full.yaml b/recipes/configs/qwen2/7B_full.yaml index 7ffc07e457..06083d908f 100644 --- a/recipes/configs/qwen2/7B_full.yaml +++ b/recipes/configs/qwen2/7B_full.yaml @@ -26,6 +26,7 @@ tokenizer: # Dataset dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_cleaned_dataset seed: null shuffle: True @@ -59,7 +60,7 @@ loss: _component_: torchtune.modules.loss.CEWithChunkedOutputLoss max_steps_per_epoch: null gradient_accumulation_steps: 16 - +compile: False # Training env device: cuda @@ -76,4 +77,4 @@ metric_logger: log_dir: ${output_dir} output_dir: /tmp/Qwen2-7B-Instruct-finetune log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True diff --git a/recipes/configs/qwen2/7B_full_single_device.yaml b/recipes/configs/qwen2/7B_full_single_device.yaml index 560dd5fc9f..13290d82a0 100644 --- a/recipes/configs/qwen2/7B_full_single_device.yaml +++ b/recipes/configs/qwen2/7B_full_single_device.yaml @@ -28,6 +28,7 @@ tokenizer: # Dataset dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_cleaned_dataset seed: null shuffle: True @@ -78,4 +79,4 @@ metric_logger: log_dir: ${output_dir} output_dir: /tmp/Qwen2-7B-Instruct-finetune log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True diff --git a/recipes/configs/qwen2/7B_lora.yaml b/recipes/configs/qwen2/7B_lora.yaml index d3b63fd1df..6e778ecd7d 100644 --- a/recipes/configs/qwen2/7B_lora.yaml +++ b/recipes/configs/qwen2/7B_lora.yaml @@ -50,6 +50,7 @@ resume_from_checkpoint: False # Dataset and Sampler dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_cleaned_dataset seed: null shuffle: True @@ -72,6 +73,7 @@ loss: epochs: 1 max_steps_per_epoch: null gradient_accumulation_steps: 32 +compile: False # Logging output_dir: /tmp/Qwen2-7B-Instruct-lora-finetune @@ -79,7 +81,7 @@ metric_logger: _component_: torchtune.training.metric_logging.DiskLogger log_dir: ${output_dir} log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Environment device: cuda diff --git a/recipes/configs/qwen2/7B_lora_single_device.yaml b/recipes/configs/qwen2/7B_lora_single_device.yaml index 6f9fb35b15..e0b19d03a3 100644 --- a/recipes/configs/qwen2/7B_lora_single_device.yaml +++ b/recipes/configs/qwen2/7B_lora_single_device.yaml @@ -48,6 +48,7 @@ resume_from_checkpoint: False # Dataset and Sampler dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_cleaned_dataset seed: null shuffle: True @@ -78,7 +79,7 @@ metric_logger: _component_: torchtune.training.metric_logging.DiskLogger log_dir: ${output_dir} log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Environment device: cuda diff --git a/recipes/configs/qwen2/knowledge_distillation_single_device.yaml b/recipes/configs/qwen2/knowledge_distillation_single_device.yaml index 9cc894a7e5..078a91c417 100644 --- a/recipes/configs/qwen2/knowledge_distillation_single_device.yaml +++ b/recipes/configs/qwen2/knowledge_distillation_single_device.yaml @@ -7,6 +7,7 @@ # tune download Qwen/Qwen2-1.5B-Instruct --output-dir /tmp/Qwen2-1.5B-Instruct --ignore-patterns None # # You get better results using KD if the teacher model has already been fine-tuned on the target dataset: + packed: False # Set to true for great speed ups # tune run lora_finetune_single_device --config qwen2/1.5B_lora_single_device # # To launch on a single device, run the following command from root: @@ -56,6 +57,7 @@ resume_from_checkpoint: False # Dataset and Sampler dataset: + packed: False # Set to true for great speed ups _component_: torchtune.datasets.alpaca_cleaned_dataset seed: null shuffle: True @@ -89,7 +91,7 @@ metric_logger: _component_: torchtune.training.metric_logging.DiskLogger log_dir: ${output_dir} log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Environment device: cuda diff --git a/recipes/full_finetune_distributed.py b/recipes/full_finetune_distributed.py index 6e83e575f9..50f32878af 100644 --- a/recipes/full_finetune_distributed.py +++ b/recipes/full_finetune_distributed.py @@ -472,7 +472,8 @@ def _setup_optimizer( def _setup_data( self, - cfg_dataset: DictConfig, + cfg_dataset: + packed: False # Set to true for great speed ups DictConfig, shuffle: bool, batch_size: int, collate_fn: str, diff --git a/recipes/full_finetune_single_device.py b/recipes/full_finetune_single_device.py index 2addd92944..a35cd27079 100644 --- a/recipes/full_finetune_single_device.py +++ b/recipes/full_finetune_single_device.py @@ -478,7 +478,8 @@ def _setup_lr_scheduler( def _setup_data( self, - cfg_dataset: DictConfig, + cfg_dataset: + packed: False # Set to true for great speed ups DictConfig, shuffle: bool, batch_size: int, collate_fn: str, diff --git a/recipes/knowledge_distillation_single_device.py b/recipes/knowledge_distillation_single_device.py index c2ee8c7cc4..63d680626c 100644 --- a/recipes/knowledge_distillation_single_device.py +++ b/recipes/knowledge_distillation_single_device.py @@ -494,7 +494,8 @@ def _setup_lr_scheduler( def _setup_data( self, - cfg_dataset: DictConfig, + cfg_dataset: + packed: False # Set to true for great speed ups DictConfig, shuffle: bool, batch_size: int, ) -> Tuple[DistributedSampler, DataLoader]: diff --git a/recipes/lora_dpo_distributed.py b/recipes/lora_dpo_distributed.py index e903ab274a..830012cd02 100644 --- a/recipes/lora_dpo_distributed.py +++ b/recipes/lora_dpo_distributed.py @@ -446,7 +446,8 @@ def _setup_lr_scheduler( def _setup_data( self, - cfg_dataset: DictConfig, + cfg_dataset: + packed: False # Set to true for great speed ups DictConfig, shuffle: bool, batch_size: int, ) -> Tuple[DistributedSampler, DataLoader]: diff --git a/recipes/lora_dpo_single_device.py b/recipes/lora_dpo_single_device.py index c158d17875..2b4a4bc00e 100644 --- a/recipes/lora_dpo_single_device.py +++ b/recipes/lora_dpo_single_device.py @@ -334,7 +334,8 @@ def _setup_lr_scheduler( def _setup_data( self, - cfg_dataset: DictConfig, + cfg_dataset: + packed: False # Set to true for great speed ups DictConfig, shuffle: bool, batch_size: int, ) -> Tuple[DistributedSampler, DataLoader]: diff --git a/recipes/lora_finetune_distributed.py b/recipes/lora_finetune_distributed.py index 1569dfee63..235c376751 100644 --- a/recipes/lora_finetune_distributed.py +++ b/recipes/lora_finetune_distributed.py @@ -586,7 +586,8 @@ def _setup_lr_scheduler( def _setup_data( self, - cfg_dataset: DictConfig, + cfg_dataset: + packed: False # Set to true for great speed ups DictConfig, shuffle: bool, batch_size: int, collate_fn: str, diff --git a/recipes/lora_finetune_single_device.py b/recipes/lora_finetune_single_device.py index 5d39b72086..3499c412b7 100644 --- a/recipes/lora_finetune_single_device.py +++ b/recipes/lora_finetune_single_device.py @@ -497,7 +497,8 @@ def _setup_lr_scheduler( def _setup_data( self, - cfg_dataset: DictConfig, + cfg_dataset: + packed: False # Set to true for great speed ups DictConfig, shuffle: bool, batch_size: int, collate_fn: str, diff --git a/recipes/ppo_full_finetune_single_device.py b/recipes/ppo_full_finetune_single_device.py index 7679af3fd3..7681319581 100644 --- a/recipes/ppo_full_finetune_single_device.py +++ b/recipes/ppo_full_finetune_single_device.py @@ -554,7 +554,8 @@ def _setup_optimizer( return optimizer def _setup_data( - self, cfg_dataset: DictConfig, shuffle: bool, batch_size: int + self, cfg_dataset: + packed: False # Set to true for great speed ups DictConfig, shuffle: bool, batch_size: int ) -> Tuple[DistributedSampler, DataLoader]: """ All data related setup happens here. diff --git a/recipes/qat_distributed.py b/recipes/qat_distributed.py index eb2e44fae2..f660b6f9f8 100644 --- a/recipes/qat_distributed.py +++ b/recipes/qat_distributed.py @@ -494,7 +494,8 @@ def _setup_optimizer( def _setup_data( self, - cfg_dataset: DictConfig, + cfg_dataset: + packed: False # Set to true for great speed ups DictConfig, shuffle: bool, batch_size: int, ) -> Tuple[DistributedSampler, DataLoader]: From 00d09d0742c8aa5b06f2e4933e4e236bb1a364e8 Mon Sep 17 00:00:00 2001 From: krammnic Date: Sun, 20 Oct 2024 14:31:12 -0400 Subject: [PATCH 02/10] fix incorrect replacement --- recipes/full_finetune_distributed.py | 3 +-- recipes/full_finetune_single_device.py | 3 +-- recipes/knowledge_distillation_single_device.py | 3 +-- recipes/lora_dpo_distributed.py | 3 +-- recipes/lora_dpo_single_device.py | 3 +-- recipes/lora_finetune_distributed.py | 3 +-- recipes/lora_finetune_single_device.py | 3 +-- recipes/ppo_full_finetune_single_device.py | 3 +-- recipes/qat_distributed.py | 3 +-- 9 files changed, 9 insertions(+), 18 deletions(-) diff --git a/recipes/full_finetune_distributed.py b/recipes/full_finetune_distributed.py index 50f32878af..6e83e575f9 100644 --- a/recipes/full_finetune_distributed.py +++ b/recipes/full_finetune_distributed.py @@ -472,8 +472,7 @@ def _setup_optimizer( def _setup_data( self, - cfg_dataset: - packed: False # Set to true for great speed ups DictConfig, + cfg_dataset: DictConfig, shuffle: bool, batch_size: int, collate_fn: str, diff --git a/recipes/full_finetune_single_device.py b/recipes/full_finetune_single_device.py index a35cd27079..2addd92944 100644 --- a/recipes/full_finetune_single_device.py +++ b/recipes/full_finetune_single_device.py @@ -478,8 +478,7 @@ def _setup_lr_scheduler( def _setup_data( self, - cfg_dataset: - packed: False # Set to true for great speed ups DictConfig, + cfg_dataset: DictConfig, shuffle: bool, batch_size: int, collate_fn: str, diff --git a/recipes/knowledge_distillation_single_device.py b/recipes/knowledge_distillation_single_device.py index 63d680626c..c2ee8c7cc4 100644 --- a/recipes/knowledge_distillation_single_device.py +++ b/recipes/knowledge_distillation_single_device.py @@ -494,8 +494,7 @@ def _setup_lr_scheduler( def _setup_data( self, - cfg_dataset: - packed: False # Set to true for great speed ups DictConfig, + cfg_dataset: DictConfig, shuffle: bool, batch_size: int, ) -> Tuple[DistributedSampler, DataLoader]: diff --git a/recipes/lora_dpo_distributed.py b/recipes/lora_dpo_distributed.py index 830012cd02..e903ab274a 100644 --- a/recipes/lora_dpo_distributed.py +++ b/recipes/lora_dpo_distributed.py @@ -446,8 +446,7 @@ def _setup_lr_scheduler( def _setup_data( self, - cfg_dataset: - packed: False # Set to true for great speed ups DictConfig, + cfg_dataset: DictConfig, shuffle: bool, batch_size: int, ) -> Tuple[DistributedSampler, DataLoader]: diff --git a/recipes/lora_dpo_single_device.py b/recipes/lora_dpo_single_device.py index 2b4a4bc00e..c158d17875 100644 --- a/recipes/lora_dpo_single_device.py +++ b/recipes/lora_dpo_single_device.py @@ -334,8 +334,7 @@ def _setup_lr_scheduler( def _setup_data( self, - cfg_dataset: - packed: False # Set to true for great speed ups DictConfig, + cfg_dataset: DictConfig, shuffle: bool, batch_size: int, ) -> Tuple[DistributedSampler, DataLoader]: diff --git a/recipes/lora_finetune_distributed.py b/recipes/lora_finetune_distributed.py index 235c376751..1569dfee63 100644 --- a/recipes/lora_finetune_distributed.py +++ b/recipes/lora_finetune_distributed.py @@ -586,8 +586,7 @@ def _setup_lr_scheduler( def _setup_data( self, - cfg_dataset: - packed: False # Set to true for great speed ups DictConfig, + cfg_dataset: DictConfig, shuffle: bool, batch_size: int, collate_fn: str, diff --git a/recipes/lora_finetune_single_device.py b/recipes/lora_finetune_single_device.py index 3499c412b7..5d39b72086 100644 --- a/recipes/lora_finetune_single_device.py +++ b/recipes/lora_finetune_single_device.py @@ -497,8 +497,7 @@ def _setup_lr_scheduler( def _setup_data( self, - cfg_dataset: - packed: False # Set to true for great speed ups DictConfig, + cfg_dataset: DictConfig, shuffle: bool, batch_size: int, collate_fn: str, diff --git a/recipes/ppo_full_finetune_single_device.py b/recipes/ppo_full_finetune_single_device.py index 7681319581..7679af3fd3 100644 --- a/recipes/ppo_full_finetune_single_device.py +++ b/recipes/ppo_full_finetune_single_device.py @@ -554,8 +554,7 @@ def _setup_optimizer( return optimizer def _setup_data( - self, cfg_dataset: - packed: False # Set to true for great speed ups DictConfig, shuffle: bool, batch_size: int + self, cfg_dataset: DictConfig, shuffle: bool, batch_size: int ) -> Tuple[DistributedSampler, DataLoader]: """ All data related setup happens here. diff --git a/recipes/qat_distributed.py b/recipes/qat_distributed.py index f660b6f9f8..eb2e44fae2 100644 --- a/recipes/qat_distributed.py +++ b/recipes/qat_distributed.py @@ -494,8 +494,7 @@ def _setup_optimizer( def _setup_data( self, - cfg_dataset: - packed: False # Set to true for great speed ups DictConfig, + cfg_dataset: DictConfig, shuffle: bool, batch_size: int, ) -> Tuple[DistributedSampler, DataLoader]: From bb7648f0af863019413c0a14baaa0d5711bfa1eb Mon Sep 17 00:00:00 2001 From: krammnic Date: Mon, 21 Oct 2024 11:42:43 -0400 Subject: [PATCH 03/10] remove packed from dpo --- recipes/configs/llama2/7B_lora_dpo.yaml | 1 - recipes/configs/llama2/7B_lora_dpo_single_device.yaml | 1 - 2 files changed, 2 deletions(-) diff --git a/recipes/configs/llama2/7B_lora_dpo.yaml b/recipes/configs/llama2/7B_lora_dpo.yaml index 1a870956ff..1a0b4bc390 100644 --- a/recipes/configs/llama2/7B_lora_dpo.yaml +++ b/recipes/configs/llama2/7B_lora_dpo.yaml @@ -46,7 +46,6 @@ save_adapter_weights_only: False # Dataset and Sampler dataset: - packed: False # Set to true for great speed ups _component_: torchtune.datasets.stack_exchange_paired_dataset seed: null shuffle: True diff --git a/recipes/configs/llama2/7B_lora_dpo_single_device.yaml b/recipes/configs/llama2/7B_lora_dpo_single_device.yaml index 408e28a7be..bfe8185f06 100644 --- a/recipes/configs/llama2/7B_lora_dpo_single_device.yaml +++ b/recipes/configs/llama2/7B_lora_dpo_single_device.yaml @@ -45,7 +45,6 @@ save_adapter_weights_only: False # Dataset and Sampler dataset: - packed: False # Set to true for great speed ups _component_: torchtune.datasets.stack_exchange_paired_dataset seed: null shuffle: True From 22c84abf57f099dfe557ee9b4176a5301d45532d Mon Sep 17 00:00:00 2001 From: krammnic Date: Mon, 21 Oct 2024 11:43:21 -0400 Subject: [PATCH 04/10] remove packed from dpo --- recipes/configs/mistral/7B_full_ppo_low_memory.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/recipes/configs/mistral/7B_full_ppo_low_memory.yaml b/recipes/configs/mistral/7B_full_ppo_low_memory.yaml index 8c583fac0b..db3b3f5e86 100644 --- a/recipes/configs/mistral/7B_full_ppo_low_memory.yaml +++ b/recipes/configs/mistral/7B_full_ppo_low_memory.yaml @@ -32,7 +32,6 @@ tokenizer: # Dataset dataset: - packed: False # Set to true for great speed ups _component_: torchtune.datasets.text_completion_dataset source: trl-internal-testing/sentiment-trl-style split: train From 8acfa7558cf655f56b60081d3d5cabf6adb759c9 Mon Sep 17 00:00:00 2001 From: krammnic Date: Mon, 21 Oct 2024 11:59:58 -0400 Subject: [PATCH 05/10] add comment --- recipes/configs/llama3/70B_full.yaml | 2 +- recipes/configs/llama3/70B_lora.yaml | 2 +- recipes/configs/llama3_1/405B_qlora.yaml | 2 +- recipes/configs/llama3_1/70B_full.yaml | 2 +- recipes/configs/llama3_1/70B_lora.yaml | 2 +- recipes/configs/llama3_1/8B_full.yaml | 2 +- recipes/configs/llama3_1/8B_full_single_device.yaml | 2 +- recipes/configs/llama3_1/8B_lora.yaml | 2 +- recipes/configs/llama3_1/8B_lora_single_device.yaml | 2 +- recipes/configs/llama3_1/8B_qlora_single_device.yaml | 2 +- recipes/configs/llama3_2/1B_full.yaml | 2 +- recipes/configs/llama3_2/1B_full_single_device.yaml | 2 +- recipes/configs/llama3_2/1B_lora.yaml | 2 +- recipes/configs/llama3_2/1B_lora_single_device.yaml | 2 +- recipes/configs/llama3_2/1B_qlora_single_device.yaml | 2 +- recipes/configs/llama3_2/3B_full.yaml | 2 +- recipes/configs/llama3_2/3B_full_single_device.yaml | 2 +- recipes/configs/llama3_2/3B_lora.yaml | 2 +- recipes/configs/llama3_2/3B_lora_single_device.yaml | 2 +- recipes/configs/llama3_2/3B_qlora_single_device.yaml | 2 +- recipes/configs/llama3_2_vision/11B_full.yaml | 2 +- recipes/configs/llama3_2_vision/11B_full_single_device.yaml | 2 +- recipes/configs/llama3_2_vision/11B_lora.yaml | 2 +- recipes/configs/llama3_2_vision/11B_lora_single_device.yaml | 2 +- 24 files changed, 24 insertions(+), 24 deletions(-) diff --git a/recipes/configs/llama3/70B_full.yaml b/recipes/configs/llama3/70B_full.yaml index 64e678b910..21f0870094 100644 --- a/recipes/configs/llama3/70B_full.yaml +++ b/recipes/configs/llama3/70B_full.yaml @@ -100,7 +100,7 @@ device: cuda enable_activation_checkpointing: True custom_sharded_layers: ['tok_embeddings', 'output'] fsdp_cpu_offload: True -compile: False # set it to True for better memory and performance +compile=False # pytorch compile, set to true for perf/memory improvement# set it to True for better memory and performance # Reduced precision dtype: bf16 diff --git a/recipes/configs/llama3/70B_lora.yaml b/recipes/configs/llama3/70B_lora.yaml index baac18bedd..49b246a23c 100644 --- a/recipes/configs/llama3/70B_lora.yaml +++ b/recipes/configs/llama3/70B_lora.yaml @@ -90,7 +90,7 @@ loss: epochs: 1 max_steps_per_epoch: null gradient_accumulation_steps: 1 -compile: False # set it to True for better memory and performance +compile=False # pytorch compile, set to true for perf/memory improvement# set it to True for better memory and performance # Logging output_dir: /tmp/lora_finetune_output diff --git a/recipes/configs/llama3_1/405B_qlora.yaml b/recipes/configs/llama3_1/405B_qlora.yaml index 421f2bb4c2..8e4359b632 100644 --- a/recipes/configs/llama3_1/405B_qlora.yaml +++ b/recipes/configs/llama3_1/405B_qlora.yaml @@ -72,7 +72,7 @@ fsdp: epochs: 1 max_steps_per_epoch: null gradient_accumulation_steps: 16 -compile: False # set it to True for better memory and performance +compile=False # pytorch compile, set to true for perf/memory improvement# set it to True for better memory and performance # Logging output_dir: /tmp/qlora_finetune_output diff --git a/recipes/configs/llama3_1/70B_full.yaml b/recipes/configs/llama3_1/70B_full.yaml index 654b86c6e6..b5619ac0b4 100644 --- a/recipes/configs/llama3_1/70B_full.yaml +++ b/recipes/configs/llama3_1/70B_full.yaml @@ -102,7 +102,7 @@ device: cuda enable_activation_checkpointing: True custom_sharded_layers: ['tok_embeddings', 'output'] fsdp_cpu_offload: True -compile: False # set it to True for better memory and performance +compile=False # pytorch compile, set to true for perf/memory improvement# set it to True for better memory and performance # Reduced precision dtype: bf16 diff --git a/recipes/configs/llama3_1/70B_lora.yaml b/recipes/configs/llama3_1/70B_lora.yaml index 6e98357f13..030eb0a5fb 100644 --- a/recipes/configs/llama3_1/70B_lora.yaml +++ b/recipes/configs/llama3_1/70B_lora.yaml @@ -89,7 +89,7 @@ loss: epochs: 1 max_steps_per_epoch: null gradient_accumulation_steps: 1 -compile: False # set it to True for better memory and performance +compile=False # pytorch compile, set to true for perf/memory improvement# set it to True for better memory and performance # Logging output_dir: /tmp/lora-llama3_1-finetune-output diff --git a/recipes/configs/llama3_1/8B_full.yaml b/recipes/configs/llama3_1/8B_full.yaml index 1c71813e42..43fb5a11b8 100644 --- a/recipes/configs/llama3_1/8B_full.yaml +++ b/recipes/configs/llama3_1/8B_full.yaml @@ -69,7 +69,7 @@ device: cuda # Memory management enable_activation_checkpointing: True custom_sharded_layers: ['tok_embeddings', 'output'] -compile: False # set it to True for better memory and performance +compile=False # pytorch compile, set to true for perf/memory improvement# set it to True for better memory and performance # Reduced precision dtype: bf16 diff --git a/recipes/configs/llama3_1/8B_full_single_device.yaml b/recipes/configs/llama3_1/8B_full_single_device.yaml index 3db2de566b..e99d77164f 100644 --- a/recipes/configs/llama3_1/8B_full_single_device.yaml +++ b/recipes/configs/llama3_1/8B_full_single_device.yaml @@ -62,7 +62,7 @@ loss: max_steps_per_epoch: null gradient_accumulation_steps: 1 optimizer_in_bwd: True -compile: False # set it to True for better memory and performance +compile=False # pytorch compile, set to true for perf/memory improvement# set it to True for better memory and performance # Training environment device: cuda diff --git a/recipes/configs/llama3_1/8B_lora.yaml b/recipes/configs/llama3_1/8B_lora.yaml index 586ab6cd19..6f37eeeaa8 100644 --- a/recipes/configs/llama3_1/8B_lora.yaml +++ b/recipes/configs/llama3_1/8B_lora.yaml @@ -73,7 +73,7 @@ loss: epochs: 1 max_steps_per_epoch: null gradient_accumulation_steps: 32 -compile: False # set it to True for better memory and performance +compile=False # pytorch compile, set to true for perf/memory improvement# set it to True for better memory and performance # Logging output_dir: /tmp/lora_finetune_output diff --git a/recipes/configs/llama3_1/8B_lora_single_device.yaml b/recipes/configs/llama3_1/8B_lora_single_device.yaml index 4c24d330f9..41caed2a38 100644 --- a/recipes/configs/llama3_1/8B_lora_single_device.yaml +++ b/recipes/configs/llama3_1/8B_lora_single_device.yaml @@ -72,7 +72,7 @@ loss: epochs: 1 max_steps_per_epoch: null gradient_accumulation_steps: 64 -compile: False # set it to True for better memory and performance +compile=False # pytorch compile, set to true for perf/memory improvement# set it to True for better memory and performance # Logging output_dir: /tmp/lora_finetune_output diff --git a/recipes/configs/llama3_1/8B_qlora_single_device.yaml b/recipes/configs/llama3_1/8B_qlora_single_device.yaml index 5e66d817cc..f3476fcb16 100644 --- a/recipes/configs/llama3_1/8B_qlora_single_device.yaml +++ b/recipes/configs/llama3_1/8B_qlora_single_device.yaml @@ -71,7 +71,7 @@ loss: epochs: 1 max_steps_per_epoch: null gradient_accumulation_steps: 16 -compile: False # set it to True for better memory and performance +compile=False # pytorch compile, set to true for perf/memory improvement# set it to True for better memory and performance # Logging output_dir: /tmp/qlora_finetune_output/ diff --git a/recipes/configs/llama3_2/1B_full.yaml b/recipes/configs/llama3_2/1B_full.yaml index 74f2e15e43..97257c623a 100644 --- a/recipes/configs/llama3_2/1B_full.yaml +++ b/recipes/configs/llama3_2/1B_full.yaml @@ -65,7 +65,7 @@ device: cuda # Memory management enable_activation_checkpointing: False -compile: False # set it to True for better memory and performance +compile=False # pytorch compile, set to true for perf/memory improvement# set it to True for better memory and performance # Reduced precision dtype: bf16 diff --git a/recipes/configs/llama3_2/1B_full_single_device.yaml b/recipes/configs/llama3_2/1B_full_single_device.yaml index e7b9ca8939..c69b592cf8 100644 --- a/recipes/configs/llama3_2/1B_full_single_device.yaml +++ b/recipes/configs/llama3_2/1B_full_single_device.yaml @@ -59,7 +59,7 @@ loss: max_steps_per_epoch: null gradient_accumulation_steps: 1 optimizer_in_bwd: True -compile: False # set it to True for better memory and performance +compile=False # pytorch compile, set to true for perf/memory improvement# set it to True for better memory and performance # Training environment device: cuda diff --git a/recipes/configs/llama3_2/1B_lora.yaml b/recipes/configs/llama3_2/1B_lora.yaml index bb182d3192..b531461a4b 100644 --- a/recipes/configs/llama3_2/1B_lora.yaml +++ b/recipes/configs/llama3_2/1B_lora.yaml @@ -70,7 +70,7 @@ loss: epochs: 1 max_steps_per_epoch: null gradient_accumulation_steps: 4 -compile: False # set it to True for better memory and performance +compile=False # pytorch compile, set to true for perf/memory improvement# set it to True for better memory and performance # Logging output_dir: /tmp/lora_finetune_output diff --git a/recipes/configs/llama3_2/1B_lora_single_device.yaml b/recipes/configs/llama3_2/1B_lora_single_device.yaml index eca60cd2ce..e2b97a6d77 100644 --- a/recipes/configs/llama3_2/1B_lora_single_device.yaml +++ b/recipes/configs/llama3_2/1B_lora_single_device.yaml @@ -69,7 +69,7 @@ loss: epochs: 1 max_steps_per_epoch: null gradient_accumulation_steps: 4 -compile: False # set it to True for better memory and performance +compile=False # pytorch compile, set to true for perf/memory improvement# set it to True for better memory and performance # Logging output_dir: /tmp/lora_finetune_output diff --git a/recipes/configs/llama3_2/1B_qlora_single_device.yaml b/recipes/configs/llama3_2/1B_qlora_single_device.yaml index f896668a45..0372578e23 100644 --- a/recipes/configs/llama3_2/1B_qlora_single_device.yaml +++ b/recipes/configs/llama3_2/1B_qlora_single_device.yaml @@ -68,7 +68,7 @@ loss: epochs: 1 max_steps_per_epoch: null gradient_accumulation_steps: 4 -compile: False # set it to True for better memory and performance +compile=False # pytorch compile, set to true for perf/memory improvement# set it to True for better memory and performance # Logging output_dir: /tmp/lora_finetune_output diff --git a/recipes/configs/llama3_2/3B_full.yaml b/recipes/configs/llama3_2/3B_full.yaml index dee24434ad..027a1d9e27 100644 --- a/recipes/configs/llama3_2/3B_full.yaml +++ b/recipes/configs/llama3_2/3B_full.yaml @@ -65,7 +65,7 @@ device: cuda # Memory management enable_activation_checkpointing: True -compile: False # set it to True for better memory and performance +compile=False # pytorch compile, set to true for perf/memory improvement# set it to True for better memory and performance # Reduced precision dtype: bf16 diff --git a/recipes/configs/llama3_2/3B_full_single_device.yaml b/recipes/configs/llama3_2/3B_full_single_device.yaml index 5a61d297d7..29955d8d47 100644 --- a/recipes/configs/llama3_2/3B_full_single_device.yaml +++ b/recipes/configs/llama3_2/3B_full_single_device.yaml @@ -60,7 +60,7 @@ loss: max_steps_per_epoch: null gradient_accumulation_steps: 1 optimizer_in_bwd: True -compile: False # set it to True for better memory and performance +compile=False # pytorch compile, set to true for perf/memory improvement# set it to True for better memory and performance # Training environment device: cuda diff --git a/recipes/configs/llama3_2/3B_lora.yaml b/recipes/configs/llama3_2/3B_lora.yaml index 9ea9745a77..e30ad9d63b 100644 --- a/recipes/configs/llama3_2/3B_lora.yaml +++ b/recipes/configs/llama3_2/3B_lora.yaml @@ -71,7 +71,7 @@ loss: epochs: 1 max_steps_per_epoch: null gradient_accumulation_steps: 4 -compile: False # set it to True for better memory and performance +compile=False # pytorch compile, set to true for perf/memory improvement# set it to True for better memory and performance # Logging output_dir: /tmp/lora_finetune_output diff --git a/recipes/configs/llama3_2/3B_lora_single_device.yaml b/recipes/configs/llama3_2/3B_lora_single_device.yaml index 283f9eda40..3e888090a8 100644 --- a/recipes/configs/llama3_2/3B_lora_single_device.yaml +++ b/recipes/configs/llama3_2/3B_lora_single_device.yaml @@ -70,7 +70,7 @@ loss: epochs: 1 max_steps_per_epoch: null gradient_accumulation_steps: 4 -compile: False # set it to True for better memory and performance +compile=False # pytorch compile, set to true for perf/memory improvement# set it to True for better memory and performance # Logging output_dir: /tmp/lora_finetune_output diff --git a/recipes/configs/llama3_2/3B_qlora_single_device.yaml b/recipes/configs/llama3_2/3B_qlora_single_device.yaml index f36c5ee126..a98e1ac9c0 100644 --- a/recipes/configs/llama3_2/3B_qlora_single_device.yaml +++ b/recipes/configs/llama3_2/3B_qlora_single_device.yaml @@ -69,7 +69,7 @@ loss: epochs: 1 max_steps_per_epoch: null gradient_accumulation_steps: 4 -compile: False # set it to True for better memory and performance +compile=False # pytorch compile, set to true for perf/memory improvement# set it to True for better memory and performance # Logging output_dir: /tmp/lora_finetune_output diff --git a/recipes/configs/llama3_2_vision/11B_full.yaml b/recipes/configs/llama3_2_vision/11B_full.yaml index 1a4c76d307..2ef3298f11 100644 --- a/recipes/configs/llama3_2_vision/11B_full.yaml +++ b/recipes/configs/llama3_2_vision/11B_full.yaml @@ -61,7 +61,7 @@ optimizer: loss: _component_: torchtune.modules.loss.CEWithChunkedOutputLoss clip_grad_norm: 1.0 -compile: False # set it to True for better memory and performance +compile=False # pytorch compile, set to true for perf/memory improvement# set it to True for better memory and performance # Training env device: cuda diff --git a/recipes/configs/llama3_2_vision/11B_full_single_device.yaml b/recipes/configs/llama3_2_vision/11B_full_single_device.yaml index 3e02d5d103..e72ccbfdbd 100644 --- a/recipes/configs/llama3_2_vision/11B_full_single_device.yaml +++ b/recipes/configs/llama3_2_vision/11B_full_single_device.yaml @@ -63,7 +63,7 @@ optimizer_in_bwd: False loss: _component_: torchtune.modules.loss.CEWithChunkedOutputLoss clip_grad_norm: 1.0 -compile: False # set it to True for better memory and performance +compile=False # pytorch compile, set to true for perf/memory improvement# set it to True for better memory and performance # Training env device: cuda diff --git a/recipes/configs/llama3_2_vision/11B_lora.yaml b/recipes/configs/llama3_2_vision/11B_lora.yaml index f0cd05d012..9d94e5ce7f 100644 --- a/recipes/configs/llama3_2_vision/11B_lora.yaml +++ b/recipes/configs/llama3_2_vision/11B_lora.yaml @@ -71,7 +71,7 @@ lr_scheduler: loss: _component_: torchtune.modules.loss.CEWithChunkedOutputLoss clip_grad_norm: 1.0 -compile: False # set it to True for better memory and performance +compile=False # pytorch compile, set to true for perf/memory improvement# set it to True for better memory and performance # Training env device: cuda diff --git a/recipes/configs/llama3_2_vision/11B_lora_single_device.yaml b/recipes/configs/llama3_2_vision/11B_lora_single_device.yaml index 83e2227ca5..fe1c04929c 100644 --- a/recipes/configs/llama3_2_vision/11B_lora_single_device.yaml +++ b/recipes/configs/llama3_2_vision/11B_lora_single_device.yaml @@ -70,7 +70,7 @@ lr_scheduler: loss: _component_: torchtune.modules.loss.CEWithChunkedOutputLoss clip_grad_norm: 1.0 -compile: False # set it to True for better memory and performance +compile=False # pytorch compile, set to true for perf/memory improvement# set it to True for better memory and performance # Training env device: cuda From 9b475018178d40c6d5397048c61f2e2abb56c474 Mon Sep 17 00:00:00 2001 From: krammnic Date: Wed, 23 Oct 2024 09:19:12 -0400 Subject: [PATCH 06/10] add cuda check --- recipes/full_finetune_distributed.py | 6 ++++++ recipes/full_finetune_single_device.py | 6 ++++++ recipes/knowledge_distillation_single_device.py | 6 ++++++ recipes/lora_dpo_distributed.py | 7 +++++++ recipes/lora_dpo_single_device.py | 6 ++++++ recipes/lora_finetune_distributed.py | 7 +++++++ recipes/lora_finetune_single_device.py | 6 ++++++ recipes/ppo_full_finetune_single_device.py | 6 ++++++ recipes/qat_distributed.py | 6 ++++++ 9 files changed, 56 insertions(+) diff --git a/recipes/full_finetune_distributed.py b/recipes/full_finetune_distributed.py index 6e83e575f9..8221eadc04 100644 --- a/recipes/full_finetune_distributed.py +++ b/recipes/full_finetune_distributed.py @@ -121,6 +121,12 @@ def __init__(self, cfg: DictConfig) -> None: self._log_every_n_steps = cfg.get("log_every_n_steps", 1) self._log_peak_memory_stats = cfg.get("log_peak_memory_stats", False) + if self._log_peak_memory_stats and self._device.type == "cuda": + log.info( + "log_peak_memory_stats was se to True, however, training does not use cuda. Setting log_peak_memory_stats=False." + ) + self._log_peak_memory_stats = False + # _is_rank_zero is used primarily for logging. In the future, the logger # should directly take care of this _, rank = training.get_world_size_and_rank() diff --git a/recipes/full_finetune_single_device.py b/recipes/full_finetune_single_device.py index 2addd92944..ea7bd5a6e6 100644 --- a/recipes/full_finetune_single_device.py +++ b/recipes/full_finetune_single_device.py @@ -116,6 +116,12 @@ def __init__(self, cfg: DictConfig) -> None: self._log_every_n_steps = cfg.get("log_every_n_steps", 1) self._log_peak_memory_stats = cfg.get("log_peak_memory_stats", False) + if self._log_peak_memory_stats and self._device.type == "cuda": + log.info( + "log_peak_memory_stats was se to True, however, training does not use cuda. Setting log_peak_memory_stats=False." + ) + self._log_peak_memory_stats = False + # Training cfg self._resume_from_checkpoint = cfg.resume_from_checkpoint self._gradient_accumulation_steps = cfg.gradient_accumulation_steps diff --git a/recipes/knowledge_distillation_single_device.py b/recipes/knowledge_distillation_single_device.py index c2ee8c7cc4..db357e031c 100644 --- a/recipes/knowledge_distillation_single_device.py +++ b/recipes/knowledge_distillation_single_device.py @@ -120,6 +120,12 @@ def __init__(self, cfg: DictConfig) -> None: self._log_every_n_steps = cfg.get("log_every_n_steps", 1) self._log_peak_memory_stats = cfg.get("log_peak_memory_stats", False) + if self._log_peak_memory_stats and self._device.type == "cuda": + log.info( + "log_peak_memory_stats was se to True, however, training does not use cuda. Setting log_peak_memory_stats=False." + ) + self._log_peak_memory_stats = False + # These are public properties which are updated by the checkpoint loader # when ``resume_from_checkpoint`` is `True` or validated in tests self.seed = training.set_seed(seed=cfg.seed) diff --git a/recipes/lora_dpo_distributed.py b/recipes/lora_dpo_distributed.py index e903ab274a..acc48e6390 100644 --- a/recipes/lora_dpo_distributed.py +++ b/recipes/lora_dpo_distributed.py @@ -130,6 +130,13 @@ def __init__(self, cfg: DictConfig) -> None: self._log_every_n_steps = cfg.get("log_every_n_steps", 1) self._log_peak_memory_stats = cfg.get("log_peak_memory_stats", False) + if self._log_peak_memory_stats and self._device.type == "cuda": + log.info( + "log_peak_memory_stats was se to True, however, training does not use cuda. Setting log_peak_memory_stats=False." + ) + self._log_peak_memory_stats = False + + # training attributes self._enable_activation_checkpointing = cfg.enable_activation_checkpointing diff --git a/recipes/lora_dpo_single_device.py b/recipes/lora_dpo_single_device.py index c158d17875..a7a882afda 100644 --- a/recipes/lora_dpo_single_device.py +++ b/recipes/lora_dpo_single_device.py @@ -95,6 +95,12 @@ def __init__(self, cfg: DictConfig) -> None: self._log_every_n_steps = cfg.get("log_every_n_steps", 1) self._log_peak_memory_stats = cfg.get("log_peak_memory_stats", False) + if self._log_peak_memory_stats and self._device.type == "cuda": + log.info( + "log_peak_memory_stats was se to True, however, training does not use cuda. Setting log_peak_memory_stats=False." + ) + self._log_peak_memory_stats + # These are public properties which are updated by the checkpoint loader # when ``resume_from_checkpoint`` is `True` or validated in tests self.seed = training.set_seed(seed=cfg.seed) diff --git a/recipes/lora_finetune_distributed.py b/recipes/lora_finetune_distributed.py index 1569dfee63..556a11c16b 100644 --- a/recipes/lora_finetune_distributed.py +++ b/recipes/lora_finetune_distributed.py @@ -151,6 +151,12 @@ def __init__(self, cfg: DictConfig) -> None: self._log_every_n_steps = cfg.get("log_every_n_steps", 1) self._log_peak_memory_stats = cfg.get("log_peak_memory_stats", False) + if self._log_peak_memory_stats and self._device.type == "cuda": + log.info( + "log_peak_memory_stats was se to True, however, training does not use cuda. Setting log_peak_memory_stats=False." + ) + self._log_peak_memory_stats = False + # training attributes self._enable_activation_checkpointing = cfg.enable_activation_checkpointing self._enable_activation_offloading = cfg.get( @@ -836,6 +842,7 @@ def train(self) -> None: log_dict.update( training.get_memory_stats(device=self._device) ) + if self._clip_grad_norm is not None: log_dict.update({"grad_norm": grad_norm}) self._metric_logger.log_dict( diff --git a/recipes/lora_finetune_single_device.py b/recipes/lora_finetune_single_device.py index 5d39b72086..c580389cb3 100644 --- a/recipes/lora_finetune_single_device.py +++ b/recipes/lora_finetune_single_device.py @@ -141,6 +141,12 @@ def __init__(self, cfg: DictConfig) -> None: self._log_every_n_steps = cfg.get("log_every_n_steps", 1) self._log_peak_memory_stats = cfg.get("log_peak_memory_stats", False) + if self._log_peak_memory_stats and self._device.type == "cuda": + log.info( + "log_peak_memory_stats was se to True, however, training does not use cuda. Setting log_peak_memory_stats=False." + ) + self._log_peak_memory_stats = False + # These are public properties which are updated by the checkpoint loader # when ``resume_from_checkpoint`` is `True` or validated in tests self.seed = training.set_seed(seed=cfg.seed) diff --git a/recipes/ppo_full_finetune_single_device.py b/recipes/ppo_full_finetune_single_device.py index 7679af3fd3..aa0895b7f8 100644 --- a/recipes/ppo_full_finetune_single_device.py +++ b/recipes/ppo_full_finetune_single_device.py @@ -119,6 +119,12 @@ def __init__(self, cfg: DictConfig) -> None: self._log_every_n_steps = cfg.get("log_every_n_steps", 1) self._log_peak_memory_stats = cfg.get("log_peak_memory_stats", False) + if self._log_peak_memory_stats and self._device.type == "cuda": + log.info( + "log_peak_memory_stats was se to True, however, training does not use cuda. Setting log_peak_memory_stats=False." + ) + self._log_peak_memory_stats = False + # These are public properties which are updated by the checkpoint loader # when ``resume_from_checkpoint`` is `True` or validated in tests self.seed = training.set_seed(seed=cfg.seed) diff --git a/recipes/qat_distributed.py b/recipes/qat_distributed.py index eb2e44fae2..2d7169e9da 100644 --- a/recipes/qat_distributed.py +++ b/recipes/qat_distributed.py @@ -127,6 +127,12 @@ def __init__(self, cfg: DictConfig) -> None: self._log_every_n_steps = cfg.get("log_every_n_steps", 1) self._log_peak_memory_stats = cfg.get("log_peak_memory_stats", False) + if self._log_peak_memory_stats and self._device.type == "cuda": + log.info( + "log_peak_memory_stats was se to True, however, training does not use cuda. Setting log_peak_memory_stats=False." + ) + self._log_peak_memory_stats = False + # _is_rank_zero is used primarily for logging. In the future, the logger # should directly take care of this _, rank = training.get_world_size_and_rank() From e60eebd8bec69d9fc65aad4b9710d06c699f5437 Mon Sep 17 00:00:00 2001 From: krammnic Date: Wed, 23 Oct 2024 09:20:37 -0400 Subject: [PATCH 07/10] fix lint --- recipes/lora_dpo_distributed.py | 1 - 1 file changed, 1 deletion(-) diff --git a/recipes/lora_dpo_distributed.py b/recipes/lora_dpo_distributed.py index acc48e6390..f008e71b91 100644 --- a/recipes/lora_dpo_distributed.py +++ b/recipes/lora_dpo_distributed.py @@ -136,7 +136,6 @@ def __init__(self, cfg: DictConfig) -> None: ) self._log_peak_memory_stats = False - # training attributes self._enable_activation_checkpointing = cfg.enable_activation_checkpointing From dacc3d2b3a045352ad0e2a2f5b29f19e03cbe360 Mon Sep 17 00:00:00 2001 From: krammnic Date: Thu, 24 Oct 2024 09:38:19 -0400 Subject: [PATCH 08/10] fixes --- recipes/configs/qwen2/knowledge_distillation_single_device.yaml | 1 - recipes/lora_dpo_single_device.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/recipes/configs/qwen2/knowledge_distillation_single_device.yaml b/recipes/configs/qwen2/knowledge_distillation_single_device.yaml index 078a91c417..f7d1b191cd 100644 --- a/recipes/configs/qwen2/knowledge_distillation_single_device.yaml +++ b/recipes/configs/qwen2/knowledge_distillation_single_device.yaml @@ -7,7 +7,6 @@ # tune download Qwen/Qwen2-1.5B-Instruct --output-dir /tmp/Qwen2-1.5B-Instruct --ignore-patterns None # # You get better results using KD if the teacher model has already been fine-tuned on the target dataset: - packed: False # Set to true for great speed ups # tune run lora_finetune_single_device --config qwen2/1.5B_lora_single_device # # To launch on a single device, run the following command from root: diff --git a/recipes/lora_dpo_single_device.py b/recipes/lora_dpo_single_device.py index a7a882afda..dca5553dac 100644 --- a/recipes/lora_dpo_single_device.py +++ b/recipes/lora_dpo_single_device.py @@ -99,7 +99,7 @@ def __init__(self, cfg: DictConfig) -> None: log.info( "log_peak_memory_stats was se to True, however, training does not use cuda. Setting log_peak_memory_stats=False." ) - self._log_peak_memory_stats + self._log_peak_memory_stats = False # These are public properties which are updated by the checkpoint loader # when ``resume_from_checkpoint`` is `True` or validated in tests From a311232e845b5052131254029351c6467bf84c32 Mon Sep 17 00:00:00 2001 From: krammnic Date: Sat, 26 Oct 2024 07:36:42 -0400 Subject: [PATCH 09/10] fix compile typo --- recipes/configs/llama3/70B_full.yaml | 2 +- recipes/configs/llama3/70B_lora.yaml | 2 +- recipes/configs/llama3_1/405B_qlora.yaml | 2 +- recipes/configs/llama3_1/70B_full.yaml | 2 +- recipes/configs/llama3_1/70B_lora.yaml | 2 +- recipes/configs/llama3_1/8B_full.yaml | 2 +- recipes/configs/llama3_1/8B_full_single_device.yaml | 2 +- recipes/configs/llama3_1/8B_lora.yaml | 2 +- recipes/configs/llama3_1/8B_lora_single_device.yaml | 2 +- recipes/configs/llama3_1/8B_qlora_single_device.yaml | 2 +- recipes/configs/llama3_2/1B_full.yaml | 2 +- recipes/configs/llama3_2/1B_full_single_device.yaml | 2 +- recipes/configs/llama3_2/1B_lora.yaml | 2 +- recipes/configs/llama3_2/1B_lora_single_device.yaml | 2 +- recipes/configs/llama3_2/1B_qlora_single_device.yaml | 2 +- recipes/configs/llama3_2/3B_full.yaml | 2 +- recipes/configs/llama3_2/3B_full_single_device.yaml | 2 +- recipes/configs/llama3_2/3B_lora.yaml | 2 +- recipes/configs/llama3_2/3B_lora_single_device.yaml | 2 +- recipes/configs/llama3_2/3B_qlora_single_device.yaml | 2 +- recipes/configs/llama3_2_vision/11B_full.yaml | 2 +- recipes/configs/llama3_2_vision/11B_full_single_device.yaml | 2 +- recipes/configs/llama3_2_vision/11B_lora.yaml | 2 +- recipes/configs/llama3_2_vision/11B_lora_single_device.yaml | 2 +- 24 files changed, 24 insertions(+), 24 deletions(-) diff --git a/recipes/configs/llama3/70B_full.yaml b/recipes/configs/llama3/70B_full.yaml index 21f0870094..91d95d2898 100644 --- a/recipes/configs/llama3/70B_full.yaml +++ b/recipes/configs/llama3/70B_full.yaml @@ -100,7 +100,7 @@ device: cuda enable_activation_checkpointing: True custom_sharded_layers: ['tok_embeddings', 'output'] fsdp_cpu_offload: True -compile=False # pytorch compile, set to true for perf/memory improvement# set it to True for better memory and performance +compile: False # pytorch compile, set to true for perf/memory improvement # Reduced precision dtype: bf16 diff --git a/recipes/configs/llama3/70B_lora.yaml b/recipes/configs/llama3/70B_lora.yaml index 49b246a23c..247daba5cc 100644 --- a/recipes/configs/llama3/70B_lora.yaml +++ b/recipes/configs/llama3/70B_lora.yaml @@ -90,7 +90,7 @@ loss: epochs: 1 max_steps_per_epoch: null gradient_accumulation_steps: 1 -compile=False # pytorch compile, set to true for perf/memory improvement# set it to True for better memory and performance +compile: False # pytorch compile, set to true for perf/memory improvement # Logging output_dir: /tmp/lora_finetune_output diff --git a/recipes/configs/llama3_1/405B_qlora.yaml b/recipes/configs/llama3_1/405B_qlora.yaml index 8e4359b632..e59b333fa1 100644 --- a/recipes/configs/llama3_1/405B_qlora.yaml +++ b/recipes/configs/llama3_1/405B_qlora.yaml @@ -72,7 +72,7 @@ fsdp: epochs: 1 max_steps_per_epoch: null gradient_accumulation_steps: 16 -compile=False # pytorch compile, set to true for perf/memory improvement# set it to True for better memory and performance +compile: False # pytorch compile, set to true for perf/memory improvement # Logging output_dir: /tmp/qlora_finetune_output diff --git a/recipes/configs/llama3_1/70B_full.yaml b/recipes/configs/llama3_1/70B_full.yaml index b5619ac0b4..c016015056 100644 --- a/recipes/configs/llama3_1/70B_full.yaml +++ b/recipes/configs/llama3_1/70B_full.yaml @@ -102,7 +102,7 @@ device: cuda enable_activation_checkpointing: True custom_sharded_layers: ['tok_embeddings', 'output'] fsdp_cpu_offload: True -compile=False # pytorch compile, set to true for perf/memory improvement# set it to True for better memory and performance +compile: False # pytorch compile, set to true for perf/memory improvement # Reduced precision dtype: bf16 diff --git a/recipes/configs/llama3_1/70B_lora.yaml b/recipes/configs/llama3_1/70B_lora.yaml index 030eb0a5fb..ad1bc64110 100644 --- a/recipes/configs/llama3_1/70B_lora.yaml +++ b/recipes/configs/llama3_1/70B_lora.yaml @@ -89,7 +89,7 @@ loss: epochs: 1 max_steps_per_epoch: null gradient_accumulation_steps: 1 -compile=False # pytorch compile, set to true for perf/memory improvement# set it to True for better memory and performance +compile: False # pytorch compile, set to true for perf/memory improvement # Logging output_dir: /tmp/lora-llama3_1-finetune-output diff --git a/recipes/configs/llama3_1/8B_full.yaml b/recipes/configs/llama3_1/8B_full.yaml index 43fb5a11b8..da27c91852 100644 --- a/recipes/configs/llama3_1/8B_full.yaml +++ b/recipes/configs/llama3_1/8B_full.yaml @@ -69,7 +69,7 @@ device: cuda # Memory management enable_activation_checkpointing: True custom_sharded_layers: ['tok_embeddings', 'output'] -compile=False # pytorch compile, set to true for perf/memory improvement# set it to True for better memory and performance +compile: False # pytorch compile, set to true for perf/memory improvement # Reduced precision dtype: bf16 diff --git a/recipes/configs/llama3_1/8B_full_single_device.yaml b/recipes/configs/llama3_1/8B_full_single_device.yaml index e99d77164f..04ba339b23 100644 --- a/recipes/configs/llama3_1/8B_full_single_device.yaml +++ b/recipes/configs/llama3_1/8B_full_single_device.yaml @@ -62,7 +62,7 @@ loss: max_steps_per_epoch: null gradient_accumulation_steps: 1 optimizer_in_bwd: True -compile=False # pytorch compile, set to true for perf/memory improvement# set it to True for better memory and performance +compile: False # pytorch compile, set to true for perf/memory improvement # Training environment device: cuda diff --git a/recipes/configs/llama3_1/8B_lora.yaml b/recipes/configs/llama3_1/8B_lora.yaml index 6f37eeeaa8..d0a5202847 100644 --- a/recipes/configs/llama3_1/8B_lora.yaml +++ b/recipes/configs/llama3_1/8B_lora.yaml @@ -73,7 +73,7 @@ loss: epochs: 1 max_steps_per_epoch: null gradient_accumulation_steps: 32 -compile=False # pytorch compile, set to true for perf/memory improvement# set it to True for better memory and performance +compile: False # pytorch compile, set to true for perf/memory improvement # Logging output_dir: /tmp/lora_finetune_output diff --git a/recipes/configs/llama3_1/8B_lora_single_device.yaml b/recipes/configs/llama3_1/8B_lora_single_device.yaml index 41caed2a38..bc9a3956f3 100644 --- a/recipes/configs/llama3_1/8B_lora_single_device.yaml +++ b/recipes/configs/llama3_1/8B_lora_single_device.yaml @@ -72,7 +72,7 @@ loss: epochs: 1 max_steps_per_epoch: null gradient_accumulation_steps: 64 -compile=False # pytorch compile, set to true for perf/memory improvement# set it to True for better memory and performance +compile: False # pytorch compile, set to true for perf/memory improvement # Logging output_dir: /tmp/lora_finetune_output diff --git a/recipes/configs/llama3_1/8B_qlora_single_device.yaml b/recipes/configs/llama3_1/8B_qlora_single_device.yaml index f3476fcb16..b194acb181 100644 --- a/recipes/configs/llama3_1/8B_qlora_single_device.yaml +++ b/recipes/configs/llama3_1/8B_qlora_single_device.yaml @@ -71,7 +71,7 @@ loss: epochs: 1 max_steps_per_epoch: null gradient_accumulation_steps: 16 -compile=False # pytorch compile, set to true for perf/memory improvement# set it to True for better memory and performance +compile: False # pytorch compile, set to true for perf/memory improvement # Logging output_dir: /tmp/qlora_finetune_output/ diff --git a/recipes/configs/llama3_2/1B_full.yaml b/recipes/configs/llama3_2/1B_full.yaml index 97257c623a..c90fea966f 100644 --- a/recipes/configs/llama3_2/1B_full.yaml +++ b/recipes/configs/llama3_2/1B_full.yaml @@ -65,7 +65,7 @@ device: cuda # Memory management enable_activation_checkpointing: False -compile=False # pytorch compile, set to true for perf/memory improvement# set it to True for better memory and performance +compile: False # pytorch compile, set to true for perf/memory improvement # Reduced precision dtype: bf16 diff --git a/recipes/configs/llama3_2/1B_full_single_device.yaml b/recipes/configs/llama3_2/1B_full_single_device.yaml index c69b592cf8..e4d1f87fac 100644 --- a/recipes/configs/llama3_2/1B_full_single_device.yaml +++ b/recipes/configs/llama3_2/1B_full_single_device.yaml @@ -59,7 +59,7 @@ loss: max_steps_per_epoch: null gradient_accumulation_steps: 1 optimizer_in_bwd: True -compile=False # pytorch compile, set to true for perf/memory improvement# set it to True for better memory and performance +compile: False # pytorch compile, set to true for perf/memory improvement # Training environment device: cuda diff --git a/recipes/configs/llama3_2/1B_lora.yaml b/recipes/configs/llama3_2/1B_lora.yaml index b531461a4b..b5e53900ef 100644 --- a/recipes/configs/llama3_2/1B_lora.yaml +++ b/recipes/configs/llama3_2/1B_lora.yaml @@ -70,7 +70,7 @@ loss: epochs: 1 max_steps_per_epoch: null gradient_accumulation_steps: 4 -compile=False # pytorch compile, set to true for perf/memory improvement# set it to True for better memory and performance +compile: False # pytorch compile, set to true for perf/memory improvement # Logging output_dir: /tmp/lora_finetune_output diff --git a/recipes/configs/llama3_2/1B_lora_single_device.yaml b/recipes/configs/llama3_2/1B_lora_single_device.yaml index e2b97a6d77..8c94bb0582 100644 --- a/recipes/configs/llama3_2/1B_lora_single_device.yaml +++ b/recipes/configs/llama3_2/1B_lora_single_device.yaml @@ -69,7 +69,7 @@ loss: epochs: 1 max_steps_per_epoch: null gradient_accumulation_steps: 4 -compile=False # pytorch compile, set to true for perf/memory improvement# set it to True for better memory and performance +compile: False # pytorch compile, set to true for perf/memory improvement # Logging output_dir: /tmp/lora_finetune_output diff --git a/recipes/configs/llama3_2/1B_qlora_single_device.yaml b/recipes/configs/llama3_2/1B_qlora_single_device.yaml index 0372578e23..282d0d9e89 100644 --- a/recipes/configs/llama3_2/1B_qlora_single_device.yaml +++ b/recipes/configs/llama3_2/1B_qlora_single_device.yaml @@ -68,7 +68,7 @@ loss: epochs: 1 max_steps_per_epoch: null gradient_accumulation_steps: 4 -compile=False # pytorch compile, set to true for perf/memory improvement# set it to True for better memory and performance +compile: False # pytorch compile, set to true for perf/memory improvement # Logging output_dir: /tmp/lora_finetune_output diff --git a/recipes/configs/llama3_2/3B_full.yaml b/recipes/configs/llama3_2/3B_full.yaml index 027a1d9e27..bfe9ef6420 100644 --- a/recipes/configs/llama3_2/3B_full.yaml +++ b/recipes/configs/llama3_2/3B_full.yaml @@ -65,7 +65,7 @@ device: cuda # Memory management enable_activation_checkpointing: True -compile=False # pytorch compile, set to true for perf/memory improvement# set it to True for better memory and performance +compile: False # pytorch compile, set to true for perf/memory improvement # Reduced precision dtype: bf16 diff --git a/recipes/configs/llama3_2/3B_full_single_device.yaml b/recipes/configs/llama3_2/3B_full_single_device.yaml index 29955d8d47..14a5369e71 100644 --- a/recipes/configs/llama3_2/3B_full_single_device.yaml +++ b/recipes/configs/llama3_2/3B_full_single_device.yaml @@ -60,7 +60,7 @@ loss: max_steps_per_epoch: null gradient_accumulation_steps: 1 optimizer_in_bwd: True -compile=False # pytorch compile, set to true for perf/memory improvement# set it to True for better memory and performance +compile: False # pytorch compile, set to true for perf/memory improvement # Training environment device: cuda diff --git a/recipes/configs/llama3_2/3B_lora.yaml b/recipes/configs/llama3_2/3B_lora.yaml index e30ad9d63b..076f9d9171 100644 --- a/recipes/configs/llama3_2/3B_lora.yaml +++ b/recipes/configs/llama3_2/3B_lora.yaml @@ -71,7 +71,7 @@ loss: epochs: 1 max_steps_per_epoch: null gradient_accumulation_steps: 4 -compile=False # pytorch compile, set to true for perf/memory improvement# set it to True for better memory and performance +compile: False # pytorch compile, set to true for perf/memory improvement # Logging output_dir: /tmp/lora_finetune_output diff --git a/recipes/configs/llama3_2/3B_lora_single_device.yaml b/recipes/configs/llama3_2/3B_lora_single_device.yaml index 3e888090a8..b36d18f872 100644 --- a/recipes/configs/llama3_2/3B_lora_single_device.yaml +++ b/recipes/configs/llama3_2/3B_lora_single_device.yaml @@ -70,7 +70,7 @@ loss: epochs: 1 max_steps_per_epoch: null gradient_accumulation_steps: 4 -compile=False # pytorch compile, set to true for perf/memory improvement# set it to True for better memory and performance +compile: False # pytorch compile, set to true for perf/memory improvement # Logging output_dir: /tmp/lora_finetune_output diff --git a/recipes/configs/llama3_2/3B_qlora_single_device.yaml b/recipes/configs/llama3_2/3B_qlora_single_device.yaml index a98e1ac9c0..3efbd6c43c 100644 --- a/recipes/configs/llama3_2/3B_qlora_single_device.yaml +++ b/recipes/configs/llama3_2/3B_qlora_single_device.yaml @@ -69,7 +69,7 @@ loss: epochs: 1 max_steps_per_epoch: null gradient_accumulation_steps: 4 -compile=False # pytorch compile, set to true for perf/memory improvement# set it to True for better memory and performance +compile: False # pytorch compile, set to true for perf/memory improvement # Logging output_dir: /tmp/lora_finetune_output diff --git a/recipes/configs/llama3_2_vision/11B_full.yaml b/recipes/configs/llama3_2_vision/11B_full.yaml index 2ef3298f11..3837e8c39c 100644 --- a/recipes/configs/llama3_2_vision/11B_full.yaml +++ b/recipes/configs/llama3_2_vision/11B_full.yaml @@ -61,7 +61,7 @@ optimizer: loss: _component_: torchtune.modules.loss.CEWithChunkedOutputLoss clip_grad_norm: 1.0 -compile=False # pytorch compile, set to true for perf/memory improvement# set it to True for better memory and performance +compile: False # pytorch compile, set to true for perf/memory improvement # Training env device: cuda diff --git a/recipes/configs/llama3_2_vision/11B_full_single_device.yaml b/recipes/configs/llama3_2_vision/11B_full_single_device.yaml index e72ccbfdbd..93d8c2cd11 100644 --- a/recipes/configs/llama3_2_vision/11B_full_single_device.yaml +++ b/recipes/configs/llama3_2_vision/11B_full_single_device.yaml @@ -63,7 +63,7 @@ optimizer_in_bwd: False loss: _component_: torchtune.modules.loss.CEWithChunkedOutputLoss clip_grad_norm: 1.0 -compile=False # pytorch compile, set to true for perf/memory improvement# set it to True for better memory and performance +compile: False # pytorch compile, set to true for perf/memory improvement # Training env device: cuda diff --git a/recipes/configs/llama3_2_vision/11B_lora.yaml b/recipes/configs/llama3_2_vision/11B_lora.yaml index 9d94e5ce7f..449d786153 100644 --- a/recipes/configs/llama3_2_vision/11B_lora.yaml +++ b/recipes/configs/llama3_2_vision/11B_lora.yaml @@ -71,7 +71,7 @@ lr_scheduler: loss: _component_: torchtune.modules.loss.CEWithChunkedOutputLoss clip_grad_norm: 1.0 -compile=False # pytorch compile, set to true for perf/memory improvement# set it to True for better memory and performance +compile: False # pytorch compile, set to true for perf/memory improvement # Training env device: cuda diff --git a/recipes/configs/llama3_2_vision/11B_lora_single_device.yaml b/recipes/configs/llama3_2_vision/11B_lora_single_device.yaml index fe1c04929c..15b097e6ed 100644 --- a/recipes/configs/llama3_2_vision/11B_lora_single_device.yaml +++ b/recipes/configs/llama3_2_vision/11B_lora_single_device.yaml @@ -70,7 +70,7 @@ lr_scheduler: loss: _component_: torchtune.modules.loss.CEWithChunkedOutputLoss clip_grad_norm: 1.0 -compile=False # pytorch compile, set to true for perf/memory improvement# set it to True for better memory and performance +compile: False # pytorch compile, set to true for perf/memory improvement # Training env device: cuda From b0b4b1410f99fb99718483b9a8ebd48418b6e8f5 Mon Sep 17 00:00:00 2001 From: krammnic Date: Sat, 26 Oct 2024 07:37:56 -0400 Subject: [PATCH 10/10] fix other typos --- recipes/full_finetune_distributed.py | 4 ++-- recipes/full_finetune_single_device.py | 4 ++-- recipes/knowledge_distillation_single_device.py | 4 ++-- recipes/lora_dpo_distributed.py | 4 ++-- recipes/lora_dpo_single_device.py | 4 ++-- recipes/lora_finetune_distributed.py | 4 ++-- recipes/lora_finetune_single_device.py | 4 ++-- recipes/ppo_full_finetune_single_device.py | 4 ++-- recipes/qat_distributed.py | 4 ++-- 9 files changed, 18 insertions(+), 18 deletions(-) diff --git a/recipes/full_finetune_distributed.py b/recipes/full_finetune_distributed.py index 8221eadc04..a6c0d81724 100644 --- a/recipes/full_finetune_distributed.py +++ b/recipes/full_finetune_distributed.py @@ -121,9 +121,9 @@ def __init__(self, cfg: DictConfig) -> None: self._log_every_n_steps = cfg.get("log_every_n_steps", 1) self._log_peak_memory_stats = cfg.get("log_peak_memory_stats", False) - if self._log_peak_memory_stats and self._device.type == "cuda": + if self._log_peak_memory_stats and self._device.type != "cuda": log.info( - "log_peak_memory_stats was se to True, however, training does not use cuda. Setting log_peak_memory_stats=False." + "log_peak_memory_stats was set to True, however, training does not use cuda. Setting log_peak_memory_stats=False." ) self._log_peak_memory_stats = False diff --git a/recipes/full_finetune_single_device.py b/recipes/full_finetune_single_device.py index ea7bd5a6e6..469732a217 100644 --- a/recipes/full_finetune_single_device.py +++ b/recipes/full_finetune_single_device.py @@ -116,9 +116,9 @@ def __init__(self, cfg: DictConfig) -> None: self._log_every_n_steps = cfg.get("log_every_n_steps", 1) self._log_peak_memory_stats = cfg.get("log_peak_memory_stats", False) - if self._log_peak_memory_stats and self._device.type == "cuda": + if self._log_peak_memory_stats and self._device.type != "cuda": log.info( - "log_peak_memory_stats was se to True, however, training does not use cuda. Setting log_peak_memory_stats=False." + "log_peak_memory_stats was set to True, however, training does not use cuda. Setting log_peak_memory_stats=False." ) self._log_peak_memory_stats = False diff --git a/recipes/knowledge_distillation_single_device.py b/recipes/knowledge_distillation_single_device.py index db357e031c..8612fa8f8e 100644 --- a/recipes/knowledge_distillation_single_device.py +++ b/recipes/knowledge_distillation_single_device.py @@ -120,9 +120,9 @@ def __init__(self, cfg: DictConfig) -> None: self._log_every_n_steps = cfg.get("log_every_n_steps", 1) self._log_peak_memory_stats = cfg.get("log_peak_memory_stats", False) - if self._log_peak_memory_stats and self._device.type == "cuda": + if self._log_peak_memory_stats and self._device.type != "cuda": log.info( - "log_peak_memory_stats was se to True, however, training does not use cuda. Setting log_peak_memory_stats=False." + "log_peak_memory_stats was set to True, however, training does not use cuda. Setting log_peak_memory_stats=False." ) self._log_peak_memory_stats = False diff --git a/recipes/lora_dpo_distributed.py b/recipes/lora_dpo_distributed.py index f008e71b91..dd98f72685 100644 --- a/recipes/lora_dpo_distributed.py +++ b/recipes/lora_dpo_distributed.py @@ -130,9 +130,9 @@ def __init__(self, cfg: DictConfig) -> None: self._log_every_n_steps = cfg.get("log_every_n_steps", 1) self._log_peak_memory_stats = cfg.get("log_peak_memory_stats", False) - if self._log_peak_memory_stats and self._device.type == "cuda": + if self._log_peak_memory_stats and self._device.type != "cuda": log.info( - "log_peak_memory_stats was se to True, however, training does not use cuda. Setting log_peak_memory_stats=False." + "log_peak_memory_stats was set to True, however, training does not use cuda. Setting log_peak_memory_stats=False." ) self._log_peak_memory_stats = False diff --git a/recipes/lora_dpo_single_device.py b/recipes/lora_dpo_single_device.py index dca5553dac..f34694ccc8 100644 --- a/recipes/lora_dpo_single_device.py +++ b/recipes/lora_dpo_single_device.py @@ -95,9 +95,9 @@ def __init__(self, cfg: DictConfig) -> None: self._log_every_n_steps = cfg.get("log_every_n_steps", 1) self._log_peak_memory_stats = cfg.get("log_peak_memory_stats", False) - if self._log_peak_memory_stats and self._device.type == "cuda": + if self._log_peak_memory_stats and self._device.type != "cuda": log.info( - "log_peak_memory_stats was se to True, however, training does not use cuda. Setting log_peak_memory_stats=False." + "log_peak_memory_stats was set to True, however, training does not use cuda. Setting log_peak_memory_stats=False." ) self._log_peak_memory_stats = False diff --git a/recipes/lora_finetune_distributed.py b/recipes/lora_finetune_distributed.py index 556a11c16b..98a09705f6 100644 --- a/recipes/lora_finetune_distributed.py +++ b/recipes/lora_finetune_distributed.py @@ -151,9 +151,9 @@ def __init__(self, cfg: DictConfig) -> None: self._log_every_n_steps = cfg.get("log_every_n_steps", 1) self._log_peak_memory_stats = cfg.get("log_peak_memory_stats", False) - if self._log_peak_memory_stats and self._device.type == "cuda": + if self._log_peak_memory_stats and self._device.type != "cuda": log.info( - "log_peak_memory_stats was se to True, however, training does not use cuda. Setting log_peak_memory_stats=False." + "log_peak_memory_stats was set to True, however, training does not use cuda. Setting log_peak_memory_stats=False." ) self._log_peak_memory_stats = False diff --git a/recipes/lora_finetune_single_device.py b/recipes/lora_finetune_single_device.py index c580389cb3..34b5406e77 100644 --- a/recipes/lora_finetune_single_device.py +++ b/recipes/lora_finetune_single_device.py @@ -141,9 +141,9 @@ def __init__(self, cfg: DictConfig) -> None: self._log_every_n_steps = cfg.get("log_every_n_steps", 1) self._log_peak_memory_stats = cfg.get("log_peak_memory_stats", False) - if self._log_peak_memory_stats and self._device.type == "cuda": + if self._log_peak_memory_stats and self._device.type != "cuda": log.info( - "log_peak_memory_stats was se to True, however, training does not use cuda. Setting log_peak_memory_stats=False." + "log_peak_memory_stats was set to True, however, training does not use cuda. Setting log_peak_memory_stats=False." ) self._log_peak_memory_stats = False diff --git a/recipes/ppo_full_finetune_single_device.py b/recipes/ppo_full_finetune_single_device.py index aa0895b7f8..1030217d74 100644 --- a/recipes/ppo_full_finetune_single_device.py +++ b/recipes/ppo_full_finetune_single_device.py @@ -119,9 +119,9 @@ def __init__(self, cfg: DictConfig) -> None: self._log_every_n_steps = cfg.get("log_every_n_steps", 1) self._log_peak_memory_stats = cfg.get("log_peak_memory_stats", False) - if self._log_peak_memory_stats and self._device.type == "cuda": + if self._log_peak_memory_stats and self._device.type != "cuda": log.info( - "log_peak_memory_stats was se to True, however, training does not use cuda. Setting log_peak_memory_stats=False." + "log_peak_memory_stats was set to True, however, training does not use cuda. Setting log_peak_memory_stats=False." ) self._log_peak_memory_stats = False diff --git a/recipes/qat_distributed.py b/recipes/qat_distributed.py index 2d7169e9da..b717c8233a 100644 --- a/recipes/qat_distributed.py +++ b/recipes/qat_distributed.py @@ -127,9 +127,9 @@ def __init__(self, cfg: DictConfig) -> None: self._log_every_n_steps = cfg.get("log_every_n_steps", 1) self._log_peak_memory_stats = cfg.get("log_peak_memory_stats", False) - if self._log_peak_memory_stats and self._device.type == "cuda": + if self._log_peak_memory_stats and self._device.type != "cuda": log.info( - "log_peak_memory_stats was se to True, however, training does not use cuda. Setting log_peak_memory_stats=False." + "log_peak_memory_stats was set to True, however, training does not use cuda. Setting log_peak_memory_stats=False." ) self._log_peak_memory_stats = False