From 4270cecfc697f538b2ec719bea4203f8926269ef Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Fri, 27 Oct 2023 22:36:30 -0400 Subject: [PATCH] fix eval_steps to be a sane default (#797) * fix eval_steps to be a sane default * update docs for fractional eval_steps --- README.md | 4 ++-- examples/cerebras/qlora.yml | 2 +- examples/code-llama/13b/lora.yml | 4 ++-- examples/code-llama/13b/qlora.yml | 4 ++-- examples/code-llama/34b/lora.yml | 4 ++-- examples/code-llama/34b/qlora.yml | 4 ++-- examples/code-llama/7b/lora.yml | 4 ++-- examples/code-llama/7b/qlora.yml | 4 ++-- examples/falcon/config-7b-qlora.yml | 2 +- examples/gptj/qlora.yml | 2 +- examples/jeopardy-bot/config.yml | 2 +- examples/llama-2/gptq-lora.yml | 2 +- examples/llama-2/lora.yml | 4 ++-- examples/llama-2/qlora.yml | 4 ++-- examples/llama-2/relora.yml | 4 ++-- examples/llama-2/tiny-llama.yml | 4 ++-- examples/mistral/config.yml | 4 ++-- examples/mistral/qlora.yml | 2 +- examples/mpt-7b/config.yml | 2 +- examples/pythia/lora.yml | 4 ++-- examples/redpajama/config-3b.yml | 2 +- examples/replit-3b/config-lora.yml | 2 +- examples/xgen-7b/xgen-7b-8k-qlora.yml | 2 +- 23 files changed, 36 insertions(+), 36 deletions(-) diff --git a/README.md b/README.md index add56caf5..c9575d659 100644 --- a/README.md +++ b/README.md @@ -618,14 +618,14 @@ gradient_accumulation_steps: 1 # The number of samples to include in each batch. This is the number of samples sent to each GPU. micro_batch_size: 2 eval_batch_size: -num_epochs: 3 +num_epochs: 4 warmup_steps: 100 learning_rate: 0.00003 lr_quadratic_warmup: logging_steps: save_strategy: # Set to `no` to skip checkpoint saves save_steps: # Leave empty to save at each epoch -eval_steps: # Leave empty to eval at each epoch +eval_steps: # Leave empty to eval at each epoch, integers for every N steps. decimal for fraction of total steps save_total_limit: # Checkpoints saved at a time # Maximum number of iterations to train for. It precedes num_epochs which means that # if both are set, num_epochs will not be guaranteed. diff --git a/examples/cerebras/qlora.yml b/examples/cerebras/qlora.yml index 352952a54..2dc4160da 100644 --- a/examples/cerebras/qlora.yml +++ b/examples/cerebras/qlora.yml @@ -49,7 +49,7 @@ flash_attention: gptq_groupsize: gptq_model_v1: warmup_steps: 10 -eval_steps: 20 +eval_steps: 0.05 save_steps: debug: deepspeed: diff --git a/examples/code-llama/13b/lora.yml b/examples/code-llama/13b/lora.yml index 2909e2477..26c5cae95 100644 --- a/examples/code-llama/13b/lora.yml +++ b/examples/code-llama/13b/lora.yml @@ -34,7 +34,7 @@ wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 2 -num_epochs: 3 +num_epochs: 4 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 @@ -54,7 +54,7 @@ xformers_attention: flash_attention: true warmup_steps: 10 -eval_steps: 20 +eval_steps: 0.05 save_steps: debug: deepspeed: diff --git a/examples/code-llama/13b/qlora.yml b/examples/code-llama/13b/qlora.yml index dff95a5aa..f6c1be56d 100644 --- a/examples/code-llama/13b/qlora.yml +++ b/examples/code-llama/13b/qlora.yml @@ -36,7 +36,7 @@ wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 2 -num_epochs: 3 +num_epochs: 4 optimizer: paged_adamw_32bit lr_scheduler: cosine learning_rate: 0.0002 @@ -56,7 +56,7 @@ xformers_attention: flash_attention: true warmup_steps: 10 -eval_steps: 20 +eval_steps: 0.05 save_steps: debug: deepspeed: diff --git a/examples/code-llama/34b/lora.yml b/examples/code-llama/34b/lora.yml index 5601b2e0b..1996e6c04 100644 --- a/examples/code-llama/34b/lora.yml +++ b/examples/code-llama/34b/lora.yml @@ -34,7 +34,7 @@ wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 2 -num_epochs: 3 +num_epochs: 4 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 @@ -54,7 +54,7 @@ xformers_attention: flash_attention: true warmup_steps: 10 -eval_steps: 20 +eval_steps: 0.05 save_steps: debug: deepspeed: diff --git a/examples/code-llama/34b/qlora.yml b/examples/code-llama/34b/qlora.yml index 71a39e534..c854ceab7 100644 --- a/examples/code-llama/34b/qlora.yml +++ b/examples/code-llama/34b/qlora.yml @@ -36,7 +36,7 @@ wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 2 -num_epochs: 3 +num_epochs: 4 optimizer: paged_adamw_32bit lr_scheduler: cosine learning_rate: 0.0002 @@ -56,7 +56,7 @@ xformers_attention: flash_attention: true warmup_steps: 10 -eval_steps: 20 +eval_steps: 0.05 save_steps: debug: deepspeed: diff --git a/examples/code-llama/7b/lora.yml b/examples/code-llama/7b/lora.yml index 345745681..a53123219 100644 --- a/examples/code-llama/7b/lora.yml +++ b/examples/code-llama/7b/lora.yml @@ -34,7 +34,7 @@ wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 2 -num_epochs: 3 +num_epochs: 4 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 @@ -54,7 +54,7 @@ xformers_attention: flash_attention: true warmup_steps: 10 -eval_steps: 20 +eval_steps: 0.05 save_steps: debug: deepspeed: diff --git a/examples/code-llama/7b/qlora.yml b/examples/code-llama/7b/qlora.yml index 25357ad28..5ad53063c 100644 --- a/examples/code-llama/7b/qlora.yml +++ b/examples/code-llama/7b/qlora.yml @@ -36,7 +36,7 @@ wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 2 -num_epochs: 3 +num_epochs: 4 optimizer: paged_adamw_32bit lr_scheduler: cosine learning_rate: 0.0002 @@ -56,7 +56,7 @@ xformers_attention: flash_attention: true warmup_steps: 10 -eval_steps: 20 +eval_steps: 0.05 save_steps: debug: deepspeed: diff --git a/examples/falcon/config-7b-qlora.yml b/examples/falcon/config-7b-qlora.yml index 3c201ff5f..78ffb43f6 100644 --- a/examples/falcon/config-7b-qlora.yml +++ b/examples/falcon/config-7b-qlora.yml @@ -53,7 +53,7 @@ output_dir: ./qlora-out # decrease if OOM, increase for max VRAM utilization micro_batch_size: 1 gradient_accumulation_steps: 2 -num_epochs: 3 +num_epochs: 4 # Optimizer for QLoRA optimizer: paged_adamw_32bit torchdistx_path: diff --git a/examples/gptj/qlora.yml b/examples/gptj/qlora.yml index b9455624a..e887e15d5 100644 --- a/examples/gptj/qlora.yml +++ b/examples/gptj/qlora.yml @@ -46,7 +46,7 @@ flash_attention: gptq_groupsize: gptq_model_v1: warmup_steps: 10 -eval_steps: 20 +eval_steps: 0.05 save_steps: debug: deepspeed: diff --git a/examples/jeopardy-bot/config.yml b/examples/jeopardy-bot/config.yml index 710a74fdf..9dbdf6e6e 100644 --- a/examples/jeopardy-bot/config.yml +++ b/examples/jeopardy-bot/config.yml @@ -24,7 +24,7 @@ wandb_log_model: output_dir: ./jeopardy-bot-7b gradient_accumulation_steps: 1 micro_batch_size: 1 -num_epochs: 3 +num_epochs: 4 optimizer: adamw_bnb_8bit torchdistx_path: lr_scheduler: cosine diff --git a/examples/llama-2/gptq-lora.yml b/examples/llama-2/gptq-lora.yml index 759b304d8..2bce70f35 100644 --- a/examples/llama-2/gptq-lora.yml +++ b/examples/llama-2/gptq-lora.yml @@ -37,7 +37,7 @@ wandb_log_model: output_dir: ./model-out gradient_accumulation_steps: 1 micro_batch_size: 1 -num_epochs: 3 +num_epochs: 4 optimizer: adamw_torch adam_beta2: 0.95 adam_eps: 0.00001 diff --git a/examples/llama-2/lora.yml b/examples/llama-2/lora.yml index 5afe3d7d1..8e1047c0b 100644 --- a/examples/llama-2/lora.yml +++ b/examples/llama-2/lora.yml @@ -34,7 +34,7 @@ wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 2 -num_epochs: 3 +num_epochs: 4 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 @@ -54,7 +54,7 @@ xformers_attention: flash_attention: true warmup_steps: 10 -eval_steps: 20 +eval_steps: 0.05 eval_table_size: eval_table_max_new_tokens: 128 save_steps: diff --git a/examples/llama-2/qlora.yml b/examples/llama-2/qlora.yml index 447761f7e..afeb981ac 100644 --- a/examples/llama-2/qlora.yml +++ b/examples/llama-2/qlora.yml @@ -36,7 +36,7 @@ wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 2 -num_epochs: 3 +num_epochs: 4 optimizer: paged_adamw_32bit lr_scheduler: cosine learning_rate: 0.0002 @@ -56,7 +56,7 @@ xformers_attention: flash_attention: true warmup_steps: 10 -eval_steps: 20 +eval_steps: 0.05 eval_table_size: save_steps: debug: diff --git a/examples/llama-2/relora.yml b/examples/llama-2/relora.yml index 2e6923811..ffccb7c0f 100644 --- a/examples/llama-2/relora.yml +++ b/examples/llama-2/relora.yml @@ -40,7 +40,7 @@ wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 4 -num_epochs: 3 +num_epochs: 4 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 @@ -60,7 +60,7 @@ xformers_attention: flash_attention: true warmup_steps: 10 -eval_steps: 20 +eval_steps: 0.05 save_steps: 50 debug: deepspeed: diff --git a/examples/llama-2/tiny-llama.yml b/examples/llama-2/tiny-llama.yml index af05830aa..b249e9434 100644 --- a/examples/llama-2/tiny-llama.yml +++ b/examples/llama-2/tiny-llama.yml @@ -34,7 +34,7 @@ wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 2 -num_epochs: 3 +num_epochs: 4 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 @@ -54,7 +54,7 @@ xformers_attention: flash_attention: true warmup_steps: 10 -eval_steps: 20 +eval_steps: 0.05 eval_table_size: save_steps: debug: diff --git a/examples/mistral/config.yml b/examples/mistral/config.yml index 8edaa06b0..40a7e2f4e 100644 --- a/examples/mistral/config.yml +++ b/examples/mistral/config.yml @@ -26,7 +26,7 @@ wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 2 -num_epochs: 3 +num_epochs: 4 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.000005 @@ -46,7 +46,7 @@ xformers_attention: flash_attention: true warmup_steps: 10 -eval_steps: 20 +eval_steps: 0.05 eval_table_size: eval_table_max_new_tokens: 128 save_steps: diff --git a/examples/mistral/qlora.yml b/examples/mistral/qlora.yml index ac4bcbd16..ff929dcb6 100644 --- a/examples/mistral/qlora.yml +++ b/examples/mistral/qlora.yml @@ -63,7 +63,7 @@ xformers_attention: flash_attention: true warmup_steps: 10 -eval_steps: 20 +eval_steps: 0.05 eval_table_size: eval_table_max_new_tokens: 128 save_steps: diff --git a/examples/mpt-7b/config.yml b/examples/mpt-7b/config.yml index 7d124e2c0..c9401890c 100644 --- a/examples/mpt-7b/config.yml +++ b/examples/mpt-7b/config.yml @@ -26,7 +26,7 @@ wandb_log_model: output_dir: ./mpt-alpaca-7b gradient_accumulation_steps: 1 micro_batch_size: 1 -num_epochs: 3 +num_epochs: 4 optimizer: adamw_bnb_8bit torchdistx_path: lr_scheduler: cosine diff --git a/examples/pythia/lora.yml b/examples/pythia/lora.yml index c256429d8..b41e8197c 100644 --- a/examples/pythia/lora.yml +++ b/examples/pythia/lora.yml @@ -23,7 +23,7 @@ wandb_log_model: output_dir: ./lora-alpaca-pythia gradient_accumulation_steps: 1 micro_batch_size: 4 -num_epochs: 3 +num_epochs: 4 learning_rate: 0.00001 train_on_inputs: false group_by_length: false @@ -33,5 +33,5 @@ early_stopping_patience: resume_from_checkpoint: local_rank: weight_decay: 0.1 -eval_steps: 20 +eval_steps: 0.05 logging_steps: 1 diff --git a/examples/redpajama/config-3b.yml b/examples/redpajama/config-3b.yml index 30c198193..edabd0e31 100644 --- a/examples/redpajama/config-3b.yml +++ b/examples/redpajama/config-3b.yml @@ -27,7 +27,7 @@ wandb_log_model: output_dir: ./redpajama-alpaca-3b batch_size: 4 micro_batch_size: 1 -num_epochs: 3 +num_epochs: 4 optimizer: adamw_bnb_8bit torchdistx_path: lr_scheduler: cosine diff --git a/examples/replit-3b/config-lora.yml b/examples/replit-3b/config-lora.yml index cc882c212..c3f448fab 100644 --- a/examples/replit-3b/config-lora.yml +++ b/examples/replit-3b/config-lora.yml @@ -26,7 +26,7 @@ wandb_log_model: output_dir: ./lora-replit batch_size: 8 micro_batch_size: 1 -num_epochs: 3 +num_epochs: 4 optimizer: torchdistx_path: lr_scheduler: diff --git a/examples/xgen-7b/xgen-7b-8k-qlora.yml b/examples/xgen-7b/xgen-7b-8k-qlora.yml index f6fced944..40a62e6c9 100644 --- a/examples/xgen-7b/xgen-7b-8k-qlora.yml +++ b/examples/xgen-7b/xgen-7b-8k-qlora.yml @@ -51,7 +51,7 @@ output_dir: ./qlora-out # decrease if OOM, increase for max VRAM utilization micro_batch_size: 1 gradient_accumulation_steps: 1 -num_epochs: 3 +num_epochs: 4 # Optimizer for QLoRA optimizer: paged_adamw_32bit torchdistx_path: