anyscale · amogkam · Feb 23, 2024 · Feb 7, 2024 · Feb 16, 2024 · Feb 23, 2024
diff --git a/templates/fine-tune-llm/README.md b/templates/fine-tune-llm/README.md
@@ -7,6 +7,7 @@ This guide provides starter configurations if you would like to further customiz
 ### Supported base models
 
 - mistralai/Mistral-7B-Instruct-v0.1
+- mistralai/Mixtral-8x7b
 - meta-llama/Llama-2-7b-hf
 - meta-llama/Llama-2-7b-chat-hf
 - meta-llama/Llama-2-13b-hf

diff --git a/templates/fine-tune-llm/training_configs/full_param/llama-2-13b-4k-4xg5_12xlarge.yaml b/templates/fine-tune-llm/training_configs/full_param/llama-2-13b-4k-4xg5_12xlarge.yaml
@@ -8,6 +8,7 @@ train_batch_size_per_device: 8
 eval_batch_size_per_device: 8
 learning_rate: 5e-6
 num_checkpoints_to_keep: 1
+dataset_size_scaling_factor: 10000
 output_dir: /mnt/local_storage
 deepspeed:
   config_path: deepspeed_configs/zero_3_llama_2_13b.json

diff --git a/templates/fine-tune-llm/training_configs/full_param/llama-2-70b-4k-2xp4de_24xlarge.yaml b/templates/fine-tune-llm/training_configs/full_param/llama-2-70b-4k-2xp4de_24xlarge.yaml
@@ -0,0 +1,17 @@
+model_id: meta-llama/Llama-2-70b-hf # <-- change this to the model you want to fine-tune
+train_path: s3://air-example-data/gsm8k/train.jsonl # <-- change this to the path to your training data
+valid_path: s3://air-example-data/gsm8k/test.jsonl # <-- change this to the path to your validation data. This is optional
+context_length: 4096 # <-- change this to the context length you want to use
+num_devices: 16 # <-- change this to total number of GPUs that you want to use
+num_epochs: 1 # <-- change this to the number of epochs that you want to train for
+train_batch_size_per_device: 1
+eval_batch_size_per_device: 1
+learning_rate: 5e-6
+num_checkpoints_to_keep: 1
+dataset_size_scaling_factor: 10000
+output_dir: /mnt/local_storage
+deepspeed:
+  config_path: deepspeed_configs/zero_3_llama_2_70b.json
+flash_attention_2: True
+worker_resources:
+  p4de.24xlarge: 1 # <-- this maps to job_compute_configs file's custom_resources so the appropriate nodes can scale up
diff --git a/templates/fine-tune-llm/training_configs/full_param/llama-2-70b-chat-4k-2xp4de_24xlarge.yaml b/templates/fine-tune-llm/training_configs/full_param/llama-2-70b-chat-4k-2xp4de_24xlarge.yaml
@@ -0,0 +1,17 @@
+model_id: meta-llama/Llama-2-70b-chat-hf # <-- change this to the model you want to fine-tune
+train_path: s3://air-example-data/gsm8k/train.jsonl # <-- change this to the path to your training data
+valid_path: s3://air-example-data/gsm8k/test.jsonl # <-- change this to the path to your validation data. This is optional
+context_length: 4096 # <-- change this to the context length you want to use
+num_devices: 16 # <-- change this to total number of GPUs that you want to use
+num_epochs: 1 # <-- change this to the number of epochs that you want to train for
+train_batch_size_per_device: 1
+eval_batch_size_per_device: 1
+learning_rate: 5e-6
+num_checkpoints_to_keep: 1
+dataset_size_scaling_factor: 10000
+output_dir: /mnt/local_storage
+deepspeed:
+  config_path: deepspeed_configs/zero_3_llama_2_70b.json
+flash_attention_2: True
+worker_resources:
+  p4de.24xlarge: 1 # <-- this maps to job_compute_configs file's custom_resources so the appropriate nodes can scale up
diff --git a/templates/fine-tune-llm/training_configs/full_param/llama-2-7b-512-16xg5_4xlarge.yaml b/templates/fine-tune-llm/training_configs/full_param/llama-2-7b-512-16xg5_4xlarge.yaml
@@ -8,6 +8,7 @@ train_batch_size_per_device: 16
 eval_batch_size_per_device: 16
 learning_rate: 5e-6
 num_checkpoints_to_keep: 1
+dataset_size_scaling_factor: 10000
 output_dir: /mnt/local_storage
 deepspeed:
   config_path: deepspeed_configs/zero_3_llama_2_7b.json

diff --git a/templates/fine-tune-llm/training_configs/lora/llama-2-13b-4k-4xg5_12xlarge.yaml b/templates/fine-tune-llm/training_configs/lora/llama-2-13b-4k-4xg5_12xlarge.yaml
@@ -8,6 +8,7 @@ train_batch_size_per_device: 8
 eval_batch_size_per_device: 8
 learning_rate: 5e-6
 num_checkpoints_to_keep: 1
+dataset_size_scaling_factor: 10000
 output_dir: /mnt/local_storage
 deepspeed:
   config_path: deepspeed_configs/zero_3_llama_2_13b.json

diff --git a/templates/fine-tune-llm/training_configs/lora/llama-2-7b-512-16xg5_4xlarge.yaml b/templates/fine-tune-llm/training_configs/lora/llama-2-7b-512-16xg5_4xlarge.yaml
@@ -8,6 +8,7 @@ train_batch_size_per_device: 16
 eval_batch_size_per_device: 16
 learning_rate: 5e-6
 num_checkpoints_to_keep: 1
+dataset_size_scaling_factor: 10000
 output_dir: /mnt/local_storage
 deepspeed:
   config_path: deepspeed_configs/zero_3_llama_2_7b.json

diff --git a/...xtral-8X7b-512-1xp4de_24xlarge-viggo.yaml → ...ora/mixtral-8X7b-512-1xp4de_24xlarge.yaml b/...xtral-8X7b-512-1xp4de_24xlarge-viggo.yaml → ...ora/mixtral-8X7b-512-1xp4de_24xlarge.yaml
@@ -7,6 +7,7 @@ train_batch_size_per_device: 4
 eval_batch_size_per_device: 4
 learning_rate: 1e-4
 num_checkpoints_to_keep: 1
+dataset_size_scaling_factor: 10000
 no_gradient_checkpoint: False
 output_dir: /mnt/local_storage
 deepspeed: