From e55beb987fcbb14be197ff4233256f7c88a9cf2a Mon Sep 17 00:00:00 2001 From: Amog Kamsetty Date: Tue, 6 Feb 2024 21:50:58 -0800 Subject: [PATCH 1/4] add Signed-off-by: Amog Kamsetty --- .../job_compute_configs/aws.yaml | 4 +-- .../llama-2-70b-4k-1xp4de_24xlarge.yaml | 16 +++++++++ .../llama-2-70b-chat-4k-1xp4de_24xlarge.yaml | 16 +++++++++ ...ma-2-70b-chat-lora-4k-1xp4de_24xlarge.yaml | 36 +++++++++++++++++++ .../llama-2-70b-lora-4k-1xp4de_24xlarge.yaml | 36 +++++++++++++++++++ 5 files changed, 106 insertions(+), 2 deletions(-) create mode 100644 templates/fine-tune-llm/training_configs/llama-2-70b-4k-1xp4de_24xlarge.yaml create mode 100644 templates/fine-tune-llm/training_configs/llama-2-70b-chat-4k-1xp4de_24xlarge.yaml create mode 100644 templates/fine-tune-llm/training_configs/llama-2-70b-chat-lora-4k-1xp4de_24xlarge.yaml create mode 100644 templates/fine-tune-llm/training_configs/llama-2-70b-lora-4k-1xp4de_24xlarge.yaml diff --git a/templates/fine-tune-llm/job_compute_configs/aws.yaml b/templates/fine-tune-llm/job_compute_configs/aws.yaml index 19140cf72..1382bb575 100644 --- a/templates/fine-tune-llm/job_compute_configs/aws.yaml +++ b/templates/fine-tune-llm/job_compute_configs/aws.yaml @@ -1,8 +1,8 @@ compute_config: allowed_azs: - any - cloud: my-cloud # You may specify `cloud_id` instead - region: any + cloud_id: cld_ldm5ez4edlp7yh4yiakp2u294w # You may specify `cloud_id` instead + region: us-west-2 head_node_type: instance_type: m5.4xlarge name: head_node diff --git a/templates/fine-tune-llm/training_configs/llama-2-70b-4k-1xp4de_24xlarge.yaml b/templates/fine-tune-llm/training_configs/llama-2-70b-4k-1xp4de_24xlarge.yaml new file mode 100644 index 000000000..dbe877070 --- /dev/null +++ b/templates/fine-tune-llm/training_configs/llama-2-70b-4k-1xp4de_24xlarge.yaml @@ -0,0 +1,16 @@ +model_id: meta-llama/Llama-2-70b-hf # <-- change this to the model you want to fine-tune +train_path: s3://anyscale-wapo-collab/finetune_full_train.json # <-- change this to the path to your training data +valid_path: s3://anyscale-wapo-collab/finetune_full_val.json # <-- change this to the path to your validation data. This is optional +context_length: 4096 # <-- change this to the context length you want to use +num_devices: 8 # <-- change this to total number of GPUs that you want to use +num_epochs: 10 # <-- change this to the number of epochs that you want to train for +train_batch_size_per_device: 1 +eval_batch_size_per_device: 1 +learning_rate: 5e-6 +num_checkpoints_to_keep: 1 +output_dir: /mnt/local_storage +deepspeed: + config_path: deepspeed_configs/zero_3_llama_2_70b.json +flash_attention_2: True +worker_resources: + p4de.24xlarge: 1 # <-- this maps to job_compute_configs file's custom_resources so the appropriate nodes can scale up \ No newline at end of file diff --git a/templates/fine-tune-llm/training_configs/llama-2-70b-chat-4k-1xp4de_24xlarge.yaml b/templates/fine-tune-llm/training_configs/llama-2-70b-chat-4k-1xp4de_24xlarge.yaml new file mode 100644 index 000000000..de1cc2bd5 --- /dev/null +++ b/templates/fine-tune-llm/training_configs/llama-2-70b-chat-4k-1xp4de_24xlarge.yaml @@ -0,0 +1,16 @@ +model_id: meta-llama/Llama-2-70b-chat-hf # <-- change this to the model you want to fine-tune +train_path: s3://anyscale-wapo-collab/finetune_full_train.json # <-- change this to the path to your training data +valid_path: s3://anyscale-wapo-collab/finetune_full_val.json # <-- change this to the path to your validation data. This is optional +context_length: 4096 # <-- change this to the context length you want to use +num_devices: 8 # <-- change this to total number of GPUs that you want to use +num_epochs: 10 # <-- change this to the number of epochs that you want to train for +train_batch_size_per_device: 1 +eval_batch_size_per_device: 1 +learning_rate: 5e-6 +num_checkpoints_to_keep: 1 +output_dir: /mnt/local_storage +deepspeed: + config_path: deepspeed_configs/zero_3_llama_2_70b.json +flash_attention_2: True +worker_resources: + p4de.24xlarge: 1 # <-- this maps to job_compute_configs file's custom_resources so the appropriate nodes can scale up \ No newline at end of file diff --git a/templates/fine-tune-llm/training_configs/llama-2-70b-chat-lora-4k-1xp4de_24xlarge.yaml b/templates/fine-tune-llm/training_configs/llama-2-70b-chat-lora-4k-1xp4de_24xlarge.yaml new file mode 100644 index 000000000..5a47f8240 --- /dev/null +++ b/templates/fine-tune-llm/training_configs/llama-2-70b-chat-lora-4k-1xp4de_24xlarge.yaml @@ -0,0 +1,36 @@ +model_id: meta-llama/Llama-2-70b-chat-hf # <-- change this to the model you want to fine-tune +train_path: s3://anyscale-wapo-collab/finetune_full_train.json # <-- change this to the path to your training data +valid_path: s3://anyscale-wapo-collab/finetune_full_val.json # <-- change this to the path to your validation data. This is optional +context_length: 4096 # <-- change this to the context length you want to use +num_devices: 8 # <-- change this to total number of GPUs that you want to use +num_epochs: 10 # <-- change this to the number of epochs that you want to train for +train_batch_size_per_device: 1 +eval_batch_size_per_device: 1 +learning_rate: 5e-6 +num_checkpoints_to_keep: 1 +output_dir: /mnt/local_storage +deepspeed: + config_path: deepspeed_configs/zero_3_llama_2_70b.json +flash_attention_2: True +worker_resources: + p4de.24xlarge: 1 # <-- this maps to job_compute_configs file's custom_resources so the appropriate nodes can scale up +padding: "longest" +lora_config: + r: 8 + lora_alpha: 16 + lora_dropout: 0.05 + target_modules: + - q_proj + - v_proj + - k_proj + - o_proj + - gate_proj + - up_proj + - down_proj + - embed_tokens + - lm_head + task_type: "CAUSAL_LM" + modules_to_save: [] + bias: "none" + fan_in_fan_out: false + init_lora_weights: true \ No newline at end of file diff --git a/templates/fine-tune-llm/training_configs/llama-2-70b-lora-4k-1xp4de_24xlarge.yaml b/templates/fine-tune-llm/training_configs/llama-2-70b-lora-4k-1xp4de_24xlarge.yaml new file mode 100644 index 000000000..b0ae9302f --- /dev/null +++ b/templates/fine-tune-llm/training_configs/llama-2-70b-lora-4k-1xp4de_24xlarge.yaml @@ -0,0 +1,36 @@ +model_id: meta-llama/Llama-2-70b-hf # <-- change this to the model you want to fine-tune +train_path: s3://anyscale-wapo-collab/finetune_full_train.json # <-- change this to the path to your training data +valid_path: s3://anyscale-wapo-collab/finetune_full_val.json # <-- change this to the path to your validation data. This is optional +context_length: 4096 # <-- change this to the context length you want to use +num_devices: 8 # <-- change this to total number of GPUs that you want to use +num_epochs: 10 # <-- change this to the number of epochs that you want to train for +train_batch_size_per_device: 1 +eval_batch_size_per_device: 1 +learning_rate: 5e-6 +num_checkpoints_to_keep: 1 +output_dir: /mnt/local_storage +deepspeed: + config_path: deepspeed_configs/zero_3_llama_2_70b.json +flash_attention_2: True +worker_resources: + p4de.24xlarge: 1 # <-- this maps to job_compute_configs file's custom_resources so the appropriate nodes can scale up +padding: "longest" +lora_config: + r: 8 + lora_alpha: 16 + lora_dropout: 0.05 + target_modules: + - q_proj + - v_proj + - k_proj + - o_proj + - gate_proj + - up_proj + - down_proj + - embed_tokens + - lm_head + task_type: "CAUSAL_LM" + modules_to_save: [] + bias: "none" + fan_in_fan_out: false + init_lora_weights: true \ No newline at end of file From 953e110c2d8e1d0587f16117c1dc1d650095ace7 Mon Sep 17 00:00:00 2001 From: Amog Kamsetty Date: Fri, 23 Feb 2024 14:59:00 -0800 Subject: [PATCH 2/4] updated configs Signed-off-by: Amog Kamsetty --- .../full_param/llama-2-13b-4k-4xg5_12xlarge.yaml | 1 + .../llama-2-70b-4k-2xp4de_24xlarge.yaml} | 8 ++++---- .../full_param/llama-2-7b-512-16xg5_4xlarge.yaml | 1 + .../llama-2-70b-chat-4k-1xp4de_24xlarge.yaml | 16 ---------------- .../lora/llama-2-13b-4k-4xg5_12xlarge.yaml | 1 + .../lora/llama-2-7b-512-16xg5_4xlarge.yaml | 1 + ...aml => mixtral-8X7b-512-1xp4de_24xlarge.yaml} | 1 + 7 files changed, 9 insertions(+), 20 deletions(-) rename templates/fine-tune-llm/training_configs/{llama-2-70b-4k-1xp4de_24xlarge.yaml => full_param/llama-2-70b-4k-2xp4de_24xlarge.yaml} (56%) delete mode 100644 templates/fine-tune-llm/training_configs/llama-2-70b-chat-4k-1xp4de_24xlarge.yaml rename templates/fine-tune-llm/training_configs/lora/{mixtral-8X7b-512-1xp4de_24xlarge-viggo.yaml => mixtral-8X7b-512-1xp4de_24xlarge.yaml} (95%) diff --git a/templates/fine-tune-llm/training_configs/full_param/llama-2-13b-4k-4xg5_12xlarge.yaml b/templates/fine-tune-llm/training_configs/full_param/llama-2-13b-4k-4xg5_12xlarge.yaml index b46992e11..f060236db 100644 --- a/templates/fine-tune-llm/training_configs/full_param/llama-2-13b-4k-4xg5_12xlarge.yaml +++ b/templates/fine-tune-llm/training_configs/full_param/llama-2-13b-4k-4xg5_12xlarge.yaml @@ -8,6 +8,7 @@ train_batch_size_per_device: 8 eval_batch_size_per_device: 8 learning_rate: 5e-6 num_checkpoints_to_keep: 1 +dataset_size_scaling_factor: 10000 output_dir: /mnt/local_storage deepspeed: config_path: deepspeed_configs/zero_3_llama_2_13b.json diff --git a/templates/fine-tune-llm/training_configs/llama-2-70b-4k-1xp4de_24xlarge.yaml b/templates/fine-tune-llm/training_configs/full_param/llama-2-70b-4k-2xp4de_24xlarge.yaml similarity index 56% rename from templates/fine-tune-llm/training_configs/llama-2-70b-4k-1xp4de_24xlarge.yaml rename to templates/fine-tune-llm/training_configs/full_param/llama-2-70b-4k-2xp4de_24xlarge.yaml index dbe877070..6110c8644 100644 --- a/templates/fine-tune-llm/training_configs/llama-2-70b-4k-1xp4de_24xlarge.yaml +++ b/templates/fine-tune-llm/training_configs/full_param/llama-2-70b-4k-2xp4de_24xlarge.yaml @@ -1,13 +1,13 @@ model_id: meta-llama/Llama-2-70b-hf # <-- change this to the model you want to fine-tune -train_path: s3://anyscale-wapo-collab/finetune_full_train.json # <-- change this to the path to your training data -valid_path: s3://anyscale-wapo-collab/finetune_full_val.json # <-- change this to the path to your validation data. This is optional +train_path: s3://anyscale-production-data-cld-ldm5ez4edlp7yh4yiakp2u294w/org_4snvy99zwbmh4gbtk64jfqggmj/cld_ldm5ez4edlp7yh4yiakp2u294w/artifact_storage/finetune-5k.json # <-- change this to the path to your training data context_length: 4096 # <-- change this to the context length you want to use -num_devices: 8 # <-- change this to total number of GPUs that you want to use -num_epochs: 10 # <-- change this to the number of epochs that you want to train for +num_devices: 16 # <-- change this to total number of GPUs that you want to use +num_epochs: 1 # <-- change this to the number of epochs that you want to train for train_batch_size_per_device: 1 eval_batch_size_per_device: 1 learning_rate: 5e-6 num_checkpoints_to_keep: 1 +dataset_size_scaling_factor: 10000 output_dir: /mnt/local_storage deepspeed: config_path: deepspeed_configs/zero_3_llama_2_70b.json diff --git a/templates/fine-tune-llm/training_configs/full_param/llama-2-7b-512-16xg5_4xlarge.yaml b/templates/fine-tune-llm/training_configs/full_param/llama-2-7b-512-16xg5_4xlarge.yaml index 6ad7b6f38..fc184b47a 100644 --- a/templates/fine-tune-llm/training_configs/full_param/llama-2-7b-512-16xg5_4xlarge.yaml +++ b/templates/fine-tune-llm/training_configs/full_param/llama-2-7b-512-16xg5_4xlarge.yaml @@ -8,6 +8,7 @@ train_batch_size_per_device: 16 eval_batch_size_per_device: 16 learning_rate: 5e-6 num_checkpoints_to_keep: 1 +dataset_size_scaling_factor: 10000 output_dir: /mnt/local_storage deepspeed: config_path: deepspeed_configs/zero_3_llama_2_7b.json diff --git a/templates/fine-tune-llm/training_configs/llama-2-70b-chat-4k-1xp4de_24xlarge.yaml b/templates/fine-tune-llm/training_configs/llama-2-70b-chat-4k-1xp4de_24xlarge.yaml deleted file mode 100644 index de1cc2bd5..000000000 --- a/templates/fine-tune-llm/training_configs/llama-2-70b-chat-4k-1xp4de_24xlarge.yaml +++ /dev/null @@ -1,16 +0,0 @@ -model_id: meta-llama/Llama-2-70b-chat-hf # <-- change this to the model you want to fine-tune -train_path: s3://anyscale-wapo-collab/finetune_full_train.json # <-- change this to the path to your training data -valid_path: s3://anyscale-wapo-collab/finetune_full_val.json # <-- change this to the path to your validation data. This is optional -context_length: 4096 # <-- change this to the context length you want to use -num_devices: 8 # <-- change this to total number of GPUs that you want to use -num_epochs: 10 # <-- change this to the number of epochs that you want to train for -train_batch_size_per_device: 1 -eval_batch_size_per_device: 1 -learning_rate: 5e-6 -num_checkpoints_to_keep: 1 -output_dir: /mnt/local_storage -deepspeed: - config_path: deepspeed_configs/zero_3_llama_2_70b.json -flash_attention_2: True -worker_resources: - p4de.24xlarge: 1 # <-- this maps to job_compute_configs file's custom_resources so the appropriate nodes can scale up \ No newline at end of file diff --git a/templates/fine-tune-llm/training_configs/lora/llama-2-13b-4k-4xg5_12xlarge.yaml b/templates/fine-tune-llm/training_configs/lora/llama-2-13b-4k-4xg5_12xlarge.yaml index df6ea4ff0..6ab5de06c 100644 --- a/templates/fine-tune-llm/training_configs/lora/llama-2-13b-4k-4xg5_12xlarge.yaml +++ b/templates/fine-tune-llm/training_configs/lora/llama-2-13b-4k-4xg5_12xlarge.yaml @@ -8,6 +8,7 @@ train_batch_size_per_device: 8 eval_batch_size_per_device: 8 learning_rate: 5e-6 num_checkpoints_to_keep: 1 +dataset_size_scaling_factor: 10000 output_dir: /mnt/local_storage deepspeed: config_path: deepspeed_configs/zero_3_llama_2_13b.json diff --git a/templates/fine-tune-llm/training_configs/lora/llama-2-7b-512-16xg5_4xlarge.yaml b/templates/fine-tune-llm/training_configs/lora/llama-2-7b-512-16xg5_4xlarge.yaml index 12269f27b..2094ecca9 100644 --- a/templates/fine-tune-llm/training_configs/lora/llama-2-7b-512-16xg5_4xlarge.yaml +++ b/templates/fine-tune-llm/training_configs/lora/llama-2-7b-512-16xg5_4xlarge.yaml @@ -8,6 +8,7 @@ train_batch_size_per_device: 16 eval_batch_size_per_device: 16 learning_rate: 5e-6 num_checkpoints_to_keep: 1 +dataset_size_scaling_factor: 10000 output_dir: /mnt/local_storage deepspeed: config_path: deepspeed_configs/zero_3_llama_2_7b.json diff --git a/templates/fine-tune-llm/training_configs/lora/mixtral-8X7b-512-1xp4de_24xlarge-viggo.yaml b/templates/fine-tune-llm/training_configs/lora/mixtral-8X7b-512-1xp4de_24xlarge.yaml similarity index 95% rename from templates/fine-tune-llm/training_configs/lora/mixtral-8X7b-512-1xp4de_24xlarge-viggo.yaml rename to templates/fine-tune-llm/training_configs/lora/mixtral-8X7b-512-1xp4de_24xlarge.yaml index dd2a18bd9..7ffac4ad5 100644 --- a/templates/fine-tune-llm/training_configs/lora/mixtral-8X7b-512-1xp4de_24xlarge-viggo.yaml +++ b/templates/fine-tune-llm/training_configs/lora/mixtral-8X7b-512-1xp4de_24xlarge.yaml @@ -7,6 +7,7 @@ train_batch_size_per_device: 4 eval_batch_size_per_device: 4 learning_rate: 1e-4 num_checkpoints_to_keep: 1 +dataset_size_scaling_factor: 10000 no_gradient_checkpoint: False output_dir: /mnt/local_storage deepspeed: From 03a69915a37fdf58ca7216df1392c0383e49bd0f Mon Sep 17 00:00:00 2001 From: Amog Kamsetty Date: Fri, 23 Feb 2024 15:07:06 -0800 Subject: [PATCH 3/4] update Signed-off-by: Amog Kamsetty --- .../llama-2-70b-4k-2xp4de_24xlarge.yaml | 3 +- ...ma-2-70b-chat-lora-4k-1xp4de_24xlarge.yaml | 36 ------------------- .../llama-2-70b-lora-4k-1xp4de_24xlarge.yaml | 36 ------------------- 3 files changed, 2 insertions(+), 73 deletions(-) delete mode 100644 templates/fine-tune-llm/training_configs/llama-2-70b-chat-lora-4k-1xp4de_24xlarge.yaml delete mode 100644 templates/fine-tune-llm/training_configs/llama-2-70b-lora-4k-1xp4de_24xlarge.yaml diff --git a/templates/fine-tune-llm/training_configs/full_param/llama-2-70b-4k-2xp4de_24xlarge.yaml b/templates/fine-tune-llm/training_configs/full_param/llama-2-70b-4k-2xp4de_24xlarge.yaml index 6110c8644..68d6488df 100644 --- a/templates/fine-tune-llm/training_configs/full_param/llama-2-70b-4k-2xp4de_24xlarge.yaml +++ b/templates/fine-tune-llm/training_configs/full_param/llama-2-70b-4k-2xp4de_24xlarge.yaml @@ -1,5 +1,6 @@ model_id: meta-llama/Llama-2-70b-hf # <-- change this to the model you want to fine-tune -train_path: s3://anyscale-production-data-cld-ldm5ez4edlp7yh4yiakp2u294w/org_4snvy99zwbmh4gbtk64jfqggmj/cld_ldm5ez4edlp7yh4yiakp2u294w/artifact_storage/finetune-5k.json # <-- change this to the path to your training data +train_path: s3://air-example-data/gsm8k/train.jsonl # <-- change this to the path to your training data +valid_path: s3://air-example-data/gsm8k/test.jsonl # <-- change this to the path to your validation data. This is optional context_length: 4096 # <-- change this to the context length you want to use num_devices: 16 # <-- change this to total number of GPUs that you want to use num_epochs: 1 # <-- change this to the number of epochs that you want to train for diff --git a/templates/fine-tune-llm/training_configs/llama-2-70b-chat-lora-4k-1xp4de_24xlarge.yaml b/templates/fine-tune-llm/training_configs/llama-2-70b-chat-lora-4k-1xp4de_24xlarge.yaml deleted file mode 100644 index 5a47f8240..000000000 --- a/templates/fine-tune-llm/training_configs/llama-2-70b-chat-lora-4k-1xp4de_24xlarge.yaml +++ /dev/null @@ -1,36 +0,0 @@ -model_id: meta-llama/Llama-2-70b-chat-hf # <-- change this to the model you want to fine-tune -train_path: s3://anyscale-wapo-collab/finetune_full_train.json # <-- change this to the path to your training data -valid_path: s3://anyscale-wapo-collab/finetune_full_val.json # <-- change this to the path to your validation data. This is optional -context_length: 4096 # <-- change this to the context length you want to use -num_devices: 8 # <-- change this to total number of GPUs that you want to use -num_epochs: 10 # <-- change this to the number of epochs that you want to train for -train_batch_size_per_device: 1 -eval_batch_size_per_device: 1 -learning_rate: 5e-6 -num_checkpoints_to_keep: 1 -output_dir: /mnt/local_storage -deepspeed: - config_path: deepspeed_configs/zero_3_llama_2_70b.json -flash_attention_2: True -worker_resources: - p4de.24xlarge: 1 # <-- this maps to job_compute_configs file's custom_resources so the appropriate nodes can scale up -padding: "longest" -lora_config: - r: 8 - lora_alpha: 16 - lora_dropout: 0.05 - target_modules: - - q_proj - - v_proj - - k_proj - - o_proj - - gate_proj - - up_proj - - down_proj - - embed_tokens - - lm_head - task_type: "CAUSAL_LM" - modules_to_save: [] - bias: "none" - fan_in_fan_out: false - init_lora_weights: true \ No newline at end of file diff --git a/templates/fine-tune-llm/training_configs/llama-2-70b-lora-4k-1xp4de_24xlarge.yaml b/templates/fine-tune-llm/training_configs/llama-2-70b-lora-4k-1xp4de_24xlarge.yaml deleted file mode 100644 index b0ae9302f..000000000 --- a/templates/fine-tune-llm/training_configs/llama-2-70b-lora-4k-1xp4de_24xlarge.yaml +++ /dev/null @@ -1,36 +0,0 @@ -model_id: meta-llama/Llama-2-70b-hf # <-- change this to the model you want to fine-tune -train_path: s3://anyscale-wapo-collab/finetune_full_train.json # <-- change this to the path to your training data -valid_path: s3://anyscale-wapo-collab/finetune_full_val.json # <-- change this to the path to your validation data. This is optional -context_length: 4096 # <-- change this to the context length you want to use -num_devices: 8 # <-- change this to total number of GPUs that you want to use -num_epochs: 10 # <-- change this to the number of epochs that you want to train for -train_batch_size_per_device: 1 -eval_batch_size_per_device: 1 -learning_rate: 5e-6 -num_checkpoints_to_keep: 1 -output_dir: /mnt/local_storage -deepspeed: - config_path: deepspeed_configs/zero_3_llama_2_70b.json -flash_attention_2: True -worker_resources: - p4de.24xlarge: 1 # <-- this maps to job_compute_configs file's custom_resources so the appropriate nodes can scale up -padding: "longest" -lora_config: - r: 8 - lora_alpha: 16 - lora_dropout: 0.05 - target_modules: - - q_proj - - v_proj - - k_proj - - o_proj - - gate_proj - - up_proj - - down_proj - - embed_tokens - - lm_head - task_type: "CAUSAL_LM" - modules_to_save: [] - bias: "none" - fan_in_fan_out: false - init_lora_weights: true \ No newline at end of file From 7bcd58842698c8038224f4265fca3b641d7b3750 Mon Sep 17 00:00:00 2001 From: Amog Kamsetty Date: Fri, 23 Feb 2024 15:10:49 -0800 Subject: [PATCH 4/4] add chat Signed-off-by: Amog Kamsetty --- .../llama-2-70b-chat-4k-2xp4de_24xlarge.yaml | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 templates/fine-tune-llm/training_configs/full_param/llama-2-70b-chat-4k-2xp4de_24xlarge.yaml diff --git a/templates/fine-tune-llm/training_configs/full_param/llama-2-70b-chat-4k-2xp4de_24xlarge.yaml b/templates/fine-tune-llm/training_configs/full_param/llama-2-70b-chat-4k-2xp4de_24xlarge.yaml new file mode 100644 index 000000000..e45a2d023 --- /dev/null +++ b/templates/fine-tune-llm/training_configs/full_param/llama-2-70b-chat-4k-2xp4de_24xlarge.yaml @@ -0,0 +1,17 @@ +model_id: meta-llama/Llama-2-70b-chat-hf # <-- change this to the model you want to fine-tune +train_path: s3://air-example-data/gsm8k/train.jsonl # <-- change this to the path to your training data +valid_path: s3://air-example-data/gsm8k/test.jsonl # <-- change this to the path to your validation data. This is optional +context_length: 4096 # <-- change this to the context length you want to use +num_devices: 16 # <-- change this to total number of GPUs that you want to use +num_epochs: 1 # <-- change this to the number of epochs that you want to train for +train_batch_size_per_device: 1 +eval_batch_size_per_device: 1 +learning_rate: 5e-6 +num_checkpoints_to_keep: 1 +dataset_size_scaling_factor: 10000 +output_dir: /mnt/local_storage +deepspeed: + config_path: deepspeed_configs/zero_3_llama_2_70b.json +flash_attention_2: True +worker_resources: + p4de.24xlarge: 1 # <-- this maps to job_compute_configs file's custom_resources so the appropriate nodes can scale up \ No newline at end of file