From e55beb987fcbb14be197ff4233256f7c88a9cf2a Mon Sep 17 00:00:00 2001
From: Amog Kamsetty <amogkamsetty@gmail.com>
Date: Tue, 6 Feb 2024 21:50:58 -0800
Subject: [PATCH 1/4] add

Signed-off-by: Amog Kamsetty <amogkamsetty@gmail.com>
---
 .../job_compute_configs/aws.yaml              |  4 +--
 .../llama-2-70b-4k-1xp4de_24xlarge.yaml       | 16 +++++++++
 .../llama-2-70b-chat-4k-1xp4de_24xlarge.yaml  | 16 +++++++++
 ...ma-2-70b-chat-lora-4k-1xp4de_24xlarge.yaml | 36 +++++++++++++++++++
 .../llama-2-70b-lora-4k-1xp4de_24xlarge.yaml  | 36 +++++++++++++++++++
 5 files changed, 106 insertions(+), 2 deletions(-)
 create mode 100644 templates/fine-tune-llm/training_configs/llama-2-70b-4k-1xp4de_24xlarge.yaml
 create mode 100644 templates/fine-tune-llm/training_configs/llama-2-70b-chat-4k-1xp4de_24xlarge.yaml
 create mode 100644 templates/fine-tune-llm/training_configs/llama-2-70b-chat-lora-4k-1xp4de_24xlarge.yaml
 create mode 100644 templates/fine-tune-llm/training_configs/llama-2-70b-lora-4k-1xp4de_24xlarge.yaml

diff --git a/templates/fine-tune-llm/job_compute_configs/aws.yaml b/templates/fine-tune-llm/job_compute_configs/aws.yaml
index 19140cf72..1382bb575 100644
--- a/templates/fine-tune-llm/job_compute_configs/aws.yaml
+++ b/templates/fine-tune-llm/job_compute_configs/aws.yaml
@@ -1,8 +1,8 @@
 compute_config:
   allowed_azs:
     - any
-  cloud: my-cloud # You may specify `cloud_id` instead
-  region: any
+  cloud_id: cld_ldm5ez4edlp7yh4yiakp2u294w # You may specify `cloud_id` instead
+  region: us-west-2
   head_node_type:
     instance_type: m5.4xlarge
     name: head_node
diff --git a/templates/fine-tune-llm/training_configs/llama-2-70b-4k-1xp4de_24xlarge.yaml b/templates/fine-tune-llm/training_configs/llama-2-70b-4k-1xp4de_24xlarge.yaml
new file mode 100644
index 000000000..dbe877070
--- /dev/null
+++ b/templates/fine-tune-llm/training_configs/llama-2-70b-4k-1xp4de_24xlarge.yaml
@@ -0,0 +1,16 @@
+model_id: meta-llama/Llama-2-70b-hf # <-- change this to the model you want to fine-tune
+train_path: s3://anyscale-wapo-collab/finetune_full_train.json # <-- change this to the path to your training data
+valid_path: s3://anyscale-wapo-collab/finetune_full_val.json # <-- change this to the path to your validation data. This is optional
+context_length: 4096 # <-- change this to the context length you want to use
+num_devices: 8 # <-- change this to total number of GPUs that you want to use
+num_epochs: 10 # <-- change this to the number of epochs that you want to train for
+train_batch_size_per_device: 1
+eval_batch_size_per_device: 1
+learning_rate: 5e-6
+num_checkpoints_to_keep: 1
+output_dir: /mnt/local_storage
+deepspeed:
+  config_path: deepspeed_configs/zero_3_llama_2_70b.json
+flash_attention_2: True
+worker_resources:
+  p4de.24xlarge: 1 # <-- this maps to job_compute_configs file's custom_resources so the appropriate nodes can scale up
\ No newline at end of file
diff --git a/templates/fine-tune-llm/training_configs/llama-2-70b-chat-4k-1xp4de_24xlarge.yaml b/templates/fine-tune-llm/training_configs/llama-2-70b-chat-4k-1xp4de_24xlarge.yaml
new file mode 100644
index 000000000..de1cc2bd5
--- /dev/null
+++ b/templates/fine-tune-llm/training_configs/llama-2-70b-chat-4k-1xp4de_24xlarge.yaml
@@ -0,0 +1,16 @@
+model_id: meta-llama/Llama-2-70b-chat-hf # <-- change this to the model you want to fine-tune
+train_path: s3://anyscale-wapo-collab/finetune_full_train.json # <-- change this to the path to your training data
+valid_path: s3://anyscale-wapo-collab/finetune_full_val.json # <-- change this to the path to your validation data. This is optional
+context_length: 4096 # <-- change this to the context length you want to use
+num_devices: 8 # <-- change this to total number of GPUs that you want to use
+num_epochs: 10 # <-- change this to the number of epochs that you want to train for
+train_batch_size_per_device: 1
+eval_batch_size_per_device: 1
+learning_rate: 5e-6
+num_checkpoints_to_keep: 1
+output_dir: /mnt/local_storage
+deepspeed:
+  config_path: deepspeed_configs/zero_3_llama_2_70b.json
+flash_attention_2: True
+worker_resources:
+  p4de.24xlarge: 1 # <-- this maps to job_compute_configs file's custom_resources so the appropriate nodes can scale up
\ No newline at end of file
diff --git a/templates/fine-tune-llm/training_configs/llama-2-70b-chat-lora-4k-1xp4de_24xlarge.yaml b/templates/fine-tune-llm/training_configs/llama-2-70b-chat-lora-4k-1xp4de_24xlarge.yaml
new file mode 100644
index 000000000..5a47f8240
--- /dev/null
+++ b/templates/fine-tune-llm/training_configs/llama-2-70b-chat-lora-4k-1xp4de_24xlarge.yaml
@@ -0,0 +1,36 @@
+model_id: meta-llama/Llama-2-70b-chat-hf # <-- change this to the model you want to fine-tune
+train_path: s3://anyscale-wapo-collab/finetune_full_train.json # <-- change this to the path to your training data
+valid_path: s3://anyscale-wapo-collab/finetune_full_val.json # <-- change this to the path to your validation data. This is optional
+context_length: 4096 # <-- change this to the context length you want to use
+num_devices: 8 # <-- change this to total number of GPUs that you want to use
+num_epochs: 10 # <-- change this to the number of epochs that you want to train for
+train_batch_size_per_device: 1
+eval_batch_size_per_device: 1
+learning_rate: 5e-6
+num_checkpoints_to_keep: 1
+output_dir: /mnt/local_storage
+deepspeed:
+  config_path: deepspeed_configs/zero_3_llama_2_70b.json
+flash_attention_2: True
+worker_resources:
+  p4de.24xlarge: 1 # <-- this maps to job_compute_configs file's custom_resources so the appropriate nodes can scale up
+padding: "longest"
+lora_config:
+  r: 8
+  lora_alpha: 16
+  lora_dropout: 0.05
+  target_modules: 
+    - q_proj
+    - v_proj
+    - k_proj
+    - o_proj
+    - gate_proj
+    - up_proj
+    - down_proj
+    - embed_tokens
+    - lm_head
+  task_type: "CAUSAL_LM"
+  modules_to_save: []
+  bias: "none"
+  fan_in_fan_out: false
+  init_lora_weights: true
\ No newline at end of file
diff --git a/templates/fine-tune-llm/training_configs/llama-2-70b-lora-4k-1xp4de_24xlarge.yaml b/templates/fine-tune-llm/training_configs/llama-2-70b-lora-4k-1xp4de_24xlarge.yaml
new file mode 100644
index 000000000..b0ae9302f
--- /dev/null
+++ b/templates/fine-tune-llm/training_configs/llama-2-70b-lora-4k-1xp4de_24xlarge.yaml
@@ -0,0 +1,36 @@
+model_id: meta-llama/Llama-2-70b-hf # <-- change this to the model you want to fine-tune
+train_path: s3://anyscale-wapo-collab/finetune_full_train.json # <-- change this to the path to your training data
+valid_path: s3://anyscale-wapo-collab/finetune_full_val.json # <-- change this to the path to your validation data. This is optional
+context_length: 4096 # <-- change this to the context length you want to use
+num_devices: 8 # <-- change this to total number of GPUs that you want to use
+num_epochs: 10 # <-- change this to the number of epochs that you want to train for
+train_batch_size_per_device: 1
+eval_batch_size_per_device: 1
+learning_rate: 5e-6
+num_checkpoints_to_keep: 1
+output_dir: /mnt/local_storage
+deepspeed:
+  config_path: deepspeed_configs/zero_3_llama_2_70b.json
+flash_attention_2: True
+worker_resources:
+  p4de.24xlarge: 1 # <-- this maps to job_compute_configs file's custom_resources so the appropriate nodes can scale up
+padding: "longest"
+lora_config:
+  r: 8
+  lora_alpha: 16
+  lora_dropout: 0.05
+  target_modules: 
+    - q_proj
+    - v_proj
+    - k_proj
+    - o_proj
+    - gate_proj
+    - up_proj
+    - down_proj
+    - embed_tokens
+    - lm_head
+  task_type: "CAUSAL_LM"
+  modules_to_save: []
+  bias: "none"
+  fan_in_fan_out: false
+  init_lora_weights: true
\ No newline at end of file

From 953e110c2d8e1d0587f16117c1dc1d650095ace7 Mon Sep 17 00:00:00 2001
From: Amog Kamsetty <amogkamsetty@gmail.com>
Date: Fri, 23 Feb 2024 14:59:00 -0800
Subject: [PATCH 2/4] updated configs

Signed-off-by: Amog Kamsetty <amogkamsetty@gmail.com>
---
 .../full_param/llama-2-13b-4k-4xg5_12xlarge.yaml |  1 +
 .../llama-2-70b-4k-2xp4de_24xlarge.yaml}         |  8 ++++----
 .../full_param/llama-2-7b-512-16xg5_4xlarge.yaml |  1 +
 .../llama-2-70b-chat-4k-1xp4de_24xlarge.yaml     | 16 ----------------
 .../lora/llama-2-13b-4k-4xg5_12xlarge.yaml       |  1 +
 .../lora/llama-2-7b-512-16xg5_4xlarge.yaml       |  1 +
 ...aml => mixtral-8X7b-512-1xp4de_24xlarge.yaml} |  1 +
 7 files changed, 9 insertions(+), 20 deletions(-)
 rename templates/fine-tune-llm/training_configs/{llama-2-70b-4k-1xp4de_24xlarge.yaml => full_param/llama-2-70b-4k-2xp4de_24xlarge.yaml} (56%)
 delete mode 100644 templates/fine-tune-llm/training_configs/llama-2-70b-chat-4k-1xp4de_24xlarge.yaml
 rename templates/fine-tune-llm/training_configs/lora/{mixtral-8X7b-512-1xp4de_24xlarge-viggo.yaml => mixtral-8X7b-512-1xp4de_24xlarge.yaml} (95%)

diff --git a/templates/fine-tune-llm/training_configs/full_param/llama-2-13b-4k-4xg5_12xlarge.yaml b/templates/fine-tune-llm/training_configs/full_param/llama-2-13b-4k-4xg5_12xlarge.yaml
index b46992e11..f060236db 100644
--- a/templates/fine-tune-llm/training_configs/full_param/llama-2-13b-4k-4xg5_12xlarge.yaml
+++ b/templates/fine-tune-llm/training_configs/full_param/llama-2-13b-4k-4xg5_12xlarge.yaml
@@ -8,6 +8,7 @@ train_batch_size_per_device: 8
 eval_batch_size_per_device: 8
 learning_rate: 5e-6
 num_checkpoints_to_keep: 1
+dataset_size_scaling_factor: 10000
 output_dir: /mnt/local_storage
 deepspeed:
   config_path: deepspeed_configs/zero_3_llama_2_13b.json
diff --git a/templates/fine-tune-llm/training_configs/llama-2-70b-4k-1xp4de_24xlarge.yaml b/templates/fine-tune-llm/training_configs/full_param/llama-2-70b-4k-2xp4de_24xlarge.yaml
similarity index 56%
rename from templates/fine-tune-llm/training_configs/llama-2-70b-4k-1xp4de_24xlarge.yaml
rename to templates/fine-tune-llm/training_configs/full_param/llama-2-70b-4k-2xp4de_24xlarge.yaml
index dbe877070..6110c8644 100644
--- a/templates/fine-tune-llm/training_configs/llama-2-70b-4k-1xp4de_24xlarge.yaml
+++ b/templates/fine-tune-llm/training_configs/full_param/llama-2-70b-4k-2xp4de_24xlarge.yaml
@@ -1,13 +1,13 @@
 model_id: meta-llama/Llama-2-70b-hf # <-- change this to the model you want to fine-tune
-train_path: s3://anyscale-wapo-collab/finetune_full_train.json # <-- change this to the path to your training data
-valid_path: s3://anyscale-wapo-collab/finetune_full_val.json # <-- change this to the path to your validation data. This is optional
+train_path: s3://anyscale-production-data-cld-ldm5ez4edlp7yh4yiakp2u294w/org_4snvy99zwbmh4gbtk64jfqggmj/cld_ldm5ez4edlp7yh4yiakp2u294w/artifact_storage/finetune-5k.json # <-- change this to the path to your training data
 context_length: 4096 # <-- change this to the context length you want to use
-num_devices: 8 # <-- change this to total number of GPUs that you want to use
-num_epochs: 10 # <-- change this to the number of epochs that you want to train for
+num_devices: 16 # <-- change this to total number of GPUs that you want to use
+num_epochs: 1 # <-- change this to the number of epochs that you want to train for
 train_batch_size_per_device: 1
 eval_batch_size_per_device: 1
 learning_rate: 5e-6
 num_checkpoints_to_keep: 1
+dataset_size_scaling_factor: 10000
 output_dir: /mnt/local_storage
 deepspeed:
   config_path: deepspeed_configs/zero_3_llama_2_70b.json
diff --git a/templates/fine-tune-llm/training_configs/full_param/llama-2-7b-512-16xg5_4xlarge.yaml b/templates/fine-tune-llm/training_configs/full_param/llama-2-7b-512-16xg5_4xlarge.yaml
index 6ad7b6f38..fc184b47a 100644
--- a/templates/fine-tune-llm/training_configs/full_param/llama-2-7b-512-16xg5_4xlarge.yaml
+++ b/templates/fine-tune-llm/training_configs/full_param/llama-2-7b-512-16xg5_4xlarge.yaml
@@ -8,6 +8,7 @@ train_batch_size_per_device: 16
 eval_batch_size_per_device: 16
 learning_rate: 5e-6
 num_checkpoints_to_keep: 1
+dataset_size_scaling_factor: 10000
 output_dir: /mnt/local_storage
 deepspeed:
   config_path: deepspeed_configs/zero_3_llama_2_7b.json
diff --git a/templates/fine-tune-llm/training_configs/llama-2-70b-chat-4k-1xp4de_24xlarge.yaml b/templates/fine-tune-llm/training_configs/llama-2-70b-chat-4k-1xp4de_24xlarge.yaml
deleted file mode 100644
index de1cc2bd5..000000000
--- a/templates/fine-tune-llm/training_configs/llama-2-70b-chat-4k-1xp4de_24xlarge.yaml
+++ /dev/null
@@ -1,16 +0,0 @@
-model_id: meta-llama/Llama-2-70b-chat-hf # <-- change this to the model you want to fine-tune
-train_path: s3://anyscale-wapo-collab/finetune_full_train.json # <-- change this to the path to your training data
-valid_path: s3://anyscale-wapo-collab/finetune_full_val.json # <-- change this to the path to your validation data. This is optional
-context_length: 4096 # <-- change this to the context length you want to use
-num_devices: 8 # <-- change this to total number of GPUs that you want to use
-num_epochs: 10 # <-- change this to the number of epochs that you want to train for
-train_batch_size_per_device: 1
-eval_batch_size_per_device: 1
-learning_rate: 5e-6
-num_checkpoints_to_keep: 1
-output_dir: /mnt/local_storage
-deepspeed:
-  config_path: deepspeed_configs/zero_3_llama_2_70b.json
-flash_attention_2: True
-worker_resources:
-  p4de.24xlarge: 1 # <-- this maps to job_compute_configs file's custom_resources so the appropriate nodes can scale up
\ No newline at end of file
diff --git a/templates/fine-tune-llm/training_configs/lora/llama-2-13b-4k-4xg5_12xlarge.yaml b/templates/fine-tune-llm/training_configs/lora/llama-2-13b-4k-4xg5_12xlarge.yaml
index df6ea4ff0..6ab5de06c 100644
--- a/templates/fine-tune-llm/training_configs/lora/llama-2-13b-4k-4xg5_12xlarge.yaml
+++ b/templates/fine-tune-llm/training_configs/lora/llama-2-13b-4k-4xg5_12xlarge.yaml
@@ -8,6 +8,7 @@ train_batch_size_per_device: 8
 eval_batch_size_per_device: 8
 learning_rate: 5e-6
 num_checkpoints_to_keep: 1
+dataset_size_scaling_factor: 10000
 output_dir: /mnt/local_storage
 deepspeed:
   config_path: deepspeed_configs/zero_3_llama_2_13b.json
diff --git a/templates/fine-tune-llm/training_configs/lora/llama-2-7b-512-16xg5_4xlarge.yaml b/templates/fine-tune-llm/training_configs/lora/llama-2-7b-512-16xg5_4xlarge.yaml
index 12269f27b..2094ecca9 100644
--- a/templates/fine-tune-llm/training_configs/lora/llama-2-7b-512-16xg5_4xlarge.yaml
+++ b/templates/fine-tune-llm/training_configs/lora/llama-2-7b-512-16xg5_4xlarge.yaml
@@ -8,6 +8,7 @@ train_batch_size_per_device: 16
 eval_batch_size_per_device: 16
 learning_rate: 5e-6
 num_checkpoints_to_keep: 1
+dataset_size_scaling_factor: 10000
 output_dir: /mnt/local_storage
 deepspeed:
   config_path: deepspeed_configs/zero_3_llama_2_7b.json
diff --git a/templates/fine-tune-llm/training_configs/lora/mixtral-8X7b-512-1xp4de_24xlarge-viggo.yaml b/templates/fine-tune-llm/training_configs/lora/mixtral-8X7b-512-1xp4de_24xlarge.yaml
similarity index 95%
rename from templates/fine-tune-llm/training_configs/lora/mixtral-8X7b-512-1xp4de_24xlarge-viggo.yaml
rename to templates/fine-tune-llm/training_configs/lora/mixtral-8X7b-512-1xp4de_24xlarge.yaml
index dd2a18bd9..7ffac4ad5 100644
--- a/templates/fine-tune-llm/training_configs/lora/mixtral-8X7b-512-1xp4de_24xlarge-viggo.yaml
+++ b/templates/fine-tune-llm/training_configs/lora/mixtral-8X7b-512-1xp4de_24xlarge.yaml
@@ -7,6 +7,7 @@ train_batch_size_per_device: 4
 eval_batch_size_per_device: 4
 learning_rate: 1e-4
 num_checkpoints_to_keep: 1
+dataset_size_scaling_factor: 10000
 no_gradient_checkpoint: False
 output_dir: /mnt/local_storage
 deepspeed:

From 03a69915a37fdf58ca7216df1392c0383e49bd0f Mon Sep 17 00:00:00 2001
From: Amog Kamsetty <amogkamsetty@gmail.com>
Date: Fri, 23 Feb 2024 15:07:06 -0800
Subject: [PATCH 3/4] update

Signed-off-by: Amog Kamsetty <amogkamsetty@gmail.com>
---
 .../llama-2-70b-4k-2xp4de_24xlarge.yaml       |  3 +-
 ...ma-2-70b-chat-lora-4k-1xp4de_24xlarge.yaml | 36 -------------------
 .../llama-2-70b-lora-4k-1xp4de_24xlarge.yaml  | 36 -------------------
 3 files changed, 2 insertions(+), 73 deletions(-)
 delete mode 100644 templates/fine-tune-llm/training_configs/llama-2-70b-chat-lora-4k-1xp4de_24xlarge.yaml
 delete mode 100644 templates/fine-tune-llm/training_configs/llama-2-70b-lora-4k-1xp4de_24xlarge.yaml

diff --git a/templates/fine-tune-llm/training_configs/full_param/llama-2-70b-4k-2xp4de_24xlarge.yaml b/templates/fine-tune-llm/training_configs/full_param/llama-2-70b-4k-2xp4de_24xlarge.yaml
index 6110c8644..68d6488df 100644
--- a/templates/fine-tune-llm/training_configs/full_param/llama-2-70b-4k-2xp4de_24xlarge.yaml
+++ b/templates/fine-tune-llm/training_configs/full_param/llama-2-70b-4k-2xp4de_24xlarge.yaml
@@ -1,5 +1,6 @@
 model_id: meta-llama/Llama-2-70b-hf # <-- change this to the model you want to fine-tune
-train_path: s3://anyscale-production-data-cld-ldm5ez4edlp7yh4yiakp2u294w/org_4snvy99zwbmh4gbtk64jfqggmj/cld_ldm5ez4edlp7yh4yiakp2u294w/artifact_storage/finetune-5k.json # <-- change this to the path to your training data
+train_path: s3://air-example-data/gsm8k/train.jsonl # <-- change this to the path to your training data
+valid_path: s3://air-example-data/gsm8k/test.jsonl # <-- change this to the path to your validation data. This is optional
 context_length: 4096 # <-- change this to the context length you want to use
 num_devices: 16 # <-- change this to total number of GPUs that you want to use
 num_epochs: 1 # <-- change this to the number of epochs that you want to train for
diff --git a/templates/fine-tune-llm/training_configs/llama-2-70b-chat-lora-4k-1xp4de_24xlarge.yaml b/templates/fine-tune-llm/training_configs/llama-2-70b-chat-lora-4k-1xp4de_24xlarge.yaml
deleted file mode 100644
index 5a47f8240..000000000
--- a/templates/fine-tune-llm/training_configs/llama-2-70b-chat-lora-4k-1xp4de_24xlarge.yaml
+++ /dev/null
@@ -1,36 +0,0 @@
-model_id: meta-llama/Llama-2-70b-chat-hf # <-- change this to the model you want to fine-tune
-train_path: s3://anyscale-wapo-collab/finetune_full_train.json # <-- change this to the path to your training data
-valid_path: s3://anyscale-wapo-collab/finetune_full_val.json # <-- change this to the path to your validation data. This is optional
-context_length: 4096 # <-- change this to the context length you want to use
-num_devices: 8 # <-- change this to total number of GPUs that you want to use
-num_epochs: 10 # <-- change this to the number of epochs that you want to train for
-train_batch_size_per_device: 1
-eval_batch_size_per_device: 1
-learning_rate: 5e-6
-num_checkpoints_to_keep: 1
-output_dir: /mnt/local_storage
-deepspeed:
-  config_path: deepspeed_configs/zero_3_llama_2_70b.json
-flash_attention_2: True
-worker_resources:
-  p4de.24xlarge: 1 # <-- this maps to job_compute_configs file's custom_resources so the appropriate nodes can scale up
-padding: "longest"
-lora_config:
-  r: 8
-  lora_alpha: 16
-  lora_dropout: 0.05
-  target_modules: 
-    - q_proj
-    - v_proj
-    - k_proj
-    - o_proj
-    - gate_proj
-    - up_proj
-    - down_proj
-    - embed_tokens
-    - lm_head
-  task_type: "CAUSAL_LM"
-  modules_to_save: []
-  bias: "none"
-  fan_in_fan_out: false
-  init_lora_weights: true
\ No newline at end of file
diff --git a/templates/fine-tune-llm/training_configs/llama-2-70b-lora-4k-1xp4de_24xlarge.yaml b/templates/fine-tune-llm/training_configs/llama-2-70b-lora-4k-1xp4de_24xlarge.yaml
deleted file mode 100644
index b0ae9302f..000000000
--- a/templates/fine-tune-llm/training_configs/llama-2-70b-lora-4k-1xp4de_24xlarge.yaml
+++ /dev/null
@@ -1,36 +0,0 @@
-model_id: meta-llama/Llama-2-70b-hf # <-- change this to the model you want to fine-tune
-train_path: s3://anyscale-wapo-collab/finetune_full_train.json # <-- change this to the path to your training data
-valid_path: s3://anyscale-wapo-collab/finetune_full_val.json # <-- change this to the path to your validation data. This is optional
-context_length: 4096 # <-- change this to the context length you want to use
-num_devices: 8 # <-- change this to total number of GPUs that you want to use
-num_epochs: 10 # <-- change this to the number of epochs that you want to train for
-train_batch_size_per_device: 1
-eval_batch_size_per_device: 1
-learning_rate: 5e-6
-num_checkpoints_to_keep: 1
-output_dir: /mnt/local_storage
-deepspeed:
-  config_path: deepspeed_configs/zero_3_llama_2_70b.json
-flash_attention_2: True
-worker_resources:
-  p4de.24xlarge: 1 # <-- this maps to job_compute_configs file's custom_resources so the appropriate nodes can scale up
-padding: "longest"
-lora_config:
-  r: 8
-  lora_alpha: 16
-  lora_dropout: 0.05
-  target_modules: 
-    - q_proj
-    - v_proj
-    - k_proj
-    - o_proj
-    - gate_proj
-    - up_proj
-    - down_proj
-    - embed_tokens
-    - lm_head
-  task_type: "CAUSAL_LM"
-  modules_to_save: []
-  bias: "none"
-  fan_in_fan_out: false
-  init_lora_weights: true
\ No newline at end of file

From 7bcd58842698c8038224f4265fca3b641d7b3750 Mon Sep 17 00:00:00 2001
From: Amog Kamsetty <amogkamsetty@gmail.com>
Date: Fri, 23 Feb 2024 15:10:49 -0800
Subject: [PATCH 4/4] add chat

Signed-off-by: Amog Kamsetty <amogkamsetty@gmail.com>
---
 .../llama-2-70b-chat-4k-2xp4de_24xlarge.yaml    | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)
 create mode 100644 templates/fine-tune-llm/training_configs/full_param/llama-2-70b-chat-4k-2xp4de_24xlarge.yaml

diff --git a/templates/fine-tune-llm/training_configs/full_param/llama-2-70b-chat-4k-2xp4de_24xlarge.yaml b/templates/fine-tune-llm/training_configs/full_param/llama-2-70b-chat-4k-2xp4de_24xlarge.yaml
new file mode 100644
index 000000000..e45a2d023
--- /dev/null
+++ b/templates/fine-tune-llm/training_configs/full_param/llama-2-70b-chat-4k-2xp4de_24xlarge.yaml
@@ -0,0 +1,17 @@
+model_id: meta-llama/Llama-2-70b-chat-hf # <-- change this to the model you want to fine-tune
+train_path: s3://air-example-data/gsm8k/train.jsonl # <-- change this to the path to your training data
+valid_path: s3://air-example-data/gsm8k/test.jsonl # <-- change this to the path to your validation data. This is optional
+context_length: 4096 # <-- change this to the context length you want to use
+num_devices: 16 # <-- change this to total number of GPUs that you want to use
+num_epochs: 1 # <-- change this to the number of epochs that you want to train for
+train_batch_size_per_device: 1
+eval_batch_size_per_device: 1
+learning_rate: 5e-6
+num_checkpoints_to_keep: 1
+dataset_size_scaling_factor: 10000
+output_dir: /mnt/local_storage
+deepspeed:
+  config_path: deepspeed_configs/zero_3_llama_2_70b.json
+flash_attention_2: True
+worker_resources:
+  p4de.24xlarge: 1 # <-- this maps to job_compute_configs file's custom_resources so the appropriate nodes can scale up
\ No newline at end of file