From 278a9622f6bd8c768c7b1a91e0c126d03f88d553 Mon Sep 17 00:00:00 2001
From: SumanthRH <sumanthrh@anyscale.com>
Date: Tue, 10 Dec 2024 18:39:57 +0530
Subject: [PATCH 1/2] x

Signed-off-by: SumanthRH <sumanthrh@anyscale.com>
---
 templates/e2e-llm-workflows/README.md                |  4 +---
 .../configs/training/full_param/llama-3-70b.yaml     |  5 +----
 .../configs/training/full_param/llama-3-8b.yaml      |  4 +---
 .../configs/training/full_param/mistral-7b.yaml      |  4 +---
 .../configs/training/full_param/mixtral-8x7b.yaml    |  4 +---
 .../configs/training/lora/llama-3-70b.yaml           |  6 +-----
 .../configs/training/lora/llama-3-8b.yaml            |  4 +---
 .../configs/training/lora/mistral-7b.yaml            |  4 +---
 .../configs/training/lora/mixtral-8x7b.yaml          |  4 +---
 .../fine-tune-function-calling/README.ipynb          |  5 +----
 .../fine-tune-function-calling/README.md             |  5 +----
 .../meta-llama/Meta-Llama-3-70B/full/32xA10-4k.yaml  |  6 +-----
 .../meta-llama/Meta-Llama-3-8B/full/16xA10-512.yaml  | 12 +++++-------
 .../custom/mistralai/mistral-7b/full/16xA10-512.yaml |  6 ------
 14 files changed, 17 insertions(+), 56 deletions(-)

diff --git a/templates/e2e-llm-workflows/README.md b/templates/e2e-llm-workflows/README.md
index 6e0ee9fcb..8be077ccf 100644
--- a/templates/e2e-llm-workflows/README.md
+++ b/templates/e2e-llm-workflows/README.md
@@ -334,10 +334,8 @@ We also have recipes for [LoRA](https://arxiv.org/abs/2106.09685) (where we trai
     deepspeed:
       config_path: configs/deepspeed/zero_3_offload_optim+param.json
     flash_attention_2: true
-    trainer_resources:
-      memory: 53687091200 # 50 GB memory
     worker_resources:
-      accelerator_type:A10G: 0.001
+      anyscale/accelerator_shape:4xA10G: 0.001
     lora_config:
       r: 8
       lora_alpha: 16
diff --git a/templates/e2e-llm-workflows/configs/training/full_param/llama-3-70b.yaml b/templates/e2e-llm-workflows/configs/training/full_param/llama-3-70b.yaml
index 60e9be233..16d9d1d0e 100644
--- a/templates/e2e-llm-workflows/configs/training/full_param/llama-3-70b.yaml
+++ b/templates/e2e-llm-workflows/configs/training/full_param/llama-3-70b.yaml
@@ -14,8 +14,5 @@ deepspeed:
   config_path: configs/deepspeed/zero_3_offload_optim.json
 flash_attention_2: true
 # Head node would have at least 200 GB memory
-trainer_resources:
-  memory: 161061273600 # 130 GB memory
 worker_resources:
-  memory: 53687091200 # 70 GB memory
-  accelerator_type:A10G: 0.001
+  anyscale/accelerator_shape:8xA10G: 0.001
diff --git a/templates/e2e-llm-workflows/configs/training/full_param/llama-3-8b.yaml b/templates/e2e-llm-workflows/configs/training/full_param/llama-3-8b.yaml
index 4e415eb69..50df6b102 100644
--- a/templates/e2e-llm-workflows/configs/training/full_param/llama-3-8b.yaml
+++ b/templates/e2e-llm-workflows/configs/training/full_param/llama-3-8b.yaml
@@ -13,7 +13,5 @@ output_dir: /mnt/local_storage
 deepspeed:
   config_path: configs/deepspeed/zero_3_offload_optim+param.json
 flash_attention_2: true
-trainer_resources:
-  memory: 53687091200 # 50 GB memory
 worker_resources:
-  accelerator_type:A10G: 0.001
+  anyscale/accelerator_shape:4xA10G: 0.001
diff --git a/templates/e2e-llm-workflows/configs/training/full_param/mistral-7b.yaml b/templates/e2e-llm-workflows/configs/training/full_param/mistral-7b.yaml
index acc5cce33..5fef646ad 100644
--- a/templates/e2e-llm-workflows/configs/training/full_param/mistral-7b.yaml
+++ b/templates/e2e-llm-workflows/configs/training/full_param/mistral-7b.yaml
@@ -13,7 +13,5 @@ output_dir: /mnt/local_storage
 deepspeed:
   config_path: configs/deepspeed/zero_3_offload_optim+param.json
 flash_attention_2: true
-trainer_resources:
-  memory: 53687091200 # 50 GB memory
 worker_resources:
-  accelerator_type:A10G: 0.001
+  anyscale/accelerator_shape:4xA10G: 0.001
diff --git a/templates/e2e-llm-workflows/configs/training/full_param/mixtral-8x7b.yaml b/templates/e2e-llm-workflows/configs/training/full_param/mixtral-8x7b.yaml
index 792257d41..f7f7b3647 100644
--- a/templates/e2e-llm-workflows/configs/training/full_param/mixtral-8x7b.yaml
+++ b/templates/e2e-llm-workflows/configs/training/full_param/mixtral-8x7b.yaml
@@ -13,7 +13,5 @@ output_dir: /mnt/local_storage
 deepspeed:
   config_path: configs/deepspeed/zero_3_offload_optim.json
 flash_attention_2: true
-trainer_resources:
-  memory: 107374182400 # 100 GB memory
 worker_resources:
-  accelerator_type:A10G: 0.001
+  anyscale/accelerator_shape:4xA10G: 0.001
diff --git a/templates/e2e-llm-workflows/configs/training/lora/llama-3-70b.yaml b/templates/e2e-llm-workflows/configs/training/lora/llama-3-70b.yaml
index d59a7641e..f62b7998a 100644
--- a/templates/e2e-llm-workflows/configs/training/lora/llama-3-70b.yaml
+++ b/templates/e2e-llm-workflows/configs/training/lora/llama-3-70b.yaml
@@ -13,12 +13,8 @@ output_dir: /mnt/local_storage
 deepspeed:
   config_path: configs/deepspeed/zero_3.json
 flash_attention_2: true
-# Head node would have at least 200 GB memory
-trainer_resources:
-  memory: 161061273600 # 130 GB memory
 worker_resources:
-  memory: 53687091200 # 70 GB memory
-  accelerator_type:A10G: 0.001
+  anyscale/accelerator_shape:4xA10G: 0.001
 lora_config:
   r: 8
   lora_alpha: 16
diff --git a/templates/e2e-llm-workflows/configs/training/lora/llama-3-8b.yaml b/templates/e2e-llm-workflows/configs/training/lora/llama-3-8b.yaml
index 9e63a4864..7860bc111 100644
--- a/templates/e2e-llm-workflows/configs/training/lora/llama-3-8b.yaml
+++ b/templates/e2e-llm-workflows/configs/training/lora/llama-3-8b.yaml
@@ -13,10 +13,8 @@ output_dir: /mnt/local_storage
 deepspeed:
   config_path: configs/deepspeed/zero_3_offload_optim+param.json
 flash_attention_2: true
-trainer_resources:
-  memory: 53687091200 # 50 GB memory
 worker_resources:
-  accelerator_type:A10G: 0.001
+  anyscale/accelerator_shape:4xA10G: 0.001
 lora_config:
   r: 8
   lora_alpha: 16
diff --git a/templates/e2e-llm-workflows/configs/training/lora/mistral-7b.yaml b/templates/e2e-llm-workflows/configs/training/lora/mistral-7b.yaml
index a66901530..bfc456d0f 100644
--- a/templates/e2e-llm-workflows/configs/training/lora/mistral-7b.yaml
+++ b/templates/e2e-llm-workflows/configs/training/lora/mistral-7b.yaml
@@ -13,10 +13,8 @@ output_dir: /mnt/local_storage
 deepspeed:
   config_path: configs/deepspeed/zero_3_offload_optim+param.json
 flash_attention_2: true
-trainer_resources:
-  memory: 53687091200 # 50 GB memory
 worker_resources:
-  accelerator_type:A10G: 0.001
+  anyscale/accelerator_shape:4xA10G: 0.001
 lora_config:
   r: 8
   lora_alpha: 16
diff --git a/templates/e2e-llm-workflows/configs/training/lora/mixtral-8x7b.yaml b/templates/e2e-llm-workflows/configs/training/lora/mixtral-8x7b.yaml
index bedd43eca..8aba9d2db 100644
--- a/templates/e2e-llm-workflows/configs/training/lora/mixtral-8x7b.yaml
+++ b/templates/e2e-llm-workflows/configs/training/lora/mixtral-8x7b.yaml
@@ -13,10 +13,8 @@ output_dir: /mnt/local_storage
 deepspeed:
   config_path: configs/deepspeed/zero_3.json
 flash_attention_2: true
-trainer_resources:
-  memory: 107374182400 # 100 GB memory
 worker_resources:
-  accelerator_type:A10G: 0.001
+  anyscale/accelerator_shape:4xA10G: 0.001
 lora_config:
   r: 8
   lora_alpha: 16
diff --git a/templates/fine-tune-llm_v2/end-to-end-examples/fine-tune-function-calling/README.ipynb b/templates/fine-tune-llm_v2/end-to-end-examples/fine-tune-function-calling/README.ipynb
index b07500553..f8350661e 100644
--- a/templates/fine-tune-llm_v2/end-to-end-examples/fine-tune-function-calling/README.ipynb
+++ b/templates/fine-tune-llm_v2/end-to-end-examples/fine-tune-function-calling/README.ipynb
@@ -386,11 +386,8 @@
     "    \"deepspeed\": {\n",
     "        \"config_path\": \"deepspeed_configs/zero_3_offload_optim+param.json\"\n",
     "    },\n",
-    "    \"trainer_resources\": {\n",
-    "        \"memory\": 50 << 30 # 50 GB memory\n",
-    "    },\n",
     "    \"worker_resources\": {\n",
-    "        \"accelerator_type:A10G\": 0.001\n",
+    "        \"anyscale/accelerator_shape:4xA10G\": 0.001\n",
     "    },\n",
     "    \"lora_config\": {\n",
     "        \"r\": 8,\n",
diff --git a/templates/fine-tune-llm_v2/end-to-end-examples/fine-tune-function-calling/README.md b/templates/fine-tune-llm_v2/end-to-end-examples/fine-tune-function-calling/README.md
index 87a87d634..596c08cfa 100644
--- a/templates/fine-tune-llm_v2/end-to-end-examples/fine-tune-function-calling/README.md
+++ b/templates/fine-tune-llm_v2/end-to-end-examples/fine-tune-function-calling/README.md
@@ -247,11 +247,8 @@ config = {
     "deepspeed": {
         "config_path": "deepspeed_configs/zero_3_offload_optim+param.json"
     },
-    "trainer_resources": {
-        "memory": 50 << 30 # 50 GB memory
-    },
     "worker_resources": {
-        "accelerator_type:A10G": 0.001
+        "anyscale/accelerator_shape:4xA10G": 0.001
     },
     "lora_config": {
         "r": 8,
diff --git a/templates/fine-tune-llm_v2/training_configs/custom/meta-llama/Meta-Llama-3-70B/full/32xA10-4k.yaml b/templates/fine-tune-llm_v2/training_configs/custom/meta-llama/Meta-Llama-3-70B/full/32xA10-4k.yaml
index 6e1476d01..a9f5c5a40 100644
--- a/templates/fine-tune-llm_v2/training_configs/custom/meta-llama/Meta-Llama-3-70B/full/32xA10-4k.yaml
+++ b/templates/fine-tune-llm_v2/training_configs/custom/meta-llama/Meta-Llama-3-70B/full/32xA10-4k.yaml
@@ -34,14 +34,10 @@ num_checkpoints_to_keep: 1
 deepspeed:
   config_path: deepspeed_configs/zero_3_offload_optim+param.json
 
-# Rank-0 (aka trainer) should have 140 GB memory
-# This memory is required for weight aggregation.
-trainer_resources:
-  memory: 150_323_855_360 # 140 GB memory
 
 # Accelerator type, the value of 0.001 is not important, as long as it is
 # between 0 and 1. This ensures that the given accelerator is available for each trainer
 # worker.
 worker_resources:
   memory: 53_687_091_200 # 50 GB memory
-  accelerator_type:A10G: 0.001
+  anyscale/accelerator_shape:4xA10G: 0.001
diff --git a/templates/fine-tune-llm_v2/training_configs/custom/meta-llama/Meta-Llama-3-8B/full/16xA10-512.yaml b/templates/fine-tune-llm_v2/training_configs/custom/meta-llama/Meta-Llama-3-8B/full/16xA10-512.yaml
index a9c56f752..065b159fe 100644
--- a/templates/fine-tune-llm_v2/training_configs/custom/meta-llama/Meta-Llama-3-8B/full/16xA10-512.yaml
+++ b/templates/fine-tune-llm_v2/training_configs/custom/meta-llama/Meta-Llama-3-8B/full/16xA10-512.yaml
@@ -34,14 +34,12 @@ num_checkpoints_to_keep: 1
 deepspeed:
   config_path: deepspeed_configs/zero_3_offload_optim+param.json
 
-# Rank-0 does fp32 aggregation, so we need 4x8B ~ 32GB of memory
-# Setting it to 35 GB to be safe
-# This should provision a large enough node to schedule rank-0
-trainer_resources:
-  memory: 37_580_963_840
-
 # Accelerator type, the value of 0.001 is not important, as long as it is
 # between 0 and 1. This ensures that the given accelerator is available for each trainer
 # worker.
+# For full param, we need to ensure that there is enough memory for fp32 aggregation, here that is 4x8B ~ 32GB of memory
+# A standard 4xA10G node in AWS or GCP will suffice. If needed, and additional "memory" entry can be added here with memory per worker in bytes
 worker_resources:
-  accelerator_type:A10G: 0.001
+  #  Optionally, enforce that 35 GB memory is available per rank.
+  #  memory: 37_580_963_840
+  anyscale/accelerator_shape:4xA10G: 0.001
diff --git a/templates/fine-tune-llm_v2/training_configs/custom/mistralai/mistral-7b/full/16xA10-512.yaml b/templates/fine-tune-llm_v2/training_configs/custom/mistralai/mistral-7b/full/16xA10-512.yaml
index 8fd0d0062..a2bd0e487 100644
--- a/templates/fine-tune-llm_v2/training_configs/custom/mistralai/mistral-7b/full/16xA10-512.yaml
+++ b/templates/fine-tune-llm_v2/training_configs/custom/mistralai/mistral-7b/full/16xA10-512.yaml
@@ -34,12 +34,6 @@ num_checkpoints_to_keep: 1
 deepspeed:
   config_path: deepspeed_configs/zero_3_offload_optim+param.json
 
-# Rank-0 does fp32 aggregation, so we need 4x7B ~ 28GB of memory
-# Setting it to 35 GB to be safe
-# This should provision a large enough node to schedule rank-0
-trainer_resources:
-  memory: 37_580_963_840
-
 # Accelerator type, the value of 0.001 is not important, as long as it is
 # between 0 and 1. This ensures that the given accelerator is available for each trainer
 # worker.

From 5fa48b2800a3a7548103acde7e8bd664b0d78768 Mon Sep 17 00:00:00 2001
From: SumanthRH <sumanthrh@anyscale.com>
Date: Tue, 10 Dec 2024 18:54:34 +0530
Subject: [PATCH 2/2] x

Signed-off-by: SumanthRH <sumanthrh@anyscale.com>
---
 .../configs/training/full_param/llama-3-70b.yaml              | 2 +-
 .../full/{32xA10-4k.yaml => 32xA10-4k-high-memory.yaml}       | 4 +++-
 .../custom/mistralai/mistral-7b/full/16xA10-512.yaml          | 2 +-
 3 files changed, 5 insertions(+), 3 deletions(-)
 rename templates/fine-tune-llm_v2/training_configs/custom/meta-llama/Meta-Llama-3-70B/full/{32xA10-4k.yaml => 32xA10-4k-high-memory.yaml} (76%)

diff --git a/templates/e2e-llm-workflows/configs/training/full_param/llama-3-70b.yaml b/templates/e2e-llm-workflows/configs/training/full_param/llama-3-70b.yaml
index 16d9d1d0e..190adab7e 100644
--- a/templates/e2e-llm-workflows/configs/training/full_param/llama-3-70b.yaml
+++ b/templates/e2e-llm-workflows/configs/training/full_param/llama-3-70b.yaml
@@ -13,6 +13,6 @@ output_dir: /mnt/local_storage
 deepspeed:
   config_path: configs/deepspeed/zero_3_offload_optim.json
 flash_attention_2: true
-# Head node would have at least 200 GB memory
+# Head node should have at least 280 GB memory
 worker_resources:
   anyscale/accelerator_shape:8xA10G: 0.001
diff --git a/templates/fine-tune-llm_v2/training_configs/custom/meta-llama/Meta-Llama-3-70B/full/32xA10-4k.yaml b/templates/fine-tune-llm_v2/training_configs/custom/meta-llama/Meta-Llama-3-70B/full/32xA10-4k-high-memory.yaml
similarity index 76%
rename from templates/fine-tune-llm_v2/training_configs/custom/meta-llama/Meta-Llama-3-70B/full/32xA10-4k.yaml
rename to templates/fine-tune-llm_v2/training_configs/custom/meta-llama/Meta-Llama-3-70B/full/32xA10-4k-high-memory.yaml
index a9f5c5a40..7b345e9b1 100644
--- a/templates/fine-tune-llm_v2/training_configs/custom/meta-llama/Meta-Llama-3-70B/full/32xA10-4k.yaml
+++ b/templates/fine-tune-llm_v2/training_configs/custom/meta-llama/Meta-Llama-3-70B/full/32xA10-4k-high-memory.yaml
@@ -38,6 +38,8 @@ deepspeed:
 # Accelerator type, the value of 0.001 is not important, as long as it is
 # between 0 and 1. This ensures that the given accelerator is available for each trainer
 # worker.
+# For full param, we need to ensure that there is enough memory for fp32 aggregation, here that is 4x70B ~ 280GB of memory
+# in AWS, for example, you have two types of 4xA10G nodes, one with 192GB and another with 384GB RAM. We can explicitly request more "memory" per worker to ensure we get atleast 280GB in a node.
 worker_resources:
-  memory: 53_687_091_200 # 50 GB memory
+  memory: 75_161_927_680 # 70 GB memory per rank. With 4 GPUs in a node, that would be a request of atleast 280GB in a node
   anyscale/accelerator_shape:4xA10G: 0.001
diff --git a/templates/fine-tune-llm_v2/training_configs/custom/mistralai/mistral-7b/full/16xA10-512.yaml b/templates/fine-tune-llm_v2/training_configs/custom/mistralai/mistral-7b/full/16xA10-512.yaml
index a2bd0e487..7b752195a 100644
--- a/templates/fine-tune-llm_v2/training_configs/custom/mistralai/mistral-7b/full/16xA10-512.yaml
+++ b/templates/fine-tune-llm_v2/training_configs/custom/mistralai/mistral-7b/full/16xA10-512.yaml
@@ -38,4 +38,4 @@ deepspeed:
 # between 0 and 1. This ensures that the given accelerator is available for each trainer
 # worker.
 worker_resources:
-  accelerator_type:A10G: 0.001
+  anyscale/accelerator_shape:4xA10G: 0.001