anyscale · SumanthRH · Dec 10, 2024 · Dec 10, 2024
diff --git a/templates/e2e-llm-workflows/README.md b/templates/e2e-llm-workflows/README.md
@@ -334,10 +334,8 @@ We also have recipes for [LoRA](https://arxiv.org/abs/2106.09685) (where we trai
     deepspeed:
       config_path: configs/deepspeed/zero_3_offload_optim+param.json
     flash_attention_2: true
-    trainer_resources:
-      memory: 53687091200 # 50 GB memory
     worker_resources:
-      accelerator_type:A10G: 0.001
+      anyscale/accelerator_shape:4xA10G: 0.001
     lora_config:
       r: 8
       lora_alpha: 16

diff --git a/templates/e2e-llm-workflows/configs/training/full_param/llama-3-70b.yaml b/templates/e2e-llm-workflows/configs/training/full_param/llama-3-70b.yaml
@@ -13,9 +13,6 @@ output_dir: /mnt/local_storage
 deepspeed:
   config_path: configs/deepspeed/zero_3_offload_optim.json
 flash_attention_2: true
-# Head node would have at least 200 GB memory
-trainer_resources:
-  memory: 161061273600 # 130 GB memory
+# Head node should have at least 280 GB memory
 worker_resources:
-  memory: 53687091200 # 70 GB memory
-  accelerator_type:A10G: 0.001
+  anyscale/accelerator_shape:8xA10G: 0.001
diff --git a/templates/e2e-llm-workflows/configs/training/full_param/llama-3-8b.yaml b/templates/e2e-llm-workflows/configs/training/full_param/llama-3-8b.yaml
@@ -13,7 +13,5 @@ output_dir: /mnt/local_storage
 deepspeed:
   config_path: configs/deepspeed/zero_3_offload_optim+param.json
 flash_attention_2: true
-trainer_resources:
-  memory: 53687091200 # 50 GB memory
 worker_resources:
-  accelerator_type:A10G: 0.001
+  anyscale/accelerator_shape:4xA10G: 0.001
diff --git a/templates/e2e-llm-workflows/configs/training/full_param/mistral-7b.yaml b/templates/e2e-llm-workflows/configs/training/full_param/mistral-7b.yaml
@@ -13,7 +13,5 @@ output_dir: /mnt/local_storage
 deepspeed:
   config_path: configs/deepspeed/zero_3_offload_optim+param.json
 flash_attention_2: true
-trainer_resources:
-  memory: 53687091200 # 50 GB memory
 worker_resources:
-  accelerator_type:A10G: 0.001
+  anyscale/accelerator_shape:4xA10G: 0.001
diff --git a/templates/e2e-llm-workflows/configs/training/full_param/mixtral-8x7b.yaml b/templates/e2e-llm-workflows/configs/training/full_param/mixtral-8x7b.yaml
@@ -13,7 +13,5 @@ output_dir: /mnt/local_storage
 deepspeed:
   config_path: configs/deepspeed/zero_3_offload_optim.json
 flash_attention_2: true
-trainer_resources:
-  memory: 107374182400 # 100 GB memory
 worker_resources:
-  accelerator_type:A10G: 0.001
+  anyscale/accelerator_shape:4xA10G: 0.001
diff --git a/templates/e2e-llm-workflows/configs/training/lora/llama-3-70b.yaml b/templates/e2e-llm-workflows/configs/training/lora/llama-3-70b.yaml
@@ -13,12 +13,8 @@ output_dir: /mnt/local_storage
 deepspeed:
   config_path: configs/deepspeed/zero_3.json
 flash_attention_2: true
-# Head node would have at least 200 GB memory
-trainer_resources:
-  memory: 161061273600 # 130 GB memory
 worker_resources:
-  memory: 53687091200 # 70 GB memory
-  accelerator_type:A10G: 0.001
+  anyscale/accelerator_shape:4xA10G: 0.001
 lora_config:
   r: 8
   lora_alpha: 16

diff --git a/templates/e2e-llm-workflows/configs/training/lora/llama-3-8b.yaml b/templates/e2e-llm-workflows/configs/training/lora/llama-3-8b.yaml
@@ -13,10 +13,8 @@ output_dir: /mnt/local_storage
 deepspeed:
   config_path: configs/deepspeed/zero_3_offload_optim+param.json
 flash_attention_2: true
-trainer_resources:
-  memory: 53687091200 # 50 GB memory
 worker_resources:
-  accelerator_type:A10G: 0.001
+  anyscale/accelerator_shape:4xA10G: 0.001
 lora_config:
   r: 8
   lora_alpha: 16

diff --git a/templates/e2e-llm-workflows/configs/training/lora/mistral-7b.yaml b/templates/e2e-llm-workflows/configs/training/lora/mistral-7b.yaml
@@ -13,10 +13,8 @@ output_dir: /mnt/local_storage
 deepspeed:
   config_path: configs/deepspeed/zero_3_offload_optim+param.json
 flash_attention_2: true
-trainer_resources:
-  memory: 53687091200 # 50 GB memory
 worker_resources:
-  accelerator_type:A10G: 0.001
+  anyscale/accelerator_shape:4xA10G: 0.001
 lora_config:
   r: 8
   lora_alpha: 16

diff --git a/templates/e2e-llm-workflows/configs/training/lora/mixtral-8x7b.yaml b/templates/e2e-llm-workflows/configs/training/lora/mixtral-8x7b.yaml
@@ -13,10 +13,8 @@ output_dir: /mnt/local_storage
 deepspeed:
   config_path: configs/deepspeed/zero_3.json
 flash_attention_2: true
-trainer_resources:
-  memory: 107374182400 # 100 GB memory
 worker_resources:
-  accelerator_type:A10G: 0.001
+  anyscale/accelerator_shape:4xA10G: 0.001
 lora_config:
   r: 8
   lora_alpha: 16

diff --git a/templates/fine-tune-llm_v2/end-to-end-examples/fine-tune-function-calling/README.ipynb b/templates/fine-tune-llm_v2/end-to-end-examples/fine-tune-function-calling/README.ipynb
@@ -386,11 +386,8 @@
     "    \"deepspeed\": {\n",
     "        \"config_path\": \"deepspeed_configs/zero_3_offload_optim+param.json\"\n",
     "    },\n",
-    "    \"trainer_resources\": {\n",
-    "        \"memory\": 50 << 30 # 50 GB memory\n",
-    "    },\n",
     "    \"worker_resources\": {\n",
-    "        \"accelerator_type:A10G\": 0.001\n",
+    "        \"anyscale/accelerator_shape:4xA10G\": 0.001\n",
     "    },\n",
     "    \"lora_config\": {\n",
     "        \"r\": 8,\n",

diff --git a/...lates/fine-tune-llm_v2/end-to-end-examples/fine-tune-function-calling/README.md b/...lates/fine-tune-llm_v2/end-to-end-examples/fine-tune-function-calling/README.md
@@ -247,11 +247,8 @@ config = {
     "deepspeed": {
         "config_path": "deepspeed_configs/zero_3_offload_optim+param.json"
     },
-    "trainer_resources": {
-        "memory": 50 << 30 # 50 GB memory
-    },
     "worker_resources": {
-        "accelerator_type:A10G": 0.001
+        "anyscale/accelerator_shape:4xA10G": 0.001
     },
     "lora_config": {
         "r": 8,

diff --git a/...lama/Meta-Llama-3-70B/full/32xA10-4k.yaml → ...ama-3-70B/full/32xA10-4k-high-memory.yaml b/...lama/Meta-Llama-3-70B/full/32xA10-4k.yaml → ...ama-3-70B/full/32xA10-4k-high-memory.yaml
@@ -34,14 +34,12 @@ num_checkpoints_to_keep: 1
 deepspeed:
   config_path: deepspeed_configs/zero_3_offload_optim+param.json
 
-# Rank-0 (aka trainer) should have 140 GB memory
-# This memory is required for weight aggregation.
-trainer_resources:
-  memory: 150_323_855_360 # 140 GB memory
 
 # Accelerator type, the value of 0.001 is not important, as long as it is
 # between 0 and 1. This ensures that the given accelerator is available for each trainer
 # worker.
+# For full param, we need to ensure that there is enough memory for fp32 aggregation, here that is 4x70B ~ 280GB of memory
+# in AWS, for example, you have two types of 4xA10G nodes, one with 192GB and another with 384GB RAM. We can explicitly request more "memory" per worker to ensure we get atleast 280GB in a node.
 worker_resources:
-  memory: 53_687_091_200 # 50 GB memory
-  accelerator_type:A10G: 0.001
+  memory: 75_161_927_680 # 70 GB memory per rank. With 4 GPUs in a node, that would be a request of atleast 280GB in a node
+  anyscale/accelerator_shape:4xA10G: 0.001
diff --git a/.../fine-tune-llm_v2/training_configs/custom/meta-llama/Meta-Llama-3-8B/full/16xA10-512.yaml b/.../fine-tune-llm_v2/training_configs/custom/meta-llama/Meta-Llama-3-8B/full/16xA10-512.yaml
@@ -34,14 +34,12 @@ num_checkpoints_to_keep: 1
 deepspeed:
   config_path: deepspeed_configs/zero_3_offload_optim+param.json
 
-# Rank-0 does fp32 aggregation, so we need 4x8B ~ 32GB of memory
-# Setting it to 35 GB to be safe
-# This should provision a large enough node to schedule rank-0
-trainer_resources:
-  memory: 37_580_963_840
-
 # Accelerator type, the value of 0.001 is not important, as long as it is
 # between 0 and 1. This ensures that the given accelerator is available for each trainer
 # worker.
+# For full param, we need to ensure that there is enough memory for fp32 aggregation, here that is 4x8B ~ 32GB of memory
+# A standard 4xA10G node in AWS or GCP will suffice. If needed, and additional "memory" entry can be added here with memory per worker in bytes
 worker_resources:
-  accelerator_type:A10G: 0.001
+  #  Optionally, enforce that 35 GB memory is available per rank.
+  #  memory: 37_580_963_840
+  anyscale/accelerator_shape:4xA10G: 0.001
diff --git a/templates/fine-tune-llm_v2/training_configs/custom/mistralai/mistral-7b/full/16xA10-512.yaml b/templates/fine-tune-llm_v2/training_configs/custom/mistralai/mistral-7b/full/16xA10-512.yaml
@@ -34,14 +34,8 @@ num_checkpoints_to_keep: 1
 deepspeed:
   config_path: deepspeed_configs/zero_3_offload_optim+param.json
 
-# Rank-0 does fp32 aggregation, so we need 4x7B ~ 28GB of memory
-# Setting it to 35 GB to be safe
-# This should provision a large enough node to schedule rank-0
-trainer_resources:
-  memory: 37_580_963_840
-
 # Accelerator type, the value of 0.001 is not important, as long as it is
 # between 0 and 1. This ensures that the given accelerator is available for each trainer
 # worker.
 worker_resources:
-  accelerator_type:A10G: 0.001
+  anyscale/accelerator_shape:4xA10G: 0.001