From 278a9622f6bd8c768c7b1a91e0c126d03f88d553 Mon Sep 17 00:00:00 2001 From: SumanthRH Date: Tue, 10 Dec 2024 18:39:57 +0530 Subject: [PATCH 1/2] x Signed-off-by: SumanthRH --- templates/e2e-llm-workflows/README.md | 4 +--- .../configs/training/full_param/llama-3-70b.yaml | 5 +---- .../configs/training/full_param/llama-3-8b.yaml | 4 +--- .../configs/training/full_param/mistral-7b.yaml | 4 +--- .../configs/training/full_param/mixtral-8x7b.yaml | 4 +--- .../configs/training/lora/llama-3-70b.yaml | 6 +----- .../configs/training/lora/llama-3-8b.yaml | 4 +--- .../configs/training/lora/mistral-7b.yaml | 4 +--- .../configs/training/lora/mixtral-8x7b.yaml | 4 +--- .../fine-tune-function-calling/README.ipynb | 5 +---- .../fine-tune-function-calling/README.md | 5 +---- .../meta-llama/Meta-Llama-3-70B/full/32xA10-4k.yaml | 6 +----- .../meta-llama/Meta-Llama-3-8B/full/16xA10-512.yaml | 12 +++++------- .../custom/mistralai/mistral-7b/full/16xA10-512.yaml | 6 ------ 14 files changed, 17 insertions(+), 56 deletions(-) diff --git a/templates/e2e-llm-workflows/README.md b/templates/e2e-llm-workflows/README.md index 6e0ee9fcb..8be077ccf 100644 --- a/templates/e2e-llm-workflows/README.md +++ b/templates/e2e-llm-workflows/README.md @@ -334,10 +334,8 @@ We also have recipes for [LoRA](https://arxiv.org/abs/2106.09685) (where we trai deepspeed: config_path: configs/deepspeed/zero_3_offload_optim+param.json flash_attention_2: true - trainer_resources: - memory: 53687091200 # 50 GB memory worker_resources: - accelerator_type:A10G: 0.001 + anyscale/accelerator_shape:4xA10G: 0.001 lora_config: r: 8 lora_alpha: 16 diff --git a/templates/e2e-llm-workflows/configs/training/full_param/llama-3-70b.yaml b/templates/e2e-llm-workflows/configs/training/full_param/llama-3-70b.yaml index 60e9be233..16d9d1d0e 100644 --- a/templates/e2e-llm-workflows/configs/training/full_param/llama-3-70b.yaml +++ b/templates/e2e-llm-workflows/configs/training/full_param/llama-3-70b.yaml @@ -14,8 +14,5 @@ deepspeed: config_path: configs/deepspeed/zero_3_offload_optim.json flash_attention_2: true # Head node would have at least 200 GB memory -trainer_resources: - memory: 161061273600 # 130 GB memory worker_resources: - memory: 53687091200 # 70 GB memory - accelerator_type:A10G: 0.001 + anyscale/accelerator_shape:8xA10G: 0.001 diff --git a/templates/e2e-llm-workflows/configs/training/full_param/llama-3-8b.yaml b/templates/e2e-llm-workflows/configs/training/full_param/llama-3-8b.yaml index 4e415eb69..50df6b102 100644 --- a/templates/e2e-llm-workflows/configs/training/full_param/llama-3-8b.yaml +++ b/templates/e2e-llm-workflows/configs/training/full_param/llama-3-8b.yaml @@ -13,7 +13,5 @@ output_dir: /mnt/local_storage deepspeed: config_path: configs/deepspeed/zero_3_offload_optim+param.json flash_attention_2: true -trainer_resources: - memory: 53687091200 # 50 GB memory worker_resources: - accelerator_type:A10G: 0.001 + anyscale/accelerator_shape:4xA10G: 0.001 diff --git a/templates/e2e-llm-workflows/configs/training/full_param/mistral-7b.yaml b/templates/e2e-llm-workflows/configs/training/full_param/mistral-7b.yaml index acc5cce33..5fef646ad 100644 --- a/templates/e2e-llm-workflows/configs/training/full_param/mistral-7b.yaml +++ b/templates/e2e-llm-workflows/configs/training/full_param/mistral-7b.yaml @@ -13,7 +13,5 @@ output_dir: /mnt/local_storage deepspeed: config_path: configs/deepspeed/zero_3_offload_optim+param.json flash_attention_2: true -trainer_resources: - memory: 53687091200 # 50 GB memory worker_resources: - accelerator_type:A10G: 0.001 + anyscale/accelerator_shape:4xA10G: 0.001 diff --git a/templates/e2e-llm-workflows/configs/training/full_param/mixtral-8x7b.yaml b/templates/e2e-llm-workflows/configs/training/full_param/mixtral-8x7b.yaml index 792257d41..f7f7b3647 100644 --- a/templates/e2e-llm-workflows/configs/training/full_param/mixtral-8x7b.yaml +++ b/templates/e2e-llm-workflows/configs/training/full_param/mixtral-8x7b.yaml @@ -13,7 +13,5 @@ output_dir: /mnt/local_storage deepspeed: config_path: configs/deepspeed/zero_3_offload_optim.json flash_attention_2: true -trainer_resources: - memory: 107374182400 # 100 GB memory worker_resources: - accelerator_type:A10G: 0.001 + anyscale/accelerator_shape:4xA10G: 0.001 diff --git a/templates/e2e-llm-workflows/configs/training/lora/llama-3-70b.yaml b/templates/e2e-llm-workflows/configs/training/lora/llama-3-70b.yaml index d59a7641e..f62b7998a 100644 --- a/templates/e2e-llm-workflows/configs/training/lora/llama-3-70b.yaml +++ b/templates/e2e-llm-workflows/configs/training/lora/llama-3-70b.yaml @@ -13,12 +13,8 @@ output_dir: /mnt/local_storage deepspeed: config_path: configs/deepspeed/zero_3.json flash_attention_2: true -# Head node would have at least 200 GB memory -trainer_resources: - memory: 161061273600 # 130 GB memory worker_resources: - memory: 53687091200 # 70 GB memory - accelerator_type:A10G: 0.001 + anyscale/accelerator_shape:4xA10G: 0.001 lora_config: r: 8 lora_alpha: 16 diff --git a/templates/e2e-llm-workflows/configs/training/lora/llama-3-8b.yaml b/templates/e2e-llm-workflows/configs/training/lora/llama-3-8b.yaml index 9e63a4864..7860bc111 100644 --- a/templates/e2e-llm-workflows/configs/training/lora/llama-3-8b.yaml +++ b/templates/e2e-llm-workflows/configs/training/lora/llama-3-8b.yaml @@ -13,10 +13,8 @@ output_dir: /mnt/local_storage deepspeed: config_path: configs/deepspeed/zero_3_offload_optim+param.json flash_attention_2: true -trainer_resources: - memory: 53687091200 # 50 GB memory worker_resources: - accelerator_type:A10G: 0.001 + anyscale/accelerator_shape:4xA10G: 0.001 lora_config: r: 8 lora_alpha: 16 diff --git a/templates/e2e-llm-workflows/configs/training/lora/mistral-7b.yaml b/templates/e2e-llm-workflows/configs/training/lora/mistral-7b.yaml index a66901530..bfc456d0f 100644 --- a/templates/e2e-llm-workflows/configs/training/lora/mistral-7b.yaml +++ b/templates/e2e-llm-workflows/configs/training/lora/mistral-7b.yaml @@ -13,10 +13,8 @@ output_dir: /mnt/local_storage deepspeed: config_path: configs/deepspeed/zero_3_offload_optim+param.json flash_attention_2: true -trainer_resources: - memory: 53687091200 # 50 GB memory worker_resources: - accelerator_type:A10G: 0.001 + anyscale/accelerator_shape:4xA10G: 0.001 lora_config: r: 8 lora_alpha: 16 diff --git a/templates/e2e-llm-workflows/configs/training/lora/mixtral-8x7b.yaml b/templates/e2e-llm-workflows/configs/training/lora/mixtral-8x7b.yaml index bedd43eca..8aba9d2db 100644 --- a/templates/e2e-llm-workflows/configs/training/lora/mixtral-8x7b.yaml +++ b/templates/e2e-llm-workflows/configs/training/lora/mixtral-8x7b.yaml @@ -13,10 +13,8 @@ output_dir: /mnt/local_storage deepspeed: config_path: configs/deepspeed/zero_3.json flash_attention_2: true -trainer_resources: - memory: 107374182400 # 100 GB memory worker_resources: - accelerator_type:A10G: 0.001 + anyscale/accelerator_shape:4xA10G: 0.001 lora_config: r: 8 lora_alpha: 16 diff --git a/templates/fine-tune-llm_v2/end-to-end-examples/fine-tune-function-calling/README.ipynb b/templates/fine-tune-llm_v2/end-to-end-examples/fine-tune-function-calling/README.ipynb index b07500553..f8350661e 100644 --- a/templates/fine-tune-llm_v2/end-to-end-examples/fine-tune-function-calling/README.ipynb +++ b/templates/fine-tune-llm_v2/end-to-end-examples/fine-tune-function-calling/README.ipynb @@ -386,11 +386,8 @@ " \"deepspeed\": {\n", " \"config_path\": \"deepspeed_configs/zero_3_offload_optim+param.json\"\n", " },\n", - " \"trainer_resources\": {\n", - " \"memory\": 50 << 30 # 50 GB memory\n", - " },\n", " \"worker_resources\": {\n", - " \"accelerator_type:A10G\": 0.001\n", + " \"anyscale/accelerator_shape:4xA10G\": 0.001\n", " },\n", " \"lora_config\": {\n", " \"r\": 8,\n", diff --git a/templates/fine-tune-llm_v2/end-to-end-examples/fine-tune-function-calling/README.md b/templates/fine-tune-llm_v2/end-to-end-examples/fine-tune-function-calling/README.md index 87a87d634..596c08cfa 100644 --- a/templates/fine-tune-llm_v2/end-to-end-examples/fine-tune-function-calling/README.md +++ b/templates/fine-tune-llm_v2/end-to-end-examples/fine-tune-function-calling/README.md @@ -247,11 +247,8 @@ config = { "deepspeed": { "config_path": "deepspeed_configs/zero_3_offload_optim+param.json" }, - "trainer_resources": { - "memory": 50 << 30 # 50 GB memory - }, "worker_resources": { - "accelerator_type:A10G": 0.001 + "anyscale/accelerator_shape:4xA10G": 0.001 }, "lora_config": { "r": 8, diff --git a/templates/fine-tune-llm_v2/training_configs/custom/meta-llama/Meta-Llama-3-70B/full/32xA10-4k.yaml b/templates/fine-tune-llm_v2/training_configs/custom/meta-llama/Meta-Llama-3-70B/full/32xA10-4k.yaml index 6e1476d01..a9f5c5a40 100644 --- a/templates/fine-tune-llm_v2/training_configs/custom/meta-llama/Meta-Llama-3-70B/full/32xA10-4k.yaml +++ b/templates/fine-tune-llm_v2/training_configs/custom/meta-llama/Meta-Llama-3-70B/full/32xA10-4k.yaml @@ -34,14 +34,10 @@ num_checkpoints_to_keep: 1 deepspeed: config_path: deepspeed_configs/zero_3_offload_optim+param.json -# Rank-0 (aka trainer) should have 140 GB memory -# This memory is required for weight aggregation. -trainer_resources: - memory: 150_323_855_360 # 140 GB memory # Accelerator type, the value of 0.001 is not important, as long as it is # between 0 and 1. This ensures that the given accelerator is available for each trainer # worker. worker_resources: memory: 53_687_091_200 # 50 GB memory - accelerator_type:A10G: 0.001 + anyscale/accelerator_shape:4xA10G: 0.001 diff --git a/templates/fine-tune-llm_v2/training_configs/custom/meta-llama/Meta-Llama-3-8B/full/16xA10-512.yaml b/templates/fine-tune-llm_v2/training_configs/custom/meta-llama/Meta-Llama-3-8B/full/16xA10-512.yaml index a9c56f752..065b159fe 100644 --- a/templates/fine-tune-llm_v2/training_configs/custom/meta-llama/Meta-Llama-3-8B/full/16xA10-512.yaml +++ b/templates/fine-tune-llm_v2/training_configs/custom/meta-llama/Meta-Llama-3-8B/full/16xA10-512.yaml @@ -34,14 +34,12 @@ num_checkpoints_to_keep: 1 deepspeed: config_path: deepspeed_configs/zero_3_offload_optim+param.json -# Rank-0 does fp32 aggregation, so we need 4x8B ~ 32GB of memory -# Setting it to 35 GB to be safe -# This should provision a large enough node to schedule rank-0 -trainer_resources: - memory: 37_580_963_840 - # Accelerator type, the value of 0.001 is not important, as long as it is # between 0 and 1. This ensures that the given accelerator is available for each trainer # worker. +# For full param, we need to ensure that there is enough memory for fp32 aggregation, here that is 4x8B ~ 32GB of memory +# A standard 4xA10G node in AWS or GCP will suffice. If needed, and additional "memory" entry can be added here with memory per worker in bytes worker_resources: - accelerator_type:A10G: 0.001 + # Optionally, enforce that 35 GB memory is available per rank. + # memory: 37_580_963_840 + anyscale/accelerator_shape:4xA10G: 0.001 diff --git a/templates/fine-tune-llm_v2/training_configs/custom/mistralai/mistral-7b/full/16xA10-512.yaml b/templates/fine-tune-llm_v2/training_configs/custom/mistralai/mistral-7b/full/16xA10-512.yaml index 8fd0d0062..a2bd0e487 100644 --- a/templates/fine-tune-llm_v2/training_configs/custom/mistralai/mistral-7b/full/16xA10-512.yaml +++ b/templates/fine-tune-llm_v2/training_configs/custom/mistralai/mistral-7b/full/16xA10-512.yaml @@ -34,12 +34,6 @@ num_checkpoints_to_keep: 1 deepspeed: config_path: deepspeed_configs/zero_3_offload_optim+param.json -# Rank-0 does fp32 aggregation, so we need 4x7B ~ 28GB of memory -# Setting it to 35 GB to be safe -# This should provision a large enough node to schedule rank-0 -trainer_resources: - memory: 37_580_963_840 - # Accelerator type, the value of 0.001 is not important, as long as it is # between 0 and 1. This ensures that the given accelerator is available for each trainer # worker. From 5fa48b2800a3a7548103acde7e8bd664b0d78768 Mon Sep 17 00:00:00 2001 From: SumanthRH Date: Tue, 10 Dec 2024 18:54:34 +0530 Subject: [PATCH 2/2] x Signed-off-by: SumanthRH --- .../configs/training/full_param/llama-3-70b.yaml | 2 +- .../full/{32xA10-4k.yaml => 32xA10-4k-high-memory.yaml} | 4 +++- .../custom/mistralai/mistral-7b/full/16xA10-512.yaml | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) rename templates/fine-tune-llm_v2/training_configs/custom/meta-llama/Meta-Llama-3-70B/full/{32xA10-4k.yaml => 32xA10-4k-high-memory.yaml} (76%) diff --git a/templates/e2e-llm-workflows/configs/training/full_param/llama-3-70b.yaml b/templates/e2e-llm-workflows/configs/training/full_param/llama-3-70b.yaml index 16d9d1d0e..190adab7e 100644 --- a/templates/e2e-llm-workflows/configs/training/full_param/llama-3-70b.yaml +++ b/templates/e2e-llm-workflows/configs/training/full_param/llama-3-70b.yaml @@ -13,6 +13,6 @@ output_dir: /mnt/local_storage deepspeed: config_path: configs/deepspeed/zero_3_offload_optim.json flash_attention_2: true -# Head node would have at least 200 GB memory +# Head node should have at least 280 GB memory worker_resources: anyscale/accelerator_shape:8xA10G: 0.001 diff --git a/templates/fine-tune-llm_v2/training_configs/custom/meta-llama/Meta-Llama-3-70B/full/32xA10-4k.yaml b/templates/fine-tune-llm_v2/training_configs/custom/meta-llama/Meta-Llama-3-70B/full/32xA10-4k-high-memory.yaml similarity index 76% rename from templates/fine-tune-llm_v2/training_configs/custom/meta-llama/Meta-Llama-3-70B/full/32xA10-4k.yaml rename to templates/fine-tune-llm_v2/training_configs/custom/meta-llama/Meta-Llama-3-70B/full/32xA10-4k-high-memory.yaml index a9f5c5a40..7b345e9b1 100644 --- a/templates/fine-tune-llm_v2/training_configs/custom/meta-llama/Meta-Llama-3-70B/full/32xA10-4k.yaml +++ b/templates/fine-tune-llm_v2/training_configs/custom/meta-llama/Meta-Llama-3-70B/full/32xA10-4k-high-memory.yaml @@ -38,6 +38,8 @@ deepspeed: # Accelerator type, the value of 0.001 is not important, as long as it is # between 0 and 1. This ensures that the given accelerator is available for each trainer # worker. +# For full param, we need to ensure that there is enough memory for fp32 aggregation, here that is 4x70B ~ 280GB of memory +# in AWS, for example, you have two types of 4xA10G nodes, one with 192GB and another with 384GB RAM. We can explicitly request more "memory" per worker to ensure we get atleast 280GB in a node. worker_resources: - memory: 53_687_091_200 # 50 GB memory + memory: 75_161_927_680 # 70 GB memory per rank. With 4 GPUs in a node, that would be a request of atleast 280GB in a node anyscale/accelerator_shape:4xA10G: 0.001 diff --git a/templates/fine-tune-llm_v2/training_configs/custom/mistralai/mistral-7b/full/16xA10-512.yaml b/templates/fine-tune-llm_v2/training_configs/custom/mistralai/mistral-7b/full/16xA10-512.yaml index a2bd0e487..7b752195a 100644 --- a/templates/fine-tune-llm_v2/training_configs/custom/mistralai/mistral-7b/full/16xA10-512.yaml +++ b/templates/fine-tune-llm_v2/training_configs/custom/mistralai/mistral-7b/full/16xA10-512.yaml @@ -38,4 +38,4 @@ deepspeed: # between 0 and 1. This ensures that the given accelerator is available for each trainer # worker. worker_resources: - accelerator_type:A10G: 0.001 + anyscale/accelerator_shape:4xA10G: 0.001