From 04658ed4df10480ea92457c999bfd75f22727073 Mon Sep 17 00:00:00 2001 From: Amog Kamsetty Date: Fri, 23 Feb 2024 16:44:01 -0800 Subject: [PATCH 1/4] mistral full param Signed-off-by: Amog Kamsetty --- .../deepspeed_configs/zero_3_mistral_7b.json | 35 +++++++++++++++++++ .../mistral-7b-4k-4xg5_12xlarge.yaml | 18 ++++++++++ 2 files changed, 53 insertions(+) create mode 100644 templates/fine-tune-llm/deepspeed_configs/zero_3_mistral_7b.json create mode 100644 templates/fine-tune-llm/training_configs/full_param/mistral-7b-4k-4xg5_12xlarge.yaml diff --git a/templates/fine-tune-llm/deepspeed_configs/zero_3_mistral_7b.json b/templates/fine-tune-llm/deepspeed_configs/zero_3_mistral_7b.json new file mode 100644 index 000000000..9cdb2a021 --- /dev/null +++ b/templates/fine-tune-llm/deepspeed_configs/zero_3_mistral_7b.json @@ -0,0 +1,35 @@ +{ + "fp16": { + "enabled": "auto" + }, + "bf16": { + "enabled": true + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "cpu", + "pin_memory": true + }, + "offload_param": { + "device": "cpu", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1e9, + "reduce_bucket_size": 5e8, + "stage3_prefetch_bucket_size": 5e8, + "stage3_param_persistence_threshold": 1e6, + "stage3_max_live_parameters": 1e9, + "stage3_max_reuse_distance": 1e9, + "stage3_gather_16bit_weights_on_model_save": true, + "round_robin_gradients": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 10, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false +} \ No newline at end of file diff --git a/templates/fine-tune-llm/training_configs/full_param/mistral-7b-4k-4xg5_12xlarge.yaml b/templates/fine-tune-llm/training_configs/full_param/mistral-7b-4k-4xg5_12xlarge.yaml new file mode 100644 index 000000000..fb0dc913f --- /dev/null +++ b/templates/fine-tune-llm/training_configs/full_param/mistral-7b-4k-4xg5_12xlarge.yaml @@ -0,0 +1,18 @@ +model_id: mistralai/Mistral-7B-Instruct-v0.1 # <-- change this to the model you want to fine-tune +train_path: s3://air-example-data/viggo/train.jsonl +valid_path: s3://air-example-data/viggo/valid.jsonl +context_length: 4096 # <-- change this to the context length you want to use +num_devices: 16 # <-- change this to total number of GPUs that you want to use +num_epochs: 1 # <-- change this to the number of epochs that you want to train for +train_batch_size_per_device: 2 +eval_batch_size_per_device: 2 +learning_rate: 1e-4 +num_checkpoints_to_keep: 1 +no_gradient_checkpoint: False +dataset_size_scaling_factor: 10000 +output_dir: /mnt/local_storage +deepspeed: + config_path: deepspeed_configs/zero_3_mistral_7b.json +flash_attention_2: True +worker_resources: + g5.12xlarge: 1 # <-- this maps to job_compute_configs file's custom_resources so the appropriate nodes can scale up \ No newline at end of file From 6da1afce427d527d858ca66237c38c92e0b1692e Mon Sep 17 00:00:00 2001 From: Amog Kamsetty Date: Fri, 23 Feb 2024 17:10:30 -0800 Subject: [PATCH 2/4] lint Signed-off-by: Amog Kamsetty --- .../full_param/mistral-7b-4k-4xg5_12xlarge.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/templates/fine-tune-llm/training_configs/full_param/mistral-7b-4k-4xg5_12xlarge.yaml b/templates/fine-tune-llm/training_configs/full_param/mistral-7b-4k-4xg5_12xlarge.yaml index fb0dc913f..52228990d 100644 --- a/templates/fine-tune-llm/training_configs/full_param/mistral-7b-4k-4xg5_12xlarge.yaml +++ b/templates/fine-tune-llm/training_configs/full_param/mistral-7b-4k-4xg5_12xlarge.yaml @@ -15,4 +15,4 @@ deepspeed: config_path: deepspeed_configs/zero_3_mistral_7b.json flash_attention_2: True worker_resources: - g5.12xlarge: 1 # <-- this maps to job_compute_configs file's custom_resources so the appropriate nodes can scale up \ No newline at end of file + g5.12xlarge: 1 # <-- this maps to job_compute_configs file's custom_resources so the appropriate nodes can scale up From a577741850267e3492f55028b4e179f077730e85 Mon Sep 17 00:00:00 2001 From: Amog Kamsetty Date: Fri, 23 Feb 2024 17:17:10 -0800 Subject: [PATCH 3/4] lint Signed-off-by: Amog Kamsetty --- .../fine-tune-llm/deepspeed_configs/zero_3_mistral_7b.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/templates/fine-tune-llm/deepspeed_configs/zero_3_mistral_7b.json b/templates/fine-tune-llm/deepspeed_configs/zero_3_mistral_7b.json index 9cdb2a021..07877dd5d 100644 --- a/templates/fine-tune-llm/deepspeed_configs/zero_3_mistral_7b.json +++ b/templates/fine-tune-llm/deepspeed_configs/zero_3_mistral_7b.json @@ -32,4 +32,4 @@ "train_batch_size": "auto", "train_micro_batch_size_per_gpu": "auto", "wall_clock_breakdown": false -} \ No newline at end of file +} From e33f6db7a0e81eb32a460e6db30f01324f0745c8 Mon Sep 17 00:00:00 2001 From: Amog Kamsetty Date: Fri, 23 Feb 2024 17:21:11 -0800 Subject: [PATCH 4/4] lint Signed-off-by: Amog Kamsetty --- .../fine-tune-llm/deepspeed_configs/zero_3_mistral_7b.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/templates/fine-tune-llm/deepspeed_configs/zero_3_mistral_7b.json b/templates/fine-tune-llm/deepspeed_configs/zero_3_mistral_7b.json index 07877dd5d..2ebec1f89 100644 --- a/templates/fine-tune-llm/deepspeed_configs/zero_3_mistral_7b.json +++ b/templates/fine-tune-llm/deepspeed_configs/zero_3_mistral_7b.json @@ -1,4 +1,4 @@ -{ +{ "fp16": { "enabled": "auto" },