From cc5be8a9ad4238813e4eb3744270e60a86e2f024 Mon Sep 17 00:00:00 2001 From: SumanthRH Date: Mon, 9 Dec 2024 03:46:27 -0800 Subject: [PATCH 1/3] x Signed-off-by: SumanthRH --- templates/e2e-dspy-workflow/README.md | 4 ---- templates/e2e-llm-workflows/README.ipynb | 1 - templates/e2e-llm-workflows/README.md | 2 -- .../configs/training/full_param/llama-3-70b.yaml | 2 -- .../configs/training/full_param/llama-3-8b.yaml | 2 -- .../configs/training/full_param/mistral-7b.yaml | 2 -- .../configs/training/full_param/mixtral-8x7b.yaml | 2 -- .../e2e-llm-workflows/configs/training/lora/llama-3-70b.yaml | 2 -- .../e2e-llm-workflows/configs/training/lora/llama-3-8b.yaml | 2 -- .../e2e-llm-workflows/configs/training/lora/mistral-7b.yaml | 2 -- .../e2e-llm-workflows/configs/training/lora/mixtral-8x7b.yaml | 2 -- 11 files changed, 23 deletions(-) diff --git a/templates/e2e-dspy-workflow/README.md b/templates/e2e-dspy-workflow/README.md index 50e7ac16d..2f4207ebb 100644 --- a/templates/e2e-dspy-workflow/README.md +++ b/templates/e2e-dspy-workflow/README.md @@ -378,8 +378,6 @@ sanity_check_program(llama_70b, vanilla_program, ft_trainset[0]) ``` Program input: Example({'text': 'I still have not received an answer as to why I was charged $1.00 in a transaction?'}) (input_keys={'text'}) - - Program output label: extra_charge_on_statement @@ -794,8 +792,6 @@ except ValueError as e: ``` Program input: Example({'text': 'I still have not received an answer as to why I was charged $1.00 in a transaction?'}) (input_keys={'text'}) - - Non fine-tuned model returned invalid output out and errored out with Expected dict_keys(['reasoning', 'label']) but got dict_keys([]) Program input: Example({'text': 'I still have not received an answer as to why I was charged $1.00 in a transaction?'}) (input_keys={'text'}) Fine-tuned model returned invalid output out and errored out with Expected dict_keys(['reasoning', 'label']) but got dict_keys([]) diff --git a/templates/e2e-llm-workflows/README.ipynb b/templates/e2e-llm-workflows/README.ipynb index fbf46f0a6..5e6f2a30a 100644 --- a/templates/e2e-llm-workflows/README.ipynb +++ b/templates/e2e-llm-workflows/README.ipynb @@ -1007,7 +1007,6 @@ "output_type": "stream", "text": [ "context_length: 512\n", - "dataset_size_scaling_factor: 10000\n", "deepspeed:\n", " config_path: configs/deepspeed/zero_3_offload_optim+param.json\n", "eval_batch_size_per_device: 16\n", diff --git a/templates/e2e-llm-workflows/README.md b/templates/e2e-llm-workflows/README.md index bf7a7786c..6e0ee9fcb 100644 --- a/templates/e2e-llm-workflows/README.md +++ b/templates/e2e-llm-workflows/README.md @@ -330,11 +330,9 @@ We also have recipes for [LoRA](https://arxiv.org/abs/2106.09685) (where we trai learning_rate: 1e-4 padding: "longest" # This will pad batches to the longest sequence. Use "max_length" when profiling to profile the worst case. num_checkpoints_to_keep: 1 - dataset_size_scaling_factor: 10000 output_dir: /mnt/local_storage deepspeed: config_path: configs/deepspeed/zero_3_offload_optim+param.json - dataset_size_scaling_factor: 10000 # internal flag. No need to change flash_attention_2: true trainer_resources: memory: 53687091200 # 50 GB memory diff --git a/templates/e2e-llm-workflows/configs/training/full_param/llama-3-70b.yaml b/templates/e2e-llm-workflows/configs/training/full_param/llama-3-70b.yaml index ef51c9e88..60e9be233 100644 --- a/templates/e2e-llm-workflows/configs/training/full_param/llama-3-70b.yaml +++ b/templates/e2e-llm-workflows/configs/training/full_param/llama-3-70b.yaml @@ -9,11 +9,9 @@ eval_batch_size_per_device: 8 learning_rate: 5e-6 padding: "longest" # This will pad batches to the longest sequence. Use "max_length" when profiling to profile the worst case. num_checkpoints_to_keep: 1 -dataset_size_scaling_factor: 10000 output_dir: /mnt/local_storage deepspeed: config_path: configs/deepspeed/zero_3_offload_optim.json -dataset_size_scaling_factor: 10000 # internal flag. No need to change flash_attention_2: true # Head node would have at least 200 GB memory trainer_resources: diff --git a/templates/e2e-llm-workflows/configs/training/full_param/llama-3-8b.yaml b/templates/e2e-llm-workflows/configs/training/full_param/llama-3-8b.yaml index 0b7d18e7b..4e415eb69 100644 --- a/templates/e2e-llm-workflows/configs/training/full_param/llama-3-8b.yaml +++ b/templates/e2e-llm-workflows/configs/training/full_param/llama-3-8b.yaml @@ -9,11 +9,9 @@ eval_batch_size_per_device: 16 learning_rate: 5e-6 padding: "longest" # This will pad batches to the longest sequence. Use "max_length" when profiling to profile the worst case. num_checkpoints_to_keep: 1 -dataset_size_scaling_factor: 10000 output_dir: /mnt/local_storage deepspeed: config_path: configs/deepspeed/zero_3_offload_optim+param.json -dataset_size_scaling_factor: 10000 # internal flag. No need to change flash_attention_2: true trainer_resources: memory: 53687091200 # 50 GB memory diff --git a/templates/e2e-llm-workflows/configs/training/full_param/mistral-7b.yaml b/templates/e2e-llm-workflows/configs/training/full_param/mistral-7b.yaml index 2cecafc7b..acc5cce33 100644 --- a/templates/e2e-llm-workflows/configs/training/full_param/mistral-7b.yaml +++ b/templates/e2e-llm-workflows/configs/training/full_param/mistral-7b.yaml @@ -9,11 +9,9 @@ eval_batch_size_per_device: 8 learning_rate: 5e-6 padding: "longest" # This will pad batches to the longest sequence. Use "max_length" when profiling to profile the worst case. num_checkpoints_to_keep: 1 -dataset_size_scaling_factor: 10000 output_dir: /mnt/local_storage deepspeed: config_path: configs/deepspeed/zero_3_offload_optim+param.json -dataset_size_scaling_factor: 10000 # internal flag. No need to change flash_attention_2: true trainer_resources: memory: 53687091200 # 50 GB memory diff --git a/templates/e2e-llm-workflows/configs/training/full_param/mixtral-8x7b.yaml b/templates/e2e-llm-workflows/configs/training/full_param/mixtral-8x7b.yaml index 8182755e8..792257d41 100644 --- a/templates/e2e-llm-workflows/configs/training/full_param/mixtral-8x7b.yaml +++ b/templates/e2e-llm-workflows/configs/training/full_param/mixtral-8x7b.yaml @@ -9,11 +9,9 @@ eval_batch_size_per_device: 8 learning_rate: 5e-6 padding: "longest" # This will pad batches to the longest sequence. Use "max_length" when profiling to profile the worst case. num_checkpoints_to_keep: 1 -dataset_size_scaling_factor: 10000 output_dir: /mnt/local_storage deepspeed: config_path: configs/deepspeed/zero_3_offload_optim.json -dataset_size_scaling_factor: 10000 # internal flag. No need to change flash_attention_2: true trainer_resources: memory: 107374182400 # 100 GB memory diff --git a/templates/e2e-llm-workflows/configs/training/lora/llama-3-70b.yaml b/templates/e2e-llm-workflows/configs/training/lora/llama-3-70b.yaml index 3c34d3c7a..d59a7641e 100644 --- a/templates/e2e-llm-workflows/configs/training/lora/llama-3-70b.yaml +++ b/templates/e2e-llm-workflows/configs/training/lora/llama-3-70b.yaml @@ -9,11 +9,9 @@ eval_batch_size_per_device: 8 learning_rate: 1e-4 padding: "longest" # This will pad batches to the longest sequence. Use "max_length" when profiling to profile the worst case. num_checkpoints_to_keep: 1 -dataset_size_scaling_factor: 10000 output_dir: /mnt/local_storage deepspeed: config_path: configs/deepspeed/zero_3.json -dataset_size_scaling_factor: 10000 # internal flag. No need to change flash_attention_2: true # Head node would have at least 200 GB memory trainer_resources: diff --git a/templates/e2e-llm-workflows/configs/training/lora/llama-3-8b.yaml b/templates/e2e-llm-workflows/configs/training/lora/llama-3-8b.yaml index 46b72d305..9e63a4864 100644 --- a/templates/e2e-llm-workflows/configs/training/lora/llama-3-8b.yaml +++ b/templates/e2e-llm-workflows/configs/training/lora/llama-3-8b.yaml @@ -9,11 +9,9 @@ eval_batch_size_per_device: 16 learning_rate: 1e-4 padding: "longest" # This will pad batches to the longest sequence. Use "max_length" when profiling to profile the worst case. num_checkpoints_to_keep: 1 -dataset_size_scaling_factor: 10000 output_dir: /mnt/local_storage deepspeed: config_path: configs/deepspeed/zero_3_offload_optim+param.json -dataset_size_scaling_factor: 10000 # internal flag. No need to change flash_attention_2: true trainer_resources: memory: 53687091200 # 50 GB memory diff --git a/templates/e2e-llm-workflows/configs/training/lora/mistral-7b.yaml b/templates/e2e-llm-workflows/configs/training/lora/mistral-7b.yaml index 8737bf240..a66901530 100644 --- a/templates/e2e-llm-workflows/configs/training/lora/mistral-7b.yaml +++ b/templates/e2e-llm-workflows/configs/training/lora/mistral-7b.yaml @@ -9,11 +9,9 @@ eval_batch_size_per_device: 16 learning_rate: 1e-4 padding: "longest" # This will pad batches to the longest sequence. Use "max_length" when profiling to profile the worst case. num_checkpoints_to_keep: 1 -dataset_size_scaling_factor: 10000 output_dir: /mnt/local_storage deepspeed: config_path: configs/deepspeed/zero_3_offload_optim+param.json -dataset_size_scaling_factor: 10000 # internal flag. No need to change flash_attention_2: true trainer_resources: memory: 53687091200 # 50 GB memory diff --git a/templates/e2e-llm-workflows/configs/training/lora/mixtral-8x7b.yaml b/templates/e2e-llm-workflows/configs/training/lora/mixtral-8x7b.yaml index 38541b98c..bedd43eca 100644 --- a/templates/e2e-llm-workflows/configs/training/lora/mixtral-8x7b.yaml +++ b/templates/e2e-llm-workflows/configs/training/lora/mixtral-8x7b.yaml @@ -9,11 +9,9 @@ eval_batch_size_per_device: 8 learning_rate: 1e-4 padding: "longest" # This will pad batches to the longest sequence. Use "max_length" when profiling to profile the worst case. num_checkpoints_to_keep: 1 -dataset_size_scaling_factor: 10000 output_dir: /mnt/local_storage deepspeed: config_path: configs/deepspeed/zero_3.json -dataset_size_scaling_factor: 10000 # internal flag. No need to change flash_attention_2: true trainer_resources: memory: 107374182400 # 100 GB memory From 7b662fd65871942bc3ad896fa13f9f698a9f1e6e Mon Sep 17 00:00:00 2001 From: SumanthRH Date: Mon, 9 Dec 2024 03:51:40 -0800 Subject: [PATCH 2/3] x Signed-off-by: SumanthRH --- templates/e2e-dspy-workflow/README.ipynb | 2 -- templates/e2e-dspy-workflow/README.md | 1 - .../e2e-dspy-workflow/configs/training/lora/llama-3-8b.yaml | 2 -- .../end-to-end-examples/fine-tune-function-calling/README.ipynb | 1 - .../end-to-end-examples/fine-tune-function-calling/README.md | 1 - 5 files changed, 7 deletions(-) diff --git a/templates/e2e-dspy-workflow/README.ipynb b/templates/e2e-dspy-workflow/README.ipynb index 9ab562e0e..29d243a8d 100644 --- a/templates/e2e-dspy-workflow/README.ipynb +++ b/templates/e2e-dspy-workflow/README.ipynb @@ -910,7 +910,6 @@ " 'learning_rate': 3e-05,\n", " 'padding': 'longest',\n", " 'num_checkpoints_to_keep': 1,\n", - " 'dataset_size_scaling_factor': 10000,\n", " 'output_dir': '/mnt/local_storage',\n", " 'deepspeed': {'config_path': 'configs/deepspeed/zero_3.json'},\n", " 'flash_attention_2': True,\n", @@ -960,7 +959,6 @@ " \u001b[32m'learning_rate'\u001b[0m: \u001b[1;36m3e-05\u001b[0m,\n", " \u001b[32m'padding'\u001b[0m: \u001b[32m'longest'\u001b[0m,\n", " \u001b[32m'num_checkpoints_to_keep'\u001b[0m: \u001b[1;36m1\u001b[0m,\n", - " \u001b[32m'dataset_size_scaling_factor'\u001b[0m: \u001b[1;36m10000\u001b[0m,\n", " \u001b[32m'output_dir'\u001b[0m: \u001b[32m'/mnt/local_storage'\u001b[0m,\n", " \u001b[32m'deepspeed'\u001b[0m: \u001b[1m{\u001b[0m\u001b[32m'config_path'\u001b[0m: \u001b[32m'configs/deepspeed/zero_3.json'\u001b[0m\u001b[1m}\u001b[0m,\n", " \u001b[32m'flash_attention_2'\u001b[0m: \u001b[3;92mTrue\u001b[0m,\n", diff --git a/templates/e2e-dspy-workflow/README.md b/templates/e2e-dspy-workflow/README.md index 2f4207ebb..8e298d6c9 100644 --- a/templates/e2e-dspy-workflow/README.md +++ b/templates/e2e-dspy-workflow/README.md @@ -542,7 +542,6 @@ rich.print(yaml.safe_load(open(llmforge_config_path))) 'learning_rate': 3e-05, 'padding': 'longest', 'num_checkpoints_to_keep': 1, - 'dataset_size_scaling_factor': 10000, 'output_dir': '/mnt/local_storage', 'deepspeed': {'config_path': 'configs/deepspeed/zero_3.json'}, 'flash_attention_2': True, diff --git a/templates/e2e-dspy-workflow/configs/training/lora/llama-3-8b.yaml b/templates/e2e-dspy-workflow/configs/training/lora/llama-3-8b.yaml index 3102b1928..df589ae35 100644 --- a/templates/e2e-dspy-workflow/configs/training/lora/llama-3-8b.yaml +++ b/templates/e2e-dspy-workflow/configs/training/lora/llama-3-8b.yaml @@ -10,11 +10,9 @@ eval_batch_size_per_device: 16 learning_rate: 3.0e-5 padding: "longest" # This will pad batches to the longest sequence. Use "max_length" when profiling to profile the worst case. num_checkpoints_to_keep: 1 -dataset_size_scaling_factor: 10000 output_dir: /mnt/local_storage deepspeed: config_path: configs/deepspeed/zero_3.json -dataset_size_scaling_factor: 10000 # internal flag. No need to change flash_attention_2: true worker_resources: accelerator_type:A100-80G: 0.001 diff --git a/templates/fine-tune-llm_v2/end-to-end-examples/fine-tune-function-calling/README.ipynb b/templates/fine-tune-llm_v2/end-to-end-examples/fine-tune-function-calling/README.ipynb index b6dbeb77e..b07500553 100644 --- a/templates/fine-tune-llm_v2/end-to-end-examples/fine-tune-function-calling/README.ipynb +++ b/templates/fine-tune-llm_v2/end-to-end-examples/fine-tune-function-calling/README.ipynb @@ -413,7 +413,6 @@ " \"fan_in_fan_out\": False,\n", " \"init_lora_weights\": True\n", " },\n", - " \"dataset_size_scaling_factor\": 1e10, # Very large number\n", "}" ] }, diff --git a/templates/fine-tune-llm_v2/end-to-end-examples/fine-tune-function-calling/README.md b/templates/fine-tune-llm_v2/end-to-end-examples/fine-tune-function-calling/README.md index 0d0017ab8..87a87d634 100644 --- a/templates/fine-tune-llm_v2/end-to-end-examples/fine-tune-function-calling/README.md +++ b/templates/fine-tune-llm_v2/end-to-end-examples/fine-tune-function-calling/README.md @@ -274,7 +274,6 @@ config = { "fan_in_fan_out": False, "init_lora_weights": True }, - "dataset_size_scaling_factor": 1e10, # Very large number } ``` From e047f629f6de769d469c9fba74f409b7f6a8dd4e Mon Sep 17 00:00:00 2001 From: SumanthRH Date: Mon, 9 Dec 2024 17:25:47 +0530 Subject: [PATCH 3/3] x Signed-off-by: SumanthRH --- templates/e2e-dspy-workflow/README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/templates/e2e-dspy-workflow/README.md b/templates/e2e-dspy-workflow/README.md index 8e298d6c9..7abd9b8bc 100644 --- a/templates/e2e-dspy-workflow/README.md +++ b/templates/e2e-dspy-workflow/README.md @@ -378,6 +378,8 @@ sanity_check_program(llama_70b, vanilla_program, ft_trainset[0]) ``` Program input: Example({'text': 'I still have not received an answer as to why I was charged $1.00 in a transaction?'}) (input_keys={'text'}) + + Program output label: extra_charge_on_statement @@ -791,6 +793,8 @@ except ValueError as e: ``` Program input: Example({'text': 'I still have not received an answer as to why I was charged $1.00 in a transaction?'}) (input_keys={'text'}) + + Non fine-tuned model returned invalid output out and errored out with Expected dict_keys(['reasoning', 'label']) but got dict_keys([]) Program input: Example({'text': 'I still have not received an answer as to why I was charged $1.00 in a transaction?'}) (input_keys={'text'}) Fine-tuned model returned invalid output out and errored out with Expected dict_keys(['reasoning', 'label']) but got dict_keys([])