From cc5be8a9ad4238813e4eb3744270e60a86e2f024 Mon Sep 17 00:00:00 2001
From: SumanthRH <sumanthrh@anyscale.com>
Date: Mon, 9 Dec 2024 03:46:27 -0800
Subject: [PATCH 1/3] x

Signed-off-by: SumanthRH <sumanthrh@anyscale.com>
---
 templates/e2e-dspy-workflow/README.md                         | 4 ----
 templates/e2e-llm-workflows/README.ipynb                      | 1 -
 templates/e2e-llm-workflows/README.md                         | 2 --
 .../configs/training/full_param/llama-3-70b.yaml              | 2 --
 .../configs/training/full_param/llama-3-8b.yaml               | 2 --
 .../configs/training/full_param/mistral-7b.yaml               | 2 --
 .../configs/training/full_param/mixtral-8x7b.yaml             | 2 --
 .../e2e-llm-workflows/configs/training/lora/llama-3-70b.yaml  | 2 --
 .../e2e-llm-workflows/configs/training/lora/llama-3-8b.yaml   | 2 --
 .../e2e-llm-workflows/configs/training/lora/mistral-7b.yaml   | 2 --
 .../e2e-llm-workflows/configs/training/lora/mixtral-8x7b.yaml | 2 --
 11 files changed, 23 deletions(-)

diff --git a/templates/e2e-dspy-workflow/README.md b/templates/e2e-dspy-workflow/README.md
index 50e7ac16d..2f4207ebb 100644
--- a/templates/e2e-dspy-workflow/README.md
+++ b/templates/e2e-dspy-workflow/README.md
@@ -378,8 +378,6 @@ sanity_check_program(llama_70b, vanilla_program, ft_trainset[0])
 ```
 
     Program input: Example({'text': 'I still have not received an answer as to why I was charged $1.00 in a transaction?'}) (input_keys={'text'})
-
-
     Program output label: extra_charge_on_statement
 
 
@@ -794,8 +792,6 @@ except ValueError as e:
 ```
 
     Program input: Example({'text': 'I still have not received an answer as to why I was charged $1.00 in a transaction?'}) (input_keys={'text'})
-
-
     Non fine-tuned model returned invalid output out and errored out with Expected dict_keys(['reasoning', 'label']) but got dict_keys([])
     Program input: Example({'text': 'I still have not received an answer as to why I was charged $1.00 in a transaction?'}) (input_keys={'text'})
     Fine-tuned model returned invalid output out and errored out with Expected dict_keys(['reasoning', 'label']) but got dict_keys([])
diff --git a/templates/e2e-llm-workflows/README.ipynb b/templates/e2e-llm-workflows/README.ipynb
index fbf46f0a6..5e6f2a30a 100644
--- a/templates/e2e-llm-workflows/README.ipynb
+++ b/templates/e2e-llm-workflows/README.ipynb
@@ -1007,7 +1007,6 @@
      "output_type": "stream",
      "text": [
       "context_length: 512\n",
-      "dataset_size_scaling_factor: 10000\n",
       "deepspeed:\n",
       "  config_path: configs/deepspeed/zero_3_offload_optim+param.json\n",
       "eval_batch_size_per_device: 16\n",
diff --git a/templates/e2e-llm-workflows/README.md b/templates/e2e-llm-workflows/README.md
index bf7a7786c..6e0ee9fcb 100644
--- a/templates/e2e-llm-workflows/README.md
+++ b/templates/e2e-llm-workflows/README.md
@@ -330,11 +330,9 @@ We also have recipes for [LoRA](https://arxiv.org/abs/2106.09685) (where we trai
     learning_rate: 1e-4
     padding: "longest" # This will pad batches to the longest sequence. Use "max_length" when profiling to profile the worst case.
     num_checkpoints_to_keep: 1
-    dataset_size_scaling_factor: 10000
     output_dir: /mnt/local_storage
     deepspeed:
       config_path: configs/deepspeed/zero_3_offload_optim+param.json
-    dataset_size_scaling_factor: 10000 # internal flag. No need to change
     flash_attention_2: true
     trainer_resources:
       memory: 53687091200 # 50 GB memory
diff --git a/templates/e2e-llm-workflows/configs/training/full_param/llama-3-70b.yaml b/templates/e2e-llm-workflows/configs/training/full_param/llama-3-70b.yaml
index ef51c9e88..60e9be233 100644
--- a/templates/e2e-llm-workflows/configs/training/full_param/llama-3-70b.yaml
+++ b/templates/e2e-llm-workflows/configs/training/full_param/llama-3-70b.yaml
@@ -9,11 +9,9 @@ eval_batch_size_per_device: 8
 learning_rate: 5e-6
 padding: "longest" # This will pad batches to the longest sequence. Use "max_length" when profiling to profile the worst case.
 num_checkpoints_to_keep: 1
-dataset_size_scaling_factor: 10000
 output_dir: /mnt/local_storage
 deepspeed:
   config_path: configs/deepspeed/zero_3_offload_optim.json
-dataset_size_scaling_factor: 10000 # internal flag. No need to change
 flash_attention_2: true
 # Head node would have at least 200 GB memory
 trainer_resources:
diff --git a/templates/e2e-llm-workflows/configs/training/full_param/llama-3-8b.yaml b/templates/e2e-llm-workflows/configs/training/full_param/llama-3-8b.yaml
index 0b7d18e7b..4e415eb69 100644
--- a/templates/e2e-llm-workflows/configs/training/full_param/llama-3-8b.yaml
+++ b/templates/e2e-llm-workflows/configs/training/full_param/llama-3-8b.yaml
@@ -9,11 +9,9 @@ eval_batch_size_per_device: 16
 learning_rate: 5e-6
 padding: "longest" # This will pad batches to the longest sequence. Use "max_length" when profiling to profile the worst case.
 num_checkpoints_to_keep: 1
-dataset_size_scaling_factor: 10000
 output_dir: /mnt/local_storage
 deepspeed:
   config_path: configs/deepspeed/zero_3_offload_optim+param.json
-dataset_size_scaling_factor: 10000 # internal flag. No need to change
 flash_attention_2: true
 trainer_resources:
   memory: 53687091200 # 50 GB memory
diff --git a/templates/e2e-llm-workflows/configs/training/full_param/mistral-7b.yaml b/templates/e2e-llm-workflows/configs/training/full_param/mistral-7b.yaml
index 2cecafc7b..acc5cce33 100644
--- a/templates/e2e-llm-workflows/configs/training/full_param/mistral-7b.yaml
+++ b/templates/e2e-llm-workflows/configs/training/full_param/mistral-7b.yaml
@@ -9,11 +9,9 @@ eval_batch_size_per_device: 8
 learning_rate: 5e-6
 padding: "longest" # This will pad batches to the longest sequence. Use "max_length" when profiling to profile the worst case.
 num_checkpoints_to_keep: 1
-dataset_size_scaling_factor: 10000
 output_dir: /mnt/local_storage
 deepspeed:
   config_path: configs/deepspeed/zero_3_offload_optim+param.json
-dataset_size_scaling_factor: 10000 # internal flag. No need to change
 flash_attention_2: true
 trainer_resources:
   memory: 53687091200 # 50 GB memory
diff --git a/templates/e2e-llm-workflows/configs/training/full_param/mixtral-8x7b.yaml b/templates/e2e-llm-workflows/configs/training/full_param/mixtral-8x7b.yaml
index 8182755e8..792257d41 100644
--- a/templates/e2e-llm-workflows/configs/training/full_param/mixtral-8x7b.yaml
+++ b/templates/e2e-llm-workflows/configs/training/full_param/mixtral-8x7b.yaml
@@ -9,11 +9,9 @@ eval_batch_size_per_device: 8
 learning_rate: 5e-6
 padding: "longest" # This will pad batches to the longest sequence. Use "max_length" when profiling to profile the worst case.
 num_checkpoints_to_keep: 1
-dataset_size_scaling_factor: 10000
 output_dir: /mnt/local_storage
 deepspeed:
   config_path: configs/deepspeed/zero_3_offload_optim.json
-dataset_size_scaling_factor: 10000 # internal flag. No need to change
 flash_attention_2: true
 trainer_resources:
   memory: 107374182400 # 100 GB memory
diff --git a/templates/e2e-llm-workflows/configs/training/lora/llama-3-70b.yaml b/templates/e2e-llm-workflows/configs/training/lora/llama-3-70b.yaml
index 3c34d3c7a..d59a7641e 100644
--- a/templates/e2e-llm-workflows/configs/training/lora/llama-3-70b.yaml
+++ b/templates/e2e-llm-workflows/configs/training/lora/llama-3-70b.yaml
@@ -9,11 +9,9 @@ eval_batch_size_per_device: 8
 learning_rate: 1e-4
 padding: "longest" # This will pad batches to the longest sequence. Use "max_length" when profiling to profile the worst case.
 num_checkpoints_to_keep: 1
-dataset_size_scaling_factor: 10000
 output_dir: /mnt/local_storage
 deepspeed:
   config_path: configs/deepspeed/zero_3.json
-dataset_size_scaling_factor: 10000 # internal flag. No need to change
 flash_attention_2: true
 # Head node would have at least 200 GB memory
 trainer_resources:
diff --git a/templates/e2e-llm-workflows/configs/training/lora/llama-3-8b.yaml b/templates/e2e-llm-workflows/configs/training/lora/llama-3-8b.yaml
index 46b72d305..9e63a4864 100644
--- a/templates/e2e-llm-workflows/configs/training/lora/llama-3-8b.yaml
+++ b/templates/e2e-llm-workflows/configs/training/lora/llama-3-8b.yaml
@@ -9,11 +9,9 @@ eval_batch_size_per_device: 16
 learning_rate: 1e-4
 padding: "longest" # This will pad batches to the longest sequence. Use "max_length" when profiling to profile the worst case.
 num_checkpoints_to_keep: 1
-dataset_size_scaling_factor: 10000
 output_dir: /mnt/local_storage
 deepspeed:
   config_path: configs/deepspeed/zero_3_offload_optim+param.json
-dataset_size_scaling_factor: 10000 # internal flag. No need to change
 flash_attention_2: true
 trainer_resources:
   memory: 53687091200 # 50 GB memory
diff --git a/templates/e2e-llm-workflows/configs/training/lora/mistral-7b.yaml b/templates/e2e-llm-workflows/configs/training/lora/mistral-7b.yaml
index 8737bf240..a66901530 100644
--- a/templates/e2e-llm-workflows/configs/training/lora/mistral-7b.yaml
+++ b/templates/e2e-llm-workflows/configs/training/lora/mistral-7b.yaml
@@ -9,11 +9,9 @@ eval_batch_size_per_device: 16
 learning_rate: 1e-4
 padding: "longest" # This will pad batches to the longest sequence. Use "max_length" when profiling to profile the worst case.
 num_checkpoints_to_keep: 1
-dataset_size_scaling_factor: 10000
 output_dir: /mnt/local_storage
 deepspeed:
   config_path: configs/deepspeed/zero_3_offload_optim+param.json
-dataset_size_scaling_factor: 10000 # internal flag. No need to change
 flash_attention_2: true
 trainer_resources:
   memory: 53687091200 # 50 GB memory
diff --git a/templates/e2e-llm-workflows/configs/training/lora/mixtral-8x7b.yaml b/templates/e2e-llm-workflows/configs/training/lora/mixtral-8x7b.yaml
index 38541b98c..bedd43eca 100644
--- a/templates/e2e-llm-workflows/configs/training/lora/mixtral-8x7b.yaml
+++ b/templates/e2e-llm-workflows/configs/training/lora/mixtral-8x7b.yaml
@@ -9,11 +9,9 @@ eval_batch_size_per_device: 8
 learning_rate: 1e-4
 padding: "longest" # This will pad batches to the longest sequence. Use "max_length" when profiling to profile the worst case.
 num_checkpoints_to_keep: 1
-dataset_size_scaling_factor: 10000
 output_dir: /mnt/local_storage
 deepspeed:
   config_path: configs/deepspeed/zero_3.json
-dataset_size_scaling_factor: 10000 # internal flag. No need to change
 flash_attention_2: true
 trainer_resources:
   memory: 107374182400 # 100 GB memory

From 7b662fd65871942bc3ad896fa13f9f698a9f1e6e Mon Sep 17 00:00:00 2001
From: SumanthRH <sumanthrh@anyscale.com>
Date: Mon, 9 Dec 2024 03:51:40 -0800
Subject: [PATCH 2/3] x

Signed-off-by: SumanthRH <sumanthrh@anyscale.com>
---
 templates/e2e-dspy-workflow/README.ipynb                        | 2 --
 templates/e2e-dspy-workflow/README.md                           | 1 -
 .../e2e-dspy-workflow/configs/training/lora/llama-3-8b.yaml     | 2 --
 .../end-to-end-examples/fine-tune-function-calling/README.ipynb | 1 -
 .../end-to-end-examples/fine-tune-function-calling/README.md    | 1 -
 5 files changed, 7 deletions(-)

diff --git a/templates/e2e-dspy-workflow/README.ipynb b/templates/e2e-dspy-workflow/README.ipynb
index 9ab562e0e..29d243a8d 100644
--- a/templates/e2e-dspy-workflow/README.ipynb
+++ b/templates/e2e-dspy-workflow/README.ipynb
@@ -910,7 +910,6 @@
        "    <span style=\"color: #008000; text-decoration-color: #008000\">'learning_rate'</span>: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3e-05</span>,\n",
        "    <span style=\"color: #008000; text-decoration-color: #008000\">'padding'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'longest'</span>,\n",
        "    <span style=\"color: #008000; text-decoration-color: #008000\">'num_checkpoints_to_keep'</span>: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">1</span>,\n",
-       "    <span style=\"color: #008000; text-decoration-color: #008000\">'dataset_size_scaling_factor'</span>: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">10000</span>,\n",
        "    <span style=\"color: #008000; text-decoration-color: #008000\">'output_dir'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'/mnt/local_storage'</span>,\n",
        "    <span style=\"color: #008000; text-decoration-color: #008000\">'deepspeed'</span>: <span style=\"font-weight: bold\">{</span><span style=\"color: #008000; text-decoration-color: #008000\">'config_path'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'configs/deepspeed/zero_3.json'</span><span style=\"font-weight: bold\">}</span>,\n",
        "    <span style=\"color: #008000; text-decoration-color: #008000\">'flash_attention_2'</span>: <span style=\"color: #00ff00; text-decoration-color: #00ff00; font-style: italic\">True</span>,\n",
@@ -960,7 +959,6 @@
        "    \u001b[32m'learning_rate'\u001b[0m: \u001b[1;36m3e-05\u001b[0m,\n",
        "    \u001b[32m'padding'\u001b[0m: \u001b[32m'longest'\u001b[0m,\n",
        "    \u001b[32m'num_checkpoints_to_keep'\u001b[0m: \u001b[1;36m1\u001b[0m,\n",
-       "    \u001b[32m'dataset_size_scaling_factor'\u001b[0m: \u001b[1;36m10000\u001b[0m,\n",
        "    \u001b[32m'output_dir'\u001b[0m: \u001b[32m'/mnt/local_storage'\u001b[0m,\n",
        "    \u001b[32m'deepspeed'\u001b[0m: \u001b[1m{\u001b[0m\u001b[32m'config_path'\u001b[0m: \u001b[32m'configs/deepspeed/zero_3.json'\u001b[0m\u001b[1m}\u001b[0m,\n",
        "    \u001b[32m'flash_attention_2'\u001b[0m: \u001b[3;92mTrue\u001b[0m,\n",
diff --git a/templates/e2e-dspy-workflow/README.md b/templates/e2e-dspy-workflow/README.md
index 2f4207ebb..8e298d6c9 100644
--- a/templates/e2e-dspy-workflow/README.md
+++ b/templates/e2e-dspy-workflow/README.md
@@ -542,7 +542,6 @@ rich.print(yaml.safe_load(open(llmforge_config_path)))
     <span style="color: #008000; text-decoration-color: #008000">'learning_rate'</span>: <span style="color: #008080; text-decoration-color: #008080; font-weight: bold">3e-05</span>,
     <span style="color: #008000; text-decoration-color: #008000">'padding'</span>: <span style="color: #008000; text-decoration-color: #008000">'longest'</span>,
     <span style="color: #008000; text-decoration-color: #008000">'num_checkpoints_to_keep'</span>: <span style="color: #008080; text-decoration-color: #008080; font-weight: bold">1</span>,
-    <span style="color: #008000; text-decoration-color: #008000">'dataset_size_scaling_factor'</span>: <span style="color: #008080; text-decoration-color: #008080; font-weight: bold">10000</span>,
     <span style="color: #008000; text-decoration-color: #008000">'output_dir'</span>: <span style="color: #008000; text-decoration-color: #008000">'/mnt/local_storage'</span>,
     <span style="color: #008000; text-decoration-color: #008000">'deepspeed'</span>: <span style="font-weight: bold">{</span><span style="color: #008000; text-decoration-color: #008000">'config_path'</span>: <span style="color: #008000; text-decoration-color: #008000">'configs/deepspeed/zero_3.json'</span><span style="font-weight: bold">}</span>,
     <span style="color: #008000; text-decoration-color: #008000">'flash_attention_2'</span>: <span style="color: #00ff00; text-decoration-color: #00ff00; font-style: italic">True</span>,
diff --git a/templates/e2e-dspy-workflow/configs/training/lora/llama-3-8b.yaml b/templates/e2e-dspy-workflow/configs/training/lora/llama-3-8b.yaml
index 3102b1928..df589ae35 100644
--- a/templates/e2e-dspy-workflow/configs/training/lora/llama-3-8b.yaml
+++ b/templates/e2e-dspy-workflow/configs/training/lora/llama-3-8b.yaml
@@ -10,11 +10,9 @@ eval_batch_size_per_device: 16
 learning_rate: 3.0e-5
 padding: "longest" # This will pad batches to the longest sequence. Use "max_length" when profiling to profile the worst case.
 num_checkpoints_to_keep: 1
-dataset_size_scaling_factor: 10000
 output_dir: /mnt/local_storage
 deepspeed:
   config_path: configs/deepspeed/zero_3.json
-dataset_size_scaling_factor: 10000 # internal flag. No need to change
 flash_attention_2: true
 worker_resources:
   accelerator_type:A100-80G: 0.001
diff --git a/templates/fine-tune-llm_v2/end-to-end-examples/fine-tune-function-calling/README.ipynb b/templates/fine-tune-llm_v2/end-to-end-examples/fine-tune-function-calling/README.ipynb
index b6dbeb77e..b07500553 100644
--- a/templates/fine-tune-llm_v2/end-to-end-examples/fine-tune-function-calling/README.ipynb
+++ b/templates/fine-tune-llm_v2/end-to-end-examples/fine-tune-function-calling/README.ipynb
@@ -413,7 +413,6 @@
     "        \"fan_in_fan_out\": False,\n",
     "        \"init_lora_weights\": True\n",
     "    },\n",
-    "    \"dataset_size_scaling_factor\": 1e10, # Very large number\n",
     "}"
    ]
   },
diff --git a/templates/fine-tune-llm_v2/end-to-end-examples/fine-tune-function-calling/README.md b/templates/fine-tune-llm_v2/end-to-end-examples/fine-tune-function-calling/README.md
index 0d0017ab8..87a87d634 100644
--- a/templates/fine-tune-llm_v2/end-to-end-examples/fine-tune-function-calling/README.md
+++ b/templates/fine-tune-llm_v2/end-to-end-examples/fine-tune-function-calling/README.md
@@ -274,7 +274,6 @@ config = {
         "fan_in_fan_out": False,
         "init_lora_weights": True
     },
-    "dataset_size_scaling_factor": 1e10, # Very large number
 }
 ```
 

From e047f629f6de769d469c9fba74f409b7f6a8dd4e Mon Sep 17 00:00:00 2001
From: SumanthRH <sumanthrh@anyscale.com>
Date: Mon, 9 Dec 2024 17:25:47 +0530
Subject: [PATCH 3/3] x

Signed-off-by: SumanthRH <sumanthrh@anyscale.com>
---
 templates/e2e-dspy-workflow/README.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/templates/e2e-dspy-workflow/README.md b/templates/e2e-dspy-workflow/README.md
index 8e298d6c9..7abd9b8bc 100644
--- a/templates/e2e-dspy-workflow/README.md
+++ b/templates/e2e-dspy-workflow/README.md
@@ -378,6 +378,8 @@ sanity_check_program(llama_70b, vanilla_program, ft_trainset[0])
 ```
 
     Program input: Example({'text': 'I still have not received an answer as to why I was charged $1.00 in a transaction?'}) (input_keys={'text'})
+
+
     Program output label: extra_charge_on_statement
 
 
@@ -791,6 +793,8 @@ except ValueError as e:
 ```
 
     Program input: Example({'text': 'I still have not received an answer as to why I was charged $1.00 in a transaction?'}) (input_keys={'text'})
+
+
     Non fine-tuned model returned invalid output out and errored out with Expected dict_keys(['reasoning', 'label']) but got dict_keys([])
     Program input: Example({'text': 'I still have not received an answer as to why I was charged $1.00 in a transaction?'}) (input_keys={'text'})
     Fine-tuned model returned invalid output out and errored out with Expected dict_keys(['reasoning', 'label']) but got dict_keys([])