anyscale · SumanthRH · Dec 9, 2024 · Dec 9, 2024 · Dec 9, 2024 · Dec 9, 2024
diff --git a/templates/e2e-dspy-workflow/README.ipynb b/templates/e2e-dspy-workflow/README.ipynb
@@ -910,7 +910,6 @@
        "    <span style=\"color: #008000; text-decoration-color: #008000\">'learning_rate'</span>: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">3e-05</span>,\n",
        "    <span style=\"color: #008000; text-decoration-color: #008000\">'padding'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'longest'</span>,\n",
        "    <span style=\"color: #008000; text-decoration-color: #008000\">'num_checkpoints_to_keep'</span>: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">1</span>,\n",
-       "    <span style=\"color: #008000; text-decoration-color: #008000\">'dataset_size_scaling_factor'</span>: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">10000</span>,\n",
        "    <span style=\"color: #008000; text-decoration-color: #008000\">'output_dir'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'/mnt/local_storage'</span>,\n",
        "    <span style=\"color: #008000; text-decoration-color: #008000\">'deepspeed'</span>: <span style=\"font-weight: bold\">{</span><span style=\"color: #008000; text-decoration-color: #008000\">'config_path'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'configs/deepspeed/zero_3.json'</span><span style=\"font-weight: bold\">}</span>,\n",
        "    <span style=\"color: #008000; text-decoration-color: #008000\">'flash_attention_2'</span>: <span style=\"color: #00ff00; text-decoration-color: #00ff00; font-style: italic\">True</span>,\n",
@@ -960,7 +959,6 @@
        "    \u001b[32m'learning_rate'\u001b[0m: \u001b[1;36m3e-05\u001b[0m,\n",
        "    \u001b[32m'padding'\u001b[0m: \u001b[32m'longest'\u001b[0m,\n",
        "    \u001b[32m'num_checkpoints_to_keep'\u001b[0m: \u001b[1;36m1\u001b[0m,\n",
-       "    \u001b[32m'dataset_size_scaling_factor'\u001b[0m: \u001b[1;36m10000\u001b[0m,\n",
        "    \u001b[32m'output_dir'\u001b[0m: \u001b[32m'/mnt/local_storage'\u001b[0m,\n",
        "    \u001b[32m'deepspeed'\u001b[0m: \u001b[1m{\u001b[0m\u001b[32m'config_path'\u001b[0m: \u001b[32m'configs/deepspeed/zero_3.json'\u001b[0m\u001b[1m}\u001b[0m,\n",
        "    \u001b[32m'flash_attention_2'\u001b[0m: \u001b[3;92mTrue\u001b[0m,\n",

diff --git a/templates/e2e-dspy-workflow/README.md b/templates/e2e-dspy-workflow/README.md
@@ -544,7 +544,6 @@ rich.print(yaml.safe_load(open(llmforge_config_path)))
     <span style="color: #008000; text-decoration-color: #008000">'learning_rate'</span>: <span style="color: #008080; text-decoration-color: #008080; font-weight: bold">3e-05</span>,
     <span style="color: #008000; text-decoration-color: #008000">'padding'</span>: <span style="color: #008000; text-decoration-color: #008000">'longest'</span>,
     <span style="color: #008000; text-decoration-color: #008000">'num_checkpoints_to_keep'</span>: <span style="color: #008080; text-decoration-color: #008080; font-weight: bold">1</span>,
-    <span style="color: #008000; text-decoration-color: #008000">'dataset_size_scaling_factor'</span>: <span style="color: #008080; text-decoration-color: #008080; font-weight: bold">10000</span>,
     <span style="color: #008000; text-decoration-color: #008000">'output_dir'</span>: <span style="color: #008000; text-decoration-color: #008000">'/mnt/local_storage'</span>,
     <span style="color: #008000; text-decoration-color: #008000">'deepspeed'</span>: <span style="font-weight: bold">{</span><span style="color: #008000; text-decoration-color: #008000">'config_path'</span>: <span style="color: #008000; text-decoration-color: #008000">'configs/deepspeed/zero_3.json'</span><span style="font-weight: bold">}</span>,
     <span style="color: #008000; text-decoration-color: #008000">'flash_attention_2'</span>: <span style="color: #00ff00; text-decoration-color: #00ff00; font-style: italic">True</span>,

diff --git a/templates/e2e-dspy-workflow/configs/training/lora/llama-3-8b.yaml b/templates/e2e-dspy-workflow/configs/training/lora/llama-3-8b.yaml
@@ -10,11 +10,9 @@ eval_batch_size_per_device: 16
 learning_rate: 3.0e-5
 padding: "longest" # This will pad batches to the longest sequence. Use "max_length" when profiling to profile the worst case.
 num_checkpoints_to_keep: 1
-dataset_size_scaling_factor: 10000
 output_dir: /mnt/local_storage
 deepspeed:
   config_path: configs/deepspeed/zero_3.json
-dataset_size_scaling_factor: 10000 # internal flag. No need to change
 flash_attention_2: true
 worker_resources:
   accelerator_type:A100-80G: 0.001

diff --git a/templates/e2e-llm-workflows/README.ipynb b/templates/e2e-llm-workflows/README.ipynb
@@ -1007,7 +1007,6 @@
      "output_type": "stream",
      "text": [
       "context_length: 512\n",
-      "dataset_size_scaling_factor: 10000\n",
       "deepspeed:\n",
       "  config_path: configs/deepspeed/zero_3_offload_optim+param.json\n",
       "eval_batch_size_per_device: 16\n",

diff --git a/templates/e2e-llm-workflows/README.md b/templates/e2e-llm-workflows/README.md
@@ -330,11 +330,9 @@ We also have recipes for [LoRA](https://arxiv.org/abs/2106.09685) (where we trai
     learning_rate: 1e-4
     padding: "longest" # This will pad batches to the longest sequence. Use "max_length" when profiling to profile the worst case.
     num_checkpoints_to_keep: 1
-    dataset_size_scaling_factor: 10000
     output_dir: /mnt/local_storage
     deepspeed:
       config_path: configs/deepspeed/zero_3_offload_optim+param.json
-    dataset_size_scaling_factor: 10000 # internal flag. No need to change
     flash_attention_2: true
     trainer_resources:
       memory: 53687091200 # 50 GB memory

diff --git a/templates/e2e-llm-workflows/configs/training/full_param/llama-3-70b.yaml b/templates/e2e-llm-workflows/configs/training/full_param/llama-3-70b.yaml
@@ -9,11 +9,9 @@ eval_batch_size_per_device: 8
 learning_rate: 5e-6
 padding: "longest" # This will pad batches to the longest sequence. Use "max_length" when profiling to profile the worst case.
 num_checkpoints_to_keep: 1
-dataset_size_scaling_factor: 10000
 output_dir: /mnt/local_storage
 deepspeed:
   config_path: configs/deepspeed/zero_3_offload_optim.json
-dataset_size_scaling_factor: 10000 # internal flag. No need to change
 flash_attention_2: true
 # Head node would have at least 200 GB memory
 trainer_resources:

diff --git a/templates/e2e-llm-workflows/configs/training/full_param/llama-3-8b.yaml b/templates/e2e-llm-workflows/configs/training/full_param/llama-3-8b.yaml
@@ -9,11 +9,9 @@ eval_batch_size_per_device: 16
 learning_rate: 5e-6
 padding: "longest" # This will pad batches to the longest sequence. Use "max_length" when profiling to profile the worst case.
 num_checkpoints_to_keep: 1
-dataset_size_scaling_factor: 10000
 output_dir: /mnt/local_storage
 deepspeed:
   config_path: configs/deepspeed/zero_3_offload_optim+param.json
-dataset_size_scaling_factor: 10000 # internal flag. No need to change
 flash_attention_2: true
 trainer_resources:
   memory: 53687091200 # 50 GB memory

diff --git a/templates/e2e-llm-workflows/configs/training/full_param/mistral-7b.yaml b/templates/e2e-llm-workflows/configs/training/full_param/mistral-7b.yaml
@@ -9,11 +9,9 @@ eval_batch_size_per_device: 8
 learning_rate: 5e-6
 padding: "longest" # This will pad batches to the longest sequence. Use "max_length" when profiling to profile the worst case.
 num_checkpoints_to_keep: 1
-dataset_size_scaling_factor: 10000
 output_dir: /mnt/local_storage
 deepspeed:
   config_path: configs/deepspeed/zero_3_offload_optim+param.json
-dataset_size_scaling_factor: 10000 # internal flag. No need to change
 flash_attention_2: true
 trainer_resources:
   memory: 53687091200 # 50 GB memory

diff --git a/templates/e2e-llm-workflows/configs/training/full_param/mixtral-8x7b.yaml b/templates/e2e-llm-workflows/configs/training/full_param/mixtral-8x7b.yaml
@@ -9,11 +9,9 @@ eval_batch_size_per_device: 8
 learning_rate: 5e-6
 padding: "longest" # This will pad batches to the longest sequence. Use "max_length" when profiling to profile the worst case.
 num_checkpoints_to_keep: 1
-dataset_size_scaling_factor: 10000
 output_dir: /mnt/local_storage
 deepspeed:
   config_path: configs/deepspeed/zero_3_offload_optim.json
-dataset_size_scaling_factor: 10000 # internal flag. No need to change
 flash_attention_2: true
 trainer_resources:
   memory: 107374182400 # 100 GB memory

diff --git a/templates/e2e-llm-workflows/configs/training/lora/llama-3-70b.yaml b/templates/e2e-llm-workflows/configs/training/lora/llama-3-70b.yaml
@@ -9,11 +9,9 @@ eval_batch_size_per_device: 8
 learning_rate: 1e-4
 padding: "longest" # This will pad batches to the longest sequence. Use "max_length" when profiling to profile the worst case.
 num_checkpoints_to_keep: 1
-dataset_size_scaling_factor: 10000
 output_dir: /mnt/local_storage
 deepspeed:
   config_path: configs/deepspeed/zero_3.json
-dataset_size_scaling_factor: 10000 # internal flag. No need to change
 flash_attention_2: true
 # Head node would have at least 200 GB memory
 trainer_resources:

diff --git a/templates/e2e-llm-workflows/configs/training/lora/llama-3-8b.yaml b/templates/e2e-llm-workflows/configs/training/lora/llama-3-8b.yaml
@@ -9,11 +9,9 @@ eval_batch_size_per_device: 16
 learning_rate: 1e-4
 padding: "longest" # This will pad batches to the longest sequence. Use "max_length" when profiling to profile the worst case.
 num_checkpoints_to_keep: 1
-dataset_size_scaling_factor: 10000
 output_dir: /mnt/local_storage
 deepspeed:
   config_path: configs/deepspeed/zero_3_offload_optim+param.json
-dataset_size_scaling_factor: 10000 # internal flag. No need to change
 flash_attention_2: true
 trainer_resources:
   memory: 53687091200 # 50 GB memory

diff --git a/templates/e2e-llm-workflows/configs/training/lora/mistral-7b.yaml b/templates/e2e-llm-workflows/configs/training/lora/mistral-7b.yaml
@@ -9,11 +9,9 @@ eval_batch_size_per_device: 16
 learning_rate: 1e-4
 padding: "longest" # This will pad batches to the longest sequence. Use "max_length" when profiling to profile the worst case.
 num_checkpoints_to_keep: 1
-dataset_size_scaling_factor: 10000
 output_dir: /mnt/local_storage
 deepspeed:
   config_path: configs/deepspeed/zero_3_offload_optim+param.json
-dataset_size_scaling_factor: 10000 # internal flag. No need to change
 flash_attention_2: true
 trainer_resources:
   memory: 53687091200 # 50 GB memory

diff --git a/templates/e2e-llm-workflows/configs/training/lora/mixtral-8x7b.yaml b/templates/e2e-llm-workflows/configs/training/lora/mixtral-8x7b.yaml
@@ -9,11 +9,9 @@ eval_batch_size_per_device: 8
 learning_rate: 1e-4
 padding: "longest" # This will pad batches to the longest sequence. Use "max_length" when profiling to profile the worst case.
 num_checkpoints_to_keep: 1
-dataset_size_scaling_factor: 10000
 output_dir: /mnt/local_storage
 deepspeed:
   config_path: configs/deepspeed/zero_3.json
-dataset_size_scaling_factor: 10000 # internal flag. No need to change
 flash_attention_2: true
 trainer_resources:
   memory: 107374182400 # 100 GB memory

diff --git a/templates/fine-tune-llm_v2/end-to-end-examples/fine-tune-function-calling/README.ipynb b/templates/fine-tune-llm_v2/end-to-end-examples/fine-tune-function-calling/README.ipynb
@@ -413,7 +413,6 @@
     "        \"fan_in_fan_out\": False,\n",
     "        \"init_lora_weights\": True\n",
     "    },\n",
-    "    \"dataset_size_scaling_factor\": 1e10, # Very large number\n",
     "}"
    ]
   },

diff --git a/...lates/fine-tune-llm_v2/end-to-end-examples/fine-tune-function-calling/README.md b/...lates/fine-tune-llm_v2/end-to-end-examples/fine-tune-function-calling/README.md
@@ -274,7 +274,6 @@ config = {
         "fan_in_fan_out": False,
         "init_lora_weights": True
     },
-    "dataset_size_scaling_factor": 1e10, # Very large number
 }
 ```
-Original file line number
+Diff line change
@@ Expand Up / @@ -274,7 +274,6 @@ config = { @@
             "fan_in_fan_out": False,
             "init_lora_weights": True
         },
-        "dataset_size_scaling_factor": 1e10, # Very large number
     }
     ```
@@ Expand Down @@