anyscale · SumanthRH · Nov 20, 2024 · Nov 8, 2024 · Nov 8, 2024 · Nov 13, 2024
diff --git a/templates/e2e-dspy-workflow/README.ipynb b/templates/e2e-dspy-workflow/README.ipynb
@@ -863,7 +863,7 @@
        "    <span style=\"color: #008000; text-decoration-color: #008000\">'name'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'dspy-llmforge-fine-tuning-job'</span>,\n",
        "    <span style=\"color: #008000; text-decoration-color: #008000\">'entrypoint'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'llmforge anyscale finetune configs/training/lora/llama-3-8b.yaml'</span>,\n",
        "    <span style=\"color: #008000; text-decoration-color: #008000\">'working_dir'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'.'</span>,\n",
-       "    <span style=\"color: #008000; text-decoration-color: #008000\">'image_uri'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'localhost:5555/anyscale/llm-forge:0.5.7'</span>\n",
+       "    <span style=\"color: #008000; text-decoration-color: #008000\">'image_uri'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'localhost:5555/anyscale/llm-forge:0.5.8'</span>\n",
        "<span style=\"font-weight: bold\">}</span>\n",
        "</pre>\n"
       ],
@@ -872,7 +872,7 @@
        "    \u001b[32m'name'\u001b[0m: \u001b[32m'dspy-llmforge-fine-tuning-job'\u001b[0m,\n",
        "    \u001b[32m'entrypoint'\u001b[0m: \u001b[32m'llmforge anyscale finetune configs/training/lora/llama-3-8b.yaml'\u001b[0m,\n",
        "    \u001b[32m'working_dir'\u001b[0m: \u001b[32m'.'\u001b[0m,\n",
-       "    \u001b[32m'image_uri'\u001b[0m: \u001b[32m'localhost:5555/anyscale/llm-forge:0.5.7'\u001b[0m\n",
+       "    \u001b[32m'image_uri'\u001b[0m: \u001b[32m'localhost:5555/anyscale/llm-forge:0.5.8'\u001b[0m\n",
        "\u001b[1m}\u001b[0m\n"
       ]
      },

diff --git a/templates/e2e-dspy-workflow/README.md b/templates/e2e-dspy-workflow/README.md
@@ -519,7 +519,7 @@ rich.print(yaml.safe_load(open(job_config_path)))
     <span style="color: #008000; text-decoration-color: #008000">'name'</span>: <span style="color: #008000; text-decoration-color: #008000">'dspy-llmforge-fine-tuning-job'</span>,
     <span style="color: #008000; text-decoration-color: #008000">'entrypoint'</span>: <span style="color: #008000; text-decoration-color: #008000">'llmforge anyscale finetune configs/training/lora/llama-3-8b.yaml'</span>,
     <span style="color: #008000; text-decoration-color: #008000">'working_dir'</span>: <span style="color: #008000; text-decoration-color: #008000">'.'</span>,
-    <span style="color: #008000; text-decoration-color: #008000">'image_uri'</span>: <span style="color: #008000; text-decoration-color: #008000">'localhost:5555/anyscale/llm-forge:0.5.7'</span>
+    <span style="color: #008000; text-decoration-color: #008000">'image_uri'</span>: <span style="color: #008000; text-decoration-color: #008000">'localhost:5555/anyscale/llm-forge:0.5.8'</span>
 <span style="font-weight: bold">}</span>
 </pre>
 

diff --git a/templates/e2e-dspy-workflow/configs/job.yaml b/templates/e2e-dspy-workflow/configs/job.yaml
@@ -1,4 +1,4 @@
 name: "dspy-llmforge-fine-tuning-job"
 entrypoint: "llmforge anyscale finetune configs/training/lora/llama-3-8b.yaml"
 working_dir: "."
-image_uri: "localhost:5555/anyscale/llm-forge:0.5.7"
+image_uri: "localhost:5555/anyscale/llm-forge:0.5.8"
diff --git a/templates/e2e-llm-workflows/deploy/jobs/ft.yaml b/templates/e2e-llm-workflows/deploy/jobs/ft.yaml
@@ -1,6 +1,6 @@
 name: e2e-llm-workflows
 entrypoint: llmforge anyscale finetune configs/training/lora/llama-3-8b.yaml
-image_uri: localhost:5555/anyscale/llm-forge:0.5.7
+image_uri: localhost:5555/anyscale/llm-forge:0.5.8
 requirements: []
 max_retries: 1
 excludes: ["assets"]
diff --git a/...-tune-llm_v2/training_configs/custom/meta-llama/Meta-Llama-3-70B/full/16xA100-80G-4k.yaml b/...-tune-llm_v2/training_configs/custom/meta-llama/Meta-Llama-3-70B/full/16xA100-80G-4k.yaml
@@ -32,7 +32,7 @@ num_checkpoints_to_keep: 1
 
 # Deepspeed configuration, you can provide your own deepspeed setup
 deepspeed:
-  config_path: deepspeed_configs/zero_3_hpz.json
+  config_path: deepspeed_configs/zero_3_offload_optim+param.json
 
 # Accelerator type, we value of 0.001 is not important, as long as it is
 # beteween 0 and 1. This ensures that accelerator type is used per trainer

diff --git a/...-tune-llm_v2/training_configs/custom/meta-llama/Meta-Llama-3-8B/lora/4xA10-512-liger.yaml b/...-tune-llm_v2/training_configs/custom/meta-llama/Meta-Llama-3-8B/lora/4xA10-512-liger.yaml
@@ -0,0 +1,77 @@
+# Change this to the model you want to fine-tune
+model_id: meta-llama/Meta-Llama-3-8B-Instruct
+
+# Change this to the path to your training data
+train_path: s3://air-example-data/gsm8k/train.jsonl
+
+# Change this to the path to your validation data. This is optional
+valid_path: s3://air-example-data/gsm8k/test.jsonl
+
+# Change this to the context length you want to use. Examples with longer
+# context length will be truncated.
+context_length: 512
+
+# Change this to total number of GPUs that you want to use
+num_devices: 4
+
+# Change this to the number of epochs that you want to train for
+num_epochs: 3
+
+# Change this to the batch size that you want to use
+train_batch_size_per_device: 2
+eval_batch_size_per_device: 4
+gradient_accumulation_steps: 2
+
+
+# Change this to the learning rate that you want to use
+learning_rate: 1e-4
+
+# This will pad batches to the longest sequence. Use "max_length" when profiling to profile the worst case.
+padding: "longest"
+
+# By default, we will keep the best checkpoint. You can change this to keep more checkpoints.
+num_checkpoints_to_keep: 1
+
+# Deepspeed configuration, you can provide your own deepspeed setup
+deepspeed:
+  config_path: deepspeed_configs/zero_2.json
+
+logger:
+  provider: wandb
+
+# Accelerator type, we value of 0.001 is not important, as long as it is
+# beteween 0 and 1. This ensures that accelerator type is used per trainer
+# worker.
+worker_resources:
+  anyscale/accelerator_shape:4xA10G: 0.001
+
+# Liger kernel configuration
+liger_kernel:
+  enabled: True
+  # You can further customize the individual liger kernel configurations here. By default,
+  # all the `kwargs` are `True` when liger is enabled.
+  # kwargs:
+  #   rms_norm: True
+  #   rope: True
+  #   swiglu: True
+  #   cross_entropy: True
+  #   fused_linear_cross_entropy: True
+
+# Lora configuration
+lora_config:
+  r: 8
+  lora_alpha: 16
+  lora_dropout: 0.05
+  target_modules:
+    - q_proj
+    - v_proj
+    - k_proj
+    - o_proj
+    - gate_proj
+    - up_proj
+    - down_proj
+    - embed_tokens
+    - lm_head
+  task_type: "CAUSAL_LM"
+  bias: "none"
+  modules_to_save: []
diff --git a/templates/llm-router/README.ipynb b/templates/llm-router/README.ipynb
@@ -111,7 +111,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -306,7 +306,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -330,7 +330,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -461,7 +461,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -578,7 +578,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -644,7 +644,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -761,7 +761,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -904,7 +904,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -948,7 +948,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -988,7 +988,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -1025,7 +1025,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -1055,7 +1055,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1079,7 +1079,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -1092,7 +1092,9 @@
       "context_length: 1024\n",
       "num_devices: 8\n",
       "num_epochs: 5\n",
-      "checkpoint_every_n_epochs: 5\n",
+      "checkpoint_and_evaluation_frequency: \n",
+      "  unit: epochs\n",
+      "  frequency: 5\n",
       "train_batch_size_per_device: 4\n",
       "eval_batch_size_per_device: 4\n",
       "lr_scheduler_type: constant\n",
@@ -1120,7 +1122,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -1206,7 +1208,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -1273,7 +1275,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -1371,7 +1373,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -1400,6 +1402,25 @@
     "This plot illustrates that as we relax the cost constraints (i.e., increase the percentage of GPT-4 calls), the performance improves. While the performance of a random router improves linearly with cost, our router achieves significantly better results at each cost level."
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Cleanup"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Cleanup\n",
+    "!python src/clear_cell_nums.py\n",
+    "!find . | grep -E \".ipynb_checkpoints\" | xargs rm -rf\n",
+    "!find . | grep -E \"(__pycache__|\\.pyc|\\.pyo)\" | xargs rm -rf"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -1425,7 +1446,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.8"
+   "version": "3.11.9"
   }
  },
  "nbformat": 4,

diff --git a/templates/llm-router/README.md b/templates/llm-router/README.md
@@ -715,7 +715,9 @@ For this tutorial, we will perform full-parameter finetuning of Llama3-8B on the
     context_length: 1024
     num_devices: 8
     num_epochs: 5
-    checkpoint_every_n_epochs: 5
+    checkpoint_and_evaluation_frequency: 
+      unit: epochs
+      frequency: 5
     train_batch_size_per_device: 4
     eval_batch_size_per_device: 4
     lr_scheduler_type: constant
@@ -912,5 +914,15 @@ display(Image(filename=image_path))
 
 This plot illustrates that as we relax the cost constraints (i.e., increase the percentage of GPT-4 calls), the performance improves. While the performance of a random router improves linearly with cost, our router achieves significantly better results at each cost level.
 
+## Cleanup
+
+
+```python
+# Cleanup
+!python src/clear_cell_nums.py
+!find . | grep -E ".ipynb_checkpoints" | xargs rm -rf
+!find . | grep -E "(__pycache__|\.pyc|\.pyo)" | xargs rm -rf
+```
+
 # Conclusion
 In this tutorial, we have successfully built and evaluated a finetuned-LLM router. We generated synthetic labeled data using the LLM-as-a-judge method to train the model, finetuned an LLM classifier using Anyscale's API, and conducted offline evaluation on a standard benchmark-- demonstrating that our model is effective in out-of-domain generalization.
diff --git a/templates/llm-router/configs/ft_config_a10.yaml b/templates/llm-router/configs/ft_config_a10.yaml
@@ -4,7 +4,9 @@ valid_path: /mnt/user_storage/train_data_sample.jsonl
 context_length: 1024
 num_devices: 8
 num_epochs: 5
-checkpoint_every_n_epochs: 5
+checkpoint_and_evaluation_frequency:
+  unit: epochs
+  frequency: 5
 train_batch_size_per_device: 4
 eval_batch_size_per_device: 4
 lr_scheduler_type: constant

diff --git a/templates/llm-router/src/clear_cell_nums.py b/templates/llm-router/src/clear_cell_nums.py
@@ -0,0 +1,23 @@
+from pathlib import Path
+
+import nbformat
+
+
+def clear_execution_numbers(nb_path):
+    with open(nb_path, "r", encoding="utf-8") as f:
+        nb = nbformat.read(f, as_version=4)
+    for cell in nb["cells"]:
+        if cell["cell_type"] == "code":
+            cell["execution_count"] = None
+            for output in cell["outputs"]:
+                if "execution_count" in output:
+                    output["execution_count"] = None
+    with open(nb_path, "w", encoding="utf-8") as f:
+        nbformat.write(nb, f)
+
+
+if __name__ == "__main__":
+    ROOT_DIR = Path(__file__).parent.parent
+    notebook_fps = list(ROOT_DIR.glob("**/*.ipynb"))
+    for fp in notebook_fps:
+        clear_execution_numbers(fp)