sgl-project · zhaochenyang20 · Feb 15, 2025 · Feb 14, 2025 · Feb 14, 2025 · Feb 15, 2025
diff --git a/.github/workflows/execute-notebook.yml b/.github/workflows/execute-notebook.yml
@@ -36,6 +36,9 @@ jobs:
         run: |
           bash scripts/ci_install_dependency.sh
           pip install -r docs/requirements.txt
+          apt-get update
+          apt-get install -y pandoc
+          apt-get update && apt-get install -y parallel
 
       - name: Setup Jupyter Kernel
         run: |

diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml
@@ -8,13 +8,15 @@ on:
       - "python/sglang/**"
       - "test/**"
       - "docs/**"
+      - "scripts/**"
   pull_request:
     branches: [ main ]
     paths:
       - "python/pyproject.toml"
       - "python/sglang/**"
       - "test/**"
       - "docs/**"
+      - "scripts/**"
   workflow_dispatch:
     inputs:
       version:
@@ -45,6 +47,8 @@ jobs:
           filters: |
             docs:
               - 'docs/**'
+            scripts:
+              - 'scripts/**'
             sglang:
               - 'python/sglang/**'
             test:

diff --git a/.github/workflows/release-docs.yml b/.github/workflows/release-docs.yml
@@ -32,6 +32,7 @@ jobs:
           pip install -r docs/requirements.txt
           apt-get update
           apt-get install -y pandoc
+          apt-get update && apt-get install -y parallel
 
       - name: Setup Jupyter Kernel
         run: |

diff --git a/docs/Makefile b/docs/Makefile
@@ -1,34 +1,42 @@
-# Minimal makefile for Sphinx documentation
-#
-
-# You can set these variables from the terminal, and also
-# from the environment for the first two.
+# Minimal Makefile for Sphinx documentation
 SPHINXOPTS    ?=
 SPHINXBUILD   ?= sphinx-build
 SOURCEDIR     = .
 BUILDDIR      = _build
 
-# Put it first so that "make" without argument is like "make help".
 help:
 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
 
-# New target to compile Markdown and Jupyter Notebook files
+# Compile Notebook files and record execution time
 compile:
-	find $(SOURCEDIR) -path "*/_build/*" -prune -o -name "*.ipynb" -print | while read nb; do \
-		if [ -f "$$nb" ]; then \
-			echo "Executing $$nb"; \
-			jupyter nbconvert --to notebook --execute --inplace "$$nb" \
-				--ExecutePreprocessor.timeout=600 \
-				--ExecutePreprocessor.kernel_name=python3 || exit 1; \
-		fi; \
-	done
+	@set -e; \
+	echo "Starting Notebook compilation..."; \
+	mkdir -p logs; \
+	echo "Notebook execution timings:" > logs/timing.log; \
+	START_TOTAL=$$(date +%s); \
+	find $(SOURCEDIR) -path "*/_build/*" -prune -o -name "*.ipynb" -print0 | \
+		parallel -0 -j3 --halt soon,fail=1 ' \
+		NB_NAME=$$(basename {}); \
+		START_TIME=$$(date +%s); \
+		jupyter nbconvert --to notebook --execute --inplace "{}" \
+			--ExecutePreprocessor.timeout=600 \
+			--ExecutePreprocessor.kernel_name=python3; \
+		RET_CODE=$$?; \
+		END_TIME=$$(date +%s); \
+		ELAPSED_TIME=$$((END_TIME - START_TIME)); \
+		echo "$${NB_NAME}: $${ELAPSED_TIME}s" >> logs/timing.log; \
+		exit $$RET_CODE' || exit 1; \
+	END_TOTAL=$$(date +%s); \
+	TOTAL_ELAPSED=$$((END_TOTAL - START_TOTAL)); \
+	echo "---------------------------------" >> logs/timing.log; \
+	echo "Total execution time: $${TOTAL_ELAPSED}s" >> logs/timing.log; \
+	echo "All Notebook execution timings:" && cat logs/timing.log
+
 
-.PHONY: help Makefile compile
+.PHONY: help Makefile compile clean
 
-# Catch-all target: route all unknown targets to Sphinx using the new
-# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
 %: Makefile
 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
 
 clean:
-	rm -rf $(BUILDDIR)/*
+	rm -rf $(BUILDDIR)/* logs/timing.log
diff --git a/docs/backend/__init__.py b/docs/backend/__init__.py
diff --git a/docs/backend/function_calling.ipynb b/docs/backend/function_calling.ipynb
@@ -31,17 +31,19 @@
    "source": [
     "from openai import OpenAI\n",
     "import json\n",
-    "from sglang.utils import (\n",
-    "    execute_shell_command,\n",
-    "    wait_for_server,\n",
-    "    terminate_process,\n",
-    "    print_highlight,\n",
-    ")\n",
+    "from sglang.utils import wait_for_server, print_highlight, terminate_process\n",
+    "from sglang.test.test_utils import is_in_ci\n",
+    "\n",
+    "if is_in_ci():\n",
+    "    from patch import launch_server_cmd\n",
+    "else:\n",
+    "    from sglang.utils import launch_server_cmd\n",
+    "\n",
     "\n",
-    "server_process = execute_shell_command(\n",
-    "    \"python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --tool-call-parser llama3 --port 30333 --host 0.0.0.0\"  # llama3\n",
+    "server_process, port = launch_server_cmd(\n",
+    "    \"python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --tool-call-parser llama3 --host 0.0.0.0\"  # llama3\n",
     ")\n",
-    "wait_for_server(\"http://localhost:30333\")"
+    "wait_for_server(f\"http://localhost:{port}\")"
    ]
   },
   {
@@ -141,7 +143,7 @@
    "outputs": [],
    "source": [
     "# Initialize OpenAI-like client\n",
-    "client = OpenAI(api_key=\"None\", base_url=\"http://0.0.0.0:30333/v1\")\n",
+    "client = OpenAI(api_key=\"None\", base_url=f\"http://0.0.0.0:{port}/v1\")\n",
     "model_name = client.models.list().data[0].id"
    ]
   },
@@ -377,13 +379,13 @@
     "    tools=tools,\n",
     ")\n",
     "\n",
-    "gen_url = \"http://localhost:30333/generate\"\n",
+    "gen_url = f\"http://localhost:{port}/generate\"\n",
     "gen_data = {\"text\": input, \"sampling_params\": {\"skip_special_tokens\": False}}\n",
     "gen_response = requests.post(gen_url, json=gen_data).json()[\"text\"]\n",
     "print(gen_response)\n",
     "\n",
     "# parse the response\n",
-    "parse_url = \"http://localhost:30333/function_call\"\n",
+    "parse_url = f\"http://localhost:{port}/function_call\"\n",
     "\n",
     "function_call_input = {\n",
     "    \"text\": gen_response,\n",
@@ -403,7 +405,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "terminate_process(server_process)"
+    "terminate_process(server_process, port)"
    ]
   },
   {

diff --git a/docs/backend/native_api.ipynb b/docs/backend/native_api.ipynb
@@ -34,22 +34,22 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from sglang.utils import (\n",
-    "    execute_shell_command,\n",
-    "    wait_for_server,\n",
-    "    terminate_process,\n",
-    "    print_highlight,\n",
-    ")\n",
-    "\n",
     "import requests\n",
+    "from sglang.test.test_utils import is_in_ci\n",
     "\n",
-    "server_process = execute_shell_command(\n",
-    "    \"\"\"\n",
-    "python3 -m sglang.launch_server --model-path meta-llama/Llama-3.2-1B-Instruct --port=30010\n",
-    "\"\"\"\n",
+    "if is_in_ci():\n",
+    "    from patch import launch_server_cmd\n",
+    "else:\n",
+    "    from sglang.utils import launch_server_cmd\n",
+    "\n",
+    "from sglang.utils import wait_for_server, print_highlight, terminate_process\n",
+    "\n",
+    "\n",
+    "server_process, port = launch_server_cmd(\n",
+    "    \"python -m sglang.launch_server --model-path meta-llama/Llama-3.2-1B-Instruct --host 0.0.0.0\"\n",
     ")\n",
     "\n",
-    "wait_for_server(\"http://localhost:30010\")"
+    "wait_for_server(f\"http://localhost:{port}\")"
    ]
   },
   {
@@ -66,7 +66,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "url = \"http://localhost:30010/generate\"\n",
+    "url = f\"http://localhost:{port}/generate\"\n",
     "data = {\"text\": \"What is the capital of France?\"}\n",
     "\n",
     "response = requests.post(url, json=data)\n",
@@ -92,7 +92,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "url = \"http://localhost:30010/get_model_info\"\n",
+    "url = f\"http://localhost:{port}/get_model_info\"\n",
     "\n",
     "response = requests.get(url)\n",
     "response_json = response.json()\n",
@@ -123,7 +123,7 @@
    "source": [
     "# get_server_info\n",
     "\n",
-    "url = \"http://localhost:30010/get_server_info\"\n",
+    "url = f\"http://localhost:{port}/get_server_info\"\n",
     "\n",
     "response = requests.get(url)\n",
     "print_highlight(response.text)"
@@ -144,7 +144,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "url = \"http://localhost:30010/health_generate\"\n",
+    "url = f\"http://localhost:{port}/health_generate\"\n",
     "\n",
     "response = requests.get(url)\n",
     "print_highlight(response.text)"
@@ -156,7 +156,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "url = \"http://localhost:30010/health\"\n",
+    "url = f\"http://localhost:{port}/health\"\n",
     "\n",
     "response = requests.get(url)\n",
     "print_highlight(response.text)"
@@ -179,7 +179,7 @@
    "source": [
     "# flush cache\n",
     "\n",
-    "url = \"http://localhost:30010/flush_cache\"\n",
+    "url = f\"http://localhost:{port}/flush_cache\"\n",
     "\n",
     "response = requests.post(url)\n",
     "print_highlight(response.text)"
@@ -204,7 +204,7 @@
    "source": [
     "# successful update with same architecture and size\n",
     "\n",
-    "url = \"http://localhost:30010/update_weights_from_disk\"\n",
+    "url = f\"http://localhost:{port}/update_weights_from_disk\"\n",
     "data = {\"model_path\": \"meta-llama/Llama-3.2-1B\"}\n",
     "\n",
     "response = requests.post(url, json=data)\n",
@@ -222,7 +222,7 @@
    "source": [
     "# failed update with different parameter size or wrong name\n",
     "\n",
-    "url = \"http://localhost:30010/update_weights_from_disk\"\n",
+    "url = f\"http://localhost:{port}/update_weights_from_disk\"\n",
     "data = {\"model_path\": \"meta-llama/Llama-3.2-1B-wrong\"}\n",
     "\n",
     "response = requests.post(url, json=data)\n",
@@ -252,16 +252,16 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "terminate_process(server_process)\n",
+    "terminate_process(server_process, port)\n",
     "\n",
-    "embedding_process = execute_shell_command(\n",
+    "embedding_process, port = launch_server_cmd(\n",
     "    \"\"\"\n",
     "python -m sglang.launch_server --model-path Alibaba-NLP/gte-Qwen2-7B-instruct \\\n",
-    "    --port 30020 --host 0.0.0.0 --is-embedding\n",
+    "    --host 0.0.0.0 --is-embedding\n",
     "\"\"\"\n",
     ")\n",
     "\n",
-    "wait_for_server(\"http://localhost:30020\")"
+    "wait_for_server(f\"http://localhost:{port}\")"
    ]
   },
   {
@@ -272,14 +272,23 @@
    "source": [
     "# successful encode for embedding model\n",
     "\n",
-    "url = \"http://localhost:30020/encode\"\n",
+    "url = f\"http://localhost:{port}/encode\"\n",
     "data = {\"model\": \"Alibaba-NLP/gte-Qwen2-7B-instruct\", \"text\": \"Once upon a time\"}\n",
     "\n",
     "response = requests.post(url, json=data)\n",
     "response_json = response.json()\n",
     "print_highlight(f\"Text embedding (first 10): {response_json['embedding'][:10]}\")"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "terminate_process(embedding_process, port)"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -295,18 +304,18 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "terminate_process(embedding_process)\n",
+    "terminate_process(embedding_process, port)\n",
     "\n",
     "# Note that SGLang now treats embedding models and reward models as the same type of models.\n",
     "# This will be updated in the future.\n",
     "\n",
-    "reward_process = execute_shell_command(\n",
+    "reward_process, port = launch_server_cmd(\n",
     "    \"\"\"\n",
-    "python -m sglang.launch_server --model-path Skywork/Skywork-Reward-Llama-3.1-8B-v0.2 --port 30030 --host 0.0.0.0 --is-embedding\n",
+    "python -m sglang.launch_server --model-path Skywork/Skywork-Reward-Llama-3.1-8B-v0.2 --host 0.0.0.0 --is-embedding\n",
     "\"\"\"\n",
     ")\n",
     "\n",
-    "wait_for_server(\"http://localhost:30030\")"
+    "wait_for_server(f\"http://localhost:{port}\")"
    ]
   },
   {
@@ -332,7 +341,7 @@
     "tokenizer = AutoTokenizer.from_pretrained(\"Skywork/Skywork-Reward-Llama-3.1-8B-v0.2\")\n",
     "prompts = tokenizer.apply_chat_template(CONVS, tokenize=False)\n",
     "\n",
-    "url = \"http://localhost:30030/classify\"\n",
+    "url = f\"http://localhost:{port}/classify\"\n",
     "data = {\"model\": \"Skywork/Skywork-Reward-Llama-3.1-8B-v0.2\", \"text\": prompts}\n",
     "\n",
     "responses = requests.post(url, json=data).json()\n",
@@ -346,7 +355,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "terminate_process(reward_process)"
+    "terminate_process(reward_process, port)"
    ]
   },
   {
@@ -364,13 +373,13 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "tokenizer_free_server_process = execute_shell_command(\n",
+    "tokenizer_free_server_process, port = launch_server_cmd(\n",
     "    \"\"\"\n",
-    "python3 -m sglang.launch_server --model-path meta-llama/Llama-3.2-1B-Instruct --port=30010 --skip-tokenizer-init\n",
+    "python3 -m sglang.launch_server --model-path meta-llama/Llama-3.2-1B-Instruct --skip-tokenizer-init\n",
     "\"\"\"\n",
     ")\n",
     "\n",
-    "wait_for_server(\"http://localhost:30010\")"
+    "wait_for_server(f\"http://localhost:{port}\")"
    ]
   },
   {
@@ -390,7 +399,7 @@
     "print_highlight(f\"Tokenized Input: {input_tokens}\")\n",
     "\n",
     "response = requests.post(\n",
-    "    \"http://localhost:30010/generate\",\n",
+    "    f\"http://localhost:{port}/generate\",\n",
     "    json={\n",
     "        \"input_ids\": input_tokens,\n",
     "        \"sampling_params\": {\n",
@@ -416,7 +425,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "terminate_process(tokenizer_free_server_process)"
+    "terminate_process(tokenizer_free_server_process, port)"
    ]
   }
  ],