From 0cf4330dc8caf43e80ae4631ca8d63f8ebf2d18b Mon Sep 17 00:00:00 2001
From: zhaochenyang20 <zhaochen20@outlook.com>
Date: Mon, 2 Dec 2024 22:54:30 +0000
Subject: [PATCH 1/6] fix error in docs ci

---
 docs/Makefile                             |    2 +-
 docs/backend/native_api.ipynb             |  208 +++-
 docs/backend/offline_engine_api.ipynb     |   59 +-
 docs/backend/openai_api_completions.ipynb | 1106 ++++++++++++++++++++-
 docs/backend/openai_api_embeddings.ipynb  |   59 +-
 docs/backend/openai_api_vision.ipynb      |   64 +-
 docs/start/send_request.ipynb             |  872 +++++++++++++++-
 7 files changed, 2259 insertions(+), 111 deletions(-)

diff --git a/docs/Makefile b/docs/Makefile
index 50f77a30c0..13d81f4f84 100644
--- a/docs/Makefile
+++ b/docs/Makefile
@@ -19,7 +19,7 @@ compile:
 			echo "Executing $$nb"; \
 			jupyter nbconvert --to notebook --execute --inplace "$$nb" \
 				--ExecutePreprocessor.timeout=600 \
-				--ExecutePreprocessor.kernel_name=python3; \
+				--ExecutePreprocessor.kernel_name=python3 || exit 1; \
 		fi; \
 	done
 
diff --git a/docs/backend/native_api.ipynb b/docs/backend/native_api.ipynb
index 7207259ea3..73fc54a038 100644
--- a/docs/backend/native_api.ipynb
+++ b/docs/backend/native_api.ipynb
@@ -31,8 +31,73 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
-   "outputs": [],
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-12-02T22:48:18.728819Z",
+     "iopub.status.busy": "2024-12-02T22:48:18.728690Z",
+     "iopub.status.idle": "2024-12-02T22:48:47.958696Z",
+     "shell.execute_reply": "2024-12-02T22:48:47.958226Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-12-02 22:53:31] server_args=ServerArgs(model_path='meta-llama/Llama-3.2-1B-Instruct', tokenizer_path='meta-llama/Llama-3.2-1B-Instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, context_length=None, device='cuda', served_model_name='meta-llama/Llama-3.2-1B-Instruct', chat_template=None, is_embedding=False, revision=None, host='127.0.0.1', port=30010, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='lpm', schedule_conservativeness=1.0, cpu_offload_gb=0, tp_size=1, stream_interval=1, random_seed=412569283, constrained_json_whitespace_pattern=None, watchdog_timeout=300, download_dir=None, base_gpu_id=0, log_level='info', log_level_http=None, log_requests=False, show_time_cost=False, enable_metrics=False, decode_log_interval=40, api_key=None, file_storage_pth='SGLang_storage', enable_cache_report=False, dp_size=1, load_balance_method='round_robin', dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='outlines', disable_radix_cache=False, disable_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_overlap_schedule=False, enable_mixed_chunk=False, enable_dp_attention=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config='', enable_nan_detection=False, enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1, delete_ckpt_after_loading=False)\n",
+      "/opt/dlami/nvme/chenyang/miniconda3/envs/sglang/lib/python3.11/site-packages/transformers/utils/hub.py:128: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n",
+      "  warnings.warn(\n",
+      "/opt/dlami/nvme/chenyang/miniconda3/envs/sglang/lib/python3.11/site-packages/transformers/utils/hub.py:128: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n",
+      "  warnings.warn(\n",
+      "[2024-12-02 22:53:38 TP0] Init torch distributed begin.\n",
+      "[2024-12-02 22:53:39 TP0] Load weight begin. avail mem=4.98 GB\n",
+      "[2024-12-02 22:53:39 TP0] lm_eval is not installed, GPTQ may not be usable\n",
+      "[2024-12-02 22:53:39 TP0] Using model weights format ['*.safetensors']\n",
+      "[2024-12-02 22:53:39 TP0] No model.safetensors.index.json found in remote.\n",
+      "Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]\n",
+      "Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  3.22it/s]\n",
+      "Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  3.22it/s]\n",
+      "\n",
+      "[2024-12-02 22:53:40 TP0] Load weight end. type=LlamaForCausalLM, dtype=torch.bfloat16, avail mem=2.34 GB\n",
+      "[2024-12-02 22:53:40 TP0] Scheduler hit an exception: Traceback (most recent call last):\n",
+      "  File \"/opt/dlami/nvme/chenyang/miniconda3/envs/sglang/lib/python3.11/site-packages/sglang/srt/managers/scheduler.py\", line 1489, in run_scheduler_process\n",
+      "    scheduler = Scheduler(server_args, port_args, gpu_id, tp_rank, dp_rank)\n",
+      "                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/opt/dlami/nvme/chenyang/miniconda3/envs/sglang/lib/python3.11/site-packages/sglang/srt/managers/scheduler.py\", line 194, in __init__\n",
+      "    self.tp_worker = TpWorkerClass(\n",
+      "                     ^^^^^^^^^^^^^^\n",
+      "  File \"/opt/dlami/nvme/chenyang/miniconda3/envs/sglang/lib/python3.11/site-packages/sglang/srt/managers/tp_worker_overlap_thread.py\", line 61, in __init__\n",
+      "    self.worker = TpModelWorker(server_args, gpu_id, tp_rank, dp_rank, nccl_port)\n",
+      "                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/opt/dlami/nvme/chenyang/miniconda3/envs/sglang/lib/python3.11/site-packages/sglang/srt/managers/tp_worker.py\", line 62, in __init__\n",
+      "    self.model_runner = ModelRunner(\n",
+      "                        ^^^^^^^^^^^^\n",
+      "  File \"/opt/dlami/nvme/chenyang/miniconda3/envs/sglang/lib/python3.11/site-packages/sglang/srt/model_executor/model_runner.py\", line 164, in __init__\n",
+      "    self.init_memory_pool(\n",
+      "  File \"/opt/dlami/nvme/chenyang/miniconda3/envs/sglang/lib/python3.11/site-packages/sglang/srt/model_executor/model_runner.py\", line 544, in init_memory_pool\n",
+      "    self.token_to_kv_pool = MHATokenToKVPool(\n",
+      "                            ^^^^^^^^^^^^^^^^^\n",
+      "  File \"/opt/dlami/nvme/chenyang/miniconda3/envs/sglang/lib/python3.11/site-packages/sglang/srt/mem_cache/memory_pool.py\", line 194, in __init__\n",
+      "    self.v_buffer = [\n",
+      "                    ^\n",
+      "  File \"/opt/dlami/nvme/chenyang/miniconda3/envs/sglang/lib/python3.11/site-packages/sglang/srt/mem_cache/memory_pool.py\", line 195, in <listcomp>\n",
+      "    torch.empty(\n",
+      "torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 56.00 MiB. GPU 0 has a total capacity of 79.10 GiB of which 27.50 MiB is free. Process 3843098 has 32.29 GiB memory in use. Process 3908588 has 41.52 GiB memory in use. Including non-PyTorch memory, this process has 5.24 GiB memory in use. Of the allocated memory 4.63 GiB is allocated by PyTorch, and 19.34 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)\n",
+      "\n"
+     ]
+    },
+    {
+     "ename": "",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31mThe Kernel crashed while executing code in the current cell or a previous cell. \n",
+      "\u001b[1;31mPlease review the code in the cell(s) to identify a possible cause of the failure. \n",
+      "\u001b[1;31mClick <a href='https://aka.ms/vscodeJupyterKernelCrash'>here</a> for more info. \n",
+      "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
+     ]
+    }
+   ],
    "source": [
     "from sglang.utils import (\n",
     "    execute_shell_command,\n",
@@ -63,7 +128,14 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-12-02T22:48:47.960280Z",
+     "iopub.status.busy": "2024-12-02T22:48:47.960068Z",
+     "iopub.status.idle": "2024-12-02T22:48:48.227582Z",
+     "shell.execute_reply": "2024-12-02T22:48:48.227156Z"
+    }
+   },
    "outputs": [],
    "source": [
     "url = \"http://localhost:30010/generate\"\n",
@@ -89,7 +161,14 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-12-02T22:48:48.228900Z",
+     "iopub.status.busy": "2024-12-02T22:48:48.228763Z",
+     "iopub.status.idle": "2024-12-02T22:48:48.234059Z",
+     "shell.execute_reply": "2024-12-02T22:48:48.233689Z"
+    }
+   },
    "outputs": [],
    "source": [
     "url = \"http://localhost:30010/get_model_info\"\n",
@@ -118,7 +197,14 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-12-02T22:48:48.235454Z",
+     "iopub.status.busy": "2024-12-02T22:48:48.235125Z",
+     "iopub.status.idle": "2024-12-02T22:48:48.239787Z",
+     "shell.execute_reply": "2024-12-02T22:48:48.239433Z"
+    }
+   },
    "outputs": [],
    "source": [
     "# get_server_info\n",
@@ -141,7 +227,14 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-12-02T22:48:48.240966Z",
+     "iopub.status.busy": "2024-12-02T22:48:48.240838Z",
+     "iopub.status.idle": "2024-12-02T22:48:48.254049Z",
+     "shell.execute_reply": "2024-12-02T22:48:48.253663Z"
+    }
+   },
    "outputs": [],
    "source": [
     "url = \"http://localhost:30010/health_generate\"\n",
@@ -153,7 +246,14 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-12-02T22:48:48.255429Z",
+     "iopub.status.busy": "2024-12-02T22:48:48.255085Z",
+     "iopub.status.idle": "2024-12-02T22:48:48.259344Z",
+     "shell.execute_reply": "2024-12-02T22:48:48.258964Z"
+    }
+   },
    "outputs": [],
    "source": [
     "url = \"http://localhost:30010/health\"\n",
@@ -174,7 +274,14 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-12-02T22:48:48.260464Z",
+     "iopub.status.busy": "2024-12-02T22:48:48.260335Z",
+     "iopub.status.idle": "2024-12-02T22:48:48.265409Z",
+     "shell.execute_reply": "2024-12-02T22:48:48.264993Z"
+    }
+   },
    "outputs": [],
    "source": [
     "# flush cache\n",
@@ -199,7 +306,14 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-12-02T22:48:48.266586Z",
+     "iopub.status.busy": "2024-12-02T22:48:48.266452Z",
+     "iopub.status.idle": "2024-12-02T22:48:48.946566Z",
+     "shell.execute_reply": "2024-12-02T22:48:48.946081Z"
+    }
+   },
    "outputs": [],
    "source": [
     "# successful update with same architecture and size\n",
@@ -217,22 +331,29 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-12-02T22:48:48.948180Z",
+     "iopub.status.busy": "2024-12-02T22:48:48.947760Z",
+     "iopub.status.idle": "2024-12-02T22:48:49.075751Z",
+     "shell.execute_reply": "2024-12-02T22:48:49.075374Z"
+    }
+   },
    "outputs": [],
    "source": [
-    "# failed update with different parameter size\n",
+    "# failed update with different parameter size or wrong name\n",
     "\n",
     "url = \"http://localhost:30010/update_weights_from_disk\"\n",
-    "data = {\"model_path\": \"meta-llama/Llama-3.2-3B\"}\n",
+    "data = {\"model_path\": \"meta-llama/Llama-3.2-1B-wrong\"}\n",
     "\n",
     "response = requests.post(url, json=data)\n",
     "response_json = response.json()\n",
     "print_highlight(response_json)\n",
     "assert response_json[\"success\"] is False\n",
     "assert response_json[\"message\"] == (\n",
-    "    \"Failed to update weights: The size of tensor a (2048) must match \"\n",
-    "    \"the size of tensor b (3072) at non-singleton dimension 1.\\n\"\n",
-    "    \"Rolling back to original weights.\"\n",
+    "    \"Failed to get weights iterator: \"\n",
+    "    \"meta-llama/Llama-3.2-1B-wrong\"\n",
+    "    \" (repository not found).\"\n",
     ")"
    ]
   },
@@ -249,7 +370,14 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-12-02T22:48:49.076953Z",
+     "iopub.status.busy": "2024-12-02T22:48:49.076828Z",
+     "iopub.status.idle": "2024-12-02T22:49:23.694647Z",
+     "shell.execute_reply": "2024-12-02T22:49:23.694216Z"
+    }
+   },
    "outputs": [],
    "source": [
     "terminate_process(server_process)\n",
@@ -267,7 +395,14 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-12-02T22:49:23.696104Z",
+     "iopub.status.busy": "2024-12-02T22:49:23.695882Z",
+     "iopub.status.idle": "2024-12-02T22:49:23.723827Z",
+     "shell.execute_reply": "2024-12-02T22:49:23.723526Z"
+    }
+   },
    "outputs": [],
    "source": [
     "# successful encode for embedding model\n",
@@ -292,7 +427,14 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-12-02T22:49:23.724947Z",
+     "iopub.status.busy": "2024-12-02T22:49:23.724827Z",
+     "iopub.status.idle": "2024-12-02T22:49:47.825508Z",
+     "shell.execute_reply": "2024-12-02T22:49:47.825004Z"
+    }
+   },
    "outputs": [],
    "source": [
     "terminate_process(embedding_process)\n",
@@ -312,7 +454,14 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-12-02T22:49:47.826961Z",
+     "iopub.status.busy": "2024-12-02T22:49:47.826823Z",
+     "iopub.status.idle": "2024-12-02T22:49:48.853347Z",
+     "shell.execute_reply": "2024-12-02T22:49:48.852882Z"
+    }
+   },
    "outputs": [],
    "source": [
     "from transformers import AutoTokenizer\n",
@@ -332,7 +481,7 @@
     "tokenizer = AutoTokenizer.from_pretrained(\"Skywork/Skywork-Reward-Llama-3.1-8B-v0.2\")\n",
     "prompts = tokenizer.apply_chat_template(CONVS, tokenize=False)\n",
     "\n",
-    "url = \"http://localhost:30030/classify\"\n",
+    "url = \"http://localhost:30030/encode\"\n",
     "data = {\"model\": \"Skywork/Skywork-Reward-Llama-3.1-8B-v0.2\", \"text\": prompts}\n",
     "\n",
     "responses = requests.post(url, json=data).json()\n",
@@ -342,8 +491,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
+   "execution_count": 14,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-12-02T22:49:48.854711Z",
+     "iopub.status.busy": "2024-12-02T22:49:48.854489Z",
+     "iopub.status.idle": "2024-12-02T22:49:48.919126Z",
+     "shell.execute_reply": "2024-12-02T22:49:48.918561Z"
+    }
+   },
    "outputs": [],
    "source": [
     "terminate_process(reward_process)"
@@ -351,6 +507,11 @@
   }
  ],
  "metadata": {
+  "kernelspec": {
+   "display_name": "sglang",
+   "language": "python",
+   "name": "python3"
+  },
   "language_info": {
    "codemirror_mode": {
     "name": "ipython",
@@ -360,7 +521,8 @@
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3"
+   "pygments_lexer": "ipython3",
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
diff --git a/docs/backend/offline_engine_api.ipynb b/docs/backend/offline_engine_api.ipynb
index 7ce89d435d..302f060b02 100644
--- a/docs/backend/offline_engine_api.ipynb
+++ b/docs/backend/offline_engine_api.ipynb
@@ -33,7 +33,14 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-12-02T22:47:40.725673Z",
+     "iopub.status.busy": "2024-12-02T22:47:40.725546Z",
+     "iopub.status.idle": "2024-12-02T22:48:06.979747Z",
+     "shell.execute_reply": "2024-12-02T22:48:06.978839Z"
+    }
+   },
    "outputs": [],
    "source": [
     "# launch the offline engine\n",
@@ -54,7 +61,14 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-12-02T22:48:06.982095Z",
+     "iopub.status.busy": "2024-12-02T22:48:06.981910Z",
+     "iopub.status.idle": "2024-12-02T22:48:08.220069Z",
+     "shell.execute_reply": "2024-12-02T22:48:08.219489Z"
+    }
+   },
    "outputs": [],
    "source": [
     "prompts = [\n",
@@ -82,7 +96,14 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-12-02T22:48:08.221656Z",
+     "iopub.status.busy": "2024-12-02T22:48:08.221505Z",
+     "iopub.status.idle": "2024-12-02T22:48:11.077997Z",
+     "shell.execute_reply": "2024-12-02T22:48:11.077319Z"
+    }
+   },
    "outputs": [],
    "source": [
     "prompts = [\n",
@@ -113,7 +134,14 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-12-02T22:48:11.079727Z",
+     "iopub.status.busy": "2024-12-02T22:48:11.079451Z",
+     "iopub.status.idle": "2024-12-02T22:48:12.007378Z",
+     "shell.execute_reply": "2024-12-02T22:48:12.006821Z"
+    }
+   },
    "outputs": [],
    "source": [
     "prompts = [\n",
@@ -148,7 +176,14 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-12-02T22:48:12.008941Z",
+     "iopub.status.busy": "2024-12-02T22:48:12.008803Z",
+     "iopub.status.idle": "2024-12-02T22:48:14.701721Z",
+     "shell.execute_reply": "2024-12-02T22:48:14.701146Z"
+    }
+   },
    "outputs": [],
    "source": [
     "prompts = [\n",
@@ -177,8 +212,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
+   "execution_count": 6,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-12-02T22:48:14.703412Z",
+     "iopub.status.busy": "2024-12-02T22:48:14.702985Z",
+     "iopub.status.idle": "2024-12-02T22:48:14.767369Z",
+     "shell.execute_reply": "2024-12-02T22:48:14.766815Z"
+    }
+   },
    "outputs": [],
    "source": [
     "llm.shutdown()"
@@ -195,7 +237,8 @@
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3"
+   "pygments_lexer": "ipython3",
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
diff --git a/docs/backend/openai_api_completions.ipynb b/docs/backend/openai_api_completions.ipynb
index 067a046885..552c60adbb 100644
--- a/docs/backend/openai_api_completions.ipynb
+++ b/docs/backend/openai_api_completions.ipynb
@@ -36,9 +36,150 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
+   "execution_count": 1,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-12-02T22:46:17.996308Z",
+     "iopub.status.busy": "2024-12-02T22:46:17.995895Z",
+     "iopub.status.idle": "2024-12-02T22:46:50.231557Z",
+     "shell.execute_reply": "2024-12-02T22:46:50.231084Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "/opt/dlami/nvme/chenyang/miniconda3/envs/sglang/lib/python3.11/site-packages/transformers/utils/hub.py:128: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n",
+      "  warnings.warn(\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-12-02 22:46:24] server_args=ServerArgs(model_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, context_length=None, device='cuda', served_model_name='meta-llama/Meta-Llama-3.1-8B-Instruct', chat_template=None, is_embedding=False, revision=None, host='0.0.0.0', port=30000, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='lpm', schedule_conservativeness=1.0, cpu_offload_gb=0, tp_size=1, stream_interval=1, random_seed=675216789, constrained_json_whitespace_pattern=None, watchdog_timeout=300, download_dir=None, base_gpu_id=0, log_level='info', log_level_http=None, log_requests=False, show_time_cost=False, enable_metrics=False, decode_log_interval=40, api_key=None, file_storage_pth='SGLang_storage', enable_cache_report=False, dp_size=1, load_balance_method='round_robin', dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='outlines', disable_radix_cache=False, disable_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_overlap_schedule=False, enable_mixed_chunk=False, enable_dp_attention=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config='', enable_nan_detection=False, enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1, delete_ckpt_after_loading=False)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "/opt/dlami/nvme/chenyang/miniconda3/envs/sglang/lib/python3.11/site-packages/transformers/utils/hub.py:128: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n",
+      "  warnings.warn(\n",
+      "/opt/dlami/nvme/chenyang/miniconda3/envs/sglang/lib/python3.11/site-packages/transformers/utils/hub.py:128: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n",
+      "  warnings.warn(\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-12-02 22:46:32 TP0] Init torch distributed begin.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-12-02 22:46:33 TP0] Load weight begin. avail mem=46.29 GB\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-12-02 22:46:33 TP0] lm_eval is not installed, GPTQ may not be usable\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-12-02 22:46:33 TP0] Using model weights format ['*.safetensors']\n",
+      "\r",
+      "Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\r",
+      "Loading safetensors checkpoint shards:  25% Completed | 1/4 [00:00<00:02,  1.36it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\r",
+      "Loading safetensors checkpoint shards:  50% Completed | 2/4 [00:01<00:01,  1.22it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\r",
+      "Loading safetensors checkpoint shards:  75% Completed | 3/4 [00:02<00:00,  1.18it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\r",
+      "Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:02<00:00,  1.56it/s]\n",
+      "\r",
+      "Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:02<00:00,  1.42it/s]\n",
+      "\n",
+      "[2024-12-02 22:46:36 TP0] Load weight end. type=LlamaForCausalLM, dtype=torch.bfloat16, avail mem=31.20 GB\n",
+      "[2024-12-02 22:46:36 TP0] Memory pool end. avail mem=4.44 GB\n",
+      "[2024-12-02 22:46:36 TP0] Capture cuda graph begin. This can take up to several minutes.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-12-02 22:46:45 TP0] max_total_num_tokens=210055, max_prefill_tokens=16384, max_running_requests=2049, context_len=131072\n",
+      "[2024-12-02 22:46:45] INFO:     Started server process [3893647]\n",
+      "[2024-12-02 22:46:45] INFO:     Waiting for application startup.\n",
+      "[2024-12-02 22:46:45] INFO:     Application startup complete.\n",
+      "[2024-12-02 22:46:45] INFO:     Uvicorn running on http://0.0.0.0:30000 (Press CTRL+C to quit)\n",
+      "[2024-12-02 22:46:45] INFO:     127.0.0.1:42852 - \"GET /v1/models HTTP/1.1\" 200 OK\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-12-02 22:46:46] INFO:     127.0.0.1:42860 - \"GET /get_model_info HTTP/1.1\" 200 OK\n",
+      "[2024-12-02 22:46:46 TP0] Prefill batch. #new-seq: 1, #new-token: 7, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-12-02 22:46:46] INFO:     127.0.0.1:42862 - \"POST /generate HTTP/1.1\" 200 OK\n",
+      "[2024-12-02 22:46:46] The server is fired up and ready to roll!\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'><br><br>                    NOTE: Typically, the server runs in a separate terminal.<br>                    In this notebook, we run the server and notebook code together, so their outputs are combined.<br>                    To improve clarity, the server logs are displayed in the original black color, while the notebook outputs are highlighted in blue.<br>                    </strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
    "source": [
     "from sglang.utils import (\n",
     "    execute_shell_command,\n",
@@ -69,9 +210,44 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
+   "execution_count": 2,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-12-02T22:46:50.233143Z",
+     "iopub.status.busy": "2024-12-02T22:46:50.232949Z",
+     "iopub.status.idle": "2024-12-02T22:46:50.886412Z",
+     "shell.execute_reply": "2024-12-02T22:46:50.885947Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-12-02 22:46:50 TP0] Prefill batch. #new-seq: 1, #new-token: 42, #cached-token: 1, cache hit rate: 2.00%, token usage: 0.00, #running-req: 0, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-12-02 22:46:50 TP0] Decode batch. #running-req: 1, #token: 76, token usage: 0.00, gen throughput (token/s): 7.04, #queue-req: 0\n",
+      "[2024-12-02 22:46:50] INFO:     127.0.0.1:42866 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Response: ChatCompletion(id='d1d6ed6246d5474e94dec1325b85ede3', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are 3 countries and their capitals:\\n\\n1. Country: Japan\\n   Capital: Tokyo\\n\\n2. Country: Australia\\n   Capital: Canberra\\n\\n3. Country: Brazil\\n   Capital: Brasília', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None), matched_stop=128009)], created=1733179610, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=43, prompt_tokens=43, total_tokens=86, completion_tokens_details=None, prompt_tokens_details=None))</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
    "source": [
     "import openai\n",
     "\n",
@@ -102,9 +278,59 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
+   "execution_count": 3,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-12-02T22:46:50.887802Z",
+     "iopub.status.busy": "2024-12-02T22:46:50.887655Z",
+     "iopub.status.idle": "2024-12-02T22:46:51.825108Z",
+     "shell.execute_reply": "2024-12-02T22:46:51.824704Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-12-02 22:46:50 TP0] Prefill batch. #new-seq: 1, #new-token: 51, #cached-token: 25, cache hit rate: 20.63%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
+      "[2024-12-02 22:46:50 TP0] frequency_penalty, presence_penalty, and repetition_penalty are not supported when using the default overlap scheduler. They will be ignored. Please add `--disable-overlap` when launching the server if you need these features. The speed will be slower in that case.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-12-02 22:46:51 TP0] Decode batch. #running-req: 1, #token: 106, token usage: 0.00, gen throughput (token/s): 125.80, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-12-02 22:46:51 TP0] Decode batch. #running-req: 1, #token: 146, token usage: 0.00, gen throughput (token/s): 142.02, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-12-02 22:46:51 TP0] Decode batch. #running-req: 1, #token: 186, token usage: 0.00, gen throughput (token/s): 141.54, #queue-req: 0\n",
+      "[2024-12-02 22:46:51] INFO:     127.0.0.1:42866 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Ancient Rome's major achievements include:<br><br>1. **Engineering and Architecture**: They built iconic structures like the Colosseum, Pantheon, and Roman Forum, showcasing their engineering skills and architectural innovations.<br>2. **Law and Governance**: The Romans developed the Twelve Tables, a precursor to modern law, and established a system of governance that included the Senate and the Assemblies.<br>3. **Military Conquests**: Rome expanded its territories through a series of military campaigns, creating a vast empire that stretched from Britain to Egypt.<br>4. **Infrastructure Development**: They built roads, bridges, aqueducts, and canals, which facilitated trade</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
    "source": [
     "response = client.chat.completions.create(\n",
     "    model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
@@ -141,9 +367,26 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
+   "execution_count": 4,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-12-02T22:46:51.826415Z",
+     "iopub.status.busy": "2024-12-02T22:46:51.826283Z",
+     "iopub.status.idle": "2024-12-02T22:46:51.883588Z",
+     "shell.execute_reply": "2024-12-02T22:46:51.883146Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-12-02 22:46:51] INFO:     127.0.0.1:42866 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n",
+      "[2024-12-02 22:46:51 TP0] Prefill batch. #new-seq: 1, #new-token: 10, #cached-token: 30, cache hit rate: 33.73%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
+      "This is only a test"
+     ]
+    }
+   ],
    "source": [
     "stream = client.chat.completions.create(\n",
     "    model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
@@ -167,9 +410,51 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
+   "execution_count": 5,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-12-02T22:46:51.884817Z",
+     "iopub.status.busy": "2024-12-02T22:46:51.884690Z",
+     "iopub.status.idle": "2024-12-02T22:46:52.336780Z",
+     "shell.execute_reply": "2024-12-02T22:46:52.336393Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-12-02 22:46:51 TP0] Prefill batch. #new-seq: 1, #new-token: 8, #cached-token: 1, cache hit rate: 32.57%, token usage: 0.00, #running-req: 0, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-12-02 22:46:52 TP0] Decode batch. #running-req: 1, #token: 25, token usage: 0.00, gen throughput (token/s): 128.02, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-12-02 22:46:52 TP0] Decode batch. #running-req: 1, #token: 65, token usage: 0.00, gen throughput (token/s): 146.74, #queue-req: 0\n",
+      "[2024-12-02 22:46:52] INFO:     127.0.0.1:42866 - \"POST /v1/completions HTTP/1.1\" 200 OK\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Response: Completion(id='bd53d29a96bc45839dcbd37d3dcc1206', choices=[CompletionChoice(finish_reason='length', index=0, logprobs=None, text=' 1. 2. 3.\\n1.  United States - Washington D.C. 2.  Japan - Tokyo 3.  Australia - Canberra\\nList 3 countries and their capitals. 1. 2. 3.\\n1.  China - Beijing 2.  Brazil - Bras', matched_stop=None)], created=1733179612, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='text_completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=64, prompt_tokens=9, total_tokens=73, completion_tokens_details=None, prompt_tokens_details=None))</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
    "source": [
     "response = client.completions.create(\n",
     "    model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
@@ -196,9 +481,59 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
+   "execution_count": 6,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-12-02T22:46:52.338049Z",
+     "iopub.status.busy": "2024-12-02T22:46:52.337919Z",
+     "iopub.status.idle": "2024-12-02T22:46:53.167934Z",
+     "shell.execute_reply": "2024-12-02T22:46:53.167533Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-12-02 22:46:52 TP0] Prefill batch. #new-seq: 1, #new-token: 9, #cached-token: 1, cache hit rate: 31.35%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
+      "[2024-12-02 22:46:52 TP0] frequency_penalty, presence_penalty, and repetition_penalty are not supported when using the default overlap scheduler. They will be ignored. Please add `--disable-overlap` when launching the server if you need these features. The speed will be slower in that case.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-12-02 22:46:52 TP0] Decode batch. #running-req: 1, #token: 42, token usage: 0.00, gen throughput (token/s): 137.48, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-12-02 22:46:52 TP0] Decode batch. #running-req: 1, #token: 82, token usage: 0.00, gen throughput (token/s): 145.33, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-12-02 22:46:53 TP0] Decode batch. #running-req: 1, #token: 122, token usage: 0.00, gen throughput (token/s): 144.24, #queue-req: 0\n",
+      "[2024-12-02 22:46:53] INFO:     127.0.0.1:42866 - \"POST /v1/completions HTTP/1.1\" 200 OK\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Response: Completion(id='f5503996a99040b19445f8121b49c4f1', choices=[CompletionChoice(finish_reason='stop', index=0, logprobs=None, text=' In 200 words or less.\\nAs the stars whizzed by outside, Captain Orion gazed out at the vast expanse of space. Her ship, the Aurora, had been traveling for months, searching for a new home for humanity. The Earth was dying, and the crew of the Aurora was determined to find a new planet to call their own.\\nOrion\\'s eyes sparkled as she scanned the data streaming in from the ship\\'s sensors. \"Captain, I\\'m reading a planet with breathable air and liquid water,\" said her navigator, Ensign Amy K\\'Rhyn.', matched_stop='\\n\\n')], created=1733179613, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='text_completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=118, prompt_tokens=10, total_tokens=128, completion_tokens_details=None, prompt_tokens_details=None))</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
    "source": [
     "response = client.completions.create(\n",
     "    model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
@@ -228,9 +563,43 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
+   "execution_count": 7,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-12-02T22:46:53.169141Z",
+     "iopub.status.busy": "2024-12-02T22:46:53.169014Z",
+     "iopub.status.idle": "2024-12-02T22:46:53.441425Z",
+     "shell.execute_reply": "2024-12-02T22:46:53.441026Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-12-02 22:46:53 TP0] Prefill batch. #new-seq: 1, #new-token: 19, #cached-token: 30, cache hit rate: 37.61%, token usage: 0.00, #running-req: 0, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-12-02 22:46:53] INFO:     127.0.0.1:42866 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>{\"name\": \"Paris\", \"population\": 2147000}</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
    "source": [
     "import json\n",
     "\n",
@@ -273,9 +642,37 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
+   "execution_count": 8,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-12-02T22:46:53.442594Z",
+     "iopub.status.busy": "2024-12-02T22:46:53.442465Z",
+     "iopub.status.idle": "2024-12-02T22:46:53.576534Z",
+     "shell.execute_reply": "2024-12-02T22:46:53.576071Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-12-02 22:46:53 TP0] Prefill batch. #new-seq: 1, #new-token: 12, #cached-token: 30, cache hit rate: 42.75%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
+      "[2024-12-02 22:46:53] INFO:     127.0.0.1:42866 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Paris</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
    "source": [
     "response = client.chat.completions.create(\n",
     "    model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
@@ -309,9 +706,44 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
+   "execution_count": 9,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-12-02T22:46:53.578160Z",
+     "iopub.status.busy": "2024-12-02T22:46:53.577636Z",
+     "iopub.status.idle": "2024-12-02T22:46:53.597584Z",
+     "shell.execute_reply": "2024-12-02T22:46:53.597165Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-12-02 22:46:53] INFO:     127.0.0.1:47994 - \"POST /v1/files HTTP/1.1\" 200 OK\n",
+      "[2024-12-02 22:46:53] INFO:     127.0.0.1:47994 - \"POST /v1/batches HTTP/1.1\" 200 OK\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Batch job created with ID: batch_eafe977c-1e11-447a-a77f-6905cd2cf267</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-12-02 22:46:53 TP0] Prefill batch. #new-seq: 2, #new-token: 18, #cached-token: 62, cache hit rate: 50.56%, token usage: 0.00, #running-req: 0, #queue-req: 0\n"
+     ]
+    }
+   ],
    "source": [
     "import json\n",
     "import time\n",
@@ -364,9 +796,102 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
+   "execution_count": 10,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-12-02T22:46:53.598950Z",
+     "iopub.status.busy": "2024-12-02T22:46:53.598642Z",
+     "iopub.status.idle": "2024-12-02T22:46:56.612608Z",
+     "shell.execute_reply": "2024-12-02T22:46:56.612127Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-12-02 22:46:53 TP0] Decode batch. #running-req: 1, #token: 56, token usage: 0.00, gen throughput (token/s): 87.91, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Batch job status: validating...trying again in 3 seconds...\n",
+      "[2024-12-02 22:46:56] INFO:     127.0.0.1:47994 - \"GET /v1/batches/batch_eafe977c-1e11-447a-a77f-6905cd2cf267 HTTP/1.1\" 200 OK\n",
+      "Batch job completed successfully!\n",
+      "Request counts: BatchRequestCounts(completed=2, failed=0, total=2)\n",
+      "[2024-12-02 22:46:56] INFO:     127.0.0.1:47994 - \"GET /v1/files/backend_result_file-62a6f368-3d24-4e97-8920-1e5d12769ffc/content HTTP/1.1\" 200 OK\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Request request-1:</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Response: {'status_code': 200, 'request_id': 'request-1', 'body': {'id': 'request-1', 'object': 'chat.completion', 'created': 1733179613, 'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct', 'choices': {'index': 0, 'message': {'role': 'assistant', 'content': 'Why do programmers prefer dark mode?\\n\\nBecause light attracts bugs.'}, 'logprobs': None, 'finish_reason': 'stop', 'matched_stop': 128009}, 'usage': {'prompt_tokens': 41, 'completion_tokens': 13, 'total_tokens': 54}, 'system_fingerprint': None}}</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Request request-2:</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Response: {'status_code': 200, 'request_id': 'request-2', 'body': {'id': 'request-2', 'object': 'chat.completion', 'created': 1733179613, 'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct', 'choices': {'index': 0, 'message': {'role': 'assistant', 'content': '**What is Python?**\\n\\nPython is a high-level, interpreted programming language that is widely used for various purposes such as web development, scientific computing, data analysis, artificial intelligence, and more. It was created in the late 1980s by'}, 'logprobs': None, 'finish_reason': 'length', 'matched_stop': None}, 'usage': {'prompt_tokens': 39, 'completion_tokens': 50, 'total_tokens': 89}, 'system_fingerprint': None}}</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Cleaning up files...</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-12-02 22:46:56] INFO:     127.0.0.1:47994 - \"DELETE /v1/files/backend_result_file-62a6f368-3d24-4e97-8920-1e5d12769ffc HTTP/1.1\" 200 OK\n"
+     ]
+    }
+   ],
    "source": [
     "while batch_response.status not in [\"completed\", \"failed\", \"cancelled\"]:\n",
     "    time.sleep(3)\n",
@@ -412,9 +937,296 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
+   "execution_count": 11,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-12-02T22:46:56.613990Z",
+     "iopub.status.busy": "2024-12-02T22:46:56.613849Z",
+     "iopub.status.idle": "2024-12-02T22:47:21.651523Z",
+     "shell.execute_reply": "2024-12-02T22:47:21.650996Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-12-02 22:46:56] INFO:     127.0.0.1:48004 - \"POST /v1/files HTTP/1.1\" 200 OK\n",
+      "[2024-12-02 22:46:56] INFO:     127.0.0.1:48004 - \"POST /v1/batches HTTP/1.1\" 200 OK\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Created batch job with ID: batch_028e2808-382e-4aa4-af37-d7f4e763c3f4</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Initial status: validating</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-12-02 22:46:56 TP0] Prefill batch. #new-seq: 100, #new-token: 3000, #cached-token: 2500, cache hit rate: 45.77%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
+      "[2024-12-02 22:46:56 TP0] Decode batch. #running-req: 100, #token: 3725, token usage: 0.02, gen throughput (token/s): 206.20, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-12-02 22:46:57 TP0] Decode batch. #running-req: 100, #token: 7725, token usage: 0.04, gen throughput (token/s): 11850.91, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-12-02 22:46:57 TP0] Decode batch. #running-req: 100, #token: 11725, token usage: 0.06, gen throughput (token/s): 11616.06, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-12-02 22:46:57 TP0] Decode batch. #running-req: 100, #token: 15725, token usage: 0.07, gen throughput (token/s): 11346.97, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-12-02 22:46:58 TP0] Decode batch. #running-req: 100, #token: 19725, token usage: 0.09, gen throughput (token/s): 11089.22, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-12-02 22:46:58 TP0] Decode batch. #running-req: 100, #token: 23725, token usage: 0.11, gen throughput (token/s): 10835.27, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-12-02 22:46:58 TP0] Decode batch. #running-req: 100, #token: 27725, token usage: 0.13, gen throughput (token/s): 10583.19, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-12-02 22:46:59 TP0] Decode batch. #running-req: 100, #token: 31725, token usage: 0.15, gen throughput (token/s): 10363.49, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-12-02 22:46:59 TP0] Decode batch. #running-req: 100, #token: 35725, token usage: 0.17, gen throughput (token/s): 10145.38, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-12-02 22:47:00 TP0] Decode batch. #running-req: 100, #token: 39725, token usage: 0.19, gen throughput (token/s): 9927.85, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-12-02 22:47:00 TP0] Decode batch. #running-req: 100, #token: 43725, token usage: 0.21, gen throughput (token/s): 9719.36, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-12-02 22:47:00 TP0] Decode batch. #running-req: 100, #token: 47725, token usage: 0.23, gen throughput (token/s): 9533.96, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-12-02 22:47:01 TP0] Decode batch. #running-req: 100, #token: 51725, token usage: 0.25, gen throughput (token/s): 9339.12, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-12-02 22:47:06] INFO:     127.0.0.1:59256 - \"GET /v1/batches/batch_028e2808-382e-4aa4-af37-d7f4e763c3f4 HTTP/1.1\" 200 OK\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Batch job details (check 1 / 5) // ID: batch_028e2808-382e-4aa4-af37-d7f4e763c3f4 // Status: completed // Created at: 1733179616 // Input file ID: backend_input_file-8762851c-2789-4b30-b4df-c66edde1d5bb // Output file ID: backend_result_file-3d82e8c5-8022-43ee-9659-99e4124ec4f8</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'><strong>Request counts: Total: 100 // Completed: 100 // Failed: 0</strong></strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-12-02 22:47:09] INFO:     127.0.0.1:59256 - \"GET /v1/batches/batch_028e2808-382e-4aa4-af37-d7f4e763c3f4 HTTP/1.1\" 200 OK\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Batch job details (check 2 / 5) // ID: batch_028e2808-382e-4aa4-af37-d7f4e763c3f4 // Status: completed // Created at: 1733179616 // Input file ID: backend_input_file-8762851c-2789-4b30-b4df-c66edde1d5bb // Output file ID: backend_result_file-3d82e8c5-8022-43ee-9659-99e4124ec4f8</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'><strong>Request counts: Total: 100 // Completed: 100 // Failed: 0</strong></strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-12-02 22:47:12] INFO:     127.0.0.1:59256 - \"GET /v1/batches/batch_028e2808-382e-4aa4-af37-d7f4e763c3f4 HTTP/1.1\" 200 OK\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Batch job details (check 3 / 5) // ID: batch_028e2808-382e-4aa4-af37-d7f4e763c3f4 // Status: completed // Created at: 1733179616 // Input file ID: backend_input_file-8762851c-2789-4b30-b4df-c66edde1d5bb // Output file ID: backend_result_file-3d82e8c5-8022-43ee-9659-99e4124ec4f8</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'><strong>Request counts: Total: 100 // Completed: 100 // Failed: 0</strong></strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-12-02 22:47:15] INFO:     127.0.0.1:59256 - \"GET /v1/batches/batch_028e2808-382e-4aa4-af37-d7f4e763c3f4 HTTP/1.1\" 200 OK\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Batch job details (check 4 / 5) // ID: batch_028e2808-382e-4aa4-af37-d7f4e763c3f4 // Status: completed // Created at: 1733179616 // Input file ID: backend_input_file-8762851c-2789-4b30-b4df-c66edde1d5bb // Output file ID: backend_result_file-3d82e8c5-8022-43ee-9659-99e4124ec4f8</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'><strong>Request counts: Total: 100 // Completed: 100 // Failed: 0</strong></strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-12-02 22:47:18] INFO:     127.0.0.1:59256 - \"GET /v1/batches/batch_028e2808-382e-4aa4-af37-d7f4e763c3f4 HTTP/1.1\" 200 OK\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Batch job details (check 5 / 5) // ID: batch_028e2808-382e-4aa4-af37-d7f4e763c3f4 // Status: completed // Created at: 1733179616 // Input file ID: backend_input_file-8762851c-2789-4b30-b4df-c66edde1d5bb // Output file ID: backend_result_file-3d82e8c5-8022-43ee-9659-99e4124ec4f8</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'><strong>Request counts: Total: 100 // Completed: 100 // Failed: 0</strong></strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
    "source": [
     "import json\n",
     "import time\n",
@@ -488,9 +1300,217 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
+   "execution_count": 12,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-12-02T22:47:21.653719Z",
+     "iopub.status.busy": "2024-12-02T22:47:21.653572Z",
+     "iopub.status.idle": "2024-12-02T22:47:34.687047Z",
+     "shell.execute_reply": "2024-12-02T22:47:34.686607Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-12-02 22:47:21] INFO:     127.0.0.1:48046 - \"POST /v1/files HTTP/1.1\" 200 OK\n",
+      "[2024-12-02 22:47:21] INFO:     127.0.0.1:48046 - \"POST /v1/batches HTTP/1.1\" 200 OK\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Created batch job with ID: batch_94c72d7f-b065-4c83-8765-d257480f694f</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Initial status: validating</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-12-02 22:47:21 TP0] Prefill batch. #new-seq: 83, #new-token: 83, #cached-token: 4482, cache hit rate: 68.73%, token usage: 0.01, #running-req: 0, #queue-req: 0\n",
+      "[2024-12-02 22:47:21 TP0] Prefill batch. #new-seq: 290, #new-token: 8192, #cached-token: 7743, cache hit rate: 56.55%, token usage: 0.01, #running-req: 83, #queue-req: 127\n",
+      "[2024-12-02 22:47:21 TP0] Prefill batch. #new-seq: 128, #new-token: 3825, #cached-token: 3215, cache hit rate: 54.26%, token usage: 0.05, #running-req: 372, #queue-req: 1\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-12-02 22:47:22 TP0] Decode batch. #running-req: 500, #token: 28525, token usage: 0.14, gen throughput (token/s): 678.38, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-12-02 22:47:23 TP0] Decode batch. #running-req: 500, #token: 48525, token usage: 0.23, gen throughput (token/s): 26596.73, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-12-02 22:47:24 TP0] Decode batch. #running-req: 500, #token: 68525, token usage: 0.33, gen throughput (token/s): 25443.33, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-12-02 22:47:24 TP0] Decode batch. #running-req: 500, #token: 88525, token usage: 0.42, gen throughput (token/s): 24276.84, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-12-02 22:47:25 TP0] Decode batch. #running-req: 500, #token: 108525, token usage: 0.52, gen throughput (token/s): 23133.50, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-12-02 22:47:26 TP0] Decode batch. #running-req: 500, #token: 128525, token usage: 0.61, gen throughput (token/s): 22139.46, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-12-02 22:47:27 TP0] Decode batch. #running-req: 500, #token: 148525, token usage: 0.71, gen throughput (token/s): 21220.49, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-12-02 22:47:28 TP0] Decode batch. #running-req: 500, #token: 168525, token usage: 0.80, gen throughput (token/s): 20377.60, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-12-02 22:47:29 TP0] Decode batch. #running-req: 500, #token: 188525, token usage: 0.90, gen throughput (token/s): 19555.01, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-12-02 22:47:30 TP0] Decode batch. #running-req: 500, #token: 208525, token usage: 0.99, gen throughput (token/s): 18893.70, #queue-req: 0\n",
+      "[2024-12-02 22:47:30 TP0] Decode out of memory happened. #retracted_reqs: 23, #new_token_ratio: 0.3087 -> 0.8200\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-12-02 22:47:31 TP0] Decode out of memory happened. #retracted_reqs: 21, #new_token_ratio: 0.8009 -> 0.8600\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-12-02 22:47:31] INFO:     127.0.0.1:60160 - \"POST /v1/batches/batch_94c72d7f-b065-4c83-8765-d257480f694f/cancel HTTP/1.1\" 200 OK\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Cancellation initiated. Status: cancelling</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-12-02 22:47:34] INFO:     127.0.0.1:60160 - \"GET /v1/batches/batch_94c72d7f-b065-4c83-8765-d257480f694f HTTP/1.1\" 200 OK\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Current status: cancelled</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Batch job successfully cancelled</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-12-02 22:47:34] INFO:     127.0.0.1:60160 - \"DELETE /v1/files/backend_input_file-0acbc8d2-85ef-406a-b610-1591357c7615 HTTP/1.1\" 200 OK\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Successfully cleaned up input file</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Successfully deleted local batch_requests.jsonl file</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
    "source": [
     "import json\n",
     "import time\n",
@@ -576,8 +1596,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
+   "execution_count": 13,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-12-02T22:47:34.689659Z",
+     "iopub.status.busy": "2024-12-02T22:47:34.689300Z",
+     "iopub.status.idle": "2024-12-02T22:47:36.284482Z",
+     "shell.execute_reply": "2024-12-02T22:47:36.283449Z"
+    }
+   },
    "outputs": [],
    "source": [
     "terminate_process(server_process)"
@@ -594,7 +1621,8 @@
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3"
+   "pygments_lexer": "ipython3",
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
diff --git a/docs/backend/openai_api_embeddings.ipynb b/docs/backend/openai_api_embeddings.ipynb
index 65b07c384d..1d106a4783 100644
--- a/docs/backend/openai_api_embeddings.ipynb
+++ b/docs/backend/openai_api_embeddings.ipynb
@@ -33,7 +33,14 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-12-02T22:49:53.281058Z",
+     "iopub.status.busy": "2024-12-02T22:49:53.280790Z",
+     "iopub.status.idle": "2024-12-02T22:50:26.517789Z",
+     "shell.execute_reply": "2024-12-02T22:50:26.517313Z"
+    }
+   },
    "outputs": [],
    "source": [
     "from sglang.utils import (\n",
@@ -63,7 +70,14 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-12-02T22:50:26.519357Z",
+     "iopub.status.busy": "2024-12-02T22:50:26.519135Z",
+     "iopub.status.idle": "2024-12-02T22:50:26.555609Z",
+     "shell.execute_reply": "2024-12-02T22:50:26.555227Z"
+    }
+   },
    "outputs": [],
    "source": [
     "import subprocess, json\n",
@@ -90,7 +104,14 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-12-02T22:50:26.557052Z",
+     "iopub.status.busy": "2024-12-02T22:50:26.556687Z",
+     "iopub.status.idle": "2024-12-02T22:50:26.580073Z",
+     "shell.execute_reply": "2024-12-02T22:50:26.579695Z"
+    }
+   },
    "outputs": [],
    "source": [
     "import requests\n",
@@ -117,7 +138,14 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-12-02T22:50:26.581299Z",
+     "iopub.status.busy": "2024-12-02T22:50:26.581150Z",
+     "iopub.status.idle": "2024-12-02T22:50:26.933134Z",
+     "shell.execute_reply": "2024-12-02T22:50:26.932703Z"
+    }
+   },
    "outputs": [],
    "source": [
     "import openai\n",
@@ -146,7 +174,14 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-12-02T22:50:26.934770Z",
+     "iopub.status.busy": "2024-12-02T22:50:26.934415Z",
+     "iopub.status.idle": "2024-12-02T22:50:28.842361Z",
+     "shell.execute_reply": "2024-12-02T22:50:28.841666Z"
+    }
+   },
    "outputs": [],
    "source": [
     "import json\n",
@@ -170,8 +205,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
+   "execution_count": 6,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-12-02T22:50:28.844332Z",
+     "iopub.status.busy": "2024-12-02T22:50:28.843909Z",
+     "iopub.status.idle": "2024-12-02T22:50:29.199607Z",
+     "shell.execute_reply": "2024-12-02T22:50:29.198958Z"
+    }
+   },
    "outputs": [],
    "source": [
     "terminate_process(embedding_process)"
@@ -188,7 +230,8 @@
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3"
+   "pygments_lexer": "ipython3",
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
diff --git a/docs/backend/openai_api_vision.ipynb b/docs/backend/openai_api_vision.ipynb
index af17b44096..4e573304ab 100644
--- a/docs/backend/openai_api_vision.ipynb
+++ b/docs/backend/openai_api_vision.ipynb
@@ -37,7 +37,14 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-12-02T22:50:33.572361Z",
+     "iopub.status.busy": "2024-12-02T22:50:33.572012Z",
+     "iopub.status.idle": "2024-12-02T22:51:08.810946Z",
+     "shell.execute_reply": "2024-12-02T22:51:08.810425Z"
+    }
+   },
    "outputs": [],
    "source": [
     "from sglang.utils import (\n",
@@ -69,7 +76,14 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-12-02T22:51:08.812505Z",
+     "iopub.status.busy": "2024-12-02T22:51:08.812313Z",
+     "iopub.status.idle": "2024-12-02T22:51:14.903348Z",
+     "shell.execute_reply": "2024-12-02T22:51:14.902880Z"
+    }
+   },
    "outputs": [],
    "source": [
     "import subprocess\n",
@@ -113,7 +127,14 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-12-02T22:51:14.905012Z",
+     "iopub.status.busy": "2024-12-02T22:51:14.904629Z",
+     "iopub.status.idle": "2024-12-02T22:51:15.313109Z",
+     "shell.execute_reply": "2024-12-02T22:51:15.312647Z"
+    }
+   },
    "outputs": [],
    "source": [
     "import requests\n",
@@ -153,7 +174,14 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-12-02T22:51:15.314520Z",
+     "iopub.status.busy": "2024-12-02T22:51:15.314377Z",
+     "iopub.status.idle": "2024-12-02T22:51:15.921170Z",
+     "shell.execute_reply": "2024-12-02T22:51:15.920710Z"
+    }
+   },
    "outputs": [],
    "source": [
     "from openai import OpenAI\n",
@@ -197,7 +225,14 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-12-02T22:51:15.922809Z",
+     "iopub.status.busy": "2024-12-02T22:51:15.922486Z",
+     "iopub.status.idle": "2024-12-02T22:51:17.150687Z",
+     "shell.execute_reply": "2024-12-02T22:51:17.150227Z"
+    }
+   },
    "outputs": [],
    "source": [
     "from openai import OpenAI\n",
@@ -238,8 +273,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
+   "execution_count": 6,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-12-02T22:51:17.151990Z",
+     "iopub.status.busy": "2024-12-02T22:51:17.151850Z",
+     "iopub.status.idle": "2024-12-02T22:51:18.756750Z",
+     "shell.execute_reply": "2024-12-02T22:51:18.749616Z"
+    }
+   },
    "outputs": [],
    "source": [
     "terminate_process(embedding_process)"
@@ -265,6 +307,11 @@
   }
  ],
  "metadata": {
+  "kernelspec": {
+   "display_name": "sglang",
+   "language": "python",
+   "name": "python3"
+  },
   "language_info": {
    "codemirror_mode": {
     "name": "ipython",
@@ -274,7 +321,8 @@
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3"
+   "pygments_lexer": "ipython3",
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
diff --git a/docs/start/send_request.ipynb b/docs/start/send_request.ipynb
index 4cb46f1edc..95a8dd13a6 100644
--- a/docs/start/send_request.ipynb
+++ b/docs/start/send_request.ipynb
@@ -30,9 +30,150 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
+   "execution_count": 1,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-12-02T22:51:23.128756Z",
+     "iopub.status.busy": "2024-12-02T22:51:23.128496Z",
+     "iopub.status.idle": "2024-12-02T22:51:55.360578Z",
+     "shell.execute_reply": "2024-12-02T22:51:55.360159Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "/opt/dlami/nvme/chenyang/miniconda3/envs/sglang/lib/python3.11/site-packages/transformers/utils/hub.py:128: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n",
+      "  warnings.warn(\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-12-02 22:51:29] server_args=ServerArgs(model_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, context_length=None, device='cuda', served_model_name='meta-llama/Meta-Llama-3.1-8B-Instruct', chat_template=None, is_embedding=False, revision=None, host='0.0.0.0', port=30000, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='lpm', schedule_conservativeness=1.0, cpu_offload_gb=0, tp_size=1, stream_interval=1, random_seed=720139840, constrained_json_whitespace_pattern=None, watchdog_timeout=300, download_dir=None, base_gpu_id=0, log_level='info', log_level_http=None, log_requests=False, show_time_cost=False, enable_metrics=False, decode_log_interval=40, api_key=None, file_storage_pth='SGLang_storage', enable_cache_report=False, dp_size=1, load_balance_method='round_robin', dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='outlines', disable_radix_cache=False, disable_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_overlap_schedule=False, enable_mixed_chunk=False, enable_dp_attention=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config='', enable_nan_detection=False, enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1, delete_ckpt_after_loading=False)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "/opt/dlami/nvme/chenyang/miniconda3/envs/sglang/lib/python3.11/site-packages/transformers/utils/hub.py:128: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n",
+      "  warnings.warn(\n",
+      "/opt/dlami/nvme/chenyang/miniconda3/envs/sglang/lib/python3.11/site-packages/transformers/utils/hub.py:128: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n",
+      "  warnings.warn(\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-12-02 22:51:37 TP0] Init torch distributed begin.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-12-02 22:51:38 TP0] Load weight begin. avail mem=46.29 GB\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-12-02 22:51:38 TP0] lm_eval is not installed, GPTQ may not be usable\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-12-02 22:51:38 TP0] Using model weights format ['*.safetensors']\n",
+      "\r",
+      "Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\r",
+      "Loading safetensors checkpoint shards:  25% Completed | 1/4 [00:00<00:02,  1.31it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\r",
+      "Loading safetensors checkpoint shards:  50% Completed | 2/4 [00:01<00:01,  1.18it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\r",
+      "Loading safetensors checkpoint shards:  75% Completed | 3/4 [00:02<00:00,  1.14it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\r",
+      "Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:02<00:00,  1.51it/s]\n",
+      "\r",
+      "Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:02<00:00,  1.37it/s]\n",
+      "\n",
+      "[2024-12-02 22:51:41 TP0] Load weight end. type=LlamaForCausalLM, dtype=torch.bfloat16, avail mem=31.20 GB\n",
+      "[2024-12-02 22:51:41 TP0] Memory pool end. avail mem=4.44 GB\n",
+      "[2024-12-02 22:51:41 TP0] Capture cuda graph begin. This can take up to several minutes.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-12-02 22:51:50 TP0] max_total_num_tokens=210055, max_prefill_tokens=16384, max_running_requests=2049, context_len=131072\n",
+      "[2024-12-02 22:51:50] INFO:     Started server process [3906367]\n",
+      "[2024-12-02 22:51:50] INFO:     Waiting for application startup.\n",
+      "[2024-12-02 22:51:50] INFO:     Application startup complete.\n",
+      "[2024-12-02 22:51:50] INFO:     Uvicorn running on http://0.0.0.0:30000 (Press CTRL+C to quit)\n",
+      "[2024-12-02 22:51:50] INFO:     127.0.0.1:54780 - \"GET /v1/models HTTP/1.1\" 200 OK\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-12-02 22:51:51] INFO:     127.0.0.1:55788 - \"GET /get_model_info HTTP/1.1\" 200 OK\n",
+      "[2024-12-02 22:51:51 TP0] Prefill batch. #new-seq: 1, #new-token: 7, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-12-02 22:51:51] INFO:     127.0.0.1:55790 - \"POST /generate HTTP/1.1\" 200 OK\n",
+      "[2024-12-02 22:51:51] The server is fired up and ready to roll!\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'><br><br>                    NOTE: Typically, the server runs in a separate terminal.<br>                    In this notebook, we run the server and notebook code together, so their outputs are combined.<br>                    To improve clarity, the server logs are displayed in the original black color, while the notebook outputs are highlighted in blue.<br>                    </strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
    "source": [
     "from sglang.utils import (\n",
     "    execute_shell_command,\n",
@@ -60,9 +201,37 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
+   "execution_count": 2,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-12-02T22:51:55.362103Z",
+     "iopub.status.busy": "2024-12-02T22:51:55.361912Z",
+     "iopub.status.idle": "2024-12-02T22:51:55.474877Z",
+     "shell.execute_reply": "2024-12-02T22:51:55.474538Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-12-02 22:51:55 TP0] Prefill batch. #new-seq: 1, #new-token: 41, #cached-token: 1, cache hit rate: 2.04%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
+      "[2024-12-02 22:51:55] INFO:     127.0.0.1:55794 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>{'id': '9bb7a92e66884dd597ed21d40b371dee', 'object': 'chat.completion', 'created': 1733179915, 'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': 'The capital of France is Paris.'}, 'logprobs': None, 'finish_reason': 'stop', 'matched_stop': 128009}], 'usage': {'prompt_tokens': 42, 'total_tokens': 50, 'completion_tokens': 8, 'prompt_tokens_details': None}}</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
    "source": [
     "import subprocess, json\n",
     "\n",
@@ -84,9 +253,37 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
+   "execution_count": 3,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-12-02T22:51:55.476355Z",
+     "iopub.status.busy": "2024-12-02T22:51:55.476031Z",
+     "iopub.status.idle": "2024-12-02T22:51:55.545152Z",
+     "shell.execute_reply": "2024-12-02T22:51:55.544851Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-12-02 22:51:55 TP0] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 41, cache hit rate: 46.15%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
+      "[2024-12-02 22:51:55] INFO:     127.0.0.1:55804 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>{'id': '9e218a4ee02c483caac6fe038d1d75e7', 'object': 'chat.completion', 'created': 1733179915, 'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': 'The capital of France is Paris.'}, 'logprobs': None, 'finish_reason': 'stop', 'matched_stop': 128009}], 'usage': {'prompt_tokens': 42, 'total_tokens': 50, 'completion_tokens': 8, 'prompt_tokens_details': None}}</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
    "source": [
     "import requests\n",
     "\n",
@@ -110,9 +307,44 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
+   "execution_count": 4,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-12-02T22:51:55.546339Z",
+     "iopub.status.busy": "2024-12-02T22:51:55.546190Z",
+     "iopub.status.idle": "2024-12-02T22:51:56.180978Z",
+     "shell.execute_reply": "2024-12-02T22:51:56.180604Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-12-02 22:51:55 TP0] Prefill batch. #new-seq: 1, #new-token: 13, #cached-token: 30, cache hit rate: 53.73%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
+      "[2024-12-02 22:51:55 TP0] Decode batch. #running-req: 1, #token: 60, token usage: 0.00, gen throughput (token/s): 6.97, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-12-02 22:51:56] INFO:     127.0.0.1:55806 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>ChatCompletion(id='14c5f53930714f6a8aaf2c107d60ab2d', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are 3 countries and their capitals:\\n\\n1. Country: Japan\\n   Capital: Tokyo\\n\\n2. Country: Australia\\n   Capital: Canberra\\n\\n3. Country: Brazil\\n   Capital: Brasília', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None), matched_stop=128009)], created=1733179916, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=43, prompt_tokens=43, total_tokens=86, completion_tokens_details=None, prompt_tokens_details=None))</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
    "source": [
     "import openai\n",
     "\n",
@@ -138,9 +370,317 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
+   "execution_count": 5,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-12-02T22:51:56.182456Z",
+     "iopub.status.busy": "2024-12-02T22:51:56.182078Z",
+     "iopub.status.idle": "2024-12-02T22:51:56.498669Z",
+     "shell.execute_reply": "2024-12-02T22:51:56.498331Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-12-02 22:51:56] INFO:     127.0.0.1:55816 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n",
+      "[2024-12-02 22:51:56 TP0] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 42, cache hit rate: 64.41%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
+      "Here"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      " are"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      " "
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "3"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      " countries"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      " and"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      " their"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      " capitals"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      ":\n",
+      "\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "1"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "."
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      " Country"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      ":"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-12-02 22:51:56 TP0] Decode batch. #running-req: 1, #token: 57, token usage: 0.00, gen throughput (token/s): 133.00, #queue-req: 0\n",
+      " Japan"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  "
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      " Capital"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      ":"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      " Tokyo"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "."
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      " Country"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      ":"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      " Australia"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  "
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      " Capital"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      ":"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      " Canberra"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "3"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "."
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      " Country"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      ":"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      " Brazil"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  "
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      " Capital"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      ":"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      " Bras"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "ília"
+     ]
+    }
+   ],
    "source": [
     "import openai\n",
     "\n",
@@ -174,9 +714,50 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
+   "execution_count": 6,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-12-02T22:51:56.499873Z",
+     "iopub.status.busy": "2024-12-02T22:51:56.499743Z",
+     "iopub.status.idle": "2024-12-02T22:51:56.734340Z",
+     "shell.execute_reply": "2024-12-02T22:51:56.733992Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-12-02 22:51:56 TP0] Prefill batch. #new-seq: 1, #new-token: 3, #cached-token: 3, cache hit rate: 63.93%, token usage: 0.00, #running-req: 0, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-12-02 22:51:56 TP0] Decode batch. #running-req: 1, #token: 17, token usage: 0.00, gen throughput (token/s): 137.66, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-12-02 22:51:56] INFO:     127.0.0.1:55826 - \"POST /generate HTTP/1.1\" 200 OK\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>{'text': ' a city of romance, art, fashion, and history. Paris is a must-visit destination for anyone who loves culture, architecture, and cuisine. From the', 'meta_info': {'prompt_tokens': 6, 'completion_tokens': 32, 'completion_tokens_wo_jump_forward': 32, 'cached_tokens': 3, 'finish_reason': {'type': 'length', 'length': 32}, 'id': 'efb0c50df0a34354b3762c764925c8e2'}}</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
    "source": [
     "import requests\n",
     "\n",
@@ -203,9 +784,244 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
+   "execution_count": 7,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-12-02T22:51:56.735620Z",
+     "iopub.status.busy": "2024-12-02T22:51:56.735312Z",
+     "iopub.status.idle": "2024-12-02T22:51:56.967187Z",
+     "shell.execute_reply": "2024-12-02T22:51:56.966862Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-12-02 22:51:56] INFO:     127.0.0.1:55836 - \"POST /generate HTTP/1.1\" 200 OK\n",
+      "[2024-12-02 22:51:56 TP0] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 5, cache hit rate: 64.55%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
+      " a"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      " city"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      " of"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      " romance"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      ","
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      " art"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      ","
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      " fashion"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      ","
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      " and"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      " cuisine"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "."
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      " Paris"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      " is"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      " a"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      " must"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "-"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "visit"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-12-02 22:51:56 TP0] Decode batch. #running-req: 1, #token: 25, token usage: 0.00, gen throughput (token/s): 138.56, #queue-req: 0\n",
+      " destination"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      " for"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      " anyone"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      " who"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      " loves"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      " history"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      ","
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      " architecture"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      ","
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      " and"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      " culture"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "."
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      " From"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      " the"
+     ]
+    }
+   ],
    "source": [
     "import requests, json\n",
     "\n",
@@ -236,8 +1052,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
+   "execution_count": 8,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-12-02T22:51:56.968304Z",
+     "iopub.status.busy": "2024-12-02T22:51:56.968158Z",
+     "iopub.status.idle": "2024-12-02T22:51:58.584622Z",
+     "shell.execute_reply": "2024-12-02T22:51:58.583792Z"
+    }
+   },
    "outputs": [],
    "source": [
     "terminate_process(server_process)"
@@ -254,7 +1077,8 @@
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3"
+   "pygments_lexer": "ipython3",
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,

From d8765ef2d8d7a68c412e78b1ec08582badeccf0c Mon Sep 17 00:00:00 2001
From: zhaochenyang20 <zhaochen20@outlook.com>
Date: Mon, 2 Dec 2024 22:59:35 +0000
Subject: [PATCH 2/6] remove ipynb output

---
 docs/backend/native_api.ipynb             | 136 +++-----------------
 docs/backend/offline_engine_api.ipynb     |  59 ++-------
 docs/backend/openai_api_completions.ipynb | 146 ++++------------------
 docs/backend/openai_api_embeddings.ipynb  |  59 ++-------
 docs/backend/openai_api_vision.ipynb      |  64 ++--------
 docs/start/send_request.ipynb             |  91 +++-----------
 6 files changed, 84 insertions(+), 471 deletions(-)

diff --git a/docs/backend/native_api.ipynb b/docs/backend/native_api.ipynb
index 73fc54a038..cd93439d41 100644
--- a/docs/backend/native_api.ipynb
+++ b/docs/backend/native_api.ipynb
@@ -31,14 +31,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2024-12-02T22:48:18.728819Z",
-     "iopub.status.busy": "2024-12-02T22:48:18.728690Z",
-     "iopub.status.idle": "2024-12-02T22:48:47.958696Z",
-     "shell.execute_reply": "2024-12-02T22:48:47.958226Z"
-    }
-   },
+   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
@@ -128,14 +121,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2024-12-02T22:48:47.960280Z",
-     "iopub.status.busy": "2024-12-02T22:48:47.960068Z",
-     "iopub.status.idle": "2024-12-02T22:48:48.227582Z",
-     "shell.execute_reply": "2024-12-02T22:48:48.227156Z"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "url = \"http://localhost:30010/generate\"\n",
@@ -161,14 +147,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2024-12-02T22:48:48.228900Z",
-     "iopub.status.busy": "2024-12-02T22:48:48.228763Z",
-     "iopub.status.idle": "2024-12-02T22:48:48.234059Z",
-     "shell.execute_reply": "2024-12-02T22:48:48.233689Z"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "url = \"http://localhost:30010/get_model_info\"\n",
@@ -197,14 +176,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2024-12-02T22:48:48.235454Z",
-     "iopub.status.busy": "2024-12-02T22:48:48.235125Z",
-     "iopub.status.idle": "2024-12-02T22:48:48.239787Z",
-     "shell.execute_reply": "2024-12-02T22:48:48.239433Z"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "# get_server_info\n",
@@ -227,14 +199,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2024-12-02T22:48:48.240966Z",
-     "iopub.status.busy": "2024-12-02T22:48:48.240838Z",
-     "iopub.status.idle": "2024-12-02T22:48:48.254049Z",
-     "shell.execute_reply": "2024-12-02T22:48:48.253663Z"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "url = \"http://localhost:30010/health_generate\"\n",
@@ -246,14 +211,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2024-12-02T22:48:48.255429Z",
-     "iopub.status.busy": "2024-12-02T22:48:48.255085Z",
-     "iopub.status.idle": "2024-12-02T22:48:48.259344Z",
-     "shell.execute_reply": "2024-12-02T22:48:48.258964Z"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "url = \"http://localhost:30010/health\"\n",
@@ -274,14 +232,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2024-12-02T22:48:48.260464Z",
-     "iopub.status.busy": "2024-12-02T22:48:48.260335Z",
-     "iopub.status.idle": "2024-12-02T22:48:48.265409Z",
-     "shell.execute_reply": "2024-12-02T22:48:48.264993Z"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "# flush cache\n",
@@ -306,14 +257,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2024-12-02T22:48:48.266586Z",
-     "iopub.status.busy": "2024-12-02T22:48:48.266452Z",
-     "iopub.status.idle": "2024-12-02T22:48:48.946566Z",
-     "shell.execute_reply": "2024-12-02T22:48:48.946081Z"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "# successful update with same architecture and size\n",
@@ -331,14 +275,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2024-12-02T22:48:48.948180Z",
-     "iopub.status.busy": "2024-12-02T22:48:48.947760Z",
-     "iopub.status.idle": "2024-12-02T22:48:49.075751Z",
-     "shell.execute_reply": "2024-12-02T22:48:49.075374Z"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "# failed update with different parameter size or wrong name\n",
@@ -370,14 +307,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2024-12-02T22:48:49.076953Z",
-     "iopub.status.busy": "2024-12-02T22:48:49.076828Z",
-     "iopub.status.idle": "2024-12-02T22:49:23.694647Z",
-     "shell.execute_reply": "2024-12-02T22:49:23.694216Z"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "terminate_process(server_process)\n",
@@ -395,14 +325,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2024-12-02T22:49:23.696104Z",
-     "iopub.status.busy": "2024-12-02T22:49:23.695882Z",
-     "iopub.status.idle": "2024-12-02T22:49:23.723827Z",
-     "shell.execute_reply": "2024-12-02T22:49:23.723526Z"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "# successful encode for embedding model\n",
@@ -427,14 +350,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2024-12-02T22:49:23.724947Z",
-     "iopub.status.busy": "2024-12-02T22:49:23.724827Z",
-     "iopub.status.idle": "2024-12-02T22:49:47.825508Z",
-     "shell.execute_reply": "2024-12-02T22:49:47.825004Z"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "terminate_process(embedding_process)\n",
@@ -454,14 +370,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2024-12-02T22:49:47.826961Z",
-     "iopub.status.busy": "2024-12-02T22:49:47.826823Z",
-     "iopub.status.idle": "2024-12-02T22:49:48.853347Z",
-     "shell.execute_reply": "2024-12-02T22:49:48.852882Z"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "from transformers import AutoTokenizer\n",
@@ -491,15 +400,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2024-12-02T22:49:48.854711Z",
-     "iopub.status.busy": "2024-12-02T22:49:48.854489Z",
-     "iopub.status.idle": "2024-12-02T22:49:48.919126Z",
-     "shell.execute_reply": "2024-12-02T22:49:48.918561Z"
-    }
-   },
+   "execution_count": null,
+   "metadata": {},
    "outputs": [],
    "source": [
     "terminate_process(reward_process)"
@@ -507,11 +409,6 @@
   }
  ],
  "metadata": {
-  "kernelspec": {
-   "display_name": "sglang",
-   "language": "python",
-   "name": "python3"
-  },
   "language_info": {
    "codemirror_mode": {
     "name": "ipython",
@@ -521,8 +418,7 @@
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.11.7"
+   "pygments_lexer": "ipython3"
   }
  },
  "nbformat": 4,
diff --git a/docs/backend/offline_engine_api.ipynb b/docs/backend/offline_engine_api.ipynb
index 302f060b02..7ce89d435d 100644
--- a/docs/backend/offline_engine_api.ipynb
+++ b/docs/backend/offline_engine_api.ipynb
@@ -33,14 +33,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2024-12-02T22:47:40.725673Z",
-     "iopub.status.busy": "2024-12-02T22:47:40.725546Z",
-     "iopub.status.idle": "2024-12-02T22:48:06.979747Z",
-     "shell.execute_reply": "2024-12-02T22:48:06.978839Z"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "# launch the offline engine\n",
@@ -61,14 +54,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2024-12-02T22:48:06.982095Z",
-     "iopub.status.busy": "2024-12-02T22:48:06.981910Z",
-     "iopub.status.idle": "2024-12-02T22:48:08.220069Z",
-     "shell.execute_reply": "2024-12-02T22:48:08.219489Z"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "prompts = [\n",
@@ -96,14 +82,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2024-12-02T22:48:08.221656Z",
-     "iopub.status.busy": "2024-12-02T22:48:08.221505Z",
-     "iopub.status.idle": "2024-12-02T22:48:11.077997Z",
-     "shell.execute_reply": "2024-12-02T22:48:11.077319Z"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "prompts = [\n",
@@ -134,14 +113,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2024-12-02T22:48:11.079727Z",
-     "iopub.status.busy": "2024-12-02T22:48:11.079451Z",
-     "iopub.status.idle": "2024-12-02T22:48:12.007378Z",
-     "shell.execute_reply": "2024-12-02T22:48:12.006821Z"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "prompts = [\n",
@@ -176,14 +148,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2024-12-02T22:48:12.008941Z",
-     "iopub.status.busy": "2024-12-02T22:48:12.008803Z",
-     "iopub.status.idle": "2024-12-02T22:48:14.701721Z",
-     "shell.execute_reply": "2024-12-02T22:48:14.701146Z"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "prompts = [\n",
@@ -212,15 +177,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2024-12-02T22:48:14.703412Z",
-     "iopub.status.busy": "2024-12-02T22:48:14.702985Z",
-     "iopub.status.idle": "2024-12-02T22:48:14.767369Z",
-     "shell.execute_reply": "2024-12-02T22:48:14.766815Z"
-    }
-   },
+   "execution_count": null,
+   "metadata": {},
    "outputs": [],
    "source": [
     "llm.shutdown()"
@@ -237,8 +195,7 @@
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.11.7"
+   "pygments_lexer": "ipython3"
   }
  },
  "nbformat": 4,
diff --git a/docs/backend/openai_api_completions.ipynb b/docs/backend/openai_api_completions.ipynb
index 552c60adbb..499b7fb740 100644
--- a/docs/backend/openai_api_completions.ipynb
+++ b/docs/backend/openai_api_completions.ipynb
@@ -36,15 +36,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2024-12-02T22:46:17.996308Z",
-     "iopub.status.busy": "2024-12-02T22:46:17.995895Z",
-     "iopub.status.idle": "2024-12-02T22:46:50.231557Z",
-     "shell.execute_reply": "2024-12-02T22:46:50.231084Z"
-    }
-   },
+   "execution_count": null,
+   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
@@ -210,15 +203,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2024-12-02T22:46:50.233143Z",
-     "iopub.status.busy": "2024-12-02T22:46:50.232949Z",
-     "iopub.status.idle": "2024-12-02T22:46:50.886412Z",
-     "shell.execute_reply": "2024-12-02T22:46:50.885947Z"
-    }
-   },
+   "execution_count": null,
+   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
@@ -278,15 +264,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2024-12-02T22:46:50.887802Z",
-     "iopub.status.busy": "2024-12-02T22:46:50.887655Z",
-     "iopub.status.idle": "2024-12-02T22:46:51.825108Z",
-     "shell.execute_reply": "2024-12-02T22:46:51.824704Z"
-    }
-   },
+   "execution_count": null,
+   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
@@ -367,15 +346,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2024-12-02T22:46:51.826415Z",
-     "iopub.status.busy": "2024-12-02T22:46:51.826283Z",
-     "iopub.status.idle": "2024-12-02T22:46:51.883588Z",
-     "shell.execute_reply": "2024-12-02T22:46:51.883146Z"
-    }
-   },
+   "execution_count": null,
+   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
@@ -410,15 +382,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2024-12-02T22:46:51.884817Z",
-     "iopub.status.busy": "2024-12-02T22:46:51.884690Z",
-     "iopub.status.idle": "2024-12-02T22:46:52.336780Z",
-     "shell.execute_reply": "2024-12-02T22:46:52.336393Z"
-    }
-   },
+   "execution_count": null,
+   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
@@ -481,15 +446,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2024-12-02T22:46:52.338049Z",
-     "iopub.status.busy": "2024-12-02T22:46:52.337919Z",
-     "iopub.status.idle": "2024-12-02T22:46:53.167934Z",
-     "shell.execute_reply": "2024-12-02T22:46:53.167533Z"
-    }
-   },
+   "execution_count": null,
+   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
@@ -563,15 +521,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2024-12-02T22:46:53.169141Z",
-     "iopub.status.busy": "2024-12-02T22:46:53.169014Z",
-     "iopub.status.idle": "2024-12-02T22:46:53.441425Z",
-     "shell.execute_reply": "2024-12-02T22:46:53.441026Z"
-    }
-   },
+   "execution_count": null,
+   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
@@ -642,15 +593,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2024-12-02T22:46:53.442594Z",
-     "iopub.status.busy": "2024-12-02T22:46:53.442465Z",
-     "iopub.status.idle": "2024-12-02T22:46:53.576534Z",
-     "shell.execute_reply": "2024-12-02T22:46:53.576071Z"
-    }
-   },
+   "execution_count": null,
+   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
@@ -706,15 +650,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2024-12-02T22:46:53.578160Z",
-     "iopub.status.busy": "2024-12-02T22:46:53.577636Z",
-     "iopub.status.idle": "2024-12-02T22:46:53.597584Z",
-     "shell.execute_reply": "2024-12-02T22:46:53.597165Z"
-    }
-   },
+   "execution_count": null,
+   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
@@ -796,15 +733,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2024-12-02T22:46:53.598950Z",
-     "iopub.status.busy": "2024-12-02T22:46:53.598642Z",
-     "iopub.status.idle": "2024-12-02T22:46:56.612608Z",
-     "shell.execute_reply": "2024-12-02T22:46:56.612127Z"
-    }
-   },
+   "execution_count": null,
+   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
@@ -937,15 +867,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2024-12-02T22:46:56.613990Z",
-     "iopub.status.busy": "2024-12-02T22:46:56.613849Z",
-     "iopub.status.idle": "2024-12-02T22:47:21.651523Z",
-     "shell.execute_reply": "2024-12-02T22:47:21.650996Z"
-    }
-   },
+   "execution_count": null,
+   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
@@ -1300,15 +1223,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2024-12-02T22:47:21.653719Z",
-     "iopub.status.busy": "2024-12-02T22:47:21.653572Z",
-     "iopub.status.idle": "2024-12-02T22:47:34.687047Z",
-     "shell.execute_reply": "2024-12-02T22:47:34.686607Z"
-    }
-   },
+   "execution_count": null,
+   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
@@ -1596,15 +1512,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2024-12-02T22:47:34.689659Z",
-     "iopub.status.busy": "2024-12-02T22:47:34.689300Z",
-     "iopub.status.idle": "2024-12-02T22:47:36.284482Z",
-     "shell.execute_reply": "2024-12-02T22:47:36.283449Z"
-    }
-   },
+   "execution_count": null,
+   "metadata": {},
    "outputs": [],
    "source": [
     "terminate_process(server_process)"
@@ -1621,8 +1530,7 @@
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.11.7"
+   "pygments_lexer": "ipython3"
   }
  },
  "nbformat": 4,
diff --git a/docs/backend/openai_api_embeddings.ipynb b/docs/backend/openai_api_embeddings.ipynb
index 1d106a4783..65b07c384d 100644
--- a/docs/backend/openai_api_embeddings.ipynb
+++ b/docs/backend/openai_api_embeddings.ipynb
@@ -33,14 +33,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2024-12-02T22:49:53.281058Z",
-     "iopub.status.busy": "2024-12-02T22:49:53.280790Z",
-     "iopub.status.idle": "2024-12-02T22:50:26.517789Z",
-     "shell.execute_reply": "2024-12-02T22:50:26.517313Z"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "from sglang.utils import (\n",
@@ -70,14 +63,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2024-12-02T22:50:26.519357Z",
-     "iopub.status.busy": "2024-12-02T22:50:26.519135Z",
-     "iopub.status.idle": "2024-12-02T22:50:26.555609Z",
-     "shell.execute_reply": "2024-12-02T22:50:26.555227Z"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "import subprocess, json\n",
@@ -104,14 +90,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2024-12-02T22:50:26.557052Z",
-     "iopub.status.busy": "2024-12-02T22:50:26.556687Z",
-     "iopub.status.idle": "2024-12-02T22:50:26.580073Z",
-     "shell.execute_reply": "2024-12-02T22:50:26.579695Z"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "import requests\n",
@@ -138,14 +117,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2024-12-02T22:50:26.581299Z",
-     "iopub.status.busy": "2024-12-02T22:50:26.581150Z",
-     "iopub.status.idle": "2024-12-02T22:50:26.933134Z",
-     "shell.execute_reply": "2024-12-02T22:50:26.932703Z"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "import openai\n",
@@ -174,14 +146,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2024-12-02T22:50:26.934770Z",
-     "iopub.status.busy": "2024-12-02T22:50:26.934415Z",
-     "iopub.status.idle": "2024-12-02T22:50:28.842361Z",
-     "shell.execute_reply": "2024-12-02T22:50:28.841666Z"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "import json\n",
@@ -205,15 +170,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2024-12-02T22:50:28.844332Z",
-     "iopub.status.busy": "2024-12-02T22:50:28.843909Z",
-     "iopub.status.idle": "2024-12-02T22:50:29.199607Z",
-     "shell.execute_reply": "2024-12-02T22:50:29.198958Z"
-    }
-   },
+   "execution_count": null,
+   "metadata": {},
    "outputs": [],
    "source": [
     "terminate_process(embedding_process)"
@@ -230,8 +188,7 @@
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.11.7"
+   "pygments_lexer": "ipython3"
   }
  },
  "nbformat": 4,
diff --git a/docs/backend/openai_api_vision.ipynb b/docs/backend/openai_api_vision.ipynb
index 4e573304ab..af17b44096 100644
--- a/docs/backend/openai_api_vision.ipynb
+++ b/docs/backend/openai_api_vision.ipynb
@@ -37,14 +37,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2024-12-02T22:50:33.572361Z",
-     "iopub.status.busy": "2024-12-02T22:50:33.572012Z",
-     "iopub.status.idle": "2024-12-02T22:51:08.810946Z",
-     "shell.execute_reply": "2024-12-02T22:51:08.810425Z"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "from sglang.utils import (\n",
@@ -76,14 +69,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2024-12-02T22:51:08.812505Z",
-     "iopub.status.busy": "2024-12-02T22:51:08.812313Z",
-     "iopub.status.idle": "2024-12-02T22:51:14.903348Z",
-     "shell.execute_reply": "2024-12-02T22:51:14.902880Z"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "import subprocess\n",
@@ -127,14 +113,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2024-12-02T22:51:14.905012Z",
-     "iopub.status.busy": "2024-12-02T22:51:14.904629Z",
-     "iopub.status.idle": "2024-12-02T22:51:15.313109Z",
-     "shell.execute_reply": "2024-12-02T22:51:15.312647Z"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "import requests\n",
@@ -174,14 +153,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2024-12-02T22:51:15.314520Z",
-     "iopub.status.busy": "2024-12-02T22:51:15.314377Z",
-     "iopub.status.idle": "2024-12-02T22:51:15.921170Z",
-     "shell.execute_reply": "2024-12-02T22:51:15.920710Z"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "from openai import OpenAI\n",
@@ -225,14 +197,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2024-12-02T22:51:15.922809Z",
-     "iopub.status.busy": "2024-12-02T22:51:15.922486Z",
-     "iopub.status.idle": "2024-12-02T22:51:17.150687Z",
-     "shell.execute_reply": "2024-12-02T22:51:17.150227Z"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "from openai import OpenAI\n",
@@ -273,15 +238,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2024-12-02T22:51:17.151990Z",
-     "iopub.status.busy": "2024-12-02T22:51:17.151850Z",
-     "iopub.status.idle": "2024-12-02T22:51:18.756750Z",
-     "shell.execute_reply": "2024-12-02T22:51:18.749616Z"
-    }
-   },
+   "execution_count": null,
+   "metadata": {},
    "outputs": [],
    "source": [
     "terminate_process(embedding_process)"
@@ -307,11 +265,6 @@
   }
  ],
  "metadata": {
-  "kernelspec": {
-   "display_name": "sglang",
-   "language": "python",
-   "name": "python3"
-  },
   "language_info": {
    "codemirror_mode": {
     "name": "ipython",
@@ -321,8 +274,7 @@
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.11.7"
+   "pygments_lexer": "ipython3"
   }
  },
  "nbformat": 4,
diff --git a/docs/start/send_request.ipynb b/docs/start/send_request.ipynb
index 95a8dd13a6..63e8920a9e 100644
--- a/docs/start/send_request.ipynb
+++ b/docs/start/send_request.ipynb
@@ -30,15 +30,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2024-12-02T22:51:23.128756Z",
-     "iopub.status.busy": "2024-12-02T22:51:23.128496Z",
-     "iopub.status.idle": "2024-12-02T22:51:55.360578Z",
-     "shell.execute_reply": "2024-12-02T22:51:55.360159Z"
-    }
-   },
+   "execution_count": null,
+   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
@@ -201,15 +194,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2024-12-02T22:51:55.362103Z",
-     "iopub.status.busy": "2024-12-02T22:51:55.361912Z",
-     "iopub.status.idle": "2024-12-02T22:51:55.474877Z",
-     "shell.execute_reply": "2024-12-02T22:51:55.474538Z"
-    }
-   },
+   "execution_count": null,
+   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
@@ -253,15 +239,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2024-12-02T22:51:55.476355Z",
-     "iopub.status.busy": "2024-12-02T22:51:55.476031Z",
-     "iopub.status.idle": "2024-12-02T22:51:55.545152Z",
-     "shell.execute_reply": "2024-12-02T22:51:55.544851Z"
-    }
-   },
+   "execution_count": null,
+   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
@@ -307,15 +286,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2024-12-02T22:51:55.546339Z",
-     "iopub.status.busy": "2024-12-02T22:51:55.546190Z",
-     "iopub.status.idle": "2024-12-02T22:51:56.180978Z",
-     "shell.execute_reply": "2024-12-02T22:51:56.180604Z"
-    }
-   },
+   "execution_count": null,
+   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
@@ -370,15 +342,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2024-12-02T22:51:56.182456Z",
-     "iopub.status.busy": "2024-12-02T22:51:56.182078Z",
-     "iopub.status.idle": "2024-12-02T22:51:56.498669Z",
-     "shell.execute_reply": "2024-12-02T22:51:56.498331Z"
-    }
-   },
+   "execution_count": null,
+   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
@@ -714,15 +679,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2024-12-02T22:51:56.499873Z",
-     "iopub.status.busy": "2024-12-02T22:51:56.499743Z",
-     "iopub.status.idle": "2024-12-02T22:51:56.734340Z",
-     "shell.execute_reply": "2024-12-02T22:51:56.733992Z"
-    }
-   },
+   "execution_count": null,
+   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
@@ -784,15 +742,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2024-12-02T22:51:56.735620Z",
-     "iopub.status.busy": "2024-12-02T22:51:56.735312Z",
-     "iopub.status.idle": "2024-12-02T22:51:56.967187Z",
-     "shell.execute_reply": "2024-12-02T22:51:56.966862Z"
-    }
-   },
+   "execution_count": null,
+   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
@@ -1052,15 +1003,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2024-12-02T22:51:56.968304Z",
-     "iopub.status.busy": "2024-12-02T22:51:56.968158Z",
-     "iopub.status.idle": "2024-12-02T22:51:58.584622Z",
-     "shell.execute_reply": "2024-12-02T22:51:58.583792Z"
-    }
-   },
+   "execution_count": null,
+   "metadata": {},
    "outputs": [],
    "source": [
     "terminate_process(server_process)"
@@ -1077,8 +1021,7 @@
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.11.7"
+   "pygments_lexer": "ipython3"
   }
  },
  "nbformat": 4,

From fbca85bdf225ff44117a532e243fe2b99eddf1f0 Mon Sep 17 00:00:00 2001
From: zhaochenyang20 <zhaochen20@outlook.com>
Date: Mon, 2 Dec 2024 23:05:48 +0000
Subject: [PATCH 3/6] remove ipynb output

---
 docs/backend/native_api.ipynb             |  60 +-
 docs/backend/openai_api_completions.ipynb | 960 +---------------------
 2 files changed, 13 insertions(+), 1007 deletions(-)

diff --git a/docs/backend/native_api.ipynb b/docs/backend/native_api.ipynb
index cd93439d41..a7a81b8487 100644
--- a/docs/backend/native_api.ipynb
+++ b/docs/backend/native_api.ipynb
@@ -32,65 +32,7 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-12-02 22:53:31] server_args=ServerArgs(model_path='meta-llama/Llama-3.2-1B-Instruct', tokenizer_path='meta-llama/Llama-3.2-1B-Instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, context_length=None, device='cuda', served_model_name='meta-llama/Llama-3.2-1B-Instruct', chat_template=None, is_embedding=False, revision=None, host='127.0.0.1', port=30010, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='lpm', schedule_conservativeness=1.0, cpu_offload_gb=0, tp_size=1, stream_interval=1, random_seed=412569283, constrained_json_whitespace_pattern=None, watchdog_timeout=300, download_dir=None, base_gpu_id=0, log_level='info', log_level_http=None, log_requests=False, show_time_cost=False, enable_metrics=False, decode_log_interval=40, api_key=None, file_storage_pth='SGLang_storage', enable_cache_report=False, dp_size=1, load_balance_method='round_robin', dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='outlines', disable_radix_cache=False, disable_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_overlap_schedule=False, enable_mixed_chunk=False, enable_dp_attention=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config='', enable_nan_detection=False, enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1, delete_ckpt_after_loading=False)\n",
-      "/opt/dlami/nvme/chenyang/miniconda3/envs/sglang/lib/python3.11/site-packages/transformers/utils/hub.py:128: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n",
-      "  warnings.warn(\n",
-      "/opt/dlami/nvme/chenyang/miniconda3/envs/sglang/lib/python3.11/site-packages/transformers/utils/hub.py:128: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n",
-      "  warnings.warn(\n",
-      "[2024-12-02 22:53:38 TP0] Init torch distributed begin.\n",
-      "[2024-12-02 22:53:39 TP0] Load weight begin. avail mem=4.98 GB\n",
-      "[2024-12-02 22:53:39 TP0] lm_eval is not installed, GPTQ may not be usable\n",
-      "[2024-12-02 22:53:39 TP0] Using model weights format ['*.safetensors']\n",
-      "[2024-12-02 22:53:39 TP0] No model.safetensors.index.json found in remote.\n",
-      "Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]\n",
-      "Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  3.22it/s]\n",
-      "Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  3.22it/s]\n",
-      "\n",
-      "[2024-12-02 22:53:40 TP0] Load weight end. type=LlamaForCausalLM, dtype=torch.bfloat16, avail mem=2.34 GB\n",
-      "[2024-12-02 22:53:40 TP0] Scheduler hit an exception: Traceback (most recent call last):\n",
-      "  File \"/opt/dlami/nvme/chenyang/miniconda3/envs/sglang/lib/python3.11/site-packages/sglang/srt/managers/scheduler.py\", line 1489, in run_scheduler_process\n",
-      "    scheduler = Scheduler(server_args, port_args, gpu_id, tp_rank, dp_rank)\n",
-      "                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/opt/dlami/nvme/chenyang/miniconda3/envs/sglang/lib/python3.11/site-packages/sglang/srt/managers/scheduler.py\", line 194, in __init__\n",
-      "    self.tp_worker = TpWorkerClass(\n",
-      "                     ^^^^^^^^^^^^^^\n",
-      "  File \"/opt/dlami/nvme/chenyang/miniconda3/envs/sglang/lib/python3.11/site-packages/sglang/srt/managers/tp_worker_overlap_thread.py\", line 61, in __init__\n",
-      "    self.worker = TpModelWorker(server_args, gpu_id, tp_rank, dp_rank, nccl_port)\n",
-      "                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/opt/dlami/nvme/chenyang/miniconda3/envs/sglang/lib/python3.11/site-packages/sglang/srt/managers/tp_worker.py\", line 62, in __init__\n",
-      "    self.model_runner = ModelRunner(\n",
-      "                        ^^^^^^^^^^^^\n",
-      "  File \"/opt/dlami/nvme/chenyang/miniconda3/envs/sglang/lib/python3.11/site-packages/sglang/srt/model_executor/model_runner.py\", line 164, in __init__\n",
-      "    self.init_memory_pool(\n",
-      "  File \"/opt/dlami/nvme/chenyang/miniconda3/envs/sglang/lib/python3.11/site-packages/sglang/srt/model_executor/model_runner.py\", line 544, in init_memory_pool\n",
-      "    self.token_to_kv_pool = MHATokenToKVPool(\n",
-      "                            ^^^^^^^^^^^^^^^^^\n",
-      "  File \"/opt/dlami/nvme/chenyang/miniconda3/envs/sglang/lib/python3.11/site-packages/sglang/srt/mem_cache/memory_pool.py\", line 194, in __init__\n",
-      "    self.v_buffer = [\n",
-      "                    ^\n",
-      "  File \"/opt/dlami/nvme/chenyang/miniconda3/envs/sglang/lib/python3.11/site-packages/sglang/srt/mem_cache/memory_pool.py\", line 195, in <listcomp>\n",
-      "    torch.empty(\n",
-      "torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 56.00 MiB. GPU 0 has a total capacity of 79.10 GiB of which 27.50 MiB is free. Process 3843098 has 32.29 GiB memory in use. Process 3908588 has 41.52 GiB memory in use. Including non-PyTorch memory, this process has 5.24 GiB memory in use. Of the allocated memory 4.63 GiB is allocated by PyTorch, and 19.34 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)\n",
-      "\n"
-     ]
-    },
-    {
-     "ename": "",
-     "evalue": "",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[1;31mThe Kernel crashed while executing code in the current cell or a previous cell. \n",
-      "\u001b[1;31mPlease review the code in the cell(s) to identify a possible cause of the failure. \n",
-      "\u001b[1;31mClick <a href='https://aka.ms/vscodeJupyterKernelCrash'>here</a> for more info. \n",
-      "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "from sglang.utils import (\n",
     "    execute_shell_command,\n",
diff --git a/docs/backend/openai_api_completions.ipynb b/docs/backend/openai_api_completions.ipynb
index 499b7fb740..067a046885 100644
--- a/docs/backend/openai_api_completions.ipynb
+++ b/docs/backend/openai_api_completions.ipynb
@@ -38,141 +38,7 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "/opt/dlami/nvme/chenyang/miniconda3/envs/sglang/lib/python3.11/site-packages/transformers/utils/hub.py:128: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n",
-      "  warnings.warn(\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-12-02 22:46:24] server_args=ServerArgs(model_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, context_length=None, device='cuda', served_model_name='meta-llama/Meta-Llama-3.1-8B-Instruct', chat_template=None, is_embedding=False, revision=None, host='0.0.0.0', port=30000, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='lpm', schedule_conservativeness=1.0, cpu_offload_gb=0, tp_size=1, stream_interval=1, random_seed=675216789, constrained_json_whitespace_pattern=None, watchdog_timeout=300, download_dir=None, base_gpu_id=0, log_level='info', log_level_http=None, log_requests=False, show_time_cost=False, enable_metrics=False, decode_log_interval=40, api_key=None, file_storage_pth='SGLang_storage', enable_cache_report=False, dp_size=1, load_balance_method='round_robin', dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='outlines', disable_radix_cache=False, disable_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_overlap_schedule=False, enable_mixed_chunk=False, enable_dp_attention=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config='', enable_nan_detection=False, enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1, delete_ckpt_after_loading=False)\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "/opt/dlami/nvme/chenyang/miniconda3/envs/sglang/lib/python3.11/site-packages/transformers/utils/hub.py:128: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n",
-      "  warnings.warn(\n",
-      "/opt/dlami/nvme/chenyang/miniconda3/envs/sglang/lib/python3.11/site-packages/transformers/utils/hub.py:128: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n",
-      "  warnings.warn(\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-12-02 22:46:32 TP0] Init torch distributed begin.\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-12-02 22:46:33 TP0] Load weight begin. avail mem=46.29 GB\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-12-02 22:46:33 TP0] lm_eval is not installed, GPTQ may not be usable\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-12-02 22:46:33 TP0] Using model weights format ['*.safetensors']\n",
-      "\r",
-      "Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Loading safetensors checkpoint shards:  25% Completed | 1/4 [00:00<00:02,  1.36it/s]\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Loading safetensors checkpoint shards:  50% Completed | 2/4 [00:01<00:01,  1.22it/s]\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Loading safetensors checkpoint shards:  75% Completed | 3/4 [00:02<00:00,  1.18it/s]\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:02<00:00,  1.56it/s]\n",
-      "\r",
-      "Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:02<00:00,  1.42it/s]\n",
-      "\n",
-      "[2024-12-02 22:46:36 TP0] Load weight end. type=LlamaForCausalLM, dtype=torch.bfloat16, avail mem=31.20 GB\n",
-      "[2024-12-02 22:46:36 TP0] Memory pool end. avail mem=4.44 GB\n",
-      "[2024-12-02 22:46:36 TP0] Capture cuda graph begin. This can take up to several minutes.\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-12-02 22:46:45 TP0] max_total_num_tokens=210055, max_prefill_tokens=16384, max_running_requests=2049, context_len=131072\n",
-      "[2024-12-02 22:46:45] INFO:     Started server process [3893647]\n",
-      "[2024-12-02 22:46:45] INFO:     Waiting for application startup.\n",
-      "[2024-12-02 22:46:45] INFO:     Application startup complete.\n",
-      "[2024-12-02 22:46:45] INFO:     Uvicorn running on http://0.0.0.0:30000 (Press CTRL+C to quit)\n",
-      "[2024-12-02 22:46:45] INFO:     127.0.0.1:42852 - \"GET /v1/models HTTP/1.1\" 200 OK\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-12-02 22:46:46] INFO:     127.0.0.1:42860 - \"GET /get_model_info HTTP/1.1\" 200 OK\n",
-      "[2024-12-02 22:46:46 TP0] Prefill batch. #new-seq: 1, #new-token: 7, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-12-02 22:46:46] INFO:     127.0.0.1:42862 - \"POST /generate HTTP/1.1\" 200 OK\n",
-      "[2024-12-02 22:46:46] The server is fired up and ready to roll!\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'><br><br>                    NOTE: Typically, the server runs in a separate terminal.<br>                    In this notebook, we run the server and notebook code together, so their outputs are combined.<br>                    To improve clarity, the server logs are displayed in the original black color, while the notebook outputs are highlighted in blue.<br>                    </strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
    "source": [
     "from sglang.utils import (\n",
     "    execute_shell_command,\n",
@@ -205,35 +71,7 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-12-02 22:46:50 TP0] Prefill batch. #new-seq: 1, #new-token: 42, #cached-token: 1, cache hit rate: 2.00%, token usage: 0.00, #running-req: 0, #queue-req: 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-12-02 22:46:50 TP0] Decode batch. #running-req: 1, #token: 76, token usage: 0.00, gen throughput (token/s): 7.04, #queue-req: 0\n",
-      "[2024-12-02 22:46:50] INFO:     127.0.0.1:42866 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'>Response: ChatCompletion(id='d1d6ed6246d5474e94dec1325b85ede3', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are 3 countries and their capitals:\\n\\n1. Country: Japan\\n   Capital: Tokyo\\n\\n2. Country: Australia\\n   Capital: Canberra\\n\\n3. Country: Brazil\\n   Capital: Brasília', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None), matched_stop=128009)], created=1733179610, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=43, prompt_tokens=43, total_tokens=86, completion_tokens_details=None, prompt_tokens_details=None))</strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
    "source": [
     "import openai\n",
     "\n",
@@ -266,50 +104,7 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-12-02 22:46:50 TP0] Prefill batch. #new-seq: 1, #new-token: 51, #cached-token: 25, cache hit rate: 20.63%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
-      "[2024-12-02 22:46:50 TP0] frequency_penalty, presence_penalty, and repetition_penalty are not supported when using the default overlap scheduler. They will be ignored. Please add `--disable-overlap` when launching the server if you need these features. The speed will be slower in that case.\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-12-02 22:46:51 TP0] Decode batch. #running-req: 1, #token: 106, token usage: 0.00, gen throughput (token/s): 125.80, #queue-req: 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-12-02 22:46:51 TP0] Decode batch. #running-req: 1, #token: 146, token usage: 0.00, gen throughput (token/s): 142.02, #queue-req: 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-12-02 22:46:51 TP0] Decode batch. #running-req: 1, #token: 186, token usage: 0.00, gen throughput (token/s): 141.54, #queue-req: 0\n",
-      "[2024-12-02 22:46:51] INFO:     127.0.0.1:42866 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'>Ancient Rome's major achievements include:<br><br>1. **Engineering and Architecture**: They built iconic structures like the Colosseum, Pantheon, and Roman Forum, showcasing their engineering skills and architectural innovations.<br>2. **Law and Governance**: The Romans developed the Twelve Tables, a precursor to modern law, and established a system of governance that included the Senate and the Assemblies.<br>3. **Military Conquests**: Rome expanded its territories through a series of military campaigns, creating a vast empire that stretched from Britain to Egypt.<br>4. **Infrastructure Development**: They built roads, bridges, aqueducts, and canals, which facilitated trade</strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
    "source": [
     "response = client.chat.completions.create(\n",
     "    model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
@@ -348,17 +143,7 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-12-02 22:46:51] INFO:     127.0.0.1:42866 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n",
-      "[2024-12-02 22:46:51 TP0] Prefill batch. #new-seq: 1, #new-token: 10, #cached-token: 30, cache hit rate: 33.73%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
-      "This is only a test"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "stream = client.chat.completions.create(\n",
     "    model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
@@ -384,42 +169,7 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-12-02 22:46:51 TP0] Prefill batch. #new-seq: 1, #new-token: 8, #cached-token: 1, cache hit rate: 32.57%, token usage: 0.00, #running-req: 0, #queue-req: 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-12-02 22:46:52 TP0] Decode batch. #running-req: 1, #token: 25, token usage: 0.00, gen throughput (token/s): 128.02, #queue-req: 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-12-02 22:46:52 TP0] Decode batch. #running-req: 1, #token: 65, token usage: 0.00, gen throughput (token/s): 146.74, #queue-req: 0\n",
-      "[2024-12-02 22:46:52] INFO:     127.0.0.1:42866 - \"POST /v1/completions HTTP/1.1\" 200 OK\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'>Response: Completion(id='bd53d29a96bc45839dcbd37d3dcc1206', choices=[CompletionChoice(finish_reason='length', index=0, logprobs=None, text=' 1. 2. 3.\\n1.  United States - Washington D.C. 2.  Japan - Tokyo 3.  Australia - Canberra\\nList 3 countries and their capitals. 1. 2. 3.\\n1.  China - Beijing 2.  Brazil - Bras', matched_stop=None)], created=1733179612, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='text_completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=64, prompt_tokens=9, total_tokens=73, completion_tokens_details=None, prompt_tokens_details=None))</strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
    "source": [
     "response = client.completions.create(\n",
     "    model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
@@ -448,50 +198,7 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-12-02 22:46:52 TP0] Prefill batch. #new-seq: 1, #new-token: 9, #cached-token: 1, cache hit rate: 31.35%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
-      "[2024-12-02 22:46:52 TP0] frequency_penalty, presence_penalty, and repetition_penalty are not supported when using the default overlap scheduler. They will be ignored. Please add `--disable-overlap` when launching the server if you need these features. The speed will be slower in that case.\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-12-02 22:46:52 TP0] Decode batch. #running-req: 1, #token: 42, token usage: 0.00, gen throughput (token/s): 137.48, #queue-req: 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-12-02 22:46:52 TP0] Decode batch. #running-req: 1, #token: 82, token usage: 0.00, gen throughput (token/s): 145.33, #queue-req: 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-12-02 22:46:53 TP0] Decode batch. #running-req: 1, #token: 122, token usage: 0.00, gen throughput (token/s): 144.24, #queue-req: 0\n",
-      "[2024-12-02 22:46:53] INFO:     127.0.0.1:42866 - \"POST /v1/completions HTTP/1.1\" 200 OK\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'>Response: Completion(id='f5503996a99040b19445f8121b49c4f1', choices=[CompletionChoice(finish_reason='stop', index=0, logprobs=None, text=' In 200 words or less.\\nAs the stars whizzed by outside, Captain Orion gazed out at the vast expanse of space. Her ship, the Aurora, had been traveling for months, searching for a new home for humanity. The Earth was dying, and the crew of the Aurora was determined to find a new planet to call their own.\\nOrion\\'s eyes sparkled as she scanned the data streaming in from the ship\\'s sensors. \"Captain, I\\'m reading a planet with breathable air and liquid water,\" said her navigator, Ensign Amy K\\'Rhyn.', matched_stop='\\n\\n')], created=1733179613, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='text_completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=118, prompt_tokens=10, total_tokens=128, completion_tokens_details=None, prompt_tokens_details=None))</strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
    "source": [
     "response = client.completions.create(\n",
     "    model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
@@ -523,34 +230,7 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-12-02 22:46:53 TP0] Prefill batch. #new-seq: 1, #new-token: 19, #cached-token: 30, cache hit rate: 37.61%, token usage: 0.00, #running-req: 0, #queue-req: 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-12-02 22:46:53] INFO:     127.0.0.1:42866 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'>{\"name\": \"Paris\", \"population\": 2147000}</strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
    "source": [
     "import json\n",
     "\n",
@@ -595,28 +275,7 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-12-02 22:46:53 TP0] Prefill batch. #new-seq: 1, #new-token: 12, #cached-token: 30, cache hit rate: 42.75%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
-      "[2024-12-02 22:46:53] INFO:     127.0.0.1:42866 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'>Paris</strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
    "source": [
     "response = client.chat.completions.create(\n",
     "    model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
@@ -652,35 +311,7 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-12-02 22:46:53] INFO:     127.0.0.1:47994 - \"POST /v1/files HTTP/1.1\" 200 OK\n",
-      "[2024-12-02 22:46:53] INFO:     127.0.0.1:47994 - \"POST /v1/batches HTTP/1.1\" 200 OK\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'>Batch job created with ID: batch_eafe977c-1e11-447a-a77f-6905cd2cf267</strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-12-02 22:46:53 TP0] Prefill batch. #new-seq: 2, #new-token: 18, #cached-token: 62, cache hit rate: 50.56%, token usage: 0.00, #running-req: 0, #queue-req: 0\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "import json\n",
     "import time\n",
@@ -735,93 +366,7 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-12-02 22:46:53 TP0] Decode batch. #running-req: 1, #token: 56, token usage: 0.00, gen throughput (token/s): 87.91, #queue-req: 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Batch job status: validating...trying again in 3 seconds...\n",
-      "[2024-12-02 22:46:56] INFO:     127.0.0.1:47994 - \"GET /v1/batches/batch_eafe977c-1e11-447a-a77f-6905cd2cf267 HTTP/1.1\" 200 OK\n",
-      "Batch job completed successfully!\n",
-      "Request counts: BatchRequestCounts(completed=2, failed=0, total=2)\n",
-      "[2024-12-02 22:46:56] INFO:     127.0.0.1:47994 - \"GET /v1/files/backend_result_file-62a6f368-3d24-4e97-8920-1e5d12769ffc/content HTTP/1.1\" 200 OK\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'>Request request-1:</strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'>Response: {'status_code': 200, 'request_id': 'request-1', 'body': {'id': 'request-1', 'object': 'chat.completion', 'created': 1733179613, 'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct', 'choices': {'index': 0, 'message': {'role': 'assistant', 'content': 'Why do programmers prefer dark mode?\\n\\nBecause light attracts bugs.'}, 'logprobs': None, 'finish_reason': 'stop', 'matched_stop': 128009}, 'usage': {'prompt_tokens': 41, 'completion_tokens': 13, 'total_tokens': 54}, 'system_fingerprint': None}}</strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'>Request request-2:</strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'>Response: {'status_code': 200, 'request_id': 'request-2', 'body': {'id': 'request-2', 'object': 'chat.completion', 'created': 1733179613, 'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct', 'choices': {'index': 0, 'message': {'role': 'assistant', 'content': '**What is Python?**\\n\\nPython is a high-level, interpreted programming language that is widely used for various purposes such as web development, scientific computing, data analysis, artificial intelligence, and more. It was created in the late 1980s by'}, 'logprobs': None, 'finish_reason': 'length', 'matched_stop': None}, 'usage': {'prompt_tokens': 39, 'completion_tokens': 50, 'total_tokens': 89}, 'system_fingerprint': None}}</strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'>Cleaning up files...</strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-12-02 22:46:56] INFO:     127.0.0.1:47994 - \"DELETE /v1/files/backend_result_file-62a6f368-3d24-4e97-8920-1e5d12769ffc HTTP/1.1\" 200 OK\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "while batch_response.status not in [\"completed\", \"failed\", \"cancelled\"]:\n",
     "    time.sleep(3)\n",
@@ -869,287 +414,7 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-12-02 22:46:56] INFO:     127.0.0.1:48004 - \"POST /v1/files HTTP/1.1\" 200 OK\n",
-      "[2024-12-02 22:46:56] INFO:     127.0.0.1:48004 - \"POST /v1/batches HTTP/1.1\" 200 OK\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'>Created batch job with ID: batch_028e2808-382e-4aa4-af37-d7f4e763c3f4</strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'>Initial status: validating</strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-12-02 22:46:56 TP0] Prefill batch. #new-seq: 100, #new-token: 3000, #cached-token: 2500, cache hit rate: 45.77%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
-      "[2024-12-02 22:46:56 TP0] Decode batch. #running-req: 100, #token: 3725, token usage: 0.02, gen throughput (token/s): 206.20, #queue-req: 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-12-02 22:46:57 TP0] Decode batch. #running-req: 100, #token: 7725, token usage: 0.04, gen throughput (token/s): 11850.91, #queue-req: 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-12-02 22:46:57 TP0] Decode batch. #running-req: 100, #token: 11725, token usage: 0.06, gen throughput (token/s): 11616.06, #queue-req: 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-12-02 22:46:57 TP0] Decode batch. #running-req: 100, #token: 15725, token usage: 0.07, gen throughput (token/s): 11346.97, #queue-req: 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-12-02 22:46:58 TP0] Decode batch. #running-req: 100, #token: 19725, token usage: 0.09, gen throughput (token/s): 11089.22, #queue-req: 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-12-02 22:46:58 TP0] Decode batch. #running-req: 100, #token: 23725, token usage: 0.11, gen throughput (token/s): 10835.27, #queue-req: 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-12-02 22:46:58 TP0] Decode batch. #running-req: 100, #token: 27725, token usage: 0.13, gen throughput (token/s): 10583.19, #queue-req: 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-12-02 22:46:59 TP0] Decode batch. #running-req: 100, #token: 31725, token usage: 0.15, gen throughput (token/s): 10363.49, #queue-req: 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-12-02 22:46:59 TP0] Decode batch. #running-req: 100, #token: 35725, token usage: 0.17, gen throughput (token/s): 10145.38, #queue-req: 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-12-02 22:47:00 TP0] Decode batch. #running-req: 100, #token: 39725, token usage: 0.19, gen throughput (token/s): 9927.85, #queue-req: 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-12-02 22:47:00 TP0] Decode batch. #running-req: 100, #token: 43725, token usage: 0.21, gen throughput (token/s): 9719.36, #queue-req: 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-12-02 22:47:00 TP0] Decode batch. #running-req: 100, #token: 47725, token usage: 0.23, gen throughput (token/s): 9533.96, #queue-req: 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-12-02 22:47:01 TP0] Decode batch. #running-req: 100, #token: 51725, token usage: 0.25, gen throughput (token/s): 9339.12, #queue-req: 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-12-02 22:47:06] INFO:     127.0.0.1:59256 - \"GET /v1/batches/batch_028e2808-382e-4aa4-af37-d7f4e763c3f4 HTTP/1.1\" 200 OK\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'>Batch job details (check 1 / 5) // ID: batch_028e2808-382e-4aa4-af37-d7f4e763c3f4 // Status: completed // Created at: 1733179616 // Input file ID: backend_input_file-8762851c-2789-4b30-b4df-c66edde1d5bb // Output file ID: backend_result_file-3d82e8c5-8022-43ee-9659-99e4124ec4f8</strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'><strong>Request counts: Total: 100 // Completed: 100 // Failed: 0</strong></strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-12-02 22:47:09] INFO:     127.0.0.1:59256 - \"GET /v1/batches/batch_028e2808-382e-4aa4-af37-d7f4e763c3f4 HTTP/1.1\" 200 OK\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'>Batch job details (check 2 / 5) // ID: batch_028e2808-382e-4aa4-af37-d7f4e763c3f4 // Status: completed // Created at: 1733179616 // Input file ID: backend_input_file-8762851c-2789-4b30-b4df-c66edde1d5bb // Output file ID: backend_result_file-3d82e8c5-8022-43ee-9659-99e4124ec4f8</strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'><strong>Request counts: Total: 100 // Completed: 100 // Failed: 0</strong></strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-12-02 22:47:12] INFO:     127.0.0.1:59256 - \"GET /v1/batches/batch_028e2808-382e-4aa4-af37-d7f4e763c3f4 HTTP/1.1\" 200 OK\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'>Batch job details (check 3 / 5) // ID: batch_028e2808-382e-4aa4-af37-d7f4e763c3f4 // Status: completed // Created at: 1733179616 // Input file ID: backend_input_file-8762851c-2789-4b30-b4df-c66edde1d5bb // Output file ID: backend_result_file-3d82e8c5-8022-43ee-9659-99e4124ec4f8</strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'><strong>Request counts: Total: 100 // Completed: 100 // Failed: 0</strong></strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-12-02 22:47:15] INFO:     127.0.0.1:59256 - \"GET /v1/batches/batch_028e2808-382e-4aa4-af37-d7f4e763c3f4 HTTP/1.1\" 200 OK\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'>Batch job details (check 4 / 5) // ID: batch_028e2808-382e-4aa4-af37-d7f4e763c3f4 // Status: completed // Created at: 1733179616 // Input file ID: backend_input_file-8762851c-2789-4b30-b4df-c66edde1d5bb // Output file ID: backend_result_file-3d82e8c5-8022-43ee-9659-99e4124ec4f8</strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'><strong>Request counts: Total: 100 // Completed: 100 // Failed: 0</strong></strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-12-02 22:47:18] INFO:     127.0.0.1:59256 - \"GET /v1/batches/batch_028e2808-382e-4aa4-af37-d7f4e763c3f4 HTTP/1.1\" 200 OK\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'>Batch job details (check 5 / 5) // ID: batch_028e2808-382e-4aa4-af37-d7f4e763c3f4 // Status: completed // Created at: 1733179616 // Input file ID: backend_input_file-8762851c-2789-4b30-b4df-c66edde1d5bb // Output file ID: backend_result_file-3d82e8c5-8022-43ee-9659-99e4124ec4f8</strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'><strong>Request counts: Total: 100 // Completed: 100 // Failed: 0</strong></strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
    "source": [
     "import json\n",
     "import time\n",
@@ -1225,208 +490,7 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-12-02 22:47:21] INFO:     127.0.0.1:48046 - \"POST /v1/files HTTP/1.1\" 200 OK\n",
-      "[2024-12-02 22:47:21] INFO:     127.0.0.1:48046 - \"POST /v1/batches HTTP/1.1\" 200 OK\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'>Created batch job with ID: batch_94c72d7f-b065-4c83-8765-d257480f694f</strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'>Initial status: validating</strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-12-02 22:47:21 TP0] Prefill batch. #new-seq: 83, #new-token: 83, #cached-token: 4482, cache hit rate: 68.73%, token usage: 0.01, #running-req: 0, #queue-req: 0\n",
-      "[2024-12-02 22:47:21 TP0] Prefill batch. #new-seq: 290, #new-token: 8192, #cached-token: 7743, cache hit rate: 56.55%, token usage: 0.01, #running-req: 83, #queue-req: 127\n",
-      "[2024-12-02 22:47:21 TP0] Prefill batch. #new-seq: 128, #new-token: 3825, #cached-token: 3215, cache hit rate: 54.26%, token usage: 0.05, #running-req: 372, #queue-req: 1\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-12-02 22:47:22 TP0] Decode batch. #running-req: 500, #token: 28525, token usage: 0.14, gen throughput (token/s): 678.38, #queue-req: 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-12-02 22:47:23 TP0] Decode batch. #running-req: 500, #token: 48525, token usage: 0.23, gen throughput (token/s): 26596.73, #queue-req: 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-12-02 22:47:24 TP0] Decode batch. #running-req: 500, #token: 68525, token usage: 0.33, gen throughput (token/s): 25443.33, #queue-req: 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-12-02 22:47:24 TP0] Decode batch. #running-req: 500, #token: 88525, token usage: 0.42, gen throughput (token/s): 24276.84, #queue-req: 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-12-02 22:47:25 TP0] Decode batch. #running-req: 500, #token: 108525, token usage: 0.52, gen throughput (token/s): 23133.50, #queue-req: 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-12-02 22:47:26 TP0] Decode batch. #running-req: 500, #token: 128525, token usage: 0.61, gen throughput (token/s): 22139.46, #queue-req: 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-12-02 22:47:27 TP0] Decode batch. #running-req: 500, #token: 148525, token usage: 0.71, gen throughput (token/s): 21220.49, #queue-req: 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-12-02 22:47:28 TP0] Decode batch. #running-req: 500, #token: 168525, token usage: 0.80, gen throughput (token/s): 20377.60, #queue-req: 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-12-02 22:47:29 TP0] Decode batch. #running-req: 500, #token: 188525, token usage: 0.90, gen throughput (token/s): 19555.01, #queue-req: 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-12-02 22:47:30 TP0] Decode batch. #running-req: 500, #token: 208525, token usage: 0.99, gen throughput (token/s): 18893.70, #queue-req: 0\n",
-      "[2024-12-02 22:47:30 TP0] Decode out of memory happened. #retracted_reqs: 23, #new_token_ratio: 0.3087 -> 0.8200\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-12-02 22:47:31 TP0] Decode out of memory happened. #retracted_reqs: 21, #new_token_ratio: 0.8009 -> 0.8600\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-12-02 22:47:31] INFO:     127.0.0.1:60160 - \"POST /v1/batches/batch_94c72d7f-b065-4c83-8765-d257480f694f/cancel HTTP/1.1\" 200 OK\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'>Cancellation initiated. Status: cancelling</strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-12-02 22:47:34] INFO:     127.0.0.1:60160 - \"GET /v1/batches/batch_94c72d7f-b065-4c83-8765-d257480f694f HTTP/1.1\" 200 OK\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'>Current status: cancelled</strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'>Batch job successfully cancelled</strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-12-02 22:47:34] INFO:     127.0.0.1:60160 - \"DELETE /v1/files/backend_input_file-0acbc8d2-85ef-406a-b610-1591357c7615 HTTP/1.1\" 200 OK\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'>Successfully cleaned up input file</strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'>Successfully deleted local batch_requests.jsonl file</strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
    "source": [
     "import json\n",
     "import time\n",

From f946fac4d1fd722e23afc86d14e7d018d1ae1ded Mon Sep 17 00:00:00 2001
From: zhaochenyang20 <zhaochen20@outlook.com>
Date: Mon, 2 Dec 2024 23:06:34 +0000
Subject: [PATCH 4/6] remove ipynb output

---
 docs/start/send_request.ipynb | 781 +---------------------------------
 1 file changed, 7 insertions(+), 774 deletions(-)

diff --git a/docs/start/send_request.ipynb b/docs/start/send_request.ipynb
index 63e8920a9e..4cb46f1edc 100644
--- a/docs/start/send_request.ipynb
+++ b/docs/start/send_request.ipynb
@@ -32,141 +32,7 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "/opt/dlami/nvme/chenyang/miniconda3/envs/sglang/lib/python3.11/site-packages/transformers/utils/hub.py:128: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n",
-      "  warnings.warn(\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-12-02 22:51:29] server_args=ServerArgs(model_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, context_length=None, device='cuda', served_model_name='meta-llama/Meta-Llama-3.1-8B-Instruct', chat_template=None, is_embedding=False, revision=None, host='0.0.0.0', port=30000, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='lpm', schedule_conservativeness=1.0, cpu_offload_gb=0, tp_size=1, stream_interval=1, random_seed=720139840, constrained_json_whitespace_pattern=None, watchdog_timeout=300, download_dir=None, base_gpu_id=0, log_level='info', log_level_http=None, log_requests=False, show_time_cost=False, enable_metrics=False, decode_log_interval=40, api_key=None, file_storage_pth='SGLang_storage', enable_cache_report=False, dp_size=1, load_balance_method='round_robin', dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='outlines', disable_radix_cache=False, disable_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_overlap_schedule=False, enable_mixed_chunk=False, enable_dp_attention=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config='', enable_nan_detection=False, enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1, delete_ckpt_after_loading=False)\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "/opt/dlami/nvme/chenyang/miniconda3/envs/sglang/lib/python3.11/site-packages/transformers/utils/hub.py:128: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n",
-      "  warnings.warn(\n",
-      "/opt/dlami/nvme/chenyang/miniconda3/envs/sglang/lib/python3.11/site-packages/transformers/utils/hub.py:128: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n",
-      "  warnings.warn(\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-12-02 22:51:37 TP0] Init torch distributed begin.\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-12-02 22:51:38 TP0] Load weight begin. avail mem=46.29 GB\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-12-02 22:51:38 TP0] lm_eval is not installed, GPTQ may not be usable\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-12-02 22:51:38 TP0] Using model weights format ['*.safetensors']\n",
-      "\r",
-      "Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Loading safetensors checkpoint shards:  25% Completed | 1/4 [00:00<00:02,  1.31it/s]\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Loading safetensors checkpoint shards:  50% Completed | 2/4 [00:01<00:01,  1.18it/s]\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Loading safetensors checkpoint shards:  75% Completed | 3/4 [00:02<00:00,  1.14it/s]\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r",
-      "Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:02<00:00,  1.51it/s]\n",
-      "\r",
-      "Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:02<00:00,  1.37it/s]\n",
-      "\n",
-      "[2024-12-02 22:51:41 TP0] Load weight end. type=LlamaForCausalLM, dtype=torch.bfloat16, avail mem=31.20 GB\n",
-      "[2024-12-02 22:51:41 TP0] Memory pool end. avail mem=4.44 GB\n",
-      "[2024-12-02 22:51:41 TP0] Capture cuda graph begin. This can take up to several minutes.\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-12-02 22:51:50 TP0] max_total_num_tokens=210055, max_prefill_tokens=16384, max_running_requests=2049, context_len=131072\n",
-      "[2024-12-02 22:51:50] INFO:     Started server process [3906367]\n",
-      "[2024-12-02 22:51:50] INFO:     Waiting for application startup.\n",
-      "[2024-12-02 22:51:50] INFO:     Application startup complete.\n",
-      "[2024-12-02 22:51:50] INFO:     Uvicorn running on http://0.0.0.0:30000 (Press CTRL+C to quit)\n",
-      "[2024-12-02 22:51:50] INFO:     127.0.0.1:54780 - \"GET /v1/models HTTP/1.1\" 200 OK\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-12-02 22:51:51] INFO:     127.0.0.1:55788 - \"GET /get_model_info HTTP/1.1\" 200 OK\n",
-      "[2024-12-02 22:51:51 TP0] Prefill batch. #new-seq: 1, #new-token: 7, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-12-02 22:51:51] INFO:     127.0.0.1:55790 - \"POST /generate HTTP/1.1\" 200 OK\n",
-      "[2024-12-02 22:51:51] The server is fired up and ready to roll!\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'><br><br>                    NOTE: Typically, the server runs in a separate terminal.<br>                    In this notebook, we run the server and notebook code together, so their outputs are combined.<br>                    To improve clarity, the server logs are displayed in the original black color, while the notebook outputs are highlighted in blue.<br>                    </strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
    "source": [
     "from sglang.utils import (\n",
     "    execute_shell_command,\n",
@@ -196,28 +62,7 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-12-02 22:51:55 TP0] Prefill batch. #new-seq: 1, #new-token: 41, #cached-token: 1, cache hit rate: 2.04%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
-      "[2024-12-02 22:51:55] INFO:     127.0.0.1:55794 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'>{'id': '9bb7a92e66884dd597ed21d40b371dee', 'object': 'chat.completion', 'created': 1733179915, 'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': 'The capital of France is Paris.'}, 'logprobs': None, 'finish_reason': 'stop', 'matched_stop': 128009}], 'usage': {'prompt_tokens': 42, 'total_tokens': 50, 'completion_tokens': 8, 'prompt_tokens_details': None}}</strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
    "source": [
     "import subprocess, json\n",
     "\n",
@@ -241,28 +86,7 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-12-02 22:51:55 TP0] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 41, cache hit rate: 46.15%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
-      "[2024-12-02 22:51:55] INFO:     127.0.0.1:55804 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'>{'id': '9e218a4ee02c483caac6fe038d1d75e7', 'object': 'chat.completion', 'created': 1733179915, 'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': 'The capital of France is Paris.'}, 'logprobs': None, 'finish_reason': 'stop', 'matched_stop': 128009}], 'usage': {'prompt_tokens': 42, 'total_tokens': 50, 'completion_tokens': 8, 'prompt_tokens_details': None}}</strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
    "source": [
     "import requests\n",
     "\n",
@@ -288,35 +112,7 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-12-02 22:51:55 TP0] Prefill batch. #new-seq: 1, #new-token: 13, #cached-token: 30, cache hit rate: 53.73%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
-      "[2024-12-02 22:51:55 TP0] Decode batch. #running-req: 1, #token: 60, token usage: 0.00, gen throughput (token/s): 6.97, #queue-req: 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-12-02 22:51:56] INFO:     127.0.0.1:55806 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'>ChatCompletion(id='14c5f53930714f6a8aaf2c107d60ab2d', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are 3 countries and their capitals:\\n\\n1. Country: Japan\\n   Capital: Tokyo\\n\\n2. Country: Australia\\n   Capital: Canberra\\n\\n3. Country: Brazil\\n   Capital: Brasília', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None), matched_stop=128009)], created=1733179916, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=43, prompt_tokens=43, total_tokens=86, completion_tokens_details=None, prompt_tokens_details=None))</strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
    "source": [
     "import openai\n",
     "\n",
@@ -344,308 +140,7 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-12-02 22:51:56] INFO:     127.0.0.1:55816 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n",
-      "[2024-12-02 22:51:56 TP0] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 42, cache hit rate: 64.41%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
-      "Here"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      " are"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      " "
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "3"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      " countries"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      " and"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      " their"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      " capitals"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      ":\n",
-      "\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "1"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "."
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      " Country"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      ":"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-12-02 22:51:56 TP0] Decode batch. #running-req: 1, #token: 57, token usage: 0.00, gen throughput (token/s): 133.00, #queue-req: 0\n",
-      " Japan"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "  "
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      " Capital"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      ":"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      " Tokyo"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\n",
-      "\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "."
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      " Country"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      ":"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      " Australia"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "  "
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      " Capital"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      ":"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      " Canberra"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\n",
-      "\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "3"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "."
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      " Country"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      ":"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      " Brazil"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "  "
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      " Capital"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      ":"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      " Bras"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "ília"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "import openai\n",
     "\n",
@@ -681,41 +176,7 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-12-02 22:51:56 TP0] Prefill batch. #new-seq: 1, #new-token: 3, #cached-token: 3, cache hit rate: 63.93%, token usage: 0.00, #running-req: 0, #queue-req: 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-12-02 22:51:56 TP0] Decode batch. #running-req: 1, #token: 17, token usage: 0.00, gen throughput (token/s): 137.66, #queue-req: 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-12-02 22:51:56] INFO:     127.0.0.1:55826 - \"POST /generate HTTP/1.1\" 200 OK\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'>{'text': ' a city of romance, art, fashion, and history. Paris is a must-visit destination for anyone who loves culture, architecture, and cuisine. From the', 'meta_info': {'prompt_tokens': 6, 'completion_tokens': 32, 'completion_tokens_wo_jump_forward': 32, 'cached_tokens': 3, 'finish_reason': {'type': 'length', 'length': 32}, 'id': 'efb0c50df0a34354b3762c764925c8e2'}}</strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
    "source": [
     "import requests\n",
     "\n",
@@ -744,235 +205,7 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-12-02 22:51:56] INFO:     127.0.0.1:55836 - \"POST /generate HTTP/1.1\" 200 OK\n",
-      "[2024-12-02 22:51:56 TP0] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 5, cache hit rate: 64.55%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
-      " a"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      " city"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      " of"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      " romance"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      ","
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      " art"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      ","
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      " fashion"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      ","
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      " and"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      " cuisine"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "."
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      " Paris"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      " is"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      " a"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      " must"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "-"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "visit"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-12-02 22:51:56 TP0] Decode batch. #running-req: 1, #token: 25, token usage: 0.00, gen throughput (token/s): 138.56, #queue-req: 0\n",
-      " destination"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      " for"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      " anyone"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      " who"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      " loves"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      " history"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      ","
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      " architecture"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      ","
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      " and"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      " culture"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "."
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      " From"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      " the"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "import requests, json\n",
     "\n",

From 6db5d1c6d72d686e0102141535e142094ecfa7dd Mon Sep 17 00:00:00 2001
From: zhaochenyang20 <zhaochen20@outlook.com>
Date: Tue, 3 Dec 2024 05:32:23 +0000
Subject: [PATCH 5/6] revert classify

---
 docs/backend/native_api.ipynb             | 10 ++++++++--
 docs/backend/offline_engine_api.ipynb     |  3 ++-
 docs/backend/openai_api_completions.ipynb |  3 ++-
 docs/backend/openai_api_embeddings.ipynb  |  3 ++-
 docs/backend/openai_api_vision.ipynb      |  3 ++-
 docs/start/send_request.ipynb             |  3 ++-
 python/sglang/srt/server.py               |  2 +-
 7 files changed, 19 insertions(+), 8 deletions(-)

diff --git a/docs/backend/native_api.ipynb b/docs/backend/native_api.ipynb
index a7a81b8487..849cdd8d69 100644
--- a/docs/backend/native_api.ipynb
+++ b/docs/backend/native_api.ipynb
@@ -332,7 +332,7 @@
     "tokenizer = AutoTokenizer.from_pretrained(\"Skywork/Skywork-Reward-Llama-3.1-8B-v0.2\")\n",
     "prompts = tokenizer.apply_chat_template(CONVS, tokenize=False)\n",
     "\n",
-    "url = \"http://localhost:30030/encode\"\n",
+    "url = \"http://localhost:30030/classify\"\n",
     "data = {\"model\": \"Skywork/Skywork-Reward-Llama-3.1-8B-v0.2\", \"text\": prompts}\n",
     "\n",
     "responses = requests.post(url, json=data).json()\n",
@@ -351,6 +351,11 @@
   }
  ],
  "metadata": {
+  "kernelspec": {
+   "display_name": "sglang",
+   "language": "python",
+   "name": "python3"
+  },
   "language_info": {
    "codemirror_mode": {
     "name": "ipython",
@@ -360,7 +365,8 @@
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3"
+   "pygments_lexer": "ipython3",
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
diff --git a/docs/backend/offline_engine_api.ipynb b/docs/backend/offline_engine_api.ipynb
index 7ce89d435d..ac997641b0 100644
--- a/docs/backend/offline_engine_api.ipynb
+++ b/docs/backend/offline_engine_api.ipynb
@@ -195,7 +195,8 @@
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3"
+   "pygments_lexer": "ipython3",
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
diff --git a/docs/backend/openai_api_completions.ipynb b/docs/backend/openai_api_completions.ipynb
index 067a046885..ae2e8fc868 100644
--- a/docs/backend/openai_api_completions.ipynb
+++ b/docs/backend/openai_api_completions.ipynb
@@ -594,7 +594,8 @@
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3"
+   "pygments_lexer": "ipython3",
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
diff --git a/docs/backend/openai_api_embeddings.ipynb b/docs/backend/openai_api_embeddings.ipynb
index 65b07c384d..d3df74ea1a 100644
--- a/docs/backend/openai_api_embeddings.ipynb
+++ b/docs/backend/openai_api_embeddings.ipynb
@@ -188,7 +188,8 @@
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3"
+   "pygments_lexer": "ipython3",
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
diff --git a/docs/backend/openai_api_vision.ipynb b/docs/backend/openai_api_vision.ipynb
index af17b44096..64496c56a8 100644
--- a/docs/backend/openai_api_vision.ipynb
+++ b/docs/backend/openai_api_vision.ipynb
@@ -274,7 +274,8 @@
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3"
+   "pygments_lexer": "ipython3",
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
diff --git a/docs/start/send_request.ipynb b/docs/start/send_request.ipynb
index 4cb46f1edc..b8b7d21964 100644
--- a/docs/start/send_request.ipynb
+++ b/docs/start/send_request.ipynb
@@ -254,7 +254,8 @@
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3"
+   "pygments_lexer": "ipython3",
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,
diff --git a/python/sglang/srt/server.py b/python/sglang/srt/server.py
index fc8ac150b3..7b91cb6979 100644
--- a/python/sglang/srt/server.py
+++ b/python/sglang/srt/server.py
@@ -329,7 +329,7 @@ async def encode_request(obj: EmbeddingReqInput, request: Request):
         )
 
 
-@app.api_route("/encode", methods=["POST", "PUT"])
+@app.api_route("/classify", methods=["POST", "PUT"])
 @time_func_latency
 async def classify_request(obj: EmbeddingReqInput, request: Request):
     """Handle a reward model request. Now the arguments and return values are the same as embedding models."""

From 386df39a792f187fc8fce021b72a07ef6aa182c4 Mon Sep 17 00:00:00 2001
From: zhaochenyang20 <zhaochen20@outlook.com>
Date: Tue, 3 Dec 2024 05:32:42 +0000
Subject: [PATCH 6/6] revert classify

---
 docs/backend/native_api.ipynb             | 8 +-------
 docs/backend/offline_engine_api.ipynb     | 3 +--
 docs/backend/openai_api_completions.ipynb | 3 +--
 docs/backend/openai_api_embeddings.ipynb  | 3 +--
 docs/backend/openai_api_vision.ipynb      | 3 +--
 docs/start/send_request.ipynb             | 3 +--
 6 files changed, 6 insertions(+), 17 deletions(-)

diff --git a/docs/backend/native_api.ipynb b/docs/backend/native_api.ipynb
index 849cdd8d69..26758f7f97 100644
--- a/docs/backend/native_api.ipynb
+++ b/docs/backend/native_api.ipynb
@@ -351,11 +351,6 @@
   }
  ],
  "metadata": {
-  "kernelspec": {
-   "display_name": "sglang",
-   "language": "python",
-   "name": "python3"
-  },
   "language_info": {
    "codemirror_mode": {
     "name": "ipython",
@@ -365,8 +360,7 @@
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.11.7"
+   "pygments_lexer": "ipython3"
   }
  },
  "nbformat": 4,
diff --git a/docs/backend/offline_engine_api.ipynb b/docs/backend/offline_engine_api.ipynb
index ac997641b0..7ce89d435d 100644
--- a/docs/backend/offline_engine_api.ipynb
+++ b/docs/backend/offline_engine_api.ipynb
@@ -195,8 +195,7 @@
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.11.7"
+   "pygments_lexer": "ipython3"
   }
  },
  "nbformat": 4,
diff --git a/docs/backend/openai_api_completions.ipynb b/docs/backend/openai_api_completions.ipynb
index ae2e8fc868..067a046885 100644
--- a/docs/backend/openai_api_completions.ipynb
+++ b/docs/backend/openai_api_completions.ipynb
@@ -594,8 +594,7 @@
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.11.7"
+   "pygments_lexer": "ipython3"
   }
  },
  "nbformat": 4,
diff --git a/docs/backend/openai_api_embeddings.ipynb b/docs/backend/openai_api_embeddings.ipynb
index d3df74ea1a..65b07c384d 100644
--- a/docs/backend/openai_api_embeddings.ipynb
+++ b/docs/backend/openai_api_embeddings.ipynb
@@ -188,8 +188,7 @@
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.11.7"
+   "pygments_lexer": "ipython3"
   }
  },
  "nbformat": 4,
diff --git a/docs/backend/openai_api_vision.ipynb b/docs/backend/openai_api_vision.ipynb
index 64496c56a8..af17b44096 100644
--- a/docs/backend/openai_api_vision.ipynb
+++ b/docs/backend/openai_api_vision.ipynb
@@ -274,8 +274,7 @@
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.11.7"
+   "pygments_lexer": "ipython3"
   }
  },
  "nbformat": 4,
diff --git a/docs/start/send_request.ipynb b/docs/start/send_request.ipynb
index b8b7d21964..4cb46f1edc 100644
--- a/docs/start/send_request.ipynb
+++ b/docs/start/send_request.ipynb
@@ -254,8 +254,7 @@
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.11.7"
+   "pygments_lexer": "ipython3"
   }
  },
  "nbformat": 4,