From 0cf4330dc8caf43e80ae4631ca8d63f8ebf2d18b Mon Sep 17 00:00:00 2001 From: zhaochenyang20 Date: Mon, 2 Dec 2024 22:54:30 +0000 Subject: [PATCH 1/6] fix error in docs ci --- docs/Makefile | 2 +- docs/backend/native_api.ipynb | 208 +++- docs/backend/offline_engine_api.ipynb | 59 +- docs/backend/openai_api_completions.ipynb | 1106 ++++++++++++++++++++- docs/backend/openai_api_embeddings.ipynb | 59 +- docs/backend/openai_api_vision.ipynb | 64 +- docs/start/send_request.ipynb | 872 +++++++++++++++- 7 files changed, 2259 insertions(+), 111 deletions(-) diff --git a/docs/Makefile b/docs/Makefile index 50f77a30c0..13d81f4f84 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -19,7 +19,7 @@ compile: echo "Executing $$nb"; \ jupyter nbconvert --to notebook --execute --inplace "$$nb" \ --ExecutePreprocessor.timeout=600 \ - --ExecutePreprocessor.kernel_name=python3; \ + --ExecutePreprocessor.kernel_name=python3 || exit 1; \ fi; \ done diff --git a/docs/backend/native_api.ipynb b/docs/backend/native_api.ipynb index 7207259ea3..73fc54a038 100644 --- a/docs/backend/native_api.ipynb +++ b/docs/backend/native_api.ipynb @@ -31,8 +31,73 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, - "outputs": [], + "metadata": { + "execution": { + "iopub.execute_input": "2024-12-02T22:48:18.728819Z", + "iopub.status.busy": "2024-12-02T22:48:18.728690Z", + "iopub.status.idle": "2024-12-02T22:48:47.958696Z", + "shell.execute_reply": "2024-12-02T22:48:47.958226Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-12-02 22:53:31] server_args=ServerArgs(model_path='meta-llama/Llama-3.2-1B-Instruct', tokenizer_path='meta-llama/Llama-3.2-1B-Instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, context_length=None, device='cuda', served_model_name='meta-llama/Llama-3.2-1B-Instruct', chat_template=None, is_embedding=False, revision=None, host='127.0.0.1', port=30010, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='lpm', schedule_conservativeness=1.0, cpu_offload_gb=0, tp_size=1, stream_interval=1, random_seed=412569283, constrained_json_whitespace_pattern=None, watchdog_timeout=300, download_dir=None, base_gpu_id=0, log_level='info', log_level_http=None, log_requests=False, show_time_cost=False, enable_metrics=False, decode_log_interval=40, api_key=None, file_storage_pth='SGLang_storage', enable_cache_report=False, dp_size=1, load_balance_method='round_robin', dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='outlines', disable_radix_cache=False, disable_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_overlap_schedule=False, enable_mixed_chunk=False, enable_dp_attention=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config='', enable_nan_detection=False, enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1, delete_ckpt_after_loading=False)\n", + "/opt/dlami/nvme/chenyang/miniconda3/envs/sglang/lib/python3.11/site-packages/transformers/utils/hub.py:128: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n", + " warnings.warn(\n", + "/opt/dlami/nvme/chenyang/miniconda3/envs/sglang/lib/python3.11/site-packages/transformers/utils/hub.py:128: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n", + " warnings.warn(\n", + "[2024-12-02 22:53:38 TP0] Init torch distributed begin.\n", + "[2024-12-02 22:53:39 TP0] Load weight begin. avail mem=4.98 GB\n", + "[2024-12-02 22:53:39 TP0] lm_eval is not installed, GPTQ may not be usable\n", + "[2024-12-02 22:53:39 TP0] Using model weights format ['*.safetensors']\n", + "[2024-12-02 22:53:39 TP0] No model.safetensors.index.json found in remote.\n", + "Loading safetensors checkpoint shards: 0% Completed | 0/1 [00:00\n", + " torch.empty(\n", + "torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 56.00 MiB. GPU 0 has a total capacity of 79.10 GiB of which 27.50 MiB is free. Process 3843098 has 32.29 GiB memory in use. Process 3908588 has 41.52 GiB memory in use. Including non-PyTorch memory, this process has 5.24 GiB memory in use. Of the allocated memory 4.63 GiB is allocated by PyTorch, and 19.34 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)\n", + "\n" + ] + }, + { + "ename": "", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[1;31mThe Kernel crashed while executing code in the current cell or a previous cell. \n", + "\u001b[1;31mPlease review the code in the cell(s) to identify a possible cause of the failure. \n", + "\u001b[1;31mClick here for more info. \n", + "\u001b[1;31mView Jupyter log for further details." + ] + } + ], "source": [ "from sglang.utils import (\n", " execute_shell_command,\n", @@ -63,7 +128,14 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-12-02T22:48:47.960280Z", + "iopub.status.busy": "2024-12-02T22:48:47.960068Z", + "iopub.status.idle": "2024-12-02T22:48:48.227582Z", + "shell.execute_reply": "2024-12-02T22:48:48.227156Z" + } + }, "outputs": [], "source": [ "url = \"http://localhost:30010/generate\"\n", @@ -89,7 +161,14 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-12-02T22:48:48.228900Z", + "iopub.status.busy": "2024-12-02T22:48:48.228763Z", + "iopub.status.idle": "2024-12-02T22:48:48.234059Z", + "shell.execute_reply": "2024-12-02T22:48:48.233689Z" + } + }, "outputs": [], "source": [ "url = \"http://localhost:30010/get_model_info\"\n", @@ -118,7 +197,14 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-12-02T22:48:48.235454Z", + "iopub.status.busy": "2024-12-02T22:48:48.235125Z", + "iopub.status.idle": "2024-12-02T22:48:48.239787Z", + "shell.execute_reply": "2024-12-02T22:48:48.239433Z" + } + }, "outputs": [], "source": [ "# get_server_info\n", @@ -141,7 +227,14 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-12-02T22:48:48.240966Z", + "iopub.status.busy": "2024-12-02T22:48:48.240838Z", + "iopub.status.idle": "2024-12-02T22:48:48.254049Z", + "shell.execute_reply": "2024-12-02T22:48:48.253663Z" + } + }, "outputs": [], "source": [ "url = \"http://localhost:30010/health_generate\"\n", @@ -153,7 +246,14 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-12-02T22:48:48.255429Z", + "iopub.status.busy": "2024-12-02T22:48:48.255085Z", + "iopub.status.idle": "2024-12-02T22:48:48.259344Z", + "shell.execute_reply": "2024-12-02T22:48:48.258964Z" + } + }, "outputs": [], "source": [ "url = \"http://localhost:30010/health\"\n", @@ -174,7 +274,14 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-12-02T22:48:48.260464Z", + "iopub.status.busy": "2024-12-02T22:48:48.260335Z", + "iopub.status.idle": "2024-12-02T22:48:48.265409Z", + "shell.execute_reply": "2024-12-02T22:48:48.264993Z" + } + }, "outputs": [], "source": [ "# flush cache\n", @@ -199,7 +306,14 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-12-02T22:48:48.266586Z", + "iopub.status.busy": "2024-12-02T22:48:48.266452Z", + "iopub.status.idle": "2024-12-02T22:48:48.946566Z", + "shell.execute_reply": "2024-12-02T22:48:48.946081Z" + } + }, "outputs": [], "source": [ "# successful update with same architecture and size\n", @@ -217,22 +331,29 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-12-02T22:48:48.948180Z", + "iopub.status.busy": "2024-12-02T22:48:48.947760Z", + "iopub.status.idle": "2024-12-02T22:48:49.075751Z", + "shell.execute_reply": "2024-12-02T22:48:49.075374Z" + } + }, "outputs": [], "source": [ - "# failed update with different parameter size\n", + "# failed update with different parameter size or wrong name\n", "\n", "url = \"http://localhost:30010/update_weights_from_disk\"\n", - "data = {\"model_path\": \"meta-llama/Llama-3.2-3B\"}\n", + "data = {\"model_path\": \"meta-llama/Llama-3.2-1B-wrong\"}\n", "\n", "response = requests.post(url, json=data)\n", "response_json = response.json()\n", "print_highlight(response_json)\n", "assert response_json[\"success\"] is False\n", "assert response_json[\"message\"] == (\n", - " \"Failed to update weights: The size of tensor a (2048) must match \"\n", - " \"the size of tensor b (3072) at non-singleton dimension 1.\\n\"\n", - " \"Rolling back to original weights.\"\n", + " \"Failed to get weights iterator: \"\n", + " \"meta-llama/Llama-3.2-1B-wrong\"\n", + " \" (repository not found).\"\n", ")" ] }, @@ -249,7 +370,14 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-12-02T22:48:49.076953Z", + "iopub.status.busy": "2024-12-02T22:48:49.076828Z", + "iopub.status.idle": "2024-12-02T22:49:23.694647Z", + "shell.execute_reply": "2024-12-02T22:49:23.694216Z" + } + }, "outputs": [], "source": [ "terminate_process(server_process)\n", @@ -267,7 +395,14 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-12-02T22:49:23.696104Z", + "iopub.status.busy": "2024-12-02T22:49:23.695882Z", + "iopub.status.idle": "2024-12-02T22:49:23.723827Z", + "shell.execute_reply": "2024-12-02T22:49:23.723526Z" + } + }, "outputs": [], "source": [ "# successful encode for embedding model\n", @@ -292,7 +427,14 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-12-02T22:49:23.724947Z", + "iopub.status.busy": "2024-12-02T22:49:23.724827Z", + "iopub.status.idle": "2024-12-02T22:49:47.825508Z", + "shell.execute_reply": "2024-12-02T22:49:47.825004Z" + } + }, "outputs": [], "source": [ "terminate_process(embedding_process)\n", @@ -312,7 +454,14 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-12-02T22:49:47.826961Z", + "iopub.status.busy": "2024-12-02T22:49:47.826823Z", + "iopub.status.idle": "2024-12-02T22:49:48.853347Z", + "shell.execute_reply": "2024-12-02T22:49:48.852882Z" + } + }, "outputs": [], "source": [ "from transformers import AutoTokenizer\n", @@ -332,7 +481,7 @@ "tokenizer = AutoTokenizer.from_pretrained(\"Skywork/Skywork-Reward-Llama-3.1-8B-v0.2\")\n", "prompts = tokenizer.apply_chat_template(CONVS, tokenize=False)\n", "\n", - "url = \"http://localhost:30030/classify\"\n", + "url = \"http://localhost:30030/encode\"\n", "data = {\"model\": \"Skywork/Skywork-Reward-Llama-3.1-8B-v0.2\", \"text\": prompts}\n", "\n", "responses = requests.post(url, json=data).json()\n", @@ -342,8 +491,15 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, + "execution_count": 14, + "metadata": { + "execution": { + "iopub.execute_input": "2024-12-02T22:49:48.854711Z", + "iopub.status.busy": "2024-12-02T22:49:48.854489Z", + "iopub.status.idle": "2024-12-02T22:49:48.919126Z", + "shell.execute_reply": "2024-12-02T22:49:48.918561Z" + } + }, "outputs": [], "source": [ "terminate_process(reward_process)" @@ -351,6 +507,11 @@ } ], "metadata": { + "kernelspec": { + "display_name": "sglang", + "language": "python", + "name": "python3" + }, "language_info": { "codemirror_mode": { "name": "ipython", @@ -360,7 +521,8 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3" + "pygments_lexer": "ipython3", + "version": "3.11.7" } }, "nbformat": 4, diff --git a/docs/backend/offline_engine_api.ipynb b/docs/backend/offline_engine_api.ipynb index 7ce89d435d..302f060b02 100644 --- a/docs/backend/offline_engine_api.ipynb +++ b/docs/backend/offline_engine_api.ipynb @@ -33,7 +33,14 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-12-02T22:47:40.725673Z", + "iopub.status.busy": "2024-12-02T22:47:40.725546Z", + "iopub.status.idle": "2024-12-02T22:48:06.979747Z", + "shell.execute_reply": "2024-12-02T22:48:06.978839Z" + } + }, "outputs": [], "source": [ "# launch the offline engine\n", @@ -54,7 +61,14 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-12-02T22:48:06.982095Z", + "iopub.status.busy": "2024-12-02T22:48:06.981910Z", + "iopub.status.idle": "2024-12-02T22:48:08.220069Z", + "shell.execute_reply": "2024-12-02T22:48:08.219489Z" + } + }, "outputs": [], "source": [ "prompts = [\n", @@ -82,7 +96,14 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-12-02T22:48:08.221656Z", + "iopub.status.busy": "2024-12-02T22:48:08.221505Z", + "iopub.status.idle": "2024-12-02T22:48:11.077997Z", + "shell.execute_reply": "2024-12-02T22:48:11.077319Z" + } + }, "outputs": [], "source": [ "prompts = [\n", @@ -113,7 +134,14 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-12-02T22:48:11.079727Z", + "iopub.status.busy": "2024-12-02T22:48:11.079451Z", + "iopub.status.idle": "2024-12-02T22:48:12.007378Z", + "shell.execute_reply": "2024-12-02T22:48:12.006821Z" + } + }, "outputs": [], "source": [ "prompts = [\n", @@ -148,7 +176,14 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-12-02T22:48:12.008941Z", + "iopub.status.busy": "2024-12-02T22:48:12.008803Z", + "iopub.status.idle": "2024-12-02T22:48:14.701721Z", + "shell.execute_reply": "2024-12-02T22:48:14.701146Z" + } + }, "outputs": [], "source": [ "prompts = [\n", @@ -177,8 +212,15 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, + "execution_count": 6, + "metadata": { + "execution": { + "iopub.execute_input": "2024-12-02T22:48:14.703412Z", + "iopub.status.busy": "2024-12-02T22:48:14.702985Z", + "iopub.status.idle": "2024-12-02T22:48:14.767369Z", + "shell.execute_reply": "2024-12-02T22:48:14.766815Z" + } + }, "outputs": [], "source": [ "llm.shutdown()" @@ -195,7 +237,8 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3" + "pygments_lexer": "ipython3", + "version": "3.11.7" } }, "nbformat": 4, diff --git a/docs/backend/openai_api_completions.ipynb b/docs/backend/openai_api_completions.ipynb index 067a046885..552c60adbb 100644 --- a/docs/backend/openai_api_completions.ipynb +++ b/docs/backend/openai_api_completions.ipynb @@ -36,9 +36,150 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 1, + "metadata": { + "execution": { + "iopub.execute_input": "2024-12-02T22:46:17.996308Z", + "iopub.status.busy": "2024-12-02T22:46:17.995895Z", + "iopub.status.idle": "2024-12-02T22:46:50.231557Z", + "shell.execute_reply": "2024-12-02T22:46:50.231084Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/opt/dlami/nvme/chenyang/miniconda3/envs/sglang/lib/python3.11/site-packages/transformers/utils/hub.py:128: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-12-02 22:46:24] server_args=ServerArgs(model_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, context_length=None, device='cuda', served_model_name='meta-llama/Meta-Llama-3.1-8B-Instruct', chat_template=None, is_embedding=False, revision=None, host='0.0.0.0', port=30000, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='lpm', schedule_conservativeness=1.0, cpu_offload_gb=0, tp_size=1, stream_interval=1, random_seed=675216789, constrained_json_whitespace_pattern=None, watchdog_timeout=300, download_dir=None, base_gpu_id=0, log_level='info', log_level_http=None, log_requests=False, show_time_cost=False, enable_metrics=False, decode_log_interval=40, api_key=None, file_storage_pth='SGLang_storage', enable_cache_report=False, dp_size=1, load_balance_method='round_robin', dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='outlines', disable_radix_cache=False, disable_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_overlap_schedule=False, enable_mixed_chunk=False, enable_dp_attention=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config='', enable_nan_detection=False, enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1, delete_ckpt_after_loading=False)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/opt/dlami/nvme/chenyang/miniconda3/envs/sglang/lib/python3.11/site-packages/transformers/utils/hub.py:128: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n", + " warnings.warn(\n", + "/opt/dlami/nvme/chenyang/miniconda3/envs/sglang/lib/python3.11/site-packages/transformers/utils/hub.py:128: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-12-02 22:46:32 TP0] Init torch distributed begin.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-12-02 22:46:33 TP0] Load weight begin. avail mem=46.29 GB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-12-02 22:46:33 TP0] lm_eval is not installed, GPTQ may not be usable\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-12-02 22:46:33 TP0] Using model weights format ['*.safetensors']\n", + "\r", + "Loading safetensors checkpoint shards: 0% Completed | 0/4 [00:00

NOTE: Typically, the server runs in a separate terminal.
In this notebook, we run the server and notebook code together, so their outputs are combined.
To improve clarity, the server logs are displayed in the original black color, while the notebook outputs are highlighted in blue.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "from sglang.utils import (\n", " execute_shell_command,\n", @@ -69,9 +210,44 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 2, + "metadata": { + "execution": { + "iopub.execute_input": "2024-12-02T22:46:50.233143Z", + "iopub.status.busy": "2024-12-02T22:46:50.232949Z", + "iopub.status.idle": "2024-12-02T22:46:50.886412Z", + "shell.execute_reply": "2024-12-02T22:46:50.885947Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-12-02 22:46:50 TP0] Prefill batch. #new-seq: 1, #new-token: 42, #cached-token: 1, cache hit rate: 2.00%, token usage: 0.00, #running-req: 0, #queue-req: 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-12-02 22:46:50 TP0] Decode batch. #running-req: 1, #token: 76, token usage: 0.00, gen throughput (token/s): 7.04, #queue-req: 0\n", + "[2024-12-02 22:46:50] INFO: 127.0.0.1:42866 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n" + ] + }, + { + "data": { + "text/html": [ + "Response: ChatCompletion(id='d1d6ed6246d5474e94dec1325b85ede3', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are 3 countries and their capitals:\\n\\n1. Country: Japan\\n Capital: Tokyo\\n\\n2. Country: Australia\\n Capital: Canberra\\n\\n3. Country: Brazil\\n Capital: Brasília', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None), matched_stop=128009)], created=1733179610, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=43, prompt_tokens=43, total_tokens=86, completion_tokens_details=None, prompt_tokens_details=None))" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "import openai\n", "\n", @@ -102,9 +278,59 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 3, + "metadata": { + "execution": { + "iopub.execute_input": "2024-12-02T22:46:50.887802Z", + "iopub.status.busy": "2024-12-02T22:46:50.887655Z", + "iopub.status.idle": "2024-12-02T22:46:51.825108Z", + "shell.execute_reply": "2024-12-02T22:46:51.824704Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-12-02 22:46:50 TP0] Prefill batch. #new-seq: 1, #new-token: 51, #cached-token: 25, cache hit rate: 20.63%, token usage: 0.00, #running-req: 0, #queue-req: 0\n", + "[2024-12-02 22:46:50 TP0] frequency_penalty, presence_penalty, and repetition_penalty are not supported when using the default overlap scheduler. They will be ignored. Please add `--disable-overlap` when launching the server if you need these features. The speed will be slower in that case.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-12-02 22:46:51 TP0] Decode batch. #running-req: 1, #token: 106, token usage: 0.00, gen throughput (token/s): 125.80, #queue-req: 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-12-02 22:46:51 TP0] Decode batch. #running-req: 1, #token: 146, token usage: 0.00, gen throughput (token/s): 142.02, #queue-req: 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-12-02 22:46:51 TP0] Decode batch. #running-req: 1, #token: 186, token usage: 0.00, gen throughput (token/s): 141.54, #queue-req: 0\n", + "[2024-12-02 22:46:51] INFO: 127.0.0.1:42866 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n" + ] + }, + { + "data": { + "text/html": [ + "Ancient Rome's major achievements include:

1. **Engineering and Architecture**: They built iconic structures like the Colosseum, Pantheon, and Roman Forum, showcasing their engineering skills and architectural innovations.
2. **Law and Governance**: The Romans developed the Twelve Tables, a precursor to modern law, and established a system of governance that included the Senate and the Assemblies.
3. **Military Conquests**: Rome expanded its territories through a series of military campaigns, creating a vast empire that stretched from Britain to Egypt.
4. **Infrastructure Development**: They built roads, bridges, aqueducts, and canals, which facilitated trade
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "response = client.chat.completions.create(\n", " model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n", @@ -141,9 +367,26 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 4, + "metadata": { + "execution": { + "iopub.execute_input": "2024-12-02T22:46:51.826415Z", + "iopub.status.busy": "2024-12-02T22:46:51.826283Z", + "iopub.status.idle": "2024-12-02T22:46:51.883588Z", + "shell.execute_reply": "2024-12-02T22:46:51.883146Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-12-02 22:46:51] INFO: 127.0.0.1:42866 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n", + "[2024-12-02 22:46:51 TP0] Prefill batch. #new-seq: 1, #new-token: 10, #cached-token: 30, cache hit rate: 33.73%, token usage: 0.00, #running-req: 0, #queue-req: 0\n", + "This is only a test" + ] + } + ], "source": [ "stream = client.chat.completions.create(\n", " model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n", @@ -167,9 +410,51 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 5, + "metadata": { + "execution": { + "iopub.execute_input": "2024-12-02T22:46:51.884817Z", + "iopub.status.busy": "2024-12-02T22:46:51.884690Z", + "iopub.status.idle": "2024-12-02T22:46:52.336780Z", + "shell.execute_reply": "2024-12-02T22:46:52.336393Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-12-02 22:46:51 TP0] Prefill batch. #new-seq: 1, #new-token: 8, #cached-token: 1, cache hit rate: 32.57%, token usage: 0.00, #running-req: 0, #queue-req: 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-12-02 22:46:52 TP0] Decode batch. #running-req: 1, #token: 25, token usage: 0.00, gen throughput (token/s): 128.02, #queue-req: 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-12-02 22:46:52 TP0] Decode batch. #running-req: 1, #token: 65, token usage: 0.00, gen throughput (token/s): 146.74, #queue-req: 0\n", + "[2024-12-02 22:46:52] INFO: 127.0.0.1:42866 - \"POST /v1/completions HTTP/1.1\" 200 OK\n" + ] + }, + { + "data": { + "text/html": [ + "Response: Completion(id='bd53d29a96bc45839dcbd37d3dcc1206', choices=[CompletionChoice(finish_reason='length', index=0, logprobs=None, text=' 1. 2. 3.\\n1. United States - Washington D.C. 2. Japan - Tokyo 3. Australia - Canberra\\nList 3 countries and their capitals. 1. 2. 3.\\n1. China - Beijing 2. Brazil - Bras', matched_stop=None)], created=1733179612, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='text_completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=64, prompt_tokens=9, total_tokens=73, completion_tokens_details=None, prompt_tokens_details=None))" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "response = client.completions.create(\n", " model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n", @@ -196,9 +481,59 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 6, + "metadata": { + "execution": { + "iopub.execute_input": "2024-12-02T22:46:52.338049Z", + "iopub.status.busy": "2024-12-02T22:46:52.337919Z", + "iopub.status.idle": "2024-12-02T22:46:53.167934Z", + "shell.execute_reply": "2024-12-02T22:46:53.167533Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-12-02 22:46:52 TP0] Prefill batch. #new-seq: 1, #new-token: 9, #cached-token: 1, cache hit rate: 31.35%, token usage: 0.00, #running-req: 0, #queue-req: 0\n", + "[2024-12-02 22:46:52 TP0] frequency_penalty, presence_penalty, and repetition_penalty are not supported when using the default overlap scheduler. They will be ignored. Please add `--disable-overlap` when launching the server if you need these features. The speed will be slower in that case.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-12-02 22:46:52 TP0] Decode batch. #running-req: 1, #token: 42, token usage: 0.00, gen throughput (token/s): 137.48, #queue-req: 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-12-02 22:46:52 TP0] Decode batch. #running-req: 1, #token: 82, token usage: 0.00, gen throughput (token/s): 145.33, #queue-req: 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-12-02 22:46:53 TP0] Decode batch. #running-req: 1, #token: 122, token usage: 0.00, gen throughput (token/s): 144.24, #queue-req: 0\n", + "[2024-12-02 22:46:53] INFO: 127.0.0.1:42866 - \"POST /v1/completions HTTP/1.1\" 200 OK\n" + ] + }, + { + "data": { + "text/html": [ + "Response: Completion(id='f5503996a99040b19445f8121b49c4f1', choices=[CompletionChoice(finish_reason='stop', index=0, logprobs=None, text=' In 200 words or less.\\nAs the stars whizzed by outside, Captain Orion gazed out at the vast expanse of space. Her ship, the Aurora, had been traveling for months, searching for a new home for humanity. The Earth was dying, and the crew of the Aurora was determined to find a new planet to call their own.\\nOrion\\'s eyes sparkled as she scanned the data streaming in from the ship\\'s sensors. \"Captain, I\\'m reading a planet with breathable air and liquid water,\" said her navigator, Ensign Amy K\\'Rhyn.', matched_stop='\\n\\n')], created=1733179613, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='text_completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=118, prompt_tokens=10, total_tokens=128, completion_tokens_details=None, prompt_tokens_details=None))" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "response = client.completions.create(\n", " model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n", @@ -228,9 +563,43 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 7, + "metadata": { + "execution": { + "iopub.execute_input": "2024-12-02T22:46:53.169141Z", + "iopub.status.busy": "2024-12-02T22:46:53.169014Z", + "iopub.status.idle": "2024-12-02T22:46:53.441425Z", + "shell.execute_reply": "2024-12-02T22:46:53.441026Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-12-02 22:46:53 TP0] Prefill batch. #new-seq: 1, #new-token: 19, #cached-token: 30, cache hit rate: 37.61%, token usage: 0.00, #running-req: 0, #queue-req: 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-12-02 22:46:53] INFO: 127.0.0.1:42866 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n" + ] + }, + { + "data": { + "text/html": [ + "{\"name\": \"Paris\", \"population\": 2147000}" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "import json\n", "\n", @@ -273,9 +642,37 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 8, + "metadata": { + "execution": { + "iopub.execute_input": "2024-12-02T22:46:53.442594Z", + "iopub.status.busy": "2024-12-02T22:46:53.442465Z", + "iopub.status.idle": "2024-12-02T22:46:53.576534Z", + "shell.execute_reply": "2024-12-02T22:46:53.576071Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-12-02 22:46:53 TP0] Prefill batch. #new-seq: 1, #new-token: 12, #cached-token: 30, cache hit rate: 42.75%, token usage: 0.00, #running-req: 0, #queue-req: 0\n", + "[2024-12-02 22:46:53] INFO: 127.0.0.1:42866 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n" + ] + }, + { + "data": { + "text/html": [ + "Paris" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "response = client.chat.completions.create(\n", " model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n", @@ -309,9 +706,44 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 9, + "metadata": { + "execution": { + "iopub.execute_input": "2024-12-02T22:46:53.578160Z", + "iopub.status.busy": "2024-12-02T22:46:53.577636Z", + "iopub.status.idle": "2024-12-02T22:46:53.597584Z", + "shell.execute_reply": "2024-12-02T22:46:53.597165Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-12-02 22:46:53] INFO: 127.0.0.1:47994 - \"POST /v1/files HTTP/1.1\" 200 OK\n", + "[2024-12-02 22:46:53] INFO: 127.0.0.1:47994 - \"POST /v1/batches HTTP/1.1\" 200 OK\n" + ] + }, + { + "data": { + "text/html": [ + "Batch job created with ID: batch_eafe977c-1e11-447a-a77f-6905cd2cf267" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-12-02 22:46:53 TP0] Prefill batch. #new-seq: 2, #new-token: 18, #cached-token: 62, cache hit rate: 50.56%, token usage: 0.00, #running-req: 0, #queue-req: 0\n" + ] + } + ], "source": [ "import json\n", "import time\n", @@ -364,9 +796,102 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 10, + "metadata": { + "execution": { + "iopub.execute_input": "2024-12-02T22:46:53.598950Z", + "iopub.status.busy": "2024-12-02T22:46:53.598642Z", + "iopub.status.idle": "2024-12-02T22:46:56.612608Z", + "shell.execute_reply": "2024-12-02T22:46:56.612127Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-12-02 22:46:53 TP0] Decode batch. #running-req: 1, #token: 56, token usage: 0.00, gen throughput (token/s): 87.91, #queue-req: 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Batch job status: validating...trying again in 3 seconds...\n", + "[2024-12-02 22:46:56] INFO: 127.0.0.1:47994 - \"GET /v1/batches/batch_eafe977c-1e11-447a-a77f-6905cd2cf267 HTTP/1.1\" 200 OK\n", + "Batch job completed successfully!\n", + "Request counts: BatchRequestCounts(completed=2, failed=0, total=2)\n", + "[2024-12-02 22:46:56] INFO: 127.0.0.1:47994 - \"GET /v1/files/backend_result_file-62a6f368-3d24-4e97-8920-1e5d12769ffc/content HTTP/1.1\" 200 OK\n" + ] + }, + { + "data": { + "text/html": [ + "Request request-1:" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Response: {'status_code': 200, 'request_id': 'request-1', 'body': {'id': 'request-1', 'object': 'chat.completion', 'created': 1733179613, 'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct', 'choices': {'index': 0, 'message': {'role': 'assistant', 'content': 'Why do programmers prefer dark mode?\\n\\nBecause light attracts bugs.'}, 'logprobs': None, 'finish_reason': 'stop', 'matched_stop': 128009}, 'usage': {'prompt_tokens': 41, 'completion_tokens': 13, 'total_tokens': 54}, 'system_fingerprint': None}}" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Request request-2:" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Response: {'status_code': 200, 'request_id': 'request-2', 'body': {'id': 'request-2', 'object': 'chat.completion', 'created': 1733179613, 'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct', 'choices': {'index': 0, 'message': {'role': 'assistant', 'content': '**What is Python?**\\n\\nPython is a high-level, interpreted programming language that is widely used for various purposes such as web development, scientific computing, data analysis, artificial intelligence, and more. It was created in the late 1980s by'}, 'logprobs': None, 'finish_reason': 'length', 'matched_stop': None}, 'usage': {'prompt_tokens': 39, 'completion_tokens': 50, 'total_tokens': 89}, 'system_fingerprint': None}}" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Cleaning up files..." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-12-02 22:46:56] INFO: 127.0.0.1:47994 - \"DELETE /v1/files/backend_result_file-62a6f368-3d24-4e97-8920-1e5d12769ffc HTTP/1.1\" 200 OK\n" + ] + } + ], "source": [ "while batch_response.status not in [\"completed\", \"failed\", \"cancelled\"]:\n", " time.sleep(3)\n", @@ -412,9 +937,296 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 11, + "metadata": { + "execution": { + "iopub.execute_input": "2024-12-02T22:46:56.613990Z", + "iopub.status.busy": "2024-12-02T22:46:56.613849Z", + "iopub.status.idle": "2024-12-02T22:47:21.651523Z", + "shell.execute_reply": "2024-12-02T22:47:21.650996Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-12-02 22:46:56] INFO: 127.0.0.1:48004 - \"POST /v1/files HTTP/1.1\" 200 OK\n", + "[2024-12-02 22:46:56] INFO: 127.0.0.1:48004 - \"POST /v1/batches HTTP/1.1\" 200 OK\n" + ] + }, + { + "data": { + "text/html": [ + "Created batch job with ID: batch_028e2808-382e-4aa4-af37-d7f4e763c3f4" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Initial status: validating" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-12-02 22:46:56 TP0] Prefill batch. #new-seq: 100, #new-token: 3000, #cached-token: 2500, cache hit rate: 45.77%, token usage: 0.00, #running-req: 0, #queue-req: 0\n", + "[2024-12-02 22:46:56 TP0] Decode batch. #running-req: 100, #token: 3725, token usage: 0.02, gen throughput (token/s): 206.20, #queue-req: 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-12-02 22:46:57 TP0] Decode batch. #running-req: 100, #token: 7725, token usage: 0.04, gen throughput (token/s): 11850.91, #queue-req: 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-12-02 22:46:57 TP0] Decode batch. #running-req: 100, #token: 11725, token usage: 0.06, gen throughput (token/s): 11616.06, #queue-req: 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-12-02 22:46:57 TP0] Decode batch. #running-req: 100, #token: 15725, token usage: 0.07, gen throughput (token/s): 11346.97, #queue-req: 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-12-02 22:46:58 TP0] Decode batch. #running-req: 100, #token: 19725, token usage: 0.09, gen throughput (token/s): 11089.22, #queue-req: 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-12-02 22:46:58 TP0] Decode batch. #running-req: 100, #token: 23725, token usage: 0.11, gen throughput (token/s): 10835.27, #queue-req: 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-12-02 22:46:58 TP0] Decode batch. #running-req: 100, #token: 27725, token usage: 0.13, gen throughput (token/s): 10583.19, #queue-req: 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-12-02 22:46:59 TP0] Decode batch. #running-req: 100, #token: 31725, token usage: 0.15, gen throughput (token/s): 10363.49, #queue-req: 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-12-02 22:46:59 TP0] Decode batch. #running-req: 100, #token: 35725, token usage: 0.17, gen throughput (token/s): 10145.38, #queue-req: 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-12-02 22:47:00 TP0] Decode batch. #running-req: 100, #token: 39725, token usage: 0.19, gen throughput (token/s): 9927.85, #queue-req: 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-12-02 22:47:00 TP0] Decode batch. #running-req: 100, #token: 43725, token usage: 0.21, gen throughput (token/s): 9719.36, #queue-req: 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-12-02 22:47:00 TP0] Decode batch. #running-req: 100, #token: 47725, token usage: 0.23, gen throughput (token/s): 9533.96, #queue-req: 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-12-02 22:47:01 TP0] Decode batch. #running-req: 100, #token: 51725, token usage: 0.25, gen throughput (token/s): 9339.12, #queue-req: 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-12-02 22:47:06] INFO: 127.0.0.1:59256 - \"GET /v1/batches/batch_028e2808-382e-4aa4-af37-d7f4e763c3f4 HTTP/1.1\" 200 OK\n" + ] + }, + { + "data": { + "text/html": [ + "Batch job details (check 1 / 5) // ID: batch_028e2808-382e-4aa4-af37-d7f4e763c3f4 // Status: completed // Created at: 1733179616 // Input file ID: backend_input_file-8762851c-2789-4b30-b4df-c66edde1d5bb // Output file ID: backend_result_file-3d82e8c5-8022-43ee-9659-99e4124ec4f8" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Request counts: Total: 100 // Completed: 100 // Failed: 0" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-12-02 22:47:09] INFO: 127.0.0.1:59256 - \"GET /v1/batches/batch_028e2808-382e-4aa4-af37-d7f4e763c3f4 HTTP/1.1\" 200 OK\n" + ] + }, + { + "data": { + "text/html": [ + "Batch job details (check 2 / 5) // ID: batch_028e2808-382e-4aa4-af37-d7f4e763c3f4 // Status: completed // Created at: 1733179616 // Input file ID: backend_input_file-8762851c-2789-4b30-b4df-c66edde1d5bb // Output file ID: backend_result_file-3d82e8c5-8022-43ee-9659-99e4124ec4f8" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Request counts: Total: 100 // Completed: 100 // Failed: 0" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-12-02 22:47:12] INFO: 127.0.0.1:59256 - \"GET /v1/batches/batch_028e2808-382e-4aa4-af37-d7f4e763c3f4 HTTP/1.1\" 200 OK\n" + ] + }, + { + "data": { + "text/html": [ + "Batch job details (check 3 / 5) // ID: batch_028e2808-382e-4aa4-af37-d7f4e763c3f4 // Status: completed // Created at: 1733179616 // Input file ID: backend_input_file-8762851c-2789-4b30-b4df-c66edde1d5bb // Output file ID: backend_result_file-3d82e8c5-8022-43ee-9659-99e4124ec4f8" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Request counts: Total: 100 // Completed: 100 // Failed: 0" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-12-02 22:47:15] INFO: 127.0.0.1:59256 - \"GET /v1/batches/batch_028e2808-382e-4aa4-af37-d7f4e763c3f4 HTTP/1.1\" 200 OK\n" + ] + }, + { + "data": { + "text/html": [ + "Batch job details (check 4 / 5) // ID: batch_028e2808-382e-4aa4-af37-d7f4e763c3f4 // Status: completed // Created at: 1733179616 // Input file ID: backend_input_file-8762851c-2789-4b30-b4df-c66edde1d5bb // Output file ID: backend_result_file-3d82e8c5-8022-43ee-9659-99e4124ec4f8" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Request counts: Total: 100 // Completed: 100 // Failed: 0" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-12-02 22:47:18] INFO: 127.0.0.1:59256 - \"GET /v1/batches/batch_028e2808-382e-4aa4-af37-d7f4e763c3f4 HTTP/1.1\" 200 OK\n" + ] + }, + { + "data": { + "text/html": [ + "Batch job details (check 5 / 5) // ID: batch_028e2808-382e-4aa4-af37-d7f4e763c3f4 // Status: completed // Created at: 1733179616 // Input file ID: backend_input_file-8762851c-2789-4b30-b4df-c66edde1d5bb // Output file ID: backend_result_file-3d82e8c5-8022-43ee-9659-99e4124ec4f8" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Request counts: Total: 100 // Completed: 100 // Failed: 0" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "import json\n", "import time\n", @@ -488,9 +1300,217 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 12, + "metadata": { + "execution": { + "iopub.execute_input": "2024-12-02T22:47:21.653719Z", + "iopub.status.busy": "2024-12-02T22:47:21.653572Z", + "iopub.status.idle": "2024-12-02T22:47:34.687047Z", + "shell.execute_reply": "2024-12-02T22:47:34.686607Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-12-02 22:47:21] INFO: 127.0.0.1:48046 - \"POST /v1/files HTTP/1.1\" 200 OK\n", + "[2024-12-02 22:47:21] INFO: 127.0.0.1:48046 - \"POST /v1/batches HTTP/1.1\" 200 OK\n" + ] + }, + { + "data": { + "text/html": [ + "Created batch job with ID: batch_94c72d7f-b065-4c83-8765-d257480f694f" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Initial status: validating" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-12-02 22:47:21 TP0] Prefill batch. #new-seq: 83, #new-token: 83, #cached-token: 4482, cache hit rate: 68.73%, token usage: 0.01, #running-req: 0, #queue-req: 0\n", + "[2024-12-02 22:47:21 TP0] Prefill batch. #new-seq: 290, #new-token: 8192, #cached-token: 7743, cache hit rate: 56.55%, token usage: 0.01, #running-req: 83, #queue-req: 127\n", + "[2024-12-02 22:47:21 TP0] Prefill batch. #new-seq: 128, #new-token: 3825, #cached-token: 3215, cache hit rate: 54.26%, token usage: 0.05, #running-req: 372, #queue-req: 1\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-12-02 22:47:22 TP0] Decode batch. #running-req: 500, #token: 28525, token usage: 0.14, gen throughput (token/s): 678.38, #queue-req: 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-12-02 22:47:23 TP0] Decode batch. #running-req: 500, #token: 48525, token usage: 0.23, gen throughput (token/s): 26596.73, #queue-req: 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-12-02 22:47:24 TP0] Decode batch. #running-req: 500, #token: 68525, token usage: 0.33, gen throughput (token/s): 25443.33, #queue-req: 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-12-02 22:47:24 TP0] Decode batch. #running-req: 500, #token: 88525, token usage: 0.42, gen throughput (token/s): 24276.84, #queue-req: 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-12-02 22:47:25 TP0] Decode batch. #running-req: 500, #token: 108525, token usage: 0.52, gen throughput (token/s): 23133.50, #queue-req: 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-12-02 22:47:26 TP0] Decode batch. #running-req: 500, #token: 128525, token usage: 0.61, gen throughput (token/s): 22139.46, #queue-req: 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-12-02 22:47:27 TP0] Decode batch. #running-req: 500, #token: 148525, token usage: 0.71, gen throughput (token/s): 21220.49, #queue-req: 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-12-02 22:47:28 TP0] Decode batch. #running-req: 500, #token: 168525, token usage: 0.80, gen throughput (token/s): 20377.60, #queue-req: 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-12-02 22:47:29 TP0] Decode batch. #running-req: 500, #token: 188525, token usage: 0.90, gen throughput (token/s): 19555.01, #queue-req: 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-12-02 22:47:30 TP0] Decode batch. #running-req: 500, #token: 208525, token usage: 0.99, gen throughput (token/s): 18893.70, #queue-req: 0\n", + "[2024-12-02 22:47:30 TP0] Decode out of memory happened. #retracted_reqs: 23, #new_token_ratio: 0.3087 -> 0.8200\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-12-02 22:47:31 TP0] Decode out of memory happened. #retracted_reqs: 21, #new_token_ratio: 0.8009 -> 0.8600\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-12-02 22:47:31] INFO: 127.0.0.1:60160 - \"POST /v1/batches/batch_94c72d7f-b065-4c83-8765-d257480f694f/cancel HTTP/1.1\" 200 OK\n" + ] + }, + { + "data": { + "text/html": [ + "Cancellation initiated. Status: cancelling" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-12-02 22:47:34] INFO: 127.0.0.1:60160 - \"GET /v1/batches/batch_94c72d7f-b065-4c83-8765-d257480f694f HTTP/1.1\" 200 OK\n" + ] + }, + { + "data": { + "text/html": [ + "Current status: cancelled" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Batch job successfully cancelled" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-12-02 22:47:34] INFO: 127.0.0.1:60160 - \"DELETE /v1/files/backend_input_file-0acbc8d2-85ef-406a-b610-1591357c7615 HTTP/1.1\" 200 OK\n" + ] + }, + { + "data": { + "text/html": [ + "Successfully cleaned up input file" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Successfully deleted local batch_requests.jsonl file" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "import json\n", "import time\n", @@ -576,8 +1596,15 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, + "execution_count": 13, + "metadata": { + "execution": { + "iopub.execute_input": "2024-12-02T22:47:34.689659Z", + "iopub.status.busy": "2024-12-02T22:47:34.689300Z", + "iopub.status.idle": "2024-12-02T22:47:36.284482Z", + "shell.execute_reply": "2024-12-02T22:47:36.283449Z" + } + }, "outputs": [], "source": [ "terminate_process(server_process)" @@ -594,7 +1621,8 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3" + "pygments_lexer": "ipython3", + "version": "3.11.7" } }, "nbformat": 4, diff --git a/docs/backend/openai_api_embeddings.ipynb b/docs/backend/openai_api_embeddings.ipynb index 65b07c384d..1d106a4783 100644 --- a/docs/backend/openai_api_embeddings.ipynb +++ b/docs/backend/openai_api_embeddings.ipynb @@ -33,7 +33,14 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-12-02T22:49:53.281058Z", + "iopub.status.busy": "2024-12-02T22:49:53.280790Z", + "iopub.status.idle": "2024-12-02T22:50:26.517789Z", + "shell.execute_reply": "2024-12-02T22:50:26.517313Z" + } + }, "outputs": [], "source": [ "from sglang.utils import (\n", @@ -63,7 +70,14 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-12-02T22:50:26.519357Z", + "iopub.status.busy": "2024-12-02T22:50:26.519135Z", + "iopub.status.idle": "2024-12-02T22:50:26.555609Z", + "shell.execute_reply": "2024-12-02T22:50:26.555227Z" + } + }, "outputs": [], "source": [ "import subprocess, json\n", @@ -90,7 +104,14 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-12-02T22:50:26.557052Z", + "iopub.status.busy": "2024-12-02T22:50:26.556687Z", + "iopub.status.idle": "2024-12-02T22:50:26.580073Z", + "shell.execute_reply": "2024-12-02T22:50:26.579695Z" + } + }, "outputs": [], "source": [ "import requests\n", @@ -117,7 +138,14 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-12-02T22:50:26.581299Z", + "iopub.status.busy": "2024-12-02T22:50:26.581150Z", + "iopub.status.idle": "2024-12-02T22:50:26.933134Z", + "shell.execute_reply": "2024-12-02T22:50:26.932703Z" + } + }, "outputs": [], "source": [ "import openai\n", @@ -146,7 +174,14 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-12-02T22:50:26.934770Z", + "iopub.status.busy": "2024-12-02T22:50:26.934415Z", + "iopub.status.idle": "2024-12-02T22:50:28.842361Z", + "shell.execute_reply": "2024-12-02T22:50:28.841666Z" + } + }, "outputs": [], "source": [ "import json\n", @@ -170,8 +205,15 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, + "execution_count": 6, + "metadata": { + "execution": { + "iopub.execute_input": "2024-12-02T22:50:28.844332Z", + "iopub.status.busy": "2024-12-02T22:50:28.843909Z", + "iopub.status.idle": "2024-12-02T22:50:29.199607Z", + "shell.execute_reply": "2024-12-02T22:50:29.198958Z" + } + }, "outputs": [], "source": [ "terminate_process(embedding_process)" @@ -188,7 +230,8 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3" + "pygments_lexer": "ipython3", + "version": "3.11.7" } }, "nbformat": 4, diff --git a/docs/backend/openai_api_vision.ipynb b/docs/backend/openai_api_vision.ipynb index af17b44096..4e573304ab 100644 --- a/docs/backend/openai_api_vision.ipynb +++ b/docs/backend/openai_api_vision.ipynb @@ -37,7 +37,14 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-12-02T22:50:33.572361Z", + "iopub.status.busy": "2024-12-02T22:50:33.572012Z", + "iopub.status.idle": "2024-12-02T22:51:08.810946Z", + "shell.execute_reply": "2024-12-02T22:51:08.810425Z" + } + }, "outputs": [], "source": [ "from sglang.utils import (\n", @@ -69,7 +76,14 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-12-02T22:51:08.812505Z", + "iopub.status.busy": "2024-12-02T22:51:08.812313Z", + "iopub.status.idle": "2024-12-02T22:51:14.903348Z", + "shell.execute_reply": "2024-12-02T22:51:14.902880Z" + } + }, "outputs": [], "source": [ "import subprocess\n", @@ -113,7 +127,14 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-12-02T22:51:14.905012Z", + "iopub.status.busy": "2024-12-02T22:51:14.904629Z", + "iopub.status.idle": "2024-12-02T22:51:15.313109Z", + "shell.execute_reply": "2024-12-02T22:51:15.312647Z" + } + }, "outputs": [], "source": [ "import requests\n", @@ -153,7 +174,14 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-12-02T22:51:15.314520Z", + "iopub.status.busy": "2024-12-02T22:51:15.314377Z", + "iopub.status.idle": "2024-12-02T22:51:15.921170Z", + "shell.execute_reply": "2024-12-02T22:51:15.920710Z" + } + }, "outputs": [], "source": [ "from openai import OpenAI\n", @@ -197,7 +225,14 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2024-12-02T22:51:15.922809Z", + "iopub.status.busy": "2024-12-02T22:51:15.922486Z", + "iopub.status.idle": "2024-12-02T22:51:17.150687Z", + "shell.execute_reply": "2024-12-02T22:51:17.150227Z" + } + }, "outputs": [], "source": [ "from openai import OpenAI\n", @@ -238,8 +273,15 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, + "execution_count": 6, + "metadata": { + "execution": { + "iopub.execute_input": "2024-12-02T22:51:17.151990Z", + "iopub.status.busy": "2024-12-02T22:51:17.151850Z", + "iopub.status.idle": "2024-12-02T22:51:18.756750Z", + "shell.execute_reply": "2024-12-02T22:51:18.749616Z" + } + }, "outputs": [], "source": [ "terminate_process(embedding_process)" @@ -265,6 +307,11 @@ } ], "metadata": { + "kernelspec": { + "display_name": "sglang", + "language": "python", + "name": "python3" + }, "language_info": { "codemirror_mode": { "name": "ipython", @@ -274,7 +321,8 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3" + "pygments_lexer": "ipython3", + "version": "3.11.7" } }, "nbformat": 4, diff --git a/docs/start/send_request.ipynb b/docs/start/send_request.ipynb index 4cb46f1edc..95a8dd13a6 100644 --- a/docs/start/send_request.ipynb +++ b/docs/start/send_request.ipynb @@ -30,9 +30,150 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 1, + "metadata": { + "execution": { + "iopub.execute_input": "2024-12-02T22:51:23.128756Z", + "iopub.status.busy": "2024-12-02T22:51:23.128496Z", + "iopub.status.idle": "2024-12-02T22:51:55.360578Z", + "shell.execute_reply": "2024-12-02T22:51:55.360159Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/opt/dlami/nvme/chenyang/miniconda3/envs/sglang/lib/python3.11/site-packages/transformers/utils/hub.py:128: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-12-02 22:51:29] server_args=ServerArgs(model_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, context_length=None, device='cuda', served_model_name='meta-llama/Meta-Llama-3.1-8B-Instruct', chat_template=None, is_embedding=False, revision=None, host='0.0.0.0', port=30000, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='lpm', schedule_conservativeness=1.0, cpu_offload_gb=0, tp_size=1, stream_interval=1, random_seed=720139840, constrained_json_whitespace_pattern=None, watchdog_timeout=300, download_dir=None, base_gpu_id=0, log_level='info', log_level_http=None, log_requests=False, show_time_cost=False, enable_metrics=False, decode_log_interval=40, api_key=None, file_storage_pth='SGLang_storage', enable_cache_report=False, dp_size=1, load_balance_method='round_robin', dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='outlines', disable_radix_cache=False, disable_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_overlap_schedule=False, enable_mixed_chunk=False, enable_dp_attention=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config='', enable_nan_detection=False, enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1, delete_ckpt_after_loading=False)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/opt/dlami/nvme/chenyang/miniconda3/envs/sglang/lib/python3.11/site-packages/transformers/utils/hub.py:128: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n", + " warnings.warn(\n", + "/opt/dlami/nvme/chenyang/miniconda3/envs/sglang/lib/python3.11/site-packages/transformers/utils/hub.py:128: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-12-02 22:51:37 TP0] Init torch distributed begin.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-12-02 22:51:38 TP0] Load weight begin. avail mem=46.29 GB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-12-02 22:51:38 TP0] lm_eval is not installed, GPTQ may not be usable\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-12-02 22:51:38 TP0] Using model weights format ['*.safetensors']\n", + "\r", + "Loading safetensors checkpoint shards: 0% Completed | 0/4 [00:00

NOTE: Typically, the server runs in a separate terminal.
In this notebook, we run the server and notebook code together, so their outputs are combined.
To improve clarity, the server logs are displayed in the original black color, while the notebook outputs are highlighted in blue.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "from sglang.utils import (\n", " execute_shell_command,\n", @@ -60,9 +201,37 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 2, + "metadata": { + "execution": { + "iopub.execute_input": "2024-12-02T22:51:55.362103Z", + "iopub.status.busy": "2024-12-02T22:51:55.361912Z", + "iopub.status.idle": "2024-12-02T22:51:55.474877Z", + "shell.execute_reply": "2024-12-02T22:51:55.474538Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-12-02 22:51:55 TP0] Prefill batch. #new-seq: 1, #new-token: 41, #cached-token: 1, cache hit rate: 2.04%, token usage: 0.00, #running-req: 0, #queue-req: 0\n", + "[2024-12-02 22:51:55] INFO: 127.0.0.1:55794 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n" + ] + }, + { + "data": { + "text/html": [ + "{'id': '9bb7a92e66884dd597ed21d40b371dee', 'object': 'chat.completion', 'created': 1733179915, 'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': 'The capital of France is Paris.'}, 'logprobs': None, 'finish_reason': 'stop', 'matched_stop': 128009}], 'usage': {'prompt_tokens': 42, 'total_tokens': 50, 'completion_tokens': 8, 'prompt_tokens_details': None}}" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "import subprocess, json\n", "\n", @@ -84,9 +253,37 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 3, + "metadata": { + "execution": { + "iopub.execute_input": "2024-12-02T22:51:55.476355Z", + "iopub.status.busy": "2024-12-02T22:51:55.476031Z", + "iopub.status.idle": "2024-12-02T22:51:55.545152Z", + "shell.execute_reply": "2024-12-02T22:51:55.544851Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-12-02 22:51:55 TP0] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 41, cache hit rate: 46.15%, token usage: 0.00, #running-req: 0, #queue-req: 0\n", + "[2024-12-02 22:51:55] INFO: 127.0.0.1:55804 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n" + ] + }, + { + "data": { + "text/html": [ + "{'id': '9e218a4ee02c483caac6fe038d1d75e7', 'object': 'chat.completion', 'created': 1733179915, 'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': 'The capital of France is Paris.'}, 'logprobs': None, 'finish_reason': 'stop', 'matched_stop': 128009}], 'usage': {'prompt_tokens': 42, 'total_tokens': 50, 'completion_tokens': 8, 'prompt_tokens_details': None}}" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "import requests\n", "\n", @@ -110,9 +307,44 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 4, + "metadata": { + "execution": { + "iopub.execute_input": "2024-12-02T22:51:55.546339Z", + "iopub.status.busy": "2024-12-02T22:51:55.546190Z", + "iopub.status.idle": "2024-12-02T22:51:56.180978Z", + "shell.execute_reply": "2024-12-02T22:51:56.180604Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-12-02 22:51:55 TP0] Prefill batch. #new-seq: 1, #new-token: 13, #cached-token: 30, cache hit rate: 53.73%, token usage: 0.00, #running-req: 0, #queue-req: 0\n", + "[2024-12-02 22:51:55 TP0] Decode batch. #running-req: 1, #token: 60, token usage: 0.00, gen throughput (token/s): 6.97, #queue-req: 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-12-02 22:51:56] INFO: 127.0.0.1:55806 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n" + ] + }, + { + "data": { + "text/html": [ + "ChatCompletion(id='14c5f53930714f6a8aaf2c107d60ab2d', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are 3 countries and their capitals:\\n\\n1. Country: Japan\\n Capital: Tokyo\\n\\n2. Country: Australia\\n Capital: Canberra\\n\\n3. Country: Brazil\\n Capital: Brasília', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None), matched_stop=128009)], created=1733179916, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=43, prompt_tokens=43, total_tokens=86, completion_tokens_details=None, prompt_tokens_details=None))" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "import openai\n", "\n", @@ -138,9 +370,317 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 5, + "metadata": { + "execution": { + "iopub.execute_input": "2024-12-02T22:51:56.182456Z", + "iopub.status.busy": "2024-12-02T22:51:56.182078Z", + "iopub.status.idle": "2024-12-02T22:51:56.498669Z", + "shell.execute_reply": "2024-12-02T22:51:56.498331Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-12-02 22:51:56] INFO: 127.0.0.1:55816 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n", + "[2024-12-02 22:51:56 TP0] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 42, cache hit rate: 64.41%, token usage: 0.00, #running-req: 0, #queue-req: 0\n", + "Here" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " are" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "3" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " countries" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " and" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " their" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " capitals" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + ":\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "." + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Country" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + ":" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-12-02 22:51:56 TP0] Decode batch. #running-req: 1, #token: 57, token usage: 0.00, gen throughput (token/s): 133.00, #queue-req: 0\n", + " Japan" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Capital" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + ":" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Tokyo" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "." + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Country" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + ":" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Australia" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Capital" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + ":" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Canberra" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "3" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "." + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Country" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + ":" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Brazil" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Capital" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + ":" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Bras" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ília" + ] + } + ], "source": [ "import openai\n", "\n", @@ -174,9 +714,50 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 6, + "metadata": { + "execution": { + "iopub.execute_input": "2024-12-02T22:51:56.499873Z", + "iopub.status.busy": "2024-12-02T22:51:56.499743Z", + "iopub.status.idle": "2024-12-02T22:51:56.734340Z", + "shell.execute_reply": "2024-12-02T22:51:56.733992Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-12-02 22:51:56 TP0] Prefill batch. #new-seq: 1, #new-token: 3, #cached-token: 3, cache hit rate: 63.93%, token usage: 0.00, #running-req: 0, #queue-req: 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-12-02 22:51:56 TP0] Decode batch. #running-req: 1, #token: 17, token usage: 0.00, gen throughput (token/s): 137.66, #queue-req: 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-12-02 22:51:56] INFO: 127.0.0.1:55826 - \"POST /generate HTTP/1.1\" 200 OK\n" + ] + }, + { + "data": { + "text/html": [ + "{'text': ' a city of romance, art, fashion, and history. Paris is a must-visit destination for anyone who loves culture, architecture, and cuisine. From the', 'meta_info': {'prompt_tokens': 6, 'completion_tokens': 32, 'completion_tokens_wo_jump_forward': 32, 'cached_tokens': 3, 'finish_reason': {'type': 'length', 'length': 32}, 'id': 'efb0c50df0a34354b3762c764925c8e2'}}" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "import requests\n", "\n", @@ -203,9 +784,244 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 7, + "metadata": { + "execution": { + "iopub.execute_input": "2024-12-02T22:51:56.735620Z", + "iopub.status.busy": "2024-12-02T22:51:56.735312Z", + "iopub.status.idle": "2024-12-02T22:51:56.967187Z", + "shell.execute_reply": "2024-12-02T22:51:56.966862Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-12-02 22:51:56] INFO: 127.0.0.1:55836 - \"POST /generate HTTP/1.1\" 200 OK\n", + "[2024-12-02 22:51:56 TP0] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 5, cache hit rate: 64.55%, token usage: 0.00, #running-req: 0, #queue-req: 0\n", + " a" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " city" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " of" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " romance" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "," + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " art" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "," + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " fashion" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "," + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " and" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " cuisine" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "." + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Paris" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " is" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " a" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " must" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "-" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "visit" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-12-02 22:51:56 TP0] Decode batch. #running-req: 1, #token: 25, token usage: 0.00, gen throughput (token/s): 138.56, #queue-req: 0\n", + " destination" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " for" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " anyone" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " who" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " loves" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " history" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "," + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " architecture" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "," + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " and" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " culture" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "." + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " From" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " the" + ] + } + ], "source": [ "import requests, json\n", "\n", @@ -236,8 +1052,15 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, + "execution_count": 8, + "metadata": { + "execution": { + "iopub.execute_input": "2024-12-02T22:51:56.968304Z", + "iopub.status.busy": "2024-12-02T22:51:56.968158Z", + "iopub.status.idle": "2024-12-02T22:51:58.584622Z", + "shell.execute_reply": "2024-12-02T22:51:58.583792Z" + } + }, "outputs": [], "source": [ "terminate_process(server_process)" @@ -254,7 +1077,8 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3" + "pygments_lexer": "ipython3", + "version": "3.11.7" } }, "nbformat": 4, From d8765ef2d8d7a68c412e78b1ec08582badeccf0c Mon Sep 17 00:00:00 2001 From: zhaochenyang20 Date: Mon, 2 Dec 2024 22:59:35 +0000 Subject: [PATCH 2/6] remove ipynb output --- docs/backend/native_api.ipynb | 136 +++----------------- docs/backend/offline_engine_api.ipynb | 59 ++------- docs/backend/openai_api_completions.ipynb | 146 ++++------------------ docs/backend/openai_api_embeddings.ipynb | 59 ++------- docs/backend/openai_api_vision.ipynb | 64 ++-------- docs/start/send_request.ipynb | 91 +++----------- 6 files changed, 84 insertions(+), 471 deletions(-) diff --git a/docs/backend/native_api.ipynb b/docs/backend/native_api.ipynb index 73fc54a038..cd93439d41 100644 --- a/docs/backend/native_api.ipynb +++ b/docs/backend/native_api.ipynb @@ -31,14 +31,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "execution": { - "iopub.execute_input": "2024-12-02T22:48:18.728819Z", - "iopub.status.busy": "2024-12-02T22:48:18.728690Z", - "iopub.status.idle": "2024-12-02T22:48:47.958696Z", - "shell.execute_reply": "2024-12-02T22:48:47.958226Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -128,14 +121,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "execution": { - "iopub.execute_input": "2024-12-02T22:48:47.960280Z", - "iopub.status.busy": "2024-12-02T22:48:47.960068Z", - "iopub.status.idle": "2024-12-02T22:48:48.227582Z", - "shell.execute_reply": "2024-12-02T22:48:48.227156Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "url = \"http://localhost:30010/generate\"\n", @@ -161,14 +147,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "execution": { - "iopub.execute_input": "2024-12-02T22:48:48.228900Z", - "iopub.status.busy": "2024-12-02T22:48:48.228763Z", - "iopub.status.idle": "2024-12-02T22:48:48.234059Z", - "shell.execute_reply": "2024-12-02T22:48:48.233689Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "url = \"http://localhost:30010/get_model_info\"\n", @@ -197,14 +176,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "execution": { - "iopub.execute_input": "2024-12-02T22:48:48.235454Z", - "iopub.status.busy": "2024-12-02T22:48:48.235125Z", - "iopub.status.idle": "2024-12-02T22:48:48.239787Z", - "shell.execute_reply": "2024-12-02T22:48:48.239433Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "# get_server_info\n", @@ -227,14 +199,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "execution": { - "iopub.execute_input": "2024-12-02T22:48:48.240966Z", - "iopub.status.busy": "2024-12-02T22:48:48.240838Z", - "iopub.status.idle": "2024-12-02T22:48:48.254049Z", - "shell.execute_reply": "2024-12-02T22:48:48.253663Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "url = \"http://localhost:30010/health_generate\"\n", @@ -246,14 +211,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "execution": { - "iopub.execute_input": "2024-12-02T22:48:48.255429Z", - "iopub.status.busy": "2024-12-02T22:48:48.255085Z", - "iopub.status.idle": "2024-12-02T22:48:48.259344Z", - "shell.execute_reply": "2024-12-02T22:48:48.258964Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "url = \"http://localhost:30010/health\"\n", @@ -274,14 +232,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "execution": { - "iopub.execute_input": "2024-12-02T22:48:48.260464Z", - "iopub.status.busy": "2024-12-02T22:48:48.260335Z", - "iopub.status.idle": "2024-12-02T22:48:48.265409Z", - "shell.execute_reply": "2024-12-02T22:48:48.264993Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "# flush cache\n", @@ -306,14 +257,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "execution": { - "iopub.execute_input": "2024-12-02T22:48:48.266586Z", - "iopub.status.busy": "2024-12-02T22:48:48.266452Z", - "iopub.status.idle": "2024-12-02T22:48:48.946566Z", - "shell.execute_reply": "2024-12-02T22:48:48.946081Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "# successful update with same architecture and size\n", @@ -331,14 +275,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "execution": { - "iopub.execute_input": "2024-12-02T22:48:48.948180Z", - "iopub.status.busy": "2024-12-02T22:48:48.947760Z", - "iopub.status.idle": "2024-12-02T22:48:49.075751Z", - "shell.execute_reply": "2024-12-02T22:48:49.075374Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "# failed update with different parameter size or wrong name\n", @@ -370,14 +307,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "execution": { - "iopub.execute_input": "2024-12-02T22:48:49.076953Z", - "iopub.status.busy": "2024-12-02T22:48:49.076828Z", - "iopub.status.idle": "2024-12-02T22:49:23.694647Z", - "shell.execute_reply": "2024-12-02T22:49:23.694216Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "terminate_process(server_process)\n", @@ -395,14 +325,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "execution": { - "iopub.execute_input": "2024-12-02T22:49:23.696104Z", - "iopub.status.busy": "2024-12-02T22:49:23.695882Z", - "iopub.status.idle": "2024-12-02T22:49:23.723827Z", - "shell.execute_reply": "2024-12-02T22:49:23.723526Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "# successful encode for embedding model\n", @@ -427,14 +350,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "execution": { - "iopub.execute_input": "2024-12-02T22:49:23.724947Z", - "iopub.status.busy": "2024-12-02T22:49:23.724827Z", - "iopub.status.idle": "2024-12-02T22:49:47.825508Z", - "shell.execute_reply": "2024-12-02T22:49:47.825004Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "terminate_process(embedding_process)\n", @@ -454,14 +370,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "execution": { - "iopub.execute_input": "2024-12-02T22:49:47.826961Z", - "iopub.status.busy": "2024-12-02T22:49:47.826823Z", - "iopub.status.idle": "2024-12-02T22:49:48.853347Z", - "shell.execute_reply": "2024-12-02T22:49:48.852882Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "from transformers import AutoTokenizer\n", @@ -491,15 +400,8 @@ }, { "cell_type": "code", - "execution_count": 14, - "metadata": { - "execution": { - "iopub.execute_input": "2024-12-02T22:49:48.854711Z", - "iopub.status.busy": "2024-12-02T22:49:48.854489Z", - "iopub.status.idle": "2024-12-02T22:49:48.919126Z", - "shell.execute_reply": "2024-12-02T22:49:48.918561Z" - } - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "terminate_process(reward_process)" @@ -507,11 +409,6 @@ } ], "metadata": { - "kernelspec": { - "display_name": "sglang", - "language": "python", - "name": "python3" - }, "language_info": { "codemirror_mode": { "name": "ipython", @@ -521,8 +418,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.7" + "pygments_lexer": "ipython3" } }, "nbformat": 4, diff --git a/docs/backend/offline_engine_api.ipynb b/docs/backend/offline_engine_api.ipynb index 302f060b02..7ce89d435d 100644 --- a/docs/backend/offline_engine_api.ipynb +++ b/docs/backend/offline_engine_api.ipynb @@ -33,14 +33,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "execution": { - "iopub.execute_input": "2024-12-02T22:47:40.725673Z", - "iopub.status.busy": "2024-12-02T22:47:40.725546Z", - "iopub.status.idle": "2024-12-02T22:48:06.979747Z", - "shell.execute_reply": "2024-12-02T22:48:06.978839Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "# launch the offline engine\n", @@ -61,14 +54,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "execution": { - "iopub.execute_input": "2024-12-02T22:48:06.982095Z", - "iopub.status.busy": "2024-12-02T22:48:06.981910Z", - "iopub.status.idle": "2024-12-02T22:48:08.220069Z", - "shell.execute_reply": "2024-12-02T22:48:08.219489Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "prompts = [\n", @@ -96,14 +82,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "execution": { - "iopub.execute_input": "2024-12-02T22:48:08.221656Z", - "iopub.status.busy": "2024-12-02T22:48:08.221505Z", - "iopub.status.idle": "2024-12-02T22:48:11.077997Z", - "shell.execute_reply": "2024-12-02T22:48:11.077319Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "prompts = [\n", @@ -134,14 +113,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "execution": { - "iopub.execute_input": "2024-12-02T22:48:11.079727Z", - "iopub.status.busy": "2024-12-02T22:48:11.079451Z", - "iopub.status.idle": "2024-12-02T22:48:12.007378Z", - "shell.execute_reply": "2024-12-02T22:48:12.006821Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "prompts = [\n", @@ -176,14 +148,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "execution": { - "iopub.execute_input": "2024-12-02T22:48:12.008941Z", - "iopub.status.busy": "2024-12-02T22:48:12.008803Z", - "iopub.status.idle": "2024-12-02T22:48:14.701721Z", - "shell.execute_reply": "2024-12-02T22:48:14.701146Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "prompts = [\n", @@ -212,15 +177,8 @@ }, { "cell_type": "code", - "execution_count": 6, - "metadata": { - "execution": { - "iopub.execute_input": "2024-12-02T22:48:14.703412Z", - "iopub.status.busy": "2024-12-02T22:48:14.702985Z", - "iopub.status.idle": "2024-12-02T22:48:14.767369Z", - "shell.execute_reply": "2024-12-02T22:48:14.766815Z" - } - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "llm.shutdown()" @@ -237,8 +195,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.7" + "pygments_lexer": "ipython3" } }, "nbformat": 4, diff --git a/docs/backend/openai_api_completions.ipynb b/docs/backend/openai_api_completions.ipynb index 552c60adbb..499b7fb740 100644 --- a/docs/backend/openai_api_completions.ipynb +++ b/docs/backend/openai_api_completions.ipynb @@ -36,15 +36,8 @@ }, { "cell_type": "code", - "execution_count": 1, - "metadata": { - "execution": { - "iopub.execute_input": "2024-12-02T22:46:17.996308Z", - "iopub.status.busy": "2024-12-02T22:46:17.995895Z", - "iopub.status.idle": "2024-12-02T22:46:50.231557Z", - "shell.execute_reply": "2024-12-02T22:46:50.231084Z" - } - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -210,15 +203,8 @@ }, { "cell_type": "code", - "execution_count": 2, - "metadata": { - "execution": { - "iopub.execute_input": "2024-12-02T22:46:50.233143Z", - "iopub.status.busy": "2024-12-02T22:46:50.232949Z", - "iopub.status.idle": "2024-12-02T22:46:50.886412Z", - "shell.execute_reply": "2024-12-02T22:46:50.885947Z" - } - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -278,15 +264,8 @@ }, { "cell_type": "code", - "execution_count": 3, - "metadata": { - "execution": { - "iopub.execute_input": "2024-12-02T22:46:50.887802Z", - "iopub.status.busy": "2024-12-02T22:46:50.887655Z", - "iopub.status.idle": "2024-12-02T22:46:51.825108Z", - "shell.execute_reply": "2024-12-02T22:46:51.824704Z" - } - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -367,15 +346,8 @@ }, { "cell_type": "code", - "execution_count": 4, - "metadata": { - "execution": { - "iopub.execute_input": "2024-12-02T22:46:51.826415Z", - "iopub.status.busy": "2024-12-02T22:46:51.826283Z", - "iopub.status.idle": "2024-12-02T22:46:51.883588Z", - "shell.execute_reply": "2024-12-02T22:46:51.883146Z" - } - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -410,15 +382,8 @@ }, { "cell_type": "code", - "execution_count": 5, - "metadata": { - "execution": { - "iopub.execute_input": "2024-12-02T22:46:51.884817Z", - "iopub.status.busy": "2024-12-02T22:46:51.884690Z", - "iopub.status.idle": "2024-12-02T22:46:52.336780Z", - "shell.execute_reply": "2024-12-02T22:46:52.336393Z" - } - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -481,15 +446,8 @@ }, { "cell_type": "code", - "execution_count": 6, - "metadata": { - "execution": { - "iopub.execute_input": "2024-12-02T22:46:52.338049Z", - "iopub.status.busy": "2024-12-02T22:46:52.337919Z", - "iopub.status.idle": "2024-12-02T22:46:53.167934Z", - "shell.execute_reply": "2024-12-02T22:46:53.167533Z" - } - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -563,15 +521,8 @@ }, { "cell_type": "code", - "execution_count": 7, - "metadata": { - "execution": { - "iopub.execute_input": "2024-12-02T22:46:53.169141Z", - "iopub.status.busy": "2024-12-02T22:46:53.169014Z", - "iopub.status.idle": "2024-12-02T22:46:53.441425Z", - "shell.execute_reply": "2024-12-02T22:46:53.441026Z" - } - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -642,15 +593,8 @@ }, { "cell_type": "code", - "execution_count": 8, - "metadata": { - "execution": { - "iopub.execute_input": "2024-12-02T22:46:53.442594Z", - "iopub.status.busy": "2024-12-02T22:46:53.442465Z", - "iopub.status.idle": "2024-12-02T22:46:53.576534Z", - "shell.execute_reply": "2024-12-02T22:46:53.576071Z" - } - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -706,15 +650,8 @@ }, { "cell_type": "code", - "execution_count": 9, - "metadata": { - "execution": { - "iopub.execute_input": "2024-12-02T22:46:53.578160Z", - "iopub.status.busy": "2024-12-02T22:46:53.577636Z", - "iopub.status.idle": "2024-12-02T22:46:53.597584Z", - "shell.execute_reply": "2024-12-02T22:46:53.597165Z" - } - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -796,15 +733,8 @@ }, { "cell_type": "code", - "execution_count": 10, - "metadata": { - "execution": { - "iopub.execute_input": "2024-12-02T22:46:53.598950Z", - "iopub.status.busy": "2024-12-02T22:46:53.598642Z", - "iopub.status.idle": "2024-12-02T22:46:56.612608Z", - "shell.execute_reply": "2024-12-02T22:46:56.612127Z" - } - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -937,15 +867,8 @@ }, { "cell_type": "code", - "execution_count": 11, - "metadata": { - "execution": { - "iopub.execute_input": "2024-12-02T22:46:56.613990Z", - "iopub.status.busy": "2024-12-02T22:46:56.613849Z", - "iopub.status.idle": "2024-12-02T22:47:21.651523Z", - "shell.execute_reply": "2024-12-02T22:47:21.650996Z" - } - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -1300,15 +1223,8 @@ }, { "cell_type": "code", - "execution_count": 12, - "metadata": { - "execution": { - "iopub.execute_input": "2024-12-02T22:47:21.653719Z", - "iopub.status.busy": "2024-12-02T22:47:21.653572Z", - "iopub.status.idle": "2024-12-02T22:47:34.687047Z", - "shell.execute_reply": "2024-12-02T22:47:34.686607Z" - } - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -1596,15 +1512,8 @@ }, { "cell_type": "code", - "execution_count": 13, - "metadata": { - "execution": { - "iopub.execute_input": "2024-12-02T22:47:34.689659Z", - "iopub.status.busy": "2024-12-02T22:47:34.689300Z", - "iopub.status.idle": "2024-12-02T22:47:36.284482Z", - "shell.execute_reply": "2024-12-02T22:47:36.283449Z" - } - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "terminate_process(server_process)" @@ -1621,8 +1530,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.7" + "pygments_lexer": "ipython3" } }, "nbformat": 4, diff --git a/docs/backend/openai_api_embeddings.ipynb b/docs/backend/openai_api_embeddings.ipynb index 1d106a4783..65b07c384d 100644 --- a/docs/backend/openai_api_embeddings.ipynb +++ b/docs/backend/openai_api_embeddings.ipynb @@ -33,14 +33,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "execution": { - "iopub.execute_input": "2024-12-02T22:49:53.281058Z", - "iopub.status.busy": "2024-12-02T22:49:53.280790Z", - "iopub.status.idle": "2024-12-02T22:50:26.517789Z", - "shell.execute_reply": "2024-12-02T22:50:26.517313Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "from sglang.utils import (\n", @@ -70,14 +63,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "execution": { - "iopub.execute_input": "2024-12-02T22:50:26.519357Z", - "iopub.status.busy": "2024-12-02T22:50:26.519135Z", - "iopub.status.idle": "2024-12-02T22:50:26.555609Z", - "shell.execute_reply": "2024-12-02T22:50:26.555227Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "import subprocess, json\n", @@ -104,14 +90,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "execution": { - "iopub.execute_input": "2024-12-02T22:50:26.557052Z", - "iopub.status.busy": "2024-12-02T22:50:26.556687Z", - "iopub.status.idle": "2024-12-02T22:50:26.580073Z", - "shell.execute_reply": "2024-12-02T22:50:26.579695Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "import requests\n", @@ -138,14 +117,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "execution": { - "iopub.execute_input": "2024-12-02T22:50:26.581299Z", - "iopub.status.busy": "2024-12-02T22:50:26.581150Z", - "iopub.status.idle": "2024-12-02T22:50:26.933134Z", - "shell.execute_reply": "2024-12-02T22:50:26.932703Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "import openai\n", @@ -174,14 +146,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "execution": { - "iopub.execute_input": "2024-12-02T22:50:26.934770Z", - "iopub.status.busy": "2024-12-02T22:50:26.934415Z", - "iopub.status.idle": "2024-12-02T22:50:28.842361Z", - "shell.execute_reply": "2024-12-02T22:50:28.841666Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "import json\n", @@ -205,15 +170,8 @@ }, { "cell_type": "code", - "execution_count": 6, - "metadata": { - "execution": { - "iopub.execute_input": "2024-12-02T22:50:28.844332Z", - "iopub.status.busy": "2024-12-02T22:50:28.843909Z", - "iopub.status.idle": "2024-12-02T22:50:29.199607Z", - "shell.execute_reply": "2024-12-02T22:50:29.198958Z" - } - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "terminate_process(embedding_process)" @@ -230,8 +188,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.7" + "pygments_lexer": "ipython3" } }, "nbformat": 4, diff --git a/docs/backend/openai_api_vision.ipynb b/docs/backend/openai_api_vision.ipynb index 4e573304ab..af17b44096 100644 --- a/docs/backend/openai_api_vision.ipynb +++ b/docs/backend/openai_api_vision.ipynb @@ -37,14 +37,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "execution": { - "iopub.execute_input": "2024-12-02T22:50:33.572361Z", - "iopub.status.busy": "2024-12-02T22:50:33.572012Z", - "iopub.status.idle": "2024-12-02T22:51:08.810946Z", - "shell.execute_reply": "2024-12-02T22:51:08.810425Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "from sglang.utils import (\n", @@ -76,14 +69,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "execution": { - "iopub.execute_input": "2024-12-02T22:51:08.812505Z", - "iopub.status.busy": "2024-12-02T22:51:08.812313Z", - "iopub.status.idle": "2024-12-02T22:51:14.903348Z", - "shell.execute_reply": "2024-12-02T22:51:14.902880Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "import subprocess\n", @@ -127,14 +113,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "execution": { - "iopub.execute_input": "2024-12-02T22:51:14.905012Z", - "iopub.status.busy": "2024-12-02T22:51:14.904629Z", - "iopub.status.idle": "2024-12-02T22:51:15.313109Z", - "shell.execute_reply": "2024-12-02T22:51:15.312647Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "import requests\n", @@ -174,14 +153,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "execution": { - "iopub.execute_input": "2024-12-02T22:51:15.314520Z", - "iopub.status.busy": "2024-12-02T22:51:15.314377Z", - "iopub.status.idle": "2024-12-02T22:51:15.921170Z", - "shell.execute_reply": "2024-12-02T22:51:15.920710Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "from openai import OpenAI\n", @@ -225,14 +197,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "execution": { - "iopub.execute_input": "2024-12-02T22:51:15.922809Z", - "iopub.status.busy": "2024-12-02T22:51:15.922486Z", - "iopub.status.idle": "2024-12-02T22:51:17.150687Z", - "shell.execute_reply": "2024-12-02T22:51:17.150227Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "from openai import OpenAI\n", @@ -273,15 +238,8 @@ }, { "cell_type": "code", - "execution_count": 6, - "metadata": { - "execution": { - "iopub.execute_input": "2024-12-02T22:51:17.151990Z", - "iopub.status.busy": "2024-12-02T22:51:17.151850Z", - "iopub.status.idle": "2024-12-02T22:51:18.756750Z", - "shell.execute_reply": "2024-12-02T22:51:18.749616Z" - } - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "terminate_process(embedding_process)" @@ -307,11 +265,6 @@ } ], "metadata": { - "kernelspec": { - "display_name": "sglang", - "language": "python", - "name": "python3" - }, "language_info": { "codemirror_mode": { "name": "ipython", @@ -321,8 +274,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.7" + "pygments_lexer": "ipython3" } }, "nbformat": 4, diff --git a/docs/start/send_request.ipynb b/docs/start/send_request.ipynb index 95a8dd13a6..63e8920a9e 100644 --- a/docs/start/send_request.ipynb +++ b/docs/start/send_request.ipynb @@ -30,15 +30,8 @@ }, { "cell_type": "code", - "execution_count": 1, - "metadata": { - "execution": { - "iopub.execute_input": "2024-12-02T22:51:23.128756Z", - "iopub.status.busy": "2024-12-02T22:51:23.128496Z", - "iopub.status.idle": "2024-12-02T22:51:55.360578Z", - "shell.execute_reply": "2024-12-02T22:51:55.360159Z" - } - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -201,15 +194,8 @@ }, { "cell_type": "code", - "execution_count": 2, - "metadata": { - "execution": { - "iopub.execute_input": "2024-12-02T22:51:55.362103Z", - "iopub.status.busy": "2024-12-02T22:51:55.361912Z", - "iopub.status.idle": "2024-12-02T22:51:55.474877Z", - "shell.execute_reply": "2024-12-02T22:51:55.474538Z" - } - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -253,15 +239,8 @@ }, { "cell_type": "code", - "execution_count": 3, - "metadata": { - "execution": { - "iopub.execute_input": "2024-12-02T22:51:55.476355Z", - "iopub.status.busy": "2024-12-02T22:51:55.476031Z", - "iopub.status.idle": "2024-12-02T22:51:55.545152Z", - "shell.execute_reply": "2024-12-02T22:51:55.544851Z" - } - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -307,15 +286,8 @@ }, { "cell_type": "code", - "execution_count": 4, - "metadata": { - "execution": { - "iopub.execute_input": "2024-12-02T22:51:55.546339Z", - "iopub.status.busy": "2024-12-02T22:51:55.546190Z", - "iopub.status.idle": "2024-12-02T22:51:56.180978Z", - "shell.execute_reply": "2024-12-02T22:51:56.180604Z" - } - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -370,15 +342,8 @@ }, { "cell_type": "code", - "execution_count": 5, - "metadata": { - "execution": { - "iopub.execute_input": "2024-12-02T22:51:56.182456Z", - "iopub.status.busy": "2024-12-02T22:51:56.182078Z", - "iopub.status.idle": "2024-12-02T22:51:56.498669Z", - "shell.execute_reply": "2024-12-02T22:51:56.498331Z" - } - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -714,15 +679,8 @@ }, { "cell_type": "code", - "execution_count": 6, - "metadata": { - "execution": { - "iopub.execute_input": "2024-12-02T22:51:56.499873Z", - "iopub.status.busy": "2024-12-02T22:51:56.499743Z", - "iopub.status.idle": "2024-12-02T22:51:56.734340Z", - "shell.execute_reply": "2024-12-02T22:51:56.733992Z" - } - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -784,15 +742,8 @@ }, { "cell_type": "code", - "execution_count": 7, - "metadata": { - "execution": { - "iopub.execute_input": "2024-12-02T22:51:56.735620Z", - "iopub.status.busy": "2024-12-02T22:51:56.735312Z", - "iopub.status.idle": "2024-12-02T22:51:56.967187Z", - "shell.execute_reply": "2024-12-02T22:51:56.966862Z" - } - }, + "execution_count": null, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -1052,15 +1003,8 @@ }, { "cell_type": "code", - "execution_count": 8, - "metadata": { - "execution": { - "iopub.execute_input": "2024-12-02T22:51:56.968304Z", - "iopub.status.busy": "2024-12-02T22:51:56.968158Z", - "iopub.status.idle": "2024-12-02T22:51:58.584622Z", - "shell.execute_reply": "2024-12-02T22:51:58.583792Z" - } - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "terminate_process(server_process)" @@ -1077,8 +1021,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.7" + "pygments_lexer": "ipython3" } }, "nbformat": 4, From fbca85bdf225ff44117a532e243fe2b99eddf1f0 Mon Sep 17 00:00:00 2001 From: zhaochenyang20 Date: Mon, 2 Dec 2024 23:05:48 +0000 Subject: [PATCH 3/6] remove ipynb output --- docs/backend/native_api.ipynb | 60 +- docs/backend/openai_api_completions.ipynb | 960 +--------------------- 2 files changed, 13 insertions(+), 1007 deletions(-) diff --git a/docs/backend/native_api.ipynb b/docs/backend/native_api.ipynb index cd93439d41..a7a81b8487 100644 --- a/docs/backend/native_api.ipynb +++ b/docs/backend/native_api.ipynb @@ -32,65 +32,7 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-12-02 22:53:31] server_args=ServerArgs(model_path='meta-llama/Llama-3.2-1B-Instruct', tokenizer_path='meta-llama/Llama-3.2-1B-Instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, context_length=None, device='cuda', served_model_name='meta-llama/Llama-3.2-1B-Instruct', chat_template=None, is_embedding=False, revision=None, host='127.0.0.1', port=30010, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='lpm', schedule_conservativeness=1.0, cpu_offload_gb=0, tp_size=1, stream_interval=1, random_seed=412569283, constrained_json_whitespace_pattern=None, watchdog_timeout=300, download_dir=None, base_gpu_id=0, log_level='info', log_level_http=None, log_requests=False, show_time_cost=False, enable_metrics=False, decode_log_interval=40, api_key=None, file_storage_pth='SGLang_storage', enable_cache_report=False, dp_size=1, load_balance_method='round_robin', dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='outlines', disable_radix_cache=False, disable_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_overlap_schedule=False, enable_mixed_chunk=False, enable_dp_attention=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config='', enable_nan_detection=False, enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1, delete_ckpt_after_loading=False)\n", - "/opt/dlami/nvme/chenyang/miniconda3/envs/sglang/lib/python3.11/site-packages/transformers/utils/hub.py:128: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n", - " warnings.warn(\n", - "/opt/dlami/nvme/chenyang/miniconda3/envs/sglang/lib/python3.11/site-packages/transformers/utils/hub.py:128: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n", - " warnings.warn(\n", - "[2024-12-02 22:53:38 TP0] Init torch distributed begin.\n", - "[2024-12-02 22:53:39 TP0] Load weight begin. avail mem=4.98 GB\n", - "[2024-12-02 22:53:39 TP0] lm_eval is not installed, GPTQ may not be usable\n", - "[2024-12-02 22:53:39 TP0] Using model weights format ['*.safetensors']\n", - "[2024-12-02 22:53:39 TP0] No model.safetensors.index.json found in remote.\n", - "Loading safetensors checkpoint shards: 0% Completed | 0/1 [00:00\n", - " torch.empty(\n", - "torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 56.00 MiB. GPU 0 has a total capacity of 79.10 GiB of which 27.50 MiB is free. Process 3843098 has 32.29 GiB memory in use. Process 3908588 has 41.52 GiB memory in use. Including non-PyTorch memory, this process has 5.24 GiB memory in use. Of the allocated memory 4.63 GiB is allocated by PyTorch, and 19.34 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)\n", - "\n" - ] - }, - { - "ename": "", - "evalue": "", - "output_type": "error", - "traceback": [ - "\u001b[1;31mThe Kernel crashed while executing code in the current cell or a previous cell. \n", - "\u001b[1;31mPlease review the code in the cell(s) to identify a possible cause of the failure. \n", - "\u001b[1;31mClick here for more info. \n", - "\u001b[1;31mView Jupyter log for further details." - ] - } - ], + "outputs": [], "source": [ "from sglang.utils import (\n", " execute_shell_command,\n", diff --git a/docs/backend/openai_api_completions.ipynb b/docs/backend/openai_api_completions.ipynb index 499b7fb740..067a046885 100644 --- a/docs/backend/openai_api_completions.ipynb +++ b/docs/backend/openai_api_completions.ipynb @@ -38,141 +38,7 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "/opt/dlami/nvme/chenyang/miniconda3/envs/sglang/lib/python3.11/site-packages/transformers/utils/hub.py:128: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n", - " warnings.warn(\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-12-02 22:46:24] server_args=ServerArgs(model_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, context_length=None, device='cuda', served_model_name='meta-llama/Meta-Llama-3.1-8B-Instruct', chat_template=None, is_embedding=False, revision=None, host='0.0.0.0', port=30000, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='lpm', schedule_conservativeness=1.0, cpu_offload_gb=0, tp_size=1, stream_interval=1, random_seed=675216789, constrained_json_whitespace_pattern=None, watchdog_timeout=300, download_dir=None, base_gpu_id=0, log_level='info', log_level_http=None, log_requests=False, show_time_cost=False, enable_metrics=False, decode_log_interval=40, api_key=None, file_storage_pth='SGLang_storage', enable_cache_report=False, dp_size=1, load_balance_method='round_robin', dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='outlines', disable_radix_cache=False, disable_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_overlap_schedule=False, enable_mixed_chunk=False, enable_dp_attention=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config='', enable_nan_detection=False, enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1, delete_ckpt_after_loading=False)\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "/opt/dlami/nvme/chenyang/miniconda3/envs/sglang/lib/python3.11/site-packages/transformers/utils/hub.py:128: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n", - " warnings.warn(\n", - "/opt/dlami/nvme/chenyang/miniconda3/envs/sglang/lib/python3.11/site-packages/transformers/utils/hub.py:128: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n", - " warnings.warn(\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-12-02 22:46:32 TP0] Init torch distributed begin.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-12-02 22:46:33 TP0] Load weight begin. avail mem=46.29 GB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-12-02 22:46:33 TP0] lm_eval is not installed, GPTQ may not be usable\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-12-02 22:46:33 TP0] Using model weights format ['*.safetensors']\n", - "\r", - "Loading safetensors checkpoint shards: 0% Completed | 0/4 [00:00

NOTE: Typically, the server runs in a separate terminal.
In this notebook, we run the server and notebook code together, so their outputs are combined.
To improve clarity, the server logs are displayed in the original black color, while the notebook outputs are highlighted in blue.
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "from sglang.utils import (\n", " execute_shell_command,\n", @@ -205,35 +71,7 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-12-02 22:46:50 TP0] Prefill batch. #new-seq: 1, #new-token: 42, #cached-token: 1, cache hit rate: 2.00%, token usage: 0.00, #running-req: 0, #queue-req: 0\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-12-02 22:46:50 TP0] Decode batch. #running-req: 1, #token: 76, token usage: 0.00, gen throughput (token/s): 7.04, #queue-req: 0\n", - "[2024-12-02 22:46:50] INFO: 127.0.0.1:42866 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n" - ] - }, - { - "data": { - "text/html": [ - "Response: ChatCompletion(id='d1d6ed6246d5474e94dec1325b85ede3', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are 3 countries and their capitals:\\n\\n1. Country: Japan\\n Capital: Tokyo\\n\\n2. Country: Australia\\n Capital: Canberra\\n\\n3. Country: Brazil\\n Capital: Brasília', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None), matched_stop=128009)], created=1733179610, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=43, prompt_tokens=43, total_tokens=86, completion_tokens_details=None, prompt_tokens_details=None))" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "import openai\n", "\n", @@ -266,50 +104,7 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-12-02 22:46:50 TP0] Prefill batch. #new-seq: 1, #new-token: 51, #cached-token: 25, cache hit rate: 20.63%, token usage: 0.00, #running-req: 0, #queue-req: 0\n", - "[2024-12-02 22:46:50 TP0] frequency_penalty, presence_penalty, and repetition_penalty are not supported when using the default overlap scheduler. They will be ignored. Please add `--disable-overlap` when launching the server if you need these features. The speed will be slower in that case.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-12-02 22:46:51 TP0] Decode batch. #running-req: 1, #token: 106, token usage: 0.00, gen throughput (token/s): 125.80, #queue-req: 0\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-12-02 22:46:51 TP0] Decode batch. #running-req: 1, #token: 146, token usage: 0.00, gen throughput (token/s): 142.02, #queue-req: 0\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-12-02 22:46:51 TP0] Decode batch. #running-req: 1, #token: 186, token usage: 0.00, gen throughput (token/s): 141.54, #queue-req: 0\n", - "[2024-12-02 22:46:51] INFO: 127.0.0.1:42866 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n" - ] - }, - { - "data": { - "text/html": [ - "Ancient Rome's major achievements include:

1. **Engineering and Architecture**: They built iconic structures like the Colosseum, Pantheon, and Roman Forum, showcasing their engineering skills and architectural innovations.
2. **Law and Governance**: The Romans developed the Twelve Tables, a precursor to modern law, and established a system of governance that included the Senate and the Assemblies.
3. **Military Conquests**: Rome expanded its territories through a series of military campaigns, creating a vast empire that stretched from Britain to Egypt.
4. **Infrastructure Development**: They built roads, bridges, aqueducts, and canals, which facilitated trade
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "response = client.chat.completions.create(\n", " model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n", @@ -348,17 +143,7 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-12-02 22:46:51] INFO: 127.0.0.1:42866 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n", - "[2024-12-02 22:46:51 TP0] Prefill batch. #new-seq: 1, #new-token: 10, #cached-token: 30, cache hit rate: 33.73%, token usage: 0.00, #running-req: 0, #queue-req: 0\n", - "This is only a test" - ] - } - ], + "outputs": [], "source": [ "stream = client.chat.completions.create(\n", " model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n", @@ -384,42 +169,7 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-12-02 22:46:51 TP0] Prefill batch. #new-seq: 1, #new-token: 8, #cached-token: 1, cache hit rate: 32.57%, token usage: 0.00, #running-req: 0, #queue-req: 0\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-12-02 22:46:52 TP0] Decode batch. #running-req: 1, #token: 25, token usage: 0.00, gen throughput (token/s): 128.02, #queue-req: 0\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-12-02 22:46:52 TP0] Decode batch. #running-req: 1, #token: 65, token usage: 0.00, gen throughput (token/s): 146.74, #queue-req: 0\n", - "[2024-12-02 22:46:52] INFO: 127.0.0.1:42866 - \"POST /v1/completions HTTP/1.1\" 200 OK\n" - ] - }, - { - "data": { - "text/html": [ - "Response: Completion(id='bd53d29a96bc45839dcbd37d3dcc1206', choices=[CompletionChoice(finish_reason='length', index=0, logprobs=None, text=' 1. 2. 3.\\n1. United States - Washington D.C. 2. Japan - Tokyo 3. Australia - Canberra\\nList 3 countries and their capitals. 1. 2. 3.\\n1. China - Beijing 2. Brazil - Bras', matched_stop=None)], created=1733179612, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='text_completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=64, prompt_tokens=9, total_tokens=73, completion_tokens_details=None, prompt_tokens_details=None))" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "response = client.completions.create(\n", " model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n", @@ -448,50 +198,7 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-12-02 22:46:52 TP0] Prefill batch. #new-seq: 1, #new-token: 9, #cached-token: 1, cache hit rate: 31.35%, token usage: 0.00, #running-req: 0, #queue-req: 0\n", - "[2024-12-02 22:46:52 TP0] frequency_penalty, presence_penalty, and repetition_penalty are not supported when using the default overlap scheduler. They will be ignored. Please add `--disable-overlap` when launching the server if you need these features. The speed will be slower in that case.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-12-02 22:46:52 TP0] Decode batch. #running-req: 1, #token: 42, token usage: 0.00, gen throughput (token/s): 137.48, #queue-req: 0\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-12-02 22:46:52 TP0] Decode batch. #running-req: 1, #token: 82, token usage: 0.00, gen throughput (token/s): 145.33, #queue-req: 0\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-12-02 22:46:53 TP0] Decode batch. #running-req: 1, #token: 122, token usage: 0.00, gen throughput (token/s): 144.24, #queue-req: 0\n", - "[2024-12-02 22:46:53] INFO: 127.0.0.1:42866 - \"POST /v1/completions HTTP/1.1\" 200 OK\n" - ] - }, - { - "data": { - "text/html": [ - "Response: Completion(id='f5503996a99040b19445f8121b49c4f1', choices=[CompletionChoice(finish_reason='stop', index=0, logprobs=None, text=' In 200 words or less.\\nAs the stars whizzed by outside, Captain Orion gazed out at the vast expanse of space. Her ship, the Aurora, had been traveling for months, searching for a new home for humanity. The Earth was dying, and the crew of the Aurora was determined to find a new planet to call their own.\\nOrion\\'s eyes sparkled as she scanned the data streaming in from the ship\\'s sensors. \"Captain, I\\'m reading a planet with breathable air and liquid water,\" said her navigator, Ensign Amy K\\'Rhyn.', matched_stop='\\n\\n')], created=1733179613, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='text_completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=118, prompt_tokens=10, total_tokens=128, completion_tokens_details=None, prompt_tokens_details=None))" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "response = client.completions.create(\n", " model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n", @@ -523,34 +230,7 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-12-02 22:46:53 TP0] Prefill batch. #new-seq: 1, #new-token: 19, #cached-token: 30, cache hit rate: 37.61%, token usage: 0.00, #running-req: 0, #queue-req: 0\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-12-02 22:46:53] INFO: 127.0.0.1:42866 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n" - ] - }, - { - "data": { - "text/html": [ - "{\"name\": \"Paris\", \"population\": 2147000}" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "import json\n", "\n", @@ -595,28 +275,7 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-12-02 22:46:53 TP0] Prefill batch. #new-seq: 1, #new-token: 12, #cached-token: 30, cache hit rate: 42.75%, token usage: 0.00, #running-req: 0, #queue-req: 0\n", - "[2024-12-02 22:46:53] INFO: 127.0.0.1:42866 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n" - ] - }, - { - "data": { - "text/html": [ - "Paris" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "response = client.chat.completions.create(\n", " model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n", @@ -652,35 +311,7 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-12-02 22:46:53] INFO: 127.0.0.1:47994 - \"POST /v1/files HTTP/1.1\" 200 OK\n", - "[2024-12-02 22:46:53] INFO: 127.0.0.1:47994 - \"POST /v1/batches HTTP/1.1\" 200 OK\n" - ] - }, - { - "data": { - "text/html": [ - "Batch job created with ID: batch_eafe977c-1e11-447a-a77f-6905cd2cf267" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-12-02 22:46:53 TP0] Prefill batch. #new-seq: 2, #new-token: 18, #cached-token: 62, cache hit rate: 50.56%, token usage: 0.00, #running-req: 0, #queue-req: 0\n" - ] - } - ], + "outputs": [], "source": [ "import json\n", "import time\n", @@ -735,93 +366,7 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-12-02 22:46:53 TP0] Decode batch. #running-req: 1, #token: 56, token usage: 0.00, gen throughput (token/s): 87.91, #queue-req: 0\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Batch job status: validating...trying again in 3 seconds...\n", - "[2024-12-02 22:46:56] INFO: 127.0.0.1:47994 - \"GET /v1/batches/batch_eafe977c-1e11-447a-a77f-6905cd2cf267 HTTP/1.1\" 200 OK\n", - "Batch job completed successfully!\n", - "Request counts: BatchRequestCounts(completed=2, failed=0, total=2)\n", - "[2024-12-02 22:46:56] INFO: 127.0.0.1:47994 - \"GET /v1/files/backend_result_file-62a6f368-3d24-4e97-8920-1e5d12769ffc/content HTTP/1.1\" 200 OK\n" - ] - }, - { - "data": { - "text/html": [ - "Request request-1:" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Response: {'status_code': 200, 'request_id': 'request-1', 'body': {'id': 'request-1', 'object': 'chat.completion', 'created': 1733179613, 'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct', 'choices': {'index': 0, 'message': {'role': 'assistant', 'content': 'Why do programmers prefer dark mode?\\n\\nBecause light attracts bugs.'}, 'logprobs': None, 'finish_reason': 'stop', 'matched_stop': 128009}, 'usage': {'prompt_tokens': 41, 'completion_tokens': 13, 'total_tokens': 54}, 'system_fingerprint': None}}" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Request request-2:" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Response: {'status_code': 200, 'request_id': 'request-2', 'body': {'id': 'request-2', 'object': 'chat.completion', 'created': 1733179613, 'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct', 'choices': {'index': 0, 'message': {'role': 'assistant', 'content': '**What is Python?**\\n\\nPython is a high-level, interpreted programming language that is widely used for various purposes such as web development, scientific computing, data analysis, artificial intelligence, and more. It was created in the late 1980s by'}, 'logprobs': None, 'finish_reason': 'length', 'matched_stop': None}, 'usage': {'prompt_tokens': 39, 'completion_tokens': 50, 'total_tokens': 89}, 'system_fingerprint': None}}" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Cleaning up files..." - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-12-02 22:46:56] INFO: 127.0.0.1:47994 - \"DELETE /v1/files/backend_result_file-62a6f368-3d24-4e97-8920-1e5d12769ffc HTTP/1.1\" 200 OK\n" - ] - } - ], + "outputs": [], "source": [ "while batch_response.status not in [\"completed\", \"failed\", \"cancelled\"]:\n", " time.sleep(3)\n", @@ -869,287 +414,7 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-12-02 22:46:56] INFO: 127.0.0.1:48004 - \"POST /v1/files HTTP/1.1\" 200 OK\n", - "[2024-12-02 22:46:56] INFO: 127.0.0.1:48004 - \"POST /v1/batches HTTP/1.1\" 200 OK\n" - ] - }, - { - "data": { - "text/html": [ - "Created batch job with ID: batch_028e2808-382e-4aa4-af37-d7f4e763c3f4" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Initial status: validating" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-12-02 22:46:56 TP0] Prefill batch. #new-seq: 100, #new-token: 3000, #cached-token: 2500, cache hit rate: 45.77%, token usage: 0.00, #running-req: 0, #queue-req: 0\n", - "[2024-12-02 22:46:56 TP0] Decode batch. #running-req: 100, #token: 3725, token usage: 0.02, gen throughput (token/s): 206.20, #queue-req: 0\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-12-02 22:46:57 TP0] Decode batch. #running-req: 100, #token: 7725, token usage: 0.04, gen throughput (token/s): 11850.91, #queue-req: 0\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-12-02 22:46:57 TP0] Decode batch. #running-req: 100, #token: 11725, token usage: 0.06, gen throughput (token/s): 11616.06, #queue-req: 0\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-12-02 22:46:57 TP0] Decode batch. #running-req: 100, #token: 15725, token usage: 0.07, gen throughput (token/s): 11346.97, #queue-req: 0\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-12-02 22:46:58 TP0] Decode batch. #running-req: 100, #token: 19725, token usage: 0.09, gen throughput (token/s): 11089.22, #queue-req: 0\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-12-02 22:46:58 TP0] Decode batch. #running-req: 100, #token: 23725, token usage: 0.11, gen throughput (token/s): 10835.27, #queue-req: 0\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-12-02 22:46:58 TP0] Decode batch. #running-req: 100, #token: 27725, token usage: 0.13, gen throughput (token/s): 10583.19, #queue-req: 0\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-12-02 22:46:59 TP0] Decode batch. #running-req: 100, #token: 31725, token usage: 0.15, gen throughput (token/s): 10363.49, #queue-req: 0\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-12-02 22:46:59 TP0] Decode batch. #running-req: 100, #token: 35725, token usage: 0.17, gen throughput (token/s): 10145.38, #queue-req: 0\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-12-02 22:47:00 TP0] Decode batch. #running-req: 100, #token: 39725, token usage: 0.19, gen throughput (token/s): 9927.85, #queue-req: 0\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-12-02 22:47:00 TP0] Decode batch. #running-req: 100, #token: 43725, token usage: 0.21, gen throughput (token/s): 9719.36, #queue-req: 0\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-12-02 22:47:00 TP0] Decode batch. #running-req: 100, #token: 47725, token usage: 0.23, gen throughput (token/s): 9533.96, #queue-req: 0\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-12-02 22:47:01 TP0] Decode batch. #running-req: 100, #token: 51725, token usage: 0.25, gen throughput (token/s): 9339.12, #queue-req: 0\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-12-02 22:47:06] INFO: 127.0.0.1:59256 - \"GET /v1/batches/batch_028e2808-382e-4aa4-af37-d7f4e763c3f4 HTTP/1.1\" 200 OK\n" - ] - }, - { - "data": { - "text/html": [ - "Batch job details (check 1 / 5) // ID: batch_028e2808-382e-4aa4-af37-d7f4e763c3f4 // Status: completed // Created at: 1733179616 // Input file ID: backend_input_file-8762851c-2789-4b30-b4df-c66edde1d5bb // Output file ID: backend_result_file-3d82e8c5-8022-43ee-9659-99e4124ec4f8" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Request counts: Total: 100 // Completed: 100 // Failed: 0" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-12-02 22:47:09] INFO: 127.0.0.1:59256 - \"GET /v1/batches/batch_028e2808-382e-4aa4-af37-d7f4e763c3f4 HTTP/1.1\" 200 OK\n" - ] - }, - { - "data": { - "text/html": [ - "Batch job details (check 2 / 5) // ID: batch_028e2808-382e-4aa4-af37-d7f4e763c3f4 // Status: completed // Created at: 1733179616 // Input file ID: backend_input_file-8762851c-2789-4b30-b4df-c66edde1d5bb // Output file ID: backend_result_file-3d82e8c5-8022-43ee-9659-99e4124ec4f8" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Request counts: Total: 100 // Completed: 100 // Failed: 0" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-12-02 22:47:12] INFO: 127.0.0.1:59256 - \"GET /v1/batches/batch_028e2808-382e-4aa4-af37-d7f4e763c3f4 HTTP/1.1\" 200 OK\n" - ] - }, - { - "data": { - "text/html": [ - "Batch job details (check 3 / 5) // ID: batch_028e2808-382e-4aa4-af37-d7f4e763c3f4 // Status: completed // Created at: 1733179616 // Input file ID: backend_input_file-8762851c-2789-4b30-b4df-c66edde1d5bb // Output file ID: backend_result_file-3d82e8c5-8022-43ee-9659-99e4124ec4f8" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Request counts: Total: 100 // Completed: 100 // Failed: 0" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-12-02 22:47:15] INFO: 127.0.0.1:59256 - \"GET /v1/batches/batch_028e2808-382e-4aa4-af37-d7f4e763c3f4 HTTP/1.1\" 200 OK\n" - ] - }, - { - "data": { - "text/html": [ - "Batch job details (check 4 / 5) // ID: batch_028e2808-382e-4aa4-af37-d7f4e763c3f4 // Status: completed // Created at: 1733179616 // Input file ID: backend_input_file-8762851c-2789-4b30-b4df-c66edde1d5bb // Output file ID: backend_result_file-3d82e8c5-8022-43ee-9659-99e4124ec4f8" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Request counts: Total: 100 // Completed: 100 // Failed: 0" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-12-02 22:47:18] INFO: 127.0.0.1:59256 - \"GET /v1/batches/batch_028e2808-382e-4aa4-af37-d7f4e763c3f4 HTTP/1.1\" 200 OK\n" - ] - }, - { - "data": { - "text/html": [ - "Batch job details (check 5 / 5) // ID: batch_028e2808-382e-4aa4-af37-d7f4e763c3f4 // Status: completed // Created at: 1733179616 // Input file ID: backend_input_file-8762851c-2789-4b30-b4df-c66edde1d5bb // Output file ID: backend_result_file-3d82e8c5-8022-43ee-9659-99e4124ec4f8" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Request counts: Total: 100 // Completed: 100 // Failed: 0" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "import json\n", "import time\n", @@ -1225,208 +490,7 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-12-02 22:47:21] INFO: 127.0.0.1:48046 - \"POST /v1/files HTTP/1.1\" 200 OK\n", - "[2024-12-02 22:47:21] INFO: 127.0.0.1:48046 - \"POST /v1/batches HTTP/1.1\" 200 OK\n" - ] - }, - { - "data": { - "text/html": [ - "Created batch job with ID: batch_94c72d7f-b065-4c83-8765-d257480f694f" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Initial status: validating" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-12-02 22:47:21 TP0] Prefill batch. #new-seq: 83, #new-token: 83, #cached-token: 4482, cache hit rate: 68.73%, token usage: 0.01, #running-req: 0, #queue-req: 0\n", - "[2024-12-02 22:47:21 TP0] Prefill batch. #new-seq: 290, #new-token: 8192, #cached-token: 7743, cache hit rate: 56.55%, token usage: 0.01, #running-req: 83, #queue-req: 127\n", - "[2024-12-02 22:47:21 TP0] Prefill batch. #new-seq: 128, #new-token: 3825, #cached-token: 3215, cache hit rate: 54.26%, token usage: 0.05, #running-req: 372, #queue-req: 1\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-12-02 22:47:22 TP0] Decode batch. #running-req: 500, #token: 28525, token usage: 0.14, gen throughput (token/s): 678.38, #queue-req: 0\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-12-02 22:47:23 TP0] Decode batch. #running-req: 500, #token: 48525, token usage: 0.23, gen throughput (token/s): 26596.73, #queue-req: 0\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-12-02 22:47:24 TP0] Decode batch. #running-req: 500, #token: 68525, token usage: 0.33, gen throughput (token/s): 25443.33, #queue-req: 0\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-12-02 22:47:24 TP0] Decode batch. #running-req: 500, #token: 88525, token usage: 0.42, gen throughput (token/s): 24276.84, #queue-req: 0\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-12-02 22:47:25 TP0] Decode batch. #running-req: 500, #token: 108525, token usage: 0.52, gen throughput (token/s): 23133.50, #queue-req: 0\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-12-02 22:47:26 TP0] Decode batch. #running-req: 500, #token: 128525, token usage: 0.61, gen throughput (token/s): 22139.46, #queue-req: 0\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-12-02 22:47:27 TP0] Decode batch. #running-req: 500, #token: 148525, token usage: 0.71, gen throughput (token/s): 21220.49, #queue-req: 0\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-12-02 22:47:28 TP0] Decode batch. #running-req: 500, #token: 168525, token usage: 0.80, gen throughput (token/s): 20377.60, #queue-req: 0\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-12-02 22:47:29 TP0] Decode batch. #running-req: 500, #token: 188525, token usage: 0.90, gen throughput (token/s): 19555.01, #queue-req: 0\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-12-02 22:47:30 TP0] Decode batch. #running-req: 500, #token: 208525, token usage: 0.99, gen throughput (token/s): 18893.70, #queue-req: 0\n", - "[2024-12-02 22:47:30 TP0] Decode out of memory happened. #retracted_reqs: 23, #new_token_ratio: 0.3087 -> 0.8200\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-12-02 22:47:31 TP0] Decode out of memory happened. #retracted_reqs: 21, #new_token_ratio: 0.8009 -> 0.8600\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-12-02 22:47:31] INFO: 127.0.0.1:60160 - \"POST /v1/batches/batch_94c72d7f-b065-4c83-8765-d257480f694f/cancel HTTP/1.1\" 200 OK\n" - ] - }, - { - "data": { - "text/html": [ - "Cancellation initiated. Status: cancelling" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-12-02 22:47:34] INFO: 127.0.0.1:60160 - \"GET /v1/batches/batch_94c72d7f-b065-4c83-8765-d257480f694f HTTP/1.1\" 200 OK\n" - ] - }, - { - "data": { - "text/html": [ - "Current status: cancelled" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Batch job successfully cancelled" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-12-02 22:47:34] INFO: 127.0.0.1:60160 - \"DELETE /v1/files/backend_input_file-0acbc8d2-85ef-406a-b610-1591357c7615 HTTP/1.1\" 200 OK\n" - ] - }, - { - "data": { - "text/html": [ - "Successfully cleaned up input file" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Successfully deleted local batch_requests.jsonl file" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "import json\n", "import time\n", From f946fac4d1fd722e23afc86d14e7d018d1ae1ded Mon Sep 17 00:00:00 2001 From: zhaochenyang20 Date: Mon, 2 Dec 2024 23:06:34 +0000 Subject: [PATCH 4/6] remove ipynb output --- docs/start/send_request.ipynb | 781 +--------------------------------- 1 file changed, 7 insertions(+), 774 deletions(-) diff --git a/docs/start/send_request.ipynb b/docs/start/send_request.ipynb index 63e8920a9e..4cb46f1edc 100644 --- a/docs/start/send_request.ipynb +++ b/docs/start/send_request.ipynb @@ -32,141 +32,7 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "/opt/dlami/nvme/chenyang/miniconda3/envs/sglang/lib/python3.11/site-packages/transformers/utils/hub.py:128: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n", - " warnings.warn(\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-12-02 22:51:29] server_args=ServerArgs(model_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, context_length=None, device='cuda', served_model_name='meta-llama/Meta-Llama-3.1-8B-Instruct', chat_template=None, is_embedding=False, revision=None, host='0.0.0.0', port=30000, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='lpm', schedule_conservativeness=1.0, cpu_offload_gb=0, tp_size=1, stream_interval=1, random_seed=720139840, constrained_json_whitespace_pattern=None, watchdog_timeout=300, download_dir=None, base_gpu_id=0, log_level='info', log_level_http=None, log_requests=False, show_time_cost=False, enable_metrics=False, decode_log_interval=40, api_key=None, file_storage_pth='SGLang_storage', enable_cache_report=False, dp_size=1, load_balance_method='round_robin', dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='outlines', disable_radix_cache=False, disable_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_overlap_schedule=False, enable_mixed_chunk=False, enable_dp_attention=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config='', enable_nan_detection=False, enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1, delete_ckpt_after_loading=False)\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "/opt/dlami/nvme/chenyang/miniconda3/envs/sglang/lib/python3.11/site-packages/transformers/utils/hub.py:128: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n", - " warnings.warn(\n", - "/opt/dlami/nvme/chenyang/miniconda3/envs/sglang/lib/python3.11/site-packages/transformers/utils/hub.py:128: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n", - " warnings.warn(\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-12-02 22:51:37 TP0] Init torch distributed begin.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-12-02 22:51:38 TP0] Load weight begin. avail mem=46.29 GB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-12-02 22:51:38 TP0] lm_eval is not installed, GPTQ may not be usable\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-12-02 22:51:38 TP0] Using model weights format ['*.safetensors']\n", - "\r", - "Loading safetensors checkpoint shards: 0% Completed | 0/4 [00:00

NOTE: Typically, the server runs in a separate terminal.
In this notebook, we run the server and notebook code together, so their outputs are combined.
To improve clarity, the server logs are displayed in the original black color, while the notebook outputs are highlighted in blue.
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "from sglang.utils import (\n", " execute_shell_command,\n", @@ -196,28 +62,7 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-12-02 22:51:55 TP0] Prefill batch. #new-seq: 1, #new-token: 41, #cached-token: 1, cache hit rate: 2.04%, token usage: 0.00, #running-req: 0, #queue-req: 0\n", - "[2024-12-02 22:51:55] INFO: 127.0.0.1:55794 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n" - ] - }, - { - "data": { - "text/html": [ - "{'id': '9bb7a92e66884dd597ed21d40b371dee', 'object': 'chat.completion', 'created': 1733179915, 'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': 'The capital of France is Paris.'}, 'logprobs': None, 'finish_reason': 'stop', 'matched_stop': 128009}], 'usage': {'prompt_tokens': 42, 'total_tokens': 50, 'completion_tokens': 8, 'prompt_tokens_details': None}}" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "import subprocess, json\n", "\n", @@ -241,28 +86,7 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-12-02 22:51:55 TP0] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 41, cache hit rate: 46.15%, token usage: 0.00, #running-req: 0, #queue-req: 0\n", - "[2024-12-02 22:51:55] INFO: 127.0.0.1:55804 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n" - ] - }, - { - "data": { - "text/html": [ - "{'id': '9e218a4ee02c483caac6fe038d1d75e7', 'object': 'chat.completion', 'created': 1733179915, 'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': 'The capital of France is Paris.'}, 'logprobs': None, 'finish_reason': 'stop', 'matched_stop': 128009}], 'usage': {'prompt_tokens': 42, 'total_tokens': 50, 'completion_tokens': 8, 'prompt_tokens_details': None}}" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "import requests\n", "\n", @@ -288,35 +112,7 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-12-02 22:51:55 TP0] Prefill batch. #new-seq: 1, #new-token: 13, #cached-token: 30, cache hit rate: 53.73%, token usage: 0.00, #running-req: 0, #queue-req: 0\n", - "[2024-12-02 22:51:55 TP0] Decode batch. #running-req: 1, #token: 60, token usage: 0.00, gen throughput (token/s): 6.97, #queue-req: 0\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-12-02 22:51:56] INFO: 127.0.0.1:55806 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n" - ] - }, - { - "data": { - "text/html": [ - "ChatCompletion(id='14c5f53930714f6a8aaf2c107d60ab2d', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are 3 countries and their capitals:\\n\\n1. Country: Japan\\n Capital: Tokyo\\n\\n2. Country: Australia\\n Capital: Canberra\\n\\n3. Country: Brazil\\n Capital: Brasília', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None), matched_stop=128009)], created=1733179916, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=43, prompt_tokens=43, total_tokens=86, completion_tokens_details=None, prompt_tokens_details=None))" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "import openai\n", "\n", @@ -344,308 +140,7 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-12-02 22:51:56] INFO: 127.0.0.1:55816 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n", - "[2024-12-02 22:51:56 TP0] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 42, cache hit rate: 64.41%, token usage: 0.00, #running-req: 0, #queue-req: 0\n", - "Here" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " are" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " " - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "3" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " countries" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " and" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " their" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " capitals" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - ":\n", - "\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "1" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "." - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " Country" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - ":" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-12-02 22:51:56 TP0] Decode batch. #running-req: 1, #token: 57, token usage: 0.00, gen throughput (token/s): 133.00, #queue-req: 0\n", - " Japan" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " " - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " Capital" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - ":" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " Tokyo" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "." - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " Country" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - ":" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " Australia" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " " - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " Capital" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - ":" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " Canberra" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "3" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "." - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " Country" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - ":" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " Brazil" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " " - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " Capital" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - ":" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " Bras" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "ília" - ] - } - ], + "outputs": [], "source": [ "import openai\n", "\n", @@ -681,41 +176,7 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-12-02 22:51:56 TP0] Prefill batch. #new-seq: 1, #new-token: 3, #cached-token: 3, cache hit rate: 63.93%, token usage: 0.00, #running-req: 0, #queue-req: 0\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-12-02 22:51:56 TP0] Decode batch. #running-req: 1, #token: 17, token usage: 0.00, gen throughput (token/s): 137.66, #queue-req: 0\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-12-02 22:51:56] INFO: 127.0.0.1:55826 - \"POST /generate HTTP/1.1\" 200 OK\n" - ] - }, - { - "data": { - "text/html": [ - "{'text': ' a city of romance, art, fashion, and history. Paris is a must-visit destination for anyone who loves culture, architecture, and cuisine. From the', 'meta_info': {'prompt_tokens': 6, 'completion_tokens': 32, 'completion_tokens_wo_jump_forward': 32, 'cached_tokens': 3, 'finish_reason': {'type': 'length', 'length': 32}, 'id': 'efb0c50df0a34354b3762c764925c8e2'}}" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "import requests\n", "\n", @@ -744,235 +205,7 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-12-02 22:51:56] INFO: 127.0.0.1:55836 - \"POST /generate HTTP/1.1\" 200 OK\n", - "[2024-12-02 22:51:56 TP0] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 5, cache hit rate: 64.55%, token usage: 0.00, #running-req: 0, #queue-req: 0\n", - " a" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " city" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " of" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " romance" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "," - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " art" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "," - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " fashion" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "," - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " and" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " cuisine" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "." - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " Paris" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " is" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " a" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " must" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "-" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "visit" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2024-12-02 22:51:56 TP0] Decode batch. #running-req: 1, #token: 25, token usage: 0.00, gen throughput (token/s): 138.56, #queue-req: 0\n", - " destination" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " for" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " anyone" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " who" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " loves" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " history" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "," - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " architecture" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "," - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " and" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " culture" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "." - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " From" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " the" - ] - } - ], + "outputs": [], "source": [ "import requests, json\n", "\n", From 6db5d1c6d72d686e0102141535e142094ecfa7dd Mon Sep 17 00:00:00 2001 From: zhaochenyang20 Date: Tue, 3 Dec 2024 05:32:23 +0000 Subject: [PATCH 5/6] revert classify --- docs/backend/native_api.ipynb | 10 ++++++++-- docs/backend/offline_engine_api.ipynb | 3 ++- docs/backend/openai_api_completions.ipynb | 3 ++- docs/backend/openai_api_embeddings.ipynb | 3 ++- docs/backend/openai_api_vision.ipynb | 3 ++- docs/start/send_request.ipynb | 3 ++- python/sglang/srt/server.py | 2 +- 7 files changed, 19 insertions(+), 8 deletions(-) diff --git a/docs/backend/native_api.ipynb b/docs/backend/native_api.ipynb index a7a81b8487..849cdd8d69 100644 --- a/docs/backend/native_api.ipynb +++ b/docs/backend/native_api.ipynb @@ -332,7 +332,7 @@ "tokenizer = AutoTokenizer.from_pretrained(\"Skywork/Skywork-Reward-Llama-3.1-8B-v0.2\")\n", "prompts = tokenizer.apply_chat_template(CONVS, tokenize=False)\n", "\n", - "url = \"http://localhost:30030/encode\"\n", + "url = \"http://localhost:30030/classify\"\n", "data = {\"model\": \"Skywork/Skywork-Reward-Llama-3.1-8B-v0.2\", \"text\": prompts}\n", "\n", "responses = requests.post(url, json=data).json()\n", @@ -351,6 +351,11 @@ } ], "metadata": { + "kernelspec": { + "display_name": "sglang", + "language": "python", + "name": "python3" + }, "language_info": { "codemirror_mode": { "name": "ipython", @@ -360,7 +365,8 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3" + "pygments_lexer": "ipython3", + "version": "3.11.7" } }, "nbformat": 4, diff --git a/docs/backend/offline_engine_api.ipynb b/docs/backend/offline_engine_api.ipynb index 7ce89d435d..ac997641b0 100644 --- a/docs/backend/offline_engine_api.ipynb +++ b/docs/backend/offline_engine_api.ipynb @@ -195,7 +195,8 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3" + "pygments_lexer": "ipython3", + "version": "3.11.7" } }, "nbformat": 4, diff --git a/docs/backend/openai_api_completions.ipynb b/docs/backend/openai_api_completions.ipynb index 067a046885..ae2e8fc868 100644 --- a/docs/backend/openai_api_completions.ipynb +++ b/docs/backend/openai_api_completions.ipynb @@ -594,7 +594,8 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3" + "pygments_lexer": "ipython3", + "version": "3.11.7" } }, "nbformat": 4, diff --git a/docs/backend/openai_api_embeddings.ipynb b/docs/backend/openai_api_embeddings.ipynb index 65b07c384d..d3df74ea1a 100644 --- a/docs/backend/openai_api_embeddings.ipynb +++ b/docs/backend/openai_api_embeddings.ipynb @@ -188,7 +188,8 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3" + "pygments_lexer": "ipython3", + "version": "3.11.7" } }, "nbformat": 4, diff --git a/docs/backend/openai_api_vision.ipynb b/docs/backend/openai_api_vision.ipynb index af17b44096..64496c56a8 100644 --- a/docs/backend/openai_api_vision.ipynb +++ b/docs/backend/openai_api_vision.ipynb @@ -274,7 +274,8 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3" + "pygments_lexer": "ipython3", + "version": "3.11.7" } }, "nbformat": 4, diff --git a/docs/start/send_request.ipynb b/docs/start/send_request.ipynb index 4cb46f1edc..b8b7d21964 100644 --- a/docs/start/send_request.ipynb +++ b/docs/start/send_request.ipynb @@ -254,7 +254,8 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3" + "pygments_lexer": "ipython3", + "version": "3.11.7" } }, "nbformat": 4, diff --git a/python/sglang/srt/server.py b/python/sglang/srt/server.py index fc8ac150b3..7b91cb6979 100644 --- a/python/sglang/srt/server.py +++ b/python/sglang/srt/server.py @@ -329,7 +329,7 @@ async def encode_request(obj: EmbeddingReqInput, request: Request): ) -@app.api_route("/encode", methods=["POST", "PUT"]) +@app.api_route("/classify", methods=["POST", "PUT"]) @time_func_latency async def classify_request(obj: EmbeddingReqInput, request: Request): """Handle a reward model request. Now the arguments and return values are the same as embedding models.""" From 386df39a792f187fc8fce021b72a07ef6aa182c4 Mon Sep 17 00:00:00 2001 From: zhaochenyang20 Date: Tue, 3 Dec 2024 05:32:42 +0000 Subject: [PATCH 6/6] revert classify --- docs/backend/native_api.ipynb | 8 +------- docs/backend/offline_engine_api.ipynb | 3 +-- docs/backend/openai_api_completions.ipynb | 3 +-- docs/backend/openai_api_embeddings.ipynb | 3 +-- docs/backend/openai_api_vision.ipynb | 3 +-- docs/start/send_request.ipynb | 3 +-- 6 files changed, 6 insertions(+), 17 deletions(-) diff --git a/docs/backend/native_api.ipynb b/docs/backend/native_api.ipynb index 849cdd8d69..26758f7f97 100644 --- a/docs/backend/native_api.ipynb +++ b/docs/backend/native_api.ipynb @@ -351,11 +351,6 @@ } ], "metadata": { - "kernelspec": { - "display_name": "sglang", - "language": "python", - "name": "python3" - }, "language_info": { "codemirror_mode": { "name": "ipython", @@ -365,8 +360,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.7" + "pygments_lexer": "ipython3" } }, "nbformat": 4, diff --git a/docs/backend/offline_engine_api.ipynb b/docs/backend/offline_engine_api.ipynb index ac997641b0..7ce89d435d 100644 --- a/docs/backend/offline_engine_api.ipynb +++ b/docs/backend/offline_engine_api.ipynb @@ -195,8 +195,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.7" + "pygments_lexer": "ipython3" } }, "nbformat": 4, diff --git a/docs/backend/openai_api_completions.ipynb b/docs/backend/openai_api_completions.ipynb index ae2e8fc868..067a046885 100644 --- a/docs/backend/openai_api_completions.ipynb +++ b/docs/backend/openai_api_completions.ipynb @@ -594,8 +594,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.7" + "pygments_lexer": "ipython3" } }, "nbformat": 4, diff --git a/docs/backend/openai_api_embeddings.ipynb b/docs/backend/openai_api_embeddings.ipynb index d3df74ea1a..65b07c384d 100644 --- a/docs/backend/openai_api_embeddings.ipynb +++ b/docs/backend/openai_api_embeddings.ipynb @@ -188,8 +188,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.7" + "pygments_lexer": "ipython3" } }, "nbformat": 4, diff --git a/docs/backend/openai_api_vision.ipynb b/docs/backend/openai_api_vision.ipynb index 64496c56a8..af17b44096 100644 --- a/docs/backend/openai_api_vision.ipynb +++ b/docs/backend/openai_api_vision.ipynb @@ -274,8 +274,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.7" + "pygments_lexer": "ipython3" } }, "nbformat": 4, diff --git a/docs/start/send_request.ipynb b/docs/start/send_request.ipynb index b8b7d21964..4cb46f1edc 100644 --- a/docs/start/send_request.ipynb +++ b/docs/start/send_request.ipynb @@ -254,8 +254,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.7" + "pygments_lexer": "ipython3" } }, "nbformat": 4,