From 156ffe2d32413fc1e220261824cec26a3058ea56 Mon Sep 17 00:00:00 2001 From: Chayenne Date: Sun, 26 Jan 2025 20:36:30 -0800 Subject: [PATCH 1/9] Docs fix about EAGLE and streaming output --- docs/backend/offline_engine_api.ipynb | 156 +++++++++++++++++++++--- docs/backend/speculative_decoding.ipynb | 17 ++- 2 files changed, 154 insertions(+), 19 deletions(-) diff --git a/docs/backend/offline_engine_api.ipynb b/docs/backend/offline_engine_api.ipynb index 7ce89d435d5..279c13cbafd 100644 --- a/docs/backend/offline_engine_api.ipynb +++ b/docs/backend/offline_engine_api.ipynb @@ -32,9 +32,27 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/chenyang/miniconda3/envs/sgl-zbz/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n", + "2025-01-26 19:57:24,783\tINFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.\n", + "Loading safetensors checkpoint shards: 0% Completed | 0/4 [00:00 **Note**: To run the following tests or benchmarks, you also need to install the [**cutex**](https://pypi.org/project/cutex/) module. \n", + "> **Requirement**: Python 3.6+ on a Unix-like OS with **fcntl** support. \n", + "> **Installation**: \n", + "> ```bash\n", + "> pip install cutex\n", + "> ```\n", + "\n", "### Performance Highlights\n", "\n", "- **Official EAGLE code** ([SafeAILab/EAGLE](https://github.com/SafeAILab/EAGLE)): ~200 tokens/s\n", "- **Standard SGLang Decoding**: ~156 tokens/s\n", "- **EAGLE Decoding in SGLang**: ~297 tokens/s\n", - "- **EAGLE Decoding in SGLang (w/ `torch.compile`)**: ~316 tokens/s\n", + "- **EAGLE Decoding in SGLang (w/ torch.compile)**: ~316 tokens/s\n", "\n", "All benchmarks below were run on a single H100." ] @@ -159,6 +166,11 @@ } ], "metadata": { + "kernelspec": { + "display_name": "sgl-zbz", + "language": "python", + "name": "python3" + }, "language_info": { "codemirror_mode": { "name": "ipython", @@ -168,7 +180,8 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3" + "pygments_lexer": "ipython3", + "version": "3.10.16" } }, "nbformat": 4, From 26f88c5fdec75e11eed7ca0545d539f9c4936620 Mon Sep 17 00:00:00 2001 From: Chayenne Date: Sun, 26 Jan 2025 20:39:37 -0800 Subject: [PATCH 2/9] Change timeout mins --- docs/backend/offline_engine_api.ipynb | 102 ++---------------------- docs/backend/speculative_decoding.ipynb | 8 +- 2 files changed, 9 insertions(+), 101 deletions(-) diff --git a/docs/backend/offline_engine_api.ipynb b/docs/backend/offline_engine_api.ipynb index 279c13cbafd..2c5b951fd58 100644 --- a/docs/backend/offline_engine_api.ipynb +++ b/docs/backend/offline_engine_api.ipynb @@ -32,27 +32,9 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/chenyang/miniconda3/envs/sgl-zbz/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from .autonotebook import tqdm as notebook_tqdm\n", - "2025-01-26 19:57:24,783\tINFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.\n", - "Loading safetensors checkpoint shards: 0% Completed | 0/4 [00:00 Date: Sun, 26 Jan 2025 21:52:50 -0800 Subject: [PATCH 3/9] Solve Streaming output issue for async --- docs/backend/offline_engine_api.ipynb | 103 ++++++++++++++++---------- 1 file changed, 63 insertions(+), 40 deletions(-) diff --git a/docs/backend/offline_engine_api.ipynb b/docs/backend/offline_engine_api.ipynb index 2c5b951fd58..111f3cf29c8 100644 --- a/docs/backend/offline_engine_api.ipynb +++ b/docs/backend/offline_engine_api.ipynb @@ -37,7 +37,10 @@ "outputs": [], "source": [ "# launch the offline engine\n", - "\n", + "from sglang.utils import (\n", + " generate_text_no_repeats,\n", + " async_generate_no_repeats,\n", + ")\n", "import sglang as sgl\n", "import asyncio\n", "\n", @@ -85,40 +88,6 @@ "metadata": {}, "outputs": [], "source": [ - "def remove_overlap(existing_text, new_chunk):\n", - " \"\"\"\n", - " Finds the largest suffix of 'existing_text' that is a prefix of 'new_chunk'\n", - " and removes that overlap from the start of 'new_chunk'.\n", - " \"\"\"\n", - " max_overlap = 0\n", - " max_possible = min(len(existing_text), len(new_chunk))\n", - "\n", - " for i in range(max_possible, 0, -1):\n", - " if existing_text.endswith(new_chunk[:i]):\n", - " max_overlap = i\n", - " break\n", - "\n", - " return new_chunk[max_overlap:]\n", - "\n", - "\n", - "def generate_text_no_repeats(llm, prompt, sampling_params):\n", - " \"\"\"\n", - " Example function that:\n", - " 1) Streams the text,\n", - " 2) Removes chunk overlaps,\n", - " 3) Returns the merged text.\n", - " \"\"\"\n", - " final_text = \"\"\n", - " for chunk in llm.generate(prompt, sampling_params, stream=True):\n", - " chunk_text = chunk[\"text\"]\n", - "\n", - " cleaned_chunk = remove_overlap(final_text, chunk_text)\n", - "\n", - " final_text += cleaned_chunk\n", - "\n", - " return final_text\n", - "\n", - "\n", "prompts = [\n", " \"Write a short, neutral self-introduction for a fictional character. Hello, my name is\",\n", " \"Provide a concise factual statement about France’s capital city. The capital of France is\",\n", @@ -187,6 +156,57 @@ "metadata": {}, "outputs": [], "source": [ + "# prompts = [\n", + "# \"Hello, my name is\",\n", + "# \"The capital of France is\",\n", + "# \"The future of AI is\",\n", + "# ]\n", + "# sampling_params = {\"temperature\": 0.8, \"top_p\": 0.95}\n", + "\n", + "# print(\"\\n=== Testing asynchronous streaming generation ===\")\n", + "\n", + "\n", + "# async def main():\n", + "# for prompt in prompts:\n", + "# print(f\"\\nPrompt: {prompt}\")\n", + "# print(\"Generated text: \", end=\"\", flush=True)\n", + "\n", + "# generator = await llm.async_generate(prompt, sampling_params, stream=True)\n", + "# async for chunk in generator:\n", + "# print(chunk[\"text\"], end=\"\", flush=True)\n", + "# print()\n", + "\n", + "\n", + "# asyncio.run(main())\n", + "\n", + "import asyncio\n", + "\n", + "\n", + "# This function checks if the end of existing_text matches the start of new_chunk,\n", + "# and removes that overlap from new_chunk so it won't be printed twice.\n", + "def remove_overlap(existing_text, new_chunk):\n", + " max_overlap = 0\n", + " max_possible = min(len(existing_text), len(new_chunk))\n", + " for i in range(max_possible, 0, -1):\n", + " if existing_text.endswith(new_chunk[:i]):\n", + " max_overlap = i\n", + " break\n", + " return new_chunk[max_overlap:]\n", + "\n", + "\n", + "# This async generator streams tokens from the model, cleans them\n", + "# to avoid repeating partial tokens, and yields only the new bits.\n", + "async def async_generate_no_repeats(llm, prompt, sampling_params):\n", + " final_text = \"\"\n", + " generator = await llm.async_generate(prompt, sampling_params, stream=True)\n", + " async for chunk in generator:\n", + " chunk_text = chunk[\"text\"]\n", + " cleaned_chunk = remove_overlap(final_text, chunk_text)\n", + " final_text += cleaned_chunk\n", + " yield cleaned_chunk\n", + "\n", + "\n", + "# Example usage\n", "prompts = [\n", " \"Hello, my name is\",\n", " \"The capital of France is\",\n", @@ -194,7 +214,7 @@ "]\n", "sampling_params = {\"temperature\": 0.8, \"top_p\": 0.95}\n", "\n", - "print(\"\\n=== Testing asynchronous streaming generation ===\")\n", + "print(\"\\n=== Testing asynchronous streaming generation (no repeats) ===\")\n", "\n", "\n", "async def main():\n", @@ -202,10 +222,13 @@ " print(f\"\\nPrompt: {prompt}\")\n", " print(\"Generated text: \", end=\"\", flush=True)\n", "\n", - " generator = await llm.async_generate(prompt, sampling_params, stream=True)\n", - " async for chunk in generator:\n", - " print(chunk[\"text\"], end=\"\", flush=True)\n", - " print()\n", + " # Replace direct calls to async_generate with our custom overlap-aware version\n", + " async for cleaned_chunk in async_generate_no_repeats(\n", + " llm, prompt, sampling_params\n", + " ):\n", + " print(cleaned_chunk, end=\"\", flush=True)\n", + "\n", + " print() # New line after each prompt\n", "\n", "\n", "asyncio.run(main())" From c5d45f54b9b5885134847b44d60bfd1521a2e46e Mon Sep 17 00:00:00 2001 From: Chayenne Date: Sun, 26 Jan 2025 22:14:48 -0800 Subject: [PATCH 4/9] DocFix in the Speculative_decoding --- docs/backend/speculative_decoding.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/backend/speculative_decoding.ipynb b/docs/backend/speculative_decoding.ipynb index aae3d27486d..cb4ab13879a 100644 --- a/docs/backend/speculative_decoding.ipynb +++ b/docs/backend/speculative_decoding.ipynb @@ -20,7 +20,7 @@ "- **Official EAGLE code** ([SafeAILab/EAGLE](https://github.com/SafeAILab/EAGLE)): ~200 tokens/s\n", "- **Standard SGLang Decoding**: ~156 tokens/s\n", "- **EAGLE Decoding in SGLang**: ~297 tokens/s\n", - "- **EAGLE Decoding in SGLang (w/ torch.compile)**: ~316 tokens/s\n", + "- EAGLE Decoding in SGLang (w/ `torch.compile`): ~316 tokens/s\n", "\n", "All benchmarks below were run on a single H100." ] From 6110fb59097e00f959cf4dfdf2f920f5dd5a4b8e Mon Sep 17 00:00:00 2001 From: Chayenne Date: Sun, 26 Jan 2025 22:15:41 -0800 Subject: [PATCH 5/9] Fix for the cutex doc --- docs/backend/speculative_decoding.ipynb | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/docs/backend/speculative_decoding.ipynb b/docs/backend/speculative_decoding.ipynb index cb4ab13879a..3216d98030b 100644 --- a/docs/backend/speculative_decoding.ipynb +++ b/docs/backend/speculative_decoding.ipynb @@ -8,9 +8,7 @@ "\n", "SGLang now provides an EAGLE-based speculative decoding option. The implementation aims to maximize speed and efficiency and is considered to be among the fastest in open-source LLM engines.\n", "\n", - "> **Note**: To run the following tests or benchmarks, you also need to install the [**cutex**](https://pypi.org/project/cutex/) module. \n", - "> **Requirement**: Python 3.6+ on a Unix-like OS with **fcntl** support. \n", - "> **Installation**: \n", + "To run the following tests or benchmarks, you also need to install [**cutex**](https://pypi.org/project/cutex/): \n", "> ```bash\n", "> pip install cutex\n", "> ```\n", From 563065670eff9c2e46c1fc39c5939636ced1a02b Mon Sep 17 00:00:00 2001 From: Chayenne Date: Sun, 26 Jan 2025 22:22:07 -0800 Subject: [PATCH 6/9] addup timeout-minutes --- .github/workflows/execute-notebook.yml | 2 +- python/sglang/utils.py | 48 ++++++++++++++++++++++++++ 2 files changed, 49 insertions(+), 1 deletion(-) diff --git a/.github/workflows/execute-notebook.yml b/.github/workflows/execute-notebook.yml index e03edd6ce79..49d649797ed 100644 --- a/.github/workflows/execute-notebook.yml +++ b/.github/workflows/execute-notebook.yml @@ -42,7 +42,7 @@ jobs: python -m ipykernel install --user --name python3 --display-name "Python 3" - name: Execute notebooks - timeout-minutes: 30 + timeout-minutes: 40 run: | cd docs make clean diff --git a/python/sglang/utils.py b/python/sglang/utils.py index 742eebc3bc9..d1e9b5a6315 100644 --- a/python/sglang/utils.py +++ b/python/sglang/utils.py @@ -373,3 +373,51 @@ def __call__(self, obj: Any): if isinstance(obj, ty): return fn(obj) raise ValueError(f"Invalid object: {obj}") + + +def remove_overlap(existing_text, new_chunk): + """ + Finds the largest suffix of 'existing_text' that is a prefix of 'new_chunk' + and removes that overlap from the start of 'new_chunk'. + """ + max_overlap = 0 + max_possible = min(len(existing_text), len(new_chunk)) + + for i in range(max_possible, 0, -1): + if existing_text.endswith(new_chunk[:i]): + max_overlap = i + break + + return new_chunk[max_overlap:] + + +def generate_text_no_repeats(llm, prompt, sampling_params): + """ + Example function that: + 1) Streams the text, + 2) Removes chunk overlaps, + 3) Returns the merged text. + """ + final_text = "" + for chunk in llm.generate(prompt, sampling_params, stream=True): + chunk_text = chunk["text"] + + cleaned_chunk = remove_overlap(final_text, chunk_text) + + final_text += cleaned_chunk + + return final_text + + +async def async_generate_no_repeats(llm, prompt, sampling_params): + """ + Streams tokens asynchronously, removes chunk overlaps, + and yields the cleaned chunk in real time for printing. + """ + final_text = "" + generator = await llm.async_generate(prompt, sampling_params, stream=True) + async for chunk in generator: + chunk_text = chunk["text"] + cleaned_chunk = remove_overlap(final_text, chunk_text) + final_text += cleaned_chunk + yield cleaned_chunk # yield the non-overlapping portion From bc315bbad525d737896f9b361f909d82a84a7769 Mon Sep 17 00:00:00 2001 From: Chayenne Date: Mon, 27 Jan 2025 11:29:07 -0800 Subject: [PATCH 7/9] Some quick fix for docs --- docs/backend/function_calling.ipynb | 10 +++- docs/backend/offline_engine_api.ipynb | 75 ++++--------------------- docs/backend/speculative_decoding.ipynb | 6 +- python/sglang/utils.py | 16 ++---- 4 files changed, 27 insertions(+), 80 deletions(-) diff --git a/docs/backend/function_calling.ipynb b/docs/backend/function_calling.ipynb index 3de80aadf11..05e7108e60e 100644 --- a/docs/backend/function_calling.ipynb +++ b/docs/backend/function_calling.ipynb @@ -507,7 +507,15 @@ ], "metadata": { "language_info": { - "name": "python" + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3" } }, "nbformat": 4, diff --git a/docs/backend/offline_engine_api.ipynb b/docs/backend/offline_engine_api.ipynb index 111f3cf29c8..58d24ac3ff6 100644 --- a/docs/backend/offline_engine_api.ipynb +++ b/docs/backend/offline_engine_api.ipynb @@ -37,10 +37,7 @@ "outputs": [], "source": [ "# launch the offline engine\n", - "from sglang.utils import (\n", - " generate_text_no_repeats,\n", - " async_generate_no_repeats,\n", - ")\n", + "from sglang.utils import stream_and_merge, async_stream_and_merge\n", "import sglang as sgl\n", "import asyncio\n", "\n", @@ -103,7 +100,7 @@ "\n", "for prompt in prompts:\n", " print(f\"Prompt: {prompt}\")\n", - " merged_output = generate_text_no_repeats(llm, prompt, sampling_params)\n", + " merged_output = stream_and_merge(llm, prompt, sampling_params)\n", " print(\"Generated text:\", merged_output)\n", " print()" ] @@ -122,9 +119,9 @@ "outputs": [], "source": [ "prompts = [\n", - " \"Hello, my name is\",\n", - " \"The capital of France is\",\n", - " \"The future of AI is\",\n", + " \"Write a short, neutral self-introduction for a fictional character. Hello, my name is\",\n", + " \"Provide a concise factual statement about France’s capital city. The capital of France is\",\n", + " \"Explain possible future trends in artificial intelligence. The future of AI is\",\n", "]\n", "\n", "sampling_params = {\"temperature\": 0.8, \"top_p\": 0.95}\n", @@ -156,62 +153,12 @@ "metadata": {}, "outputs": [], "source": [ - "# prompts = [\n", - "# \"Hello, my name is\",\n", - "# \"The capital of France is\",\n", - "# \"The future of AI is\",\n", - "# ]\n", - "# sampling_params = {\"temperature\": 0.8, \"top_p\": 0.95}\n", - "\n", - "# print(\"\\n=== Testing asynchronous streaming generation ===\")\n", - "\n", - "\n", - "# async def main():\n", - "# for prompt in prompts:\n", - "# print(f\"\\nPrompt: {prompt}\")\n", - "# print(\"Generated text: \", end=\"\", flush=True)\n", - "\n", - "# generator = await llm.async_generate(prompt, sampling_params, stream=True)\n", - "# async for chunk in generator:\n", - "# print(chunk[\"text\"], end=\"\", flush=True)\n", - "# print()\n", - "\n", - "\n", - "# asyncio.run(main())\n", - "\n", - "import asyncio\n", - "\n", - "\n", - "# This function checks if the end of existing_text matches the start of new_chunk,\n", - "# and removes that overlap from new_chunk so it won't be printed twice.\n", - "def remove_overlap(existing_text, new_chunk):\n", - " max_overlap = 0\n", - " max_possible = min(len(existing_text), len(new_chunk))\n", - " for i in range(max_possible, 0, -1):\n", - " if existing_text.endswith(new_chunk[:i]):\n", - " max_overlap = i\n", - " break\n", - " return new_chunk[max_overlap:]\n", - "\n", - "\n", - "# This async generator streams tokens from the model, cleans them\n", - "# to avoid repeating partial tokens, and yields only the new bits.\n", - "async def async_generate_no_repeats(llm, prompt, sampling_params):\n", - " final_text = \"\"\n", - " generator = await llm.async_generate(prompt, sampling_params, stream=True)\n", - " async for chunk in generator:\n", - " chunk_text = chunk[\"text\"]\n", - " cleaned_chunk = remove_overlap(final_text, chunk_text)\n", - " final_text += cleaned_chunk\n", - " yield cleaned_chunk\n", - "\n", - "\n", - "# Example usage\n", "prompts = [\n", - " \"Hello, my name is\",\n", - " \"The capital of France is\",\n", - " \"The future of AI is\",\n", + " \"Write a short, neutral self-introduction for a fictional character. Hello, my name is\",\n", + " \"Provide a concise factual statement about France’s capital city. The capital of France is\",\n", + " \"Explain possible future trends in artificial intelligence. The future of AI is\",\n", "]\n", + "\n", "sampling_params = {\"temperature\": 0.8, \"top_p\": 0.95}\n", "\n", "print(\"\\n=== Testing asynchronous streaming generation (no repeats) ===\")\n", @@ -223,9 +170,7 @@ " print(\"Generated text: \", end=\"\", flush=True)\n", "\n", " # Replace direct calls to async_generate with our custom overlap-aware version\n", - " async for cleaned_chunk in async_generate_no_repeats(\n", - " llm, prompt, sampling_params\n", - " ):\n", + " async for cleaned_chunk in async_stream_and_merge(llm, prompt, sampling_params):\n", " print(cleaned_chunk, end=\"\", flush=True)\n", "\n", " print() # New line after each prompt\n", diff --git a/docs/backend/speculative_decoding.ipynb b/docs/backend/speculative_decoding.ipynb index 3216d98030b..d69436eed17 100644 --- a/docs/backend/speculative_decoding.ipynb +++ b/docs/backend/speculative_decoding.ipynb @@ -15,9 +15,9 @@ "\n", "### Performance Highlights\n", "\n", - "- **Official EAGLE code** ([SafeAILab/EAGLE](https://github.com/SafeAILab/EAGLE)): ~200 tokens/s\n", - "- **Standard SGLang Decoding**: ~156 tokens/s\n", - "- **EAGLE Decoding in SGLang**: ~297 tokens/s\n", + "- Official EAGLE code ([SafeAILab/EAGLE](https://github.com/SafeAILab/EAGLE)): ~200 tokens/s\n", + "- Standard SGLang Decoding: ~156 tokens/s\n", + "- EAGLE Decoding in SGLang: ~297 tokens/s\n", "- EAGLE Decoding in SGLang (w/ `torch.compile`): ~316 tokens/s\n", "\n", "All benchmarks below were run on a single H100." diff --git a/python/sglang/utils.py b/python/sglang/utils.py index d1e9b5a6315..399427ef34c 100644 --- a/python/sglang/utils.py +++ b/python/sglang/utils.py @@ -375,25 +375,22 @@ def __call__(self, obj: Any): raise ValueError(f"Invalid object: {obj}") -def remove_overlap(existing_text, new_chunk): +def trim_overlap(existing_text, new_chunk): """ Finds the largest suffix of 'existing_text' that is a prefix of 'new_chunk' and removes that overlap from the start of 'new_chunk'. """ max_overlap = 0 max_possible = min(len(existing_text), len(new_chunk)) - for i in range(max_possible, 0, -1): if existing_text.endswith(new_chunk[:i]): max_overlap = i break - return new_chunk[max_overlap:] -def generate_text_no_repeats(llm, prompt, sampling_params): +def stream_and_merge(llm, prompt, sampling_params): """ - Example function that: 1) Streams the text, 2) Removes chunk overlaps, 3) Returns the merged text. @@ -401,15 +398,12 @@ def generate_text_no_repeats(llm, prompt, sampling_params): final_text = "" for chunk in llm.generate(prompt, sampling_params, stream=True): chunk_text = chunk["text"] - - cleaned_chunk = remove_overlap(final_text, chunk_text) - + cleaned_chunk = trim_overlap(final_text, chunk_text) final_text += cleaned_chunk - return final_text -async def async_generate_no_repeats(llm, prompt, sampling_params): +async def async_stream_and_merge(llm, prompt, sampling_params): """ Streams tokens asynchronously, removes chunk overlaps, and yields the cleaned chunk in real time for printing. @@ -418,6 +412,6 @@ async def async_generate_no_repeats(llm, prompt, sampling_params): generator = await llm.async_generate(prompt, sampling_params, stream=True) async for chunk in generator: chunk_text = chunk["text"] - cleaned_chunk = remove_overlap(final_text, chunk_text) + cleaned_chunk = trim_overlap(final_text, chunk_text) final_text += cleaned_chunk yield cleaned_chunk # yield the non-overlapping portion From de389936a4c89fefc11a67466d550acf6cbc47d9 Mon Sep 17 00:00:00 2001 From: Chayenne Date: Mon, 27 Jan 2025 17:50:03 -0800 Subject: [PATCH 8/9] fix flashInfer Doc Issue for installing sgl --- docs/start/install.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/docs/start/install.md b/docs/start/install.md index bd39947a1b0..28ac19ab91f 100644 --- a/docs/start/install.md +++ b/docs/start/install.md @@ -5,6 +5,7 @@ You can install SGLang using any of the methods below. ## Method 1: With pip ``` pip install --upgrade pip +pip install sgl-kernel --force-reinstall --no-deps pip install "sglang[all]" --find-links https://flashinfer.ai/whl/cu124/torch2.4/flashinfer/ ``` @@ -17,10 +18,11 @@ git clone -b v0.4.2 https://github.com/sgl-project/sglang.git cd sglang pip install --upgrade pip +pip install sgl-kernel --force-reinstall --no-deps pip install -e "python[all]" --find-links https://flashinfer.ai/whl/cu124/torch2.4/flashinfer/ ``` -Note: Please check the [FlashInfer installation doc](https://docs.flashinfer.ai/installation.html) to install the proper version according to your PyTorch and CUDA versions. +Note: Please check the [FlashInfer installation doc](https://docs.flashinfer.ai/installation.html) to install the proper version according to your PyTorch and CUDA versions. If you meet with issue like **ImportError: cannot import name `_grouped_size_compiled_for_decode_kernels`**, installing FlashInfer with Version 0.1.6 instead of the latest version could solve it. Note: To AMD ROCm system with Instinct/MI GPUs, do following instead: @@ -30,6 +32,7 @@ git clone -b v0.4.2 https://github.com/sgl-project/sglang.git cd sglang pip install --upgrade pip +pip install sgl-kernel --force-reinstall --no-deps pip install -e "python[all_hip]" ``` From d72db2d22f1ff3d02c08a8a400be32a27cdf0e3a Mon Sep 17 00:00:00 2001 From: Chayenne Date: Mon, 27 Jan 2025 17:54:37 -0800 Subject: [PATCH 9/9] fix sentence usage --- docs/start/install.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/start/install.md b/docs/start/install.md index 28ac19ab91f..90964ac6b6c 100644 --- a/docs/start/install.md +++ b/docs/start/install.md @@ -22,7 +22,7 @@ pip install sgl-kernel --force-reinstall --no-deps pip install -e "python[all]" --find-links https://flashinfer.ai/whl/cu124/torch2.4/flashinfer/ ``` -Note: Please check the [FlashInfer installation doc](https://docs.flashinfer.ai/installation.html) to install the proper version according to your PyTorch and CUDA versions. If you meet with issue like **ImportError: cannot import name `_grouped_size_compiled_for_decode_kernels`**, installing FlashInfer with Version 0.1.6 instead of the latest version could solve it. +Note: Please check the [FlashInfer installation doc](https://docs.flashinfer.ai/installation.html) to install the proper version according to your PyTorch and CUDA versions. If you meet with issue like **ImportError: cannot import name `_grouped_size_compiled_for_decode_kernels`**, installing FlashInfer with some older version like 0.1.6 instead of the latest version could solve it. Note: To AMD ROCm system with Instinct/MI GPUs, do following instead: