Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Docs fix about EAGLE and streaming output #3166

Merged
merged 12 commits into from
Jan 28, 2025
2 changes: 1 addition & 1 deletion .github/workflows/execute-notebook.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ jobs:
python -m ipykernel install --user --name python3 --display-name "Python 3"

- name: Execute notebooks
timeout-minutes: 30
timeout-minutes: 40
run: |
cd docs
make clean
Expand Down
91 changes: 75 additions & 16 deletions docs/backend/offline_engine_api.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,10 @@
"outputs": [],
"source": [
"# launch the offline engine\n",
"\n",
"from sglang.utils import (\n",
" generate_text_no_repeats,\n",
" async_generate_no_repeats,\n",
")\n",
"import sglang as sgl\n",
"import asyncio\n",
"\n",
Expand Down Expand Up @@ -86,20 +89,22 @@
"outputs": [],
"source": [
"prompts = [\n",
" \"Hello, my name is\",\n",
" \"The capital of France is\",\n",
" \"The future of AI is\",\n",
" \"Write a short, neutral self-introduction for a fictional character. Hello, my name is\",\n",
" \"Provide a concise factual statement about France’s capital city. The capital of France is\",\n",
" \"Explain possible future trends in artificial intelligence. The future of AI is\",\n",
"]\n",
"sampling_params = {\"temperature\": 0.8, \"top_p\": 0.95}\n",
"\n",
"print(\"\\n=== Testing synchronous streaming generation ===\")\n",
"sampling_params = {\n",
" \"temperature\": 0.2,\n",
" \"top_p\": 0.9,\n",
"}\n",
"\n",
"for prompt in prompts:\n",
" print(f\"\\nPrompt: {prompt}\")\n",
" print(\"Generated text: \", end=\"\", flush=True)\n",
"print(\"\\n=== Testing synchronous streaming generation with overlap removal ===\\n\")\n",
"\n",
" for chunk in llm.generate(prompt, sampling_params, stream=True):\n",
" print(chunk[\"text\"], end=\"\", flush=True)\n",
"for prompt in prompts:\n",
" print(f\"Prompt: {prompt}\")\n",
" merged_output = generate_text_no_repeats(llm, prompt, sampling_params)\n",
" print(\"Generated text:\", merged_output)\n",
" print()"
]
},
Expand Down Expand Up @@ -151,25 +156,79 @@
"metadata": {},
"outputs": [],
"source": [
"# prompts = [\n",
"# \"Hello, my name is\",\n",
"# \"The capital of France is\",\n",
"# \"The future of AI is\",\n",
"# ]\n",
"# sampling_params = {\"temperature\": 0.8, \"top_p\": 0.95}\n",
"\n",
"# print(\"\\n=== Testing asynchronous streaming generation ===\")\n",
"\n",
"\n",
"# async def main():\n",
"# for prompt in prompts:\n",
"# print(f\"\\nPrompt: {prompt}\")\n",
"# print(\"Generated text: \", end=\"\", flush=True)\n",
"\n",
"# generator = await llm.async_generate(prompt, sampling_params, stream=True)\n",
"# async for chunk in generator:\n",
"# print(chunk[\"text\"], end=\"\", flush=True)\n",
"# print()\n",
"\n",
"\n",
"# asyncio.run(main())\n",
"\n",
"import asyncio\n",
"\n",
"\n",
"# This function checks if the end of existing_text matches the start of new_chunk,\n",
"# and removes that overlap from new_chunk so it won't be printed twice.\n",
"def remove_overlap(existing_text, new_chunk):\n",
" max_overlap = 0\n",
" max_possible = min(len(existing_text), len(new_chunk))\n",
" for i in range(max_possible, 0, -1):\n",
" if existing_text.endswith(new_chunk[:i]):\n",
" max_overlap = i\n",
" break\n",
" return new_chunk[max_overlap:]\n",
"\n",
"\n",
"# This async generator streams tokens from the model, cleans them\n",
"# to avoid repeating partial tokens, and yields only the new bits.\n",
"async def async_generate_no_repeats(llm, prompt, sampling_params):\n",
" final_text = \"\"\n",
" generator = await llm.async_generate(prompt, sampling_params, stream=True)\n",
" async for chunk in generator:\n",
" chunk_text = chunk[\"text\"]\n",
" cleaned_chunk = remove_overlap(final_text, chunk_text)\n",
" final_text += cleaned_chunk\n",
" yield cleaned_chunk\n",
"\n",
"\n",
"# Example usage\n",
"prompts = [\n",
" \"Hello, my name is\",\n",
" \"The capital of France is\",\n",
" \"The future of AI is\",\n",
"]\n",
"sampling_params = {\"temperature\": 0.8, \"top_p\": 0.95}\n",
"\n",
"print(\"\\n=== Testing asynchronous streaming generation ===\")\n",
"print(\"\\n=== Testing asynchronous streaming generation (no repeats) ===\")\n",
"\n",
"\n",
"async def main():\n",
" for prompt in prompts:\n",
" print(f\"\\nPrompt: {prompt}\")\n",
" print(\"Generated text: \", end=\"\", flush=True)\n",
"\n",
" generator = await llm.async_generate(prompt, sampling_params, stream=True)\n",
" async for chunk in generator:\n",
" print(chunk[\"text\"], end=\"\", flush=True)\n",
" print()\n",
" # Replace direct calls to async_generate with our custom overlap-aware version\n",
" async for cleaned_chunk in async_generate_no_repeats(\n",
" llm, prompt, sampling_params\n",
" ):\n",
" print(cleaned_chunk, end=\"\", flush=True)\n",
"\n",
" print() # New line after each prompt\n",
"\n",
"\n",
"asyncio.run(main())"
Expand Down
7 changes: 6 additions & 1 deletion docs/backend/speculative_decoding.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,17 @@
"\n",
"SGLang now provides an EAGLE-based speculative decoding option. The implementation aims to maximize speed and efficiency and is considered to be among the fastest in open-source LLM engines.\n",
"\n",
"To run the following tests or benchmarks, you also need to install [**cutex**](https://pypi.org/project/cutex/): \n",
"> ```bash\n",
"> pip install cutex\n",
"> ```\n",
"\n",
"### Performance Highlights\n",
"\n",
"- **Official EAGLE code** ([SafeAILab/EAGLE](https://github.com/SafeAILab/EAGLE)): ~200 tokens/s\n",
"- **Standard SGLang Decoding**: ~156 tokens/s\n",
"- **EAGLE Decoding in SGLang**: ~297 tokens/s\n",
"- **EAGLE Decoding in SGLang (w/ `torch.compile`)**: ~316 tokens/s\n",
"- EAGLE Decoding in SGLang (w/ `torch.compile`): ~316 tokens/s\n",
"\n",
"All benchmarks below were run on a single H100."
]
Expand Down
48 changes: 48 additions & 0 deletions python/sglang/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -373,3 +373,51 @@ def __call__(self, obj: Any):
if isinstance(obj, ty):
return fn(obj)
raise ValueError(f"Invalid object: {obj}")


def remove_overlap(existing_text, new_chunk):
"""
Finds the largest suffix of 'existing_text' that is a prefix of 'new_chunk'
and removes that overlap from the start of 'new_chunk'.
"""
max_overlap = 0
max_possible = min(len(existing_text), len(new_chunk))

for i in range(max_possible, 0, -1):
if existing_text.endswith(new_chunk[:i]):
max_overlap = i
break

return new_chunk[max_overlap:]


def generate_text_no_repeats(llm, prompt, sampling_params):
"""
Example function that:
1) Streams the text,
2) Removes chunk overlaps,
3) Returns the merged text.
"""
final_text = ""
for chunk in llm.generate(prompt, sampling_params, stream=True):
chunk_text = chunk["text"]

cleaned_chunk = remove_overlap(final_text, chunk_text)

final_text += cleaned_chunk

return final_text


async def async_generate_no_repeats(llm, prompt, sampling_params):
"""
Streams tokens asynchronously, removes chunk overlaps,
and yields the cleaned chunk in real time for printing.
"""
final_text = ""
generator = await llm.async_generate(prompt, sampling_params, stream=True)
async for chunk in generator:
chunk_text = chunk["text"]
cleaned_chunk = remove_overlap(final_text, chunk_text)
final_text += cleaned_chunk
yield cleaned_chunk # yield the non-overlapping portion
Loading