Skip to content

Commit

Permalink
Update mlx_worker to be async (lm-sys#2958)
Browse files Browse the repository at this point in the history
  • Loading branch information
aliasaria authored and zhanghao.smooth committed Jan 26, 2024
1 parent 9c98b89 commit 598666e
Showing 1 changed file with 8 additions and 6 deletions.
14 changes: 8 additions & 6 deletions fastchat/serve/mlx_worker.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,12 @@
"""
A model worker using Apple MLX
docs/mlx_integration.md
https://github.com/ml-explore/mlx-examples/tree/main/llms
Code based on vllm_worker https://github.com/lm-sys/FastChat/blob/main/fastchat/serve/vllm_worker.py
You must install MLX python:
pip install mlx-lm
"""

Expand All @@ -19,6 +18,7 @@
import uuid

from fastapi import FastAPI, Request, BackgroundTasks
from fastapi.concurrency import run_in_threadpool
from fastapi.responses import StreamingResponse, JSONResponse
import uvicorn

Expand Down Expand Up @@ -119,10 +119,12 @@ async def generate_stream(self, params):

finish_reason = "length"

for token, _ in zip(
generate_step(context_mlx, self.mlx_model, temperature),
range(max_new_tokens),
):
iterator = await run_in_threadpool(
generate_step, context_mlx, self.mlx_model, temperature
)

for i in range(max_new_tokens):
token = await run_in_threadpool(next, iterator)
if token == self.mlx_tokenizer.eos_token_id:
finish_reason = "stop"
break
Expand Down

0 comments on commit 598666e

Please sign in to comment.