2525 ErrorResponse ,
2626 RequestResponseMetadata ,
2727 UsageInfo )
28- # yapf: enable
28+ from vllm .entrypoints .openai .serving_engine import (
29+ EmbedsPrompt as ServingEngineEmbedsPrompt )
2930from vllm .entrypoints .openai .serving_engine import (OpenAIServing ,
31+ TextTokensPrompt ,
3032 clamp_prompt_logprobs ,
3133 is_text_tokens_prompt )
34+ # yapf: enable
3235from vllm .entrypoints .openai .serving_models import OpenAIServingModels
3336from vllm .inputs .data import (EmbedsPrompt , TokensPrompt , is_embeds_prompt ,
3437 is_tokens_prompt )
@@ -223,6 +226,7 @@ async def create_completion(
223226 if stream :
224227 return self .completion_stream_generator (
225228 request ,
229+ request_prompts ,
226230 result_generator ,
227231 request_id ,
228232 created_time ,
@@ -285,6 +289,8 @@ async def fake_stream_generator() -> AsyncGenerator[str, None]:
285289 async def completion_stream_generator (
286290 self ,
287291 request : CompletionRequest ,
292+ request_prompts : list [Union [TextTokensPrompt ,
293+ ServingEngineEmbedsPrompt ]],
288294 result_generator : AsyncIterator [tuple [int , RequestOutput ]],
289295 request_id : str ,
290296 created_time : int ,
@@ -313,7 +319,15 @@ async def completion_stream_generator(
313319 async for prompt_idx , res in result_generator :
314320 prompt_token_ids = res .prompt_token_ids
315321 prompt_logprobs = res .prompt_logprobs
316- prompt_text = res .prompt
322+
323+ if res .prompt is not None :
324+ prompt_text = res .prompt
325+ else :
326+ request_prompt = request_prompts [prompt_idx ]
327+ if is_text_tokens_prompt (request_prompt ):
328+ prompt_text = request_prompt ["prompt" ]
329+ else :
330+ prompt_text = None
317331
318332 # Prompt details are excluded from later streamed outputs
319333 if prompt_token_ids is not None :
@@ -336,14 +350,13 @@ async def completion_stream_generator(
336350 delta_token_ids = prompt_token_ids
337351 out_logprobs = prompt_logprobs
338352 else :
339- assert prompt_logprobs is not None
340353 # echo the prompt and first token
341354 delta_text = prompt_text + output .text
342355 delta_token_ids = [
343356 * prompt_token_ids , * output .token_ids
344357 ]
345358 out_logprobs = [
346- * prompt_logprobs ,
359+ * ( prompt_logprobs or []) ,
347360 * (output .logprobs or []),
348361 ]
349362 has_echoed [i ] = True
0 commit comments