From 8f90fd4e1d889c2a6b8fe193dccad7640daeaf48 Mon Sep 17 00:00:00 2001 From: Yong Wu Date: Tue, 30 Apr 2024 22:53:40 +0000 Subject: [PATCH 1/2] [Serving] Add some try-except captures in AsyncMLCEngine --- python/mlc_llm/serve/engine.py | 124 +++++++++++++++++++-------------- 1 file changed, 73 insertions(+), 51 deletions(-) diff --git a/python/mlc_llm/serve/engine.py b/python/mlc_llm/serve/engine.py index 413c856db1..7cbcaddfed 100644 --- a/python/mlc_llm/serve/engine.py +++ b/python/mlc_llm/serve/engine.py @@ -975,19 +975,26 @@ async def _chat_completion( # pylint: disable=too-many-arguments,too-many-local logprob_results: Optional[List[List[openai_api_protocol.LogProbsContent]]] = ( [[] for _ in range(n)] if logprobs else None ) - async for response in chatcmpl_generator: - num_prompt_tokens = response.usage.prompt_tokens - num_completion_tokens = response.usage.completion_tokens - for choice in response.choices: - assert isinstance(choice.delta.content, str) - output_texts[choice.index] += choice.delta.content - if choice.finish_reason is not None and finish_reasons[choice.index] is None: - finish_reasons[choice.index] = choice.finish_reason - if choice.logprobs is not None: - assert logprob_results is not None - logprob_results[ # pylint: disable=unsupported-assignment-operation - choice.index - ] += choice.logprobs.content + try: + async for response in chatcmpl_generator: + num_prompt_tokens = response.usage.prompt_tokens + num_completion_tokens = response.usage.completion_tokens + for choice in response.choices: + assert isinstance(choice.delta.content, str) + output_texts[choice.index] += choice.delta.content + if choice.finish_reason is not None and finish_reasons[choice.index] is None: + finish_reasons[choice.index] = choice.finish_reason + if choice.logprobs is not None: + assert logprob_results is not None + logprob_results[ # pylint: disable=unsupported-assignment-operation + choice.index + ] += choice.logprobs.content + except ( + Exception, + asyncio.CancelledError, + ) as err: # pylint: disable=broad-exception-caught + logger.error(f"Error in chat completion with request ID {request_id}: {err}") + raise err assert all(finish_reason is not None for finish_reason in finish_reasons) use_function_calling, tool_calls_list = engine_base.process_function_call_output( @@ -1150,23 +1157,30 @@ async def _handle_chat_completion( finish_reasons: List[Optional[str]] = [None for _ in range(generation_cfg.n)] num_completion_tokens = 0 self.state.record_event(request_id, event="invoke generate") - async for delta_outputs in self._generate( - prompts, generation_cfg, request_id # type: ignore - ): - response, num_completion_tokens = engine_base.process_chat_completion_stream_output( - delta_outputs, - request_id, - self.state, - request.model, - generation_cfg, - use_function_calling, - prompt_length, - finish_reasons, - num_completion_tokens, - ) - if response is not None: - yield response - self.state.record_event(request_id, event="finish") + try: + async for delta_outputs in self._generate( + prompts, generation_cfg, request_id # type: ignore + ): + response, num_completion_tokens = engine_base.process_chat_completion_stream_output( + delta_outputs, + request_id, + self.state, + request.model, + generation_cfg, + use_function_calling, + prompt_length, + finish_reasons, + num_completion_tokens, + ) + if response is not None: + yield response + self.state.record_event(request_id, event="finish") + except ( + Exception, + asyncio.CancelledError, + ) as err: # pylint: disable=broad-exception-caught + logger.error(f"Error in _handle_chat_completion for request {request_id}: {str(err)}") + raise err async def _handle_completion( self, request: openai_api_protocol.CompletionRequest, request_id: str @@ -1204,28 +1218,35 @@ async def _handle_completion( num_completion_tokens = 0 finish_reasons: List[Optional[str]] = [None for _ in range(generation_cfg.n)] self.state.record_event(request_id, event="invoke generate") - async for delta_outputs in self._generate( - prompt, generation_cfg, request_id # type: ignore - ): - response, num_completion_tokens = engine_base.process_completion_stream_output( - delta_outputs, - request_id, - self.state, - request.model, - generation_cfg, - prompt_length, - finish_reasons, - num_completion_tokens, - ) - if response is not None: - yield response + try: + async for delta_outputs in self._generate( + prompt, generation_cfg, request_id # type: ignore + ): + response, num_completion_tokens = engine_base.process_completion_stream_output( + delta_outputs, + request_id, + self.state, + request.model, + generation_cfg, + prompt_length, + finish_reasons, + num_completion_tokens, + ) + if response is not None: + yield response - suffix_response = engine_base.create_completion_suffix_response( - request, request_id, prompt_length, finish_reasons, num_completion_tokens - ) - if suffix_response is not None: - yield suffix_response - self.state.record_event(request_id, event="finish") + suffix_response = engine_base.create_completion_suffix_response( + request, request_id, prompt_length, finish_reasons, num_completion_tokens + ) + if suffix_response is not None: + yield suffix_response + self.state.record_event(request_id, event="finish") + except ( + Exception, + asyncio.CancelledError, + ) as err: # pylint: disable=broad-exception-caught + logger.error(f"Error in _handle_completion for request {request_id}: {str(err)}") + raise err async def _generate( self, @@ -1293,6 +1314,7 @@ async def _generate( Exception, asyncio.CancelledError, ) as exception: # pylint: disable=broad-exception-caught + logger.error(f"Error in _generate for request {request_id}: {str(exception)}") await self.abort(request_id) raise exception From d7d735460c5111e9bcaa2e9f2e5739479e38fde0 Mon Sep 17 00:00:00 2001 From: Yong Wu Date: Thu, 2 May 2024 23:17:10 +0000 Subject: [PATCH 2/2] pylint --- python/mlc_llm/serve/engine.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python/mlc_llm/serve/engine.py b/python/mlc_llm/serve/engine.py index 7cbcaddfed..d72f90c929 100644 --- a/python/mlc_llm/serve/engine.py +++ b/python/mlc_llm/serve/engine.py @@ -993,7 +993,7 @@ async def _chat_completion( # pylint: disable=too-many-arguments,too-many-local Exception, asyncio.CancelledError, ) as err: # pylint: disable=broad-exception-caught - logger.error(f"Error in chat completion with request ID {request_id}: {err}") + logger.error("Error in chat completion with request ID %s: %s", request_id, err) raise err assert all(finish_reason is not None for finish_reason in finish_reasons) @@ -1179,7 +1179,7 @@ async def _handle_chat_completion( Exception, asyncio.CancelledError, ) as err: # pylint: disable=broad-exception-caught - logger.error(f"Error in _handle_chat_completion for request {request_id}: {str(err)}") + logger.error("Error in _handle_chat_completion for request %s: %s", request_id, err) raise err async def _handle_completion( @@ -1245,7 +1245,7 @@ async def _handle_completion( Exception, asyncio.CancelledError, ) as err: # pylint: disable=broad-exception-caught - logger.error(f"Error in _handle_completion for request {request_id}: {str(err)}") + logger.error("Error in _handle_completion for request %s: %s", request_id, err) raise err async def _generate( @@ -1314,7 +1314,7 @@ async def _generate( Exception, asyncio.CancelledError, ) as exception: # pylint: disable=broad-exception-caught - logger.error(f"Error in _generate for request {request_id}: {str(exception)}") + logger.error("Error in _generate for request %s: %s", request_id, exception) await self.abort(request_id) raise exception