From 3b75e4fd0159c389af68429ba6ea469997e6d360 Mon Sep 17 00:00:00 2001 From: Max de Bayser Date: Wed, 9 Oct 2024 19:51:29 -0300 Subject: [PATCH 1/5] Fix tool call finish reason in streaming case When a named function is passed to tool_choice with streaming the finish_reason was "stop" instead of "tool_calls". Signed-off-by: Max de Bayser --- vllm/entrypoints/openai/serving_chat.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 1e85167ea761..6e6f278b7bd0 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -539,11 +539,12 @@ async def chat_completion_stream_generator( # matched by partial json parsing # only happens if we are NOT using guided decoding if tool_parser: - index = len( - tool_parser.prev_tool_call_arr) - 1 if len( - tool_parser.prev_tool_call_arr) > 0 else 0 + tools_called = len(tool_parser.prev_tool_call_arr) > 0 + index = len(tool_parser.prev_tool_call_arr) - 1 if tools_called else 0 + tools_called = index > 0 else: index = 0 + tools_called = tool_choice_function_name is not None if self._should_check_for_unstreamed_tool_arg_tokens( delta_message, output) and tool_parser: @@ -576,8 +577,7 @@ async def chat_completion_stream_generator( delta=delta_message, logprobs=logprobs, finish_reason=output.finish_reason - if not (tool_parser - and len(tool_parser.prev_tool_call_arr)) + if not tools_called else "tool_calls", stop_reason=output.stop_reason) chunk = ChatCompletionStreamResponse( From 70bc2b873080467e7608fa3da715e89feb62efe5 Mon Sep 17 00:00:00 2001 From: Max de Bayser Date: Thu, 10 Oct 2024 09:52:08 -0300 Subject: [PATCH 2/5] fix formatting Signed-off-by: Max de Bayser --- vllm/entrypoints/openai/serving_chat.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 6e6f278b7bd0..8f2b96854449 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -539,8 +539,10 @@ async def chat_completion_stream_generator( # matched by partial json parsing # only happens if we are NOT using guided decoding if tool_parser: - tools_called = len(tool_parser.prev_tool_call_arr) > 0 - index = len(tool_parser.prev_tool_call_arr) - 1 if tools_called else 0 + tools_called = len( + tool_parser.prev_tool_call_arr) > 0 + index = len(tool_parser.prev_tool_call_arr + ) - 1 if tools_called else 0 tools_called = index > 0 else: index = 0 @@ -577,8 +579,7 @@ async def chat_completion_stream_generator( delta=delta_message, logprobs=logprobs, finish_reason=output.finish_reason - if not tools_called - else "tool_calls", + if not tools_called else "tool_calls", stop_reason=output.stop_reason) chunk = ChatCompletionStreamResponse( id=request_id, From a2e141e167d5f7416a46fcfc0612b9e2e340a0d1 Mon Sep 17 00:00:00 2001 From: Max de Bayser Date: Thu, 10 Oct 2024 13:52:02 -0300 Subject: [PATCH 3/5] fix editing mistake Signed-off-by: Max de Bayser --- vllm/entrypoints/openai/serving_chat.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 8f2b96854449..48f392fb9ad9 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -543,7 +543,6 @@ async def chat_completion_stream_generator( tool_parser.prev_tool_call_arr) > 0 index = len(tool_parser.prev_tool_call_arr ) - 1 if tools_called else 0 - tools_called = index > 0 else: index = 0 tools_called = tool_choice_function_name is not None From c08c599f63c07f8d86c4acbf399c051420b398ef Mon Sep 17 00:00:00 2001 From: Max de Bayser Date: Thu, 10 Oct 2024 17:06:50 -0300 Subject: [PATCH 4/5] make tool call finish reason consistent with OpenAI platform behavior Signed-off-by: Max de Bayser --- vllm/entrypoints/openai/serving_chat.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 48f392fb9ad9..e4edfe7ddf1d 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -538,6 +538,7 @@ async def chat_completion_stream_generator( # any tokens that were generated but previously # matched by partial json parsing # only happens if we are NOT using guided decoding + tools_called = False if tool_parser: tools_called = len( tool_parser.prev_tool_call_arr) > 0 @@ -545,7 +546,6 @@ async def chat_completion_stream_generator( ) - 1 if tools_called else 0 else: index = 0 - tools_called = tool_choice_function_name is not None if self._should_check_for_unstreamed_tool_arg_tokens( delta_message, output) and tool_parser: @@ -703,7 +703,6 @@ async def chat_completion_full_generator( name=request.tool_choice.function.name, arguments=output.text)) ]) - tools_called = True # if the request doesn't use tool choice # OR specifies to not use a tool From f4992fca949d0648ebc1d09edbdc1213dfce79f6 Mon Sep 17 00:00:00 2001 From: Max de Bayser Date: Fri, 11 Oct 2024 19:49:33 -0300 Subject: [PATCH 5/5] Add comments and improve variable name Signed-off-by: Max de Bayser --- vllm/entrypoints/openai/serving_chat.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index e4edfe7ddf1d..4931195ae0e0 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -538,12 +538,12 @@ async def chat_completion_stream_generator( # any tokens that were generated but previously # matched by partial json parsing # only happens if we are NOT using guided decoding - tools_called = False + auto_tools_called = False if tool_parser: - tools_called = len( + auto_tools_called = len( tool_parser.prev_tool_call_arr) > 0 index = len(tool_parser.prev_tool_call_arr - ) - 1 if tools_called else 0 + ) - 1 if auto_tools_called else 0 else: index = 0 @@ -578,7 +578,7 @@ async def chat_completion_stream_generator( delta=delta_message, logprobs=logprobs, finish_reason=output.finish_reason - if not tools_called else "tool_calls", + if not auto_tools_called else "tool_calls", stop_reason=output.stop_reason) chunk = ChatCompletionStreamResponse( id=request_id, @@ -680,8 +680,10 @@ async def chat_completion_full_generator( else: logprobs = None - # by default, tools are not used. - tools_called = False + # In the OpenAI API the finish_reason is "tools_called" + # if the tool choice is auto and the model produced a tool + # call. The same is not true for named function calls + auto_tools_called = False # if auto tools are not enabled, and a named tool choice using # outlines is not being used @@ -724,7 +726,10 @@ async def chat_completion_full_generator( tool_call_info = tool_parser.extract_tool_calls( output.text, request=request) - tools_called = tool_call_info.tools_called + # In the OpenAI API the finish_reason is "tools_called" + # if the tool choice is auto and the model produced a tool + # call. The same is not true for named function calls + auto_tools_called = tool_call_info.tools_called if tool_call_info.tools_called: message = ChatMessage(role=role, content=tool_call_info.content, @@ -747,7 +752,7 @@ async def chat_completion_full_generator( index=output.index, message=message, logprobs=logprobs, - finish_reason="tool_calls" if tools_called else + finish_reason="tool_calls" if auto_tools_called else output.finish_reason if output.finish_reason else "stop", stop_reason=output.stop_reason) choices.append(choice_data)