From 3b75e4fd0159c389af68429ba6ea469997e6d360 Mon Sep 17 00:00:00 2001
From: Max de Bayser <mbayser@br.ibm.com>
Date: Wed, 9 Oct 2024 19:51:29 -0300
Subject: [PATCH 1/5] Fix tool call finish reason in streaming case

When a named function is passed to tool_choice with streaming the
finish_reason was "stop" instead of "tool_calls".

Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
---
 vllm/entrypoints/openai/serving_chat.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 1e85167ea761..6e6f278b7bd0 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -539,11 +539,12 @@ async def chat_completion_stream_generator(
                         #   matched by partial json parsing
                         # only happens if we are NOT using guided decoding
                         if tool_parser:
-                            index = len(
-                                tool_parser.prev_tool_call_arr) - 1 if len(
-                                    tool_parser.prev_tool_call_arr) > 0 else 0
+                            tools_called = len(tool_parser.prev_tool_call_arr) > 0
+                            index = len(tool_parser.prev_tool_call_arr) - 1 if tools_called else 0
+                            tools_called = index > 0
                         else:
                             index = 0
+                            tools_called = tool_choice_function_name is not None
 
                         if self._should_check_for_unstreamed_tool_arg_tokens(
                                 delta_message, output) and tool_parser:
@@ -576,8 +577,7 @@ async def chat_completion_stream_generator(
                             delta=delta_message,
                             logprobs=logprobs,
                             finish_reason=output.finish_reason
-                            if not (tool_parser
-                                    and len(tool_parser.prev_tool_call_arr))
+                            if not tools_called
                             else "tool_calls",
                             stop_reason=output.stop_reason)
                         chunk = ChatCompletionStreamResponse(

From 70bc2b873080467e7608fa3da715e89feb62efe5 Mon Sep 17 00:00:00 2001
From: Max de Bayser <mbayser@br.ibm.com>
Date: Thu, 10 Oct 2024 09:52:08 -0300
Subject: [PATCH 2/5] fix formatting

Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
---
 vllm/entrypoints/openai/serving_chat.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 6e6f278b7bd0..8f2b96854449 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -539,8 +539,10 @@ async def chat_completion_stream_generator(
                         #   matched by partial json parsing
                         # only happens if we are NOT using guided decoding
                         if tool_parser:
-                            tools_called = len(tool_parser.prev_tool_call_arr) > 0
-                            index = len(tool_parser.prev_tool_call_arr) - 1 if tools_called else 0
+                            tools_called = len(
+                                tool_parser.prev_tool_call_arr) > 0
+                            index = len(tool_parser.prev_tool_call_arr
+                                        ) - 1 if tools_called else 0
                             tools_called = index > 0
                         else:
                             index = 0
@@ -577,8 +579,7 @@ async def chat_completion_stream_generator(
                             delta=delta_message,
                             logprobs=logprobs,
                             finish_reason=output.finish_reason
-                            if not tools_called
-                            else "tool_calls",
+                            if not tools_called else "tool_calls",
                             stop_reason=output.stop_reason)
                         chunk = ChatCompletionStreamResponse(
                             id=request_id,

From a2e141e167d5f7416a46fcfc0612b9e2e340a0d1 Mon Sep 17 00:00:00 2001
From: Max de Bayser <mbayser@br.ibm.com>
Date: Thu, 10 Oct 2024 13:52:02 -0300
Subject: [PATCH 3/5] fix editing mistake

Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
---
 vllm/entrypoints/openai/serving_chat.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 8f2b96854449..48f392fb9ad9 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -543,7 +543,6 @@ async def chat_completion_stream_generator(
                                 tool_parser.prev_tool_call_arr) > 0
                             index = len(tool_parser.prev_tool_call_arr
                                         ) - 1 if tools_called else 0
-                            tools_called = index > 0
                         else:
                             index = 0
                             tools_called = tool_choice_function_name is not None

From c08c599f63c07f8d86c4acbf399c051420b398ef Mon Sep 17 00:00:00 2001
From: Max de Bayser <mbayser@br.ibm.com>
Date: Thu, 10 Oct 2024 17:06:50 -0300
Subject: [PATCH 4/5] make tool call finish reason consistent with OpenAI
 platform behavior

Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
---
 vllm/entrypoints/openai/serving_chat.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 48f392fb9ad9..e4edfe7ddf1d 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -538,6 +538,7 @@ async def chat_completion_stream_generator(
                         #   any tokens that were generated but previously
                         #   matched by partial json parsing
                         # only happens if we are NOT using guided decoding
+                        tools_called = False
                         if tool_parser:
                             tools_called = len(
                                 tool_parser.prev_tool_call_arr) > 0
@@ -545,7 +546,6 @@ async def chat_completion_stream_generator(
                                         ) - 1 if tools_called else 0
                         else:
                             index = 0
-                            tools_called = tool_choice_function_name is not None
 
                         if self._should_check_for_unstreamed_tool_arg_tokens(
                                 delta_message, output) and tool_parser:
@@ -703,7 +703,6 @@ async def chat_completion_full_generator(
                             name=request.tool_choice.function.name,
                             arguments=output.text))
                     ])
-                tools_called = True
 
             # if the request doesn't use tool choice
             # OR specifies to not use a tool

From f4992fca949d0648ebc1d09edbdc1213dfce79f6 Mon Sep 17 00:00:00 2001
From: Max de Bayser <mbayser@br.ibm.com>
Date: Fri, 11 Oct 2024 19:49:33 -0300
Subject: [PATCH 5/5] Add comments and improve variable name

Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
---
 vllm/entrypoints/openai/serving_chat.py | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index e4edfe7ddf1d..4931195ae0e0 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -538,12 +538,12 @@ async def chat_completion_stream_generator(
                         #   any tokens that were generated but previously
                         #   matched by partial json parsing
                         # only happens if we are NOT using guided decoding
-                        tools_called = False
+                        auto_tools_called = False
                         if tool_parser:
-                            tools_called = len(
+                            auto_tools_called = len(
                                 tool_parser.prev_tool_call_arr) > 0
                             index = len(tool_parser.prev_tool_call_arr
-                                        ) - 1 if tools_called else 0
+                                        ) - 1 if auto_tools_called else 0
                         else:
                             index = 0
 
@@ -578,7 +578,7 @@ async def chat_completion_stream_generator(
                             delta=delta_message,
                             logprobs=logprobs,
                             finish_reason=output.finish_reason
-                            if not tools_called else "tool_calls",
+                            if not auto_tools_called else "tool_calls",
                             stop_reason=output.stop_reason)
                         chunk = ChatCompletionStreamResponse(
                             id=request_id,
@@ -680,8 +680,10 @@ async def chat_completion_full_generator(
             else:
                 logprobs = None
 
-            # by default, tools are not used.
-            tools_called = False
+            # In the OpenAI API the finish_reason is "tools_called"
+            # if the tool choice is auto and the model produced a tool
+            # call. The same is not true for named function calls
+            auto_tools_called = False
 
             # if auto tools are not enabled, and a named tool choice using
             #   outlines is not being used
@@ -724,7 +726,10 @@ async def chat_completion_full_generator(
 
                 tool_call_info = tool_parser.extract_tool_calls(
                     output.text, request=request)
-                tools_called = tool_call_info.tools_called
+                # In the OpenAI API the finish_reason is "tools_called"
+                # if the tool choice is auto and the model produced a tool
+                # call. The same is not true for named function calls
+                auto_tools_called = tool_call_info.tools_called
                 if tool_call_info.tools_called:
                     message = ChatMessage(role=role,
                                           content=tool_call_info.content,
@@ -747,7 +752,7 @@ async def chat_completion_full_generator(
                 index=output.index,
                 message=message,
                 logprobs=logprobs,
-                finish_reason="tool_calls" if tools_called else
+                finish_reason="tool_calls" if auto_tools_called else
                 output.finish_reason if output.finish_reason else "stop",
                 stop_reason=output.stop_reason)
             choices.append(choice_data)