From e54f05a388ce3c895a2fa156ea5c93ed9a79c71c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AE=B8=E6=96=87=E5=8D=BF?= Date: Tue, 2 Sep 2025 15:36:51 +0800 Subject: [PATCH 1/9] Support LongCat-Flash-Chat tool call MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 许文卿 --- docs/features/tool_calling.md | 9 + .../openai/tool_parsers/__init__.py | 2 + .../tool_parsers/longcat_tool_parser.py | 417 ++++++++++++++++++ 3 files changed, 428 insertions(+) create mode 100644 vllm/entrypoints/openai/tool_parsers/longcat_tool_parser.py diff --git a/docs/features/tool_calling.md b/docs/features/tool_calling.md index afc605a504b3..59d632cd19dc 100644 --- a/docs/features/tool_calling.md +++ b/docs/features/tool_calling.md @@ -311,6 +311,15 @@ Flags: * For non-reasoning: `--tool-call-parser hunyuan_a13b` * For reasoning: `--tool-call-parser hunyuan_a13b --reasoning-parser hunyuan_a13b --enable_reasoning` +### LongCat-Flash-Chat Models (`longcat`) + +Supported models: + +* `meituan-longcat/LongCat-Flash-Chat` +* `meituan-longcat/LongCat-Flash-Chat-FP8` + +Flags: `--tool-call-parser longcat` + ### Models with Pythonic Tool Calls (`pythonic`) A growing number of models output a python list to represent tool calls instead of using JSON. This has the advantage of inherently supporting parallel tool calls and removing ambiguity around the JSON schema required for tool calls. The `pythonic` tool parser can support such models. diff --git a/vllm/entrypoints/openai/tool_parsers/__init__.py b/vllm/entrypoints/openai/tool_parsers/__init__.py index 44aa1208a54c..4c5fedb88e74 100644 --- a/vllm/entrypoints/openai/tool_parsers/__init__.py +++ b/vllm/entrypoints/openai/tool_parsers/__init__.py @@ -14,6 +14,7 @@ from .kimi_k2_tool_parser import KimiK2ToolParser from .llama4_pythonic_tool_parser import Llama4PythonicToolParser from .llama_tool_parser import Llama3JsonToolParser +from .longcat_tool_parser import LongCatFlashChatToolParser from .minimax_tool_parser import MinimaxToolParser from .mistral_tool_parser import MistralToolParser from .phi4mini_tool_parser import Phi4MiniJsonToolParser @@ -34,6 +35,7 @@ "Llama3JsonToolParser", "JambaToolParser", "Llama4PythonicToolParser", + "LongCatFlashChatToolParser" "PythonicToolParser", "Phi4MiniJsonToolParser", "DeepSeekV3ToolParser", diff --git a/vllm/entrypoints/openai/tool_parsers/longcat_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/longcat_tool_parser.py new file mode 100644 index 000000000000..9ea61fb95c85 --- /dev/null +++ b/vllm/entrypoints/openai/tool_parsers/longcat_tool_parser.py @@ -0,0 +1,417 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import json +from collections.abc import Sequence +from typing import Union + +import partial_json_parser +import regex as re +from partial_json_parser.core.options import Allow + +from vllm.entrypoints.chat_utils import make_tool_call_id +from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, + DeltaFunctionCall, DeltaMessage, + DeltaToolCall, + ExtractedToolCallInformation, + FunctionCall, ToolCall) +from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( + ToolParser, ToolParserManager) +from vllm.logger import init_logger +from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer + +logger = init_logger(__name__) + + +@ToolParserManager.register_module("longcat") +class LongCatFlashChatToolParser(ToolParser): + + def __init__(self, tokenizer: AnyTokenizer): + super().__init__(tokenizer) + + if isinstance(self.model_tokenizer, MistralTokenizer): + logger.error( + "Detected Mistral tokenizer when using a Hermes model") + self.model_tokenizer = self.model_tokenizer.tokenizer + + self.current_tool_name_sent: bool = False + self.prev_tool_call_arr: list[dict] = [] + self.current_tool_id: int = -1 + self.streamed_args_for_tool: list[str] = [ + ] # map what has been streamed for each tool so far to a list + + self.tool_call_start_token: str = "" + self.tool_call_end_token: str = "" + + self.tool_call_regex = re.compile( + r"(.*?)|(.*)", + re.DOTALL) + + if not self.model_tokenizer: + raise ValueError( + "The model tokenizer must be passed to the ToolParser " + "constructor during construction.") + self.tool_call_start_token_ids = self.model_tokenizer.encode( + self.tool_call_start_token, add_special_tokens=False) + self.tool_call_end_token_ids = self.model_tokenizer.encode( + self.tool_call_end_token, add_special_tokens=False) + + self.tool_call_start_token_array = [ + self.model_tokenizer.decode([token_id]) + for token_id in self.tool_call_start_token_ids + ] + + self.tool_call_end_token_array = [ + self.model_tokenizer.decode([token_id]) + for token_id in self.tool_call_end_token_ids + ] + + self.buffered_delta_text = "" + + # Very simple idea: when encountering tokens like <, tool, _call, >, + # <, /, tool, _call, >, store them in a buffer. + # When the last token is encountered, empty the buffer and return it. + # If a token appears in an incorrect sequence while storing in the buffer, + # return the preceding buffer along with the token. + def tool_call_delta_buffer(self, delta_text: str): + # If the sequence of tool_call_start or tool_call_end tokens is not yet + # complete, fill the buffer with the token and return "". + if (delta_text in self.tool_call_start_token_array + or delta_text in self.tool_call_end_token_array): + # If delta_text is the last token of tool_call_start_token or + # tool_call_end_token, empty the buffer and return + # the buffered text + delta_text. + if (delta_text == self.tool_call_start_token_array[-1] + or delta_text == self.tool_call_end_token_array[-1]): + buffered_text = self.buffered_delta_text + self.buffered_delta_text = "" + return buffered_text + delta_text + else: + self.buffered_delta_text = self.buffered_delta_text + delta_text + return "" + else: + if self.buffered_delta_text: + buffered_text = self.buffered_delta_text + self.buffered_delta_text = "" + return buffered_text + delta_text + else: + return delta_text + + def extract_tool_calls( + self, + model_output: str, + request: ChatCompletionRequest, + ) -> ExtractedToolCallInformation: + + # sanity check; avoid unnecessary processing + if self.tool_call_start_token not in model_output: + return ExtractedToolCallInformation(tools_called=False, + tool_calls=[], + content=model_output) + + else: + + try: + # there are two possible captures - between tags, or between a + # tag and end-of-string so the result of + # findall is an array of tuples where one is a function call and + # the other is None + function_call_tuples = ( + self.tool_call_regex.findall(model_output)) + + # load the JSON, and then use it to build the Function and + # Tool Call + raw_function_calls = [ + json.loads(match[0] if match[0] else match[1]) + for match in function_call_tuples + ] + tool_calls = [ + ToolCall( + type="function", + function=FunctionCall( + name=function_call["name"], + # function call args are JSON but as a string + arguments=json.dumps(function_call["arguments"], + ensure_ascii=False))) + for function_call in raw_function_calls + ] + + content = model_output[:model_output. + find(self.tool_call_start_token)] + return ExtractedToolCallInformation( + tools_called=True, + tool_calls=tool_calls, + content=content if content else None) + + except Exception: + logger.exception( + "Error in extracting tool call from response.") + return ExtractedToolCallInformation(tools_called=False, + tool_calls=[], + content=model_output) + + def extract_tool_calls_streaming( + self, + previous_text: str, + current_text: str, + delta_text: str, + previous_token_ids: Sequence[int], + current_token_ids: Sequence[int], + delta_token_ids: Sequence[int], + request: ChatCompletionRequest, + ) -> Union[DeltaMessage, None]: + # 1. All tokens are parsed based on _text, not token_ids. + # 2. All incoming text data is processed by the tool_call_delta_buffer + # function for buffering before being used for parsing. + + delta_text = self.tool_call_delta_buffer(delta_text) + # If the last characters of previous_text + # match self.buffered_delta_text, remove only the matching part. + if (len(previous_text) >= len(self.buffered_delta_text) + and previous_text[-len(self.buffered_delta_text):] + == self.buffered_delta_text): + previous_text = previous_text[:-len(self.buffered_delta_text)] + current_text = previous_text + delta_text + + logger.debug("delta_text: %s", delta_text) + logger.debug("delta_token_ids: %s", delta_token_ids) + # check to see if we should be streaming a tool call - is there a + if self.tool_call_start_token not in current_text: + logger.debug("No tool call tokens found!") + return DeltaMessage(content=delta_text) + + try: + + # figure out where we are in the parsing by counting tool call + # start & end tags + prev_tool_start_count = previous_text.count( + self.tool_call_start_token) + prev_tool_end_count = previous_text.count(self.tool_call_end_token) + cur_tool_start_count = current_text.count( + self.tool_call_start_token) + cur_tool_end_count = current_text.count(self.tool_call_end_token) + tool_call_portion = None + text_portion = None + + # case: if we're generating text, OR rounding out a tool call + if (cur_tool_start_count == cur_tool_end_count + and prev_tool_end_count == cur_tool_end_count + and self.tool_call_end_token not in delta_text): + logger.debug("Generating text content! skipping tool parsing.") + return DeltaMessage(content=delta_text) + + if self.tool_call_end_token in delta_text: + logger.debug("tool_call_end_token in delta_text") + full_text = current_text + delta_text + tool_call_portion = full_text.split( + self.tool_call_start_token)[-1].split( + self.tool_call_end_token)[0].rstrip() + delta_text = delta_text.split( + self.tool_call_end_token)[0].rstrip() + text_portion = delta_text.split( + self.tool_call_end_token)[-1].lstrip() + + # case: if tool open & close tag counts don't match, we're doing + # imaginary "else" block here + # something with tools with this diff. + # flags for partial JSON parting. exported constants from + # "Allow" are handled via BIT MASK + flags = Allow.ALL if self.current_tool_name_sent \ + else Allow.ALL & ~Allow.STR + + # case -- we're starting a new tool call + if (cur_tool_start_count > cur_tool_end_count + and cur_tool_start_count > prev_tool_start_count): + if len(delta_token_ids) > 1: + tool_call_portion = current_text.split( + self.tool_call_start_token)[-1] + else: + tool_call_portion = None + delta = None + + text_portion = None + + # set cursors and state appropriately + self.current_tool_id += 1 + self.current_tool_name_sent = False + self.streamed_args_for_tool.append("") + logger.debug("Starting on a new tool %s", self.current_tool_id) + + # case -- we're updating an existing tool call + elif (cur_tool_start_count > cur_tool_end_count + and cur_tool_start_count == prev_tool_start_count): + + # get the portion of the text that's the tool call + tool_call_portion = current_text.split( + self.tool_call_start_token)[-1] + text_portion = None + + # case -- the current tool call is being closed. + elif (cur_tool_start_count == cur_tool_end_count + and cur_tool_end_count >= prev_tool_end_count): + if (self.prev_tool_call_arr is None + or len(self.prev_tool_call_arr) == 0): + logger.debug( + "attempting to close tool call, but no tool call") + return None + diff = self.prev_tool_call_arr[self.current_tool_id].get( + "arguments") + if diff: + diff = diff.encode('utf-8').decode( + 'unicode_escape') if diff is str else diff + if ('"}' not in delta_text): + return None + end_loc = delta_text.rindex('"}') + diff = delta_text[:end_loc] + '"}' + logger.debug( + "Finishing tool and found diff that had not " + "been streamed yet: %s", diff) + self.streamed_args_for_tool[self.current_tool_id] \ + += diff + return DeltaMessage(tool_calls=[ + DeltaToolCall(index=self.current_tool_id, + function=DeltaFunctionCall( + arguments=diff).model_dump( + exclude_none=True)) + ]) + + # case -- otherwise we're just generating text + else: + text = delta_text.replace(self.tool_call_start_token, "") + text = text.replace(self.tool_call_end_token, "") + delta = DeltaMessage(tool_calls=[], content=text) + return delta + + try: + + current_tool_call = partial_json_parser.loads( + tool_call_portion or "{}", + flags) if tool_call_portion else None + logger.debug("Parsed tool call %s", current_tool_call) + except partial_json_parser.core.exceptions.MalformedJSON: + logger.debug('not enough tokens to parse into JSON yet') + return None + except json.decoder.JSONDecodeError: + logger.debug("unable to parse JSON") + return None + + # case - we haven't sent the tool name yet. If it's available, send + # it. otherwise, wait until it's available. + if not self.current_tool_name_sent: + if (current_tool_call is None): + return None + function_name: Union[str, None] = current_tool_call.get("name") + if function_name: + self.current_tool_name_sent = True + return DeltaMessage(tool_calls=[ + DeltaToolCall(index=self.current_tool_id, + type="function", + id=make_tool_call_id(), + function=DeltaFunctionCall( + name=function_name).model_dump( + exclude_none=True)) + ]) + else: + return None + # case -- otherwise, send the tool call delta + + # if the tool call portion is None, send the delta as text + if tool_call_portion is None: + # if there's text but not tool calls, send that - + # otherwise None to skip chunk + delta = DeltaMessage(content=delta_text) \ + if text_portion is not None else None + return delta + + # now, the nitty-gritty of tool calls + # now we have the portion to parse as tool call. + + logger.debug("Trying to parse current tool call with ID %s", + self.current_tool_id) + + # if we're starting a new tool call, push an empty object in as + # a placeholder for the arguments + if len(self.prev_tool_call_arr) <= self.current_tool_id: + self.prev_tool_call_arr.append({}) + + # main logic for tool parsing here - compare prev. partially-parsed + # JSON to the current partially-parsed JSON + prev_arguments = ( + self.prev_tool_call_arr[self.current_tool_id].get("arguments")) + cur_arguments = current_tool_call.get("arguments") + + logger.debug("diffing old arguments: %s", prev_arguments) + logger.debug("against new ones: %s", cur_arguments) + + # case -- no arguments have been created yet. skip sending a delta. + if not cur_arguments and not prev_arguments: + logger.debug("Skipping text %s - no arguments", delta_text) + delta = None + + # case -- prev arguments are defined, but non are now. + # probably impossible, but not a fatal error - just keep going + elif not cur_arguments and prev_arguments: + logger.error("should be impossible to have arguments reset " + "mid-call. skipping streaming anything.") + delta = None + + # case -- we now have the first info about arguments available from + # autocompleting the JSON + elif cur_arguments and not prev_arguments: + + cur_arguments_json = json.dumps(cur_arguments, + ensure_ascii=False) + logger.debug("finding %s in %s", delta_text, + cur_arguments_json) + + # get the location where previous args differ from current + if (delta_text not in cur_arguments_json[:-2]): + return None + args_delta_start_loc = cur_arguments_json[:-2]. \ + rindex(delta_text) + \ + len(delta_text) + + # use that to find the actual delta + arguments_delta = cur_arguments_json[:args_delta_start_loc] + logger.debug("First tokens in arguments received: %s", + arguments_delta) + + delta = DeltaMessage(tool_calls=[ + DeltaToolCall(index=self.current_tool_id, + function=DeltaFunctionCall( + arguments=arguments_delta).model_dump( + exclude_none=True)) + ]) + self.streamed_args_for_tool[self.current_tool_id] \ + += arguments_delta + + # last case -- we have an update to existing arguments. + elif cur_arguments and prev_arguments: + if isinstance(delta_text, str) and len(delta_text.rstrip( + )) >= 1 and delta_text.rstrip()[-1] == '}': + delta_text = delta_text.rstrip()[:-1] + + logger.debug("got diff %s", delta_text) + + delta = DeltaMessage(tool_calls=[ + DeltaToolCall(index=self.current_tool_id, + function=DeltaFunctionCall( + arguments=delta_text).model_dump( + exclude_none=True)) + ]) + self.streamed_args_for_tool[self.current_tool_id] \ + += delta_text + + # handle saving the state for the current tool into + # the "prev" list for use in diffing for the next iteration + if self.current_tool_id == len(self.prev_tool_call_arr) - 1: + self.prev_tool_call_arr[self.current_tool_id] = \ + current_tool_call + else: + self.prev_tool_call_arr.append(current_tool_call) + + return delta + + except Exception: + logger.exception("Error trying to handle streaming tool call.") + return None # do not stream a delta. skip this token ID. From 9eb122d521d5a8c7f2b2a8fd429e3dbdbcbb3e7f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AE=B8=E6=96=87=E5=8D=BF?= Date: Tue, 2 Sep 2025 15:43:14 +0800 Subject: [PATCH 2/9] Support LongCat-Flash-Chat tool call MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 许文卿 --- vllm/entrypoints/openai/tool_parsers/__init__.py | 2 +- vllm/entrypoints/openai/tool_parsers/longcat_tool_parser.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/entrypoints/openai/tool_parsers/__init__.py b/vllm/entrypoints/openai/tool_parsers/__init__.py index 4c5fedb88e74..ab87037d85ec 100644 --- a/vllm/entrypoints/openai/tool_parsers/__init__.py +++ b/vllm/entrypoints/openai/tool_parsers/__init__.py @@ -14,7 +14,7 @@ from .kimi_k2_tool_parser import KimiK2ToolParser from .llama4_pythonic_tool_parser import Llama4PythonicToolParser from .llama_tool_parser import Llama3JsonToolParser -from .longcat_tool_parser import LongCatFlashChatToolParser +from .longcat_tool_parser import LongcatFlashToolParser from .minimax_tool_parser import MinimaxToolParser from .mistral_tool_parser import MistralToolParser from .phi4mini_tool_parser import Phi4MiniJsonToolParser diff --git a/vllm/entrypoints/openai/tool_parsers/longcat_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/longcat_tool_parser.py index 9ea61fb95c85..95d890ad89b2 100644 --- a/vllm/entrypoints/openai/tool_parsers/longcat_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/longcat_tool_parser.py @@ -24,7 +24,7 @@ @ToolParserManager.register_module("longcat") -class LongCatFlashChatToolParser(ToolParser): +class LongcatFlashToolParser(ToolParser): def __init__(self, tokenizer: AnyTokenizer): super().__init__(tokenizer) From d1a8a68aa56b383c4e7858a6826f96ca134c153b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AE=B8=E6=96=87=E5=8D=BF?= Date: Tue, 2 Sep 2025 17:16:06 +0800 Subject: [PATCH 3/9] Support LongCat-Flash-Chat tool call MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 许文卿 --- vllm/entrypoints/openai/tool_parsers/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/entrypoints/openai/tool_parsers/__init__.py b/vllm/entrypoints/openai/tool_parsers/__init__.py index ab87037d85ec..f0e7f97df0d9 100644 --- a/vllm/entrypoints/openai/tool_parsers/__init__.py +++ b/vllm/entrypoints/openai/tool_parsers/__init__.py @@ -35,7 +35,7 @@ "Llama3JsonToolParser", "JambaToolParser", "Llama4PythonicToolParser", - "LongCatFlashChatToolParser" + "LongcatFlashToolParser" "PythonicToolParser", "Phi4MiniJsonToolParser", "DeepSeekV3ToolParser", From 12a2446ded00767de6e962f6792f61018ff68400 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AE=B8=E6=96=87=E5=8D=BF?= Date: Tue, 2 Sep 2025 17:17:47 +0800 Subject: [PATCH 4/9] Support LongCat-Flash-Chat tool call MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 许文卿 --- vllm/entrypoints/openai/tool_parsers/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/entrypoints/openai/tool_parsers/__init__.py b/vllm/entrypoints/openai/tool_parsers/__init__.py index f0e7f97df0d9..51b004228379 100644 --- a/vllm/entrypoints/openai/tool_parsers/__init__.py +++ b/vllm/entrypoints/openai/tool_parsers/__init__.py @@ -35,7 +35,7 @@ "Llama3JsonToolParser", "JambaToolParser", "Llama4PythonicToolParser", - "LongcatFlashToolParser" + "LongcatFlashToolParser", "PythonicToolParser", "Phi4MiniJsonToolParser", "DeepSeekV3ToolParser", From c1e4a9a28bad296952e9c28894ada3f45ca133df Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AE=B8=E6=96=87=E5=8D=BF?= Date: Wed, 24 Sep 2025 14:43:53 +0800 Subject: [PATCH 5/9] Add longcat tool parser MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 许文卿 --- .../openai/tool_parsers/longcat_tool_parser.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/vllm/entrypoints/openai/tool_parsers/longcat_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/longcat_tool_parser.py index 95d890ad89b2..9f5b9f8b422e 100644 --- a/vllm/entrypoints/openai/tool_parsers/longcat_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/longcat_tool_parser.py @@ -97,6 +97,15 @@ def tool_call_delta_buffer(self, delta_text: str): else: return delta_text + def adjust_request( + self, request: ChatCompletionRequest) -> ChatCompletionRequest: + if request.tools and request.tool_choice != 'none': + # do not skip special tokens because the tool_call tokens are + # marked "special" in some models. Since they are skipped + # prior to the call to the tool parser, it breaks tool calling. + request.skip_special_tokens = False + return request + def extract_tool_calls( self, model_output: str, From babc0f189ddcfc8982fc2fc7bcb17469a3321f2d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AE=B8=E6=96=87=E5=8D=BF?= Date: Wed, 24 Sep 2025 14:46:39 +0800 Subject: [PATCH 6/9] merge latest hermes tool parser MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 许文卿 --- .../tool_parsers/longcat_tool_parser.py | 42 +++++++++++++++---- 1 file changed, 35 insertions(+), 7 deletions(-) diff --git a/vllm/entrypoints/openai/tool_parsers/longcat_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/longcat_tool_parser.py index 9f5b9f8b422e..578d4c0086a8 100644 --- a/vllm/entrypoints/openai/tool_parsers/longcat_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/longcat_tool_parser.py @@ -367,16 +367,32 @@ def extract_tool_calls_streaming( # case -- we now have the first info about arguments available from # autocompleting the JSON elif cur_arguments and not prev_arguments: + # extract the content after {"name": ..., "arguments": + # directly from tool_call_portion as cur_arguments_json, + # since cur_arguments may differ from the original text + # due to partial JSON parsing + # for example, tool_call_portion = + # {"name": "search", "arguments": {"search_request": {" + # but cur_arguments = + # {"search_request": {}} + function_name = current_tool_call.get("name") + match = re.search( + r'\{"name":\s*"' + + re.escape(function_name) + r'"\s*,\s*"arguments":\s*(.*)', + tool_call_portion.strip(), re.DOTALL) + if match: + cur_arguments_json = match.group(1) + else: + cur_arguments_json = json.dumps(cur_arguments, + ensure_ascii=False) - cur_arguments_json = json.dumps(cur_arguments, - ensure_ascii=False) logger.debug("finding %s in %s", delta_text, cur_arguments_json) - # get the location where previous args differ from current - if (delta_text not in cur_arguments_json[:-2]): + # get the location where previous args differ from current. + if (delta_text not in cur_arguments_json): return None - args_delta_start_loc = cur_arguments_json[:-2]. \ + args_delta_start_loc = cur_arguments_json. \ rindex(delta_text) + \ len(delta_text) @@ -396,8 +412,20 @@ def extract_tool_calls_streaming( # last case -- we have an update to existing arguments. elif cur_arguments and prev_arguments: - if isinstance(delta_text, str) and len(delta_text.rstrip( - )) >= 1 and delta_text.rstrip()[-1] == '}': + # judge whether the tool_call_portion is a complete JSON + try: + json.loads(tool_call_portion) + is_complete_json = True + except Exception: + is_complete_json = False + + # if the delta_text ends with a '}' and tool_call_portion is a + # complete JSON, then the last '}' does not belong to the + # arguments, so we should trim it off + if isinstance(delta_text, str) \ + and len(delta_text.rstrip()) >= 1 \ + and delta_text.rstrip()[-1] == '}' \ + and is_complete_json: delta_text = delta_text.rstrip()[:-1] logger.debug("got diff %s", delta_text) From c3eabb9902d064e36f2e533e60da8d8a09cd2976 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AE=B8=E6=96=87=E5=8D=BF?= Date: Wed, 24 Sep 2025 15:33:28 +0800 Subject: [PATCH 7/9] inherit from Hermes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 许文卿 --- .../tool_parsers/longcat_tool_parser.py | 423 +----------------- 1 file changed, 6 insertions(+), 417 deletions(-) diff --git a/vllm/entrypoints/openai/tool_parsers/longcat_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/longcat_tool_parser.py index 578d4c0086a8..45d264c83118 100644 --- a/vllm/entrypoints/openai/tool_parsers/longcat_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/longcat_tool_parser.py @@ -1,45 +1,25 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import json -from collections.abc import Sequence -from typing import Union - -import partial_json_parser import regex as re -from partial_json_parser.core.options import Allow -from vllm.entrypoints.chat_utils import make_tool_call_id -from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, - DeltaFunctionCall, DeltaMessage, - DeltaToolCall, - ExtractedToolCallInformation, - FunctionCall, ToolCall) from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( - ToolParser, ToolParserManager) + ToolParserManager) +from vllm.entrypoints.openai.tool_parsers.hermes_tool_parser import ( + Hermes2ProToolParser) + from vllm.logger import init_logger -from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer +from vllm.transformers_utils.tokenizer import AnyTokenizer logger = init_logger(__name__) @ToolParserManager.register_module("longcat") -class LongcatFlashToolParser(ToolParser): +class LongcatFlashToolParser(Hermes2ProToolParser): def __init__(self, tokenizer: AnyTokenizer): super().__init__(tokenizer) - if isinstance(self.model_tokenizer, MistralTokenizer): - logger.error( - "Detected Mistral tokenizer when using a Hermes model") - self.model_tokenizer = self.model_tokenizer.tokenizer - - self.current_tool_name_sent: bool = False - self.prev_tool_call_arr: list[dict] = [] - self.current_tool_id: int = -1 - self.streamed_args_for_tool: list[str] = [ - ] # map what has been streamed for each tool so far to a list - self.tool_call_start_token: str = "" self.tool_call_end_token: str = "" @@ -47,10 +27,6 @@ def __init__(self, tokenizer: AnyTokenizer): r"(.*?)|(.*)", re.DOTALL) - if not self.model_tokenizer: - raise ValueError( - "The model tokenizer must be passed to the ToolParser " - "constructor during construction.") self.tool_call_start_token_ids = self.model_tokenizer.encode( self.tool_call_start_token, add_special_tokens=False) self.tool_call_end_token_ids = self.model_tokenizer.encode( @@ -65,390 +41,3 @@ def __init__(self, tokenizer: AnyTokenizer): self.model_tokenizer.decode([token_id]) for token_id in self.tool_call_end_token_ids ] - - self.buffered_delta_text = "" - - # Very simple idea: when encountering tokens like <, tool, _call, >, - # <, /, tool, _call, >, store them in a buffer. - # When the last token is encountered, empty the buffer and return it. - # If a token appears in an incorrect sequence while storing in the buffer, - # return the preceding buffer along with the token. - def tool_call_delta_buffer(self, delta_text: str): - # If the sequence of tool_call_start or tool_call_end tokens is not yet - # complete, fill the buffer with the token and return "". - if (delta_text in self.tool_call_start_token_array - or delta_text in self.tool_call_end_token_array): - # If delta_text is the last token of tool_call_start_token or - # tool_call_end_token, empty the buffer and return - # the buffered text + delta_text. - if (delta_text == self.tool_call_start_token_array[-1] - or delta_text == self.tool_call_end_token_array[-1]): - buffered_text = self.buffered_delta_text - self.buffered_delta_text = "" - return buffered_text + delta_text - else: - self.buffered_delta_text = self.buffered_delta_text + delta_text - return "" - else: - if self.buffered_delta_text: - buffered_text = self.buffered_delta_text - self.buffered_delta_text = "" - return buffered_text + delta_text - else: - return delta_text - - def adjust_request( - self, request: ChatCompletionRequest) -> ChatCompletionRequest: - if request.tools and request.tool_choice != 'none': - # do not skip special tokens because the tool_call tokens are - # marked "special" in some models. Since they are skipped - # prior to the call to the tool parser, it breaks tool calling. - request.skip_special_tokens = False - return request - - def extract_tool_calls( - self, - model_output: str, - request: ChatCompletionRequest, - ) -> ExtractedToolCallInformation: - - # sanity check; avoid unnecessary processing - if self.tool_call_start_token not in model_output: - return ExtractedToolCallInformation(tools_called=False, - tool_calls=[], - content=model_output) - - else: - - try: - # there are two possible captures - between tags, or between a - # tag and end-of-string so the result of - # findall is an array of tuples where one is a function call and - # the other is None - function_call_tuples = ( - self.tool_call_regex.findall(model_output)) - - # load the JSON, and then use it to build the Function and - # Tool Call - raw_function_calls = [ - json.loads(match[0] if match[0] else match[1]) - for match in function_call_tuples - ] - tool_calls = [ - ToolCall( - type="function", - function=FunctionCall( - name=function_call["name"], - # function call args are JSON but as a string - arguments=json.dumps(function_call["arguments"], - ensure_ascii=False))) - for function_call in raw_function_calls - ] - - content = model_output[:model_output. - find(self.tool_call_start_token)] - return ExtractedToolCallInformation( - tools_called=True, - tool_calls=tool_calls, - content=content if content else None) - - except Exception: - logger.exception( - "Error in extracting tool call from response.") - return ExtractedToolCallInformation(tools_called=False, - tool_calls=[], - content=model_output) - - def extract_tool_calls_streaming( - self, - previous_text: str, - current_text: str, - delta_text: str, - previous_token_ids: Sequence[int], - current_token_ids: Sequence[int], - delta_token_ids: Sequence[int], - request: ChatCompletionRequest, - ) -> Union[DeltaMessage, None]: - # 1. All tokens are parsed based on _text, not token_ids. - # 2. All incoming text data is processed by the tool_call_delta_buffer - # function for buffering before being used for parsing. - - delta_text = self.tool_call_delta_buffer(delta_text) - # If the last characters of previous_text - # match self.buffered_delta_text, remove only the matching part. - if (len(previous_text) >= len(self.buffered_delta_text) - and previous_text[-len(self.buffered_delta_text):] - == self.buffered_delta_text): - previous_text = previous_text[:-len(self.buffered_delta_text)] - current_text = previous_text + delta_text - - logger.debug("delta_text: %s", delta_text) - logger.debug("delta_token_ids: %s", delta_token_ids) - # check to see if we should be streaming a tool call - is there a - if self.tool_call_start_token not in current_text: - logger.debug("No tool call tokens found!") - return DeltaMessage(content=delta_text) - - try: - - # figure out where we are in the parsing by counting tool call - # start & end tags - prev_tool_start_count = previous_text.count( - self.tool_call_start_token) - prev_tool_end_count = previous_text.count(self.tool_call_end_token) - cur_tool_start_count = current_text.count( - self.tool_call_start_token) - cur_tool_end_count = current_text.count(self.tool_call_end_token) - tool_call_portion = None - text_portion = None - - # case: if we're generating text, OR rounding out a tool call - if (cur_tool_start_count == cur_tool_end_count - and prev_tool_end_count == cur_tool_end_count - and self.tool_call_end_token not in delta_text): - logger.debug("Generating text content! skipping tool parsing.") - return DeltaMessage(content=delta_text) - - if self.tool_call_end_token in delta_text: - logger.debug("tool_call_end_token in delta_text") - full_text = current_text + delta_text - tool_call_portion = full_text.split( - self.tool_call_start_token)[-1].split( - self.tool_call_end_token)[0].rstrip() - delta_text = delta_text.split( - self.tool_call_end_token)[0].rstrip() - text_portion = delta_text.split( - self.tool_call_end_token)[-1].lstrip() - - # case: if tool open & close tag counts don't match, we're doing - # imaginary "else" block here - # something with tools with this diff. - # flags for partial JSON parting. exported constants from - # "Allow" are handled via BIT MASK - flags = Allow.ALL if self.current_tool_name_sent \ - else Allow.ALL & ~Allow.STR - - # case -- we're starting a new tool call - if (cur_tool_start_count > cur_tool_end_count - and cur_tool_start_count > prev_tool_start_count): - if len(delta_token_ids) > 1: - tool_call_portion = current_text.split( - self.tool_call_start_token)[-1] - else: - tool_call_portion = None - delta = None - - text_portion = None - - # set cursors and state appropriately - self.current_tool_id += 1 - self.current_tool_name_sent = False - self.streamed_args_for_tool.append("") - logger.debug("Starting on a new tool %s", self.current_tool_id) - - # case -- we're updating an existing tool call - elif (cur_tool_start_count > cur_tool_end_count - and cur_tool_start_count == prev_tool_start_count): - - # get the portion of the text that's the tool call - tool_call_portion = current_text.split( - self.tool_call_start_token)[-1] - text_portion = None - - # case -- the current tool call is being closed. - elif (cur_tool_start_count == cur_tool_end_count - and cur_tool_end_count >= prev_tool_end_count): - if (self.prev_tool_call_arr is None - or len(self.prev_tool_call_arr) == 0): - logger.debug( - "attempting to close tool call, but no tool call") - return None - diff = self.prev_tool_call_arr[self.current_tool_id].get( - "arguments") - if diff: - diff = diff.encode('utf-8').decode( - 'unicode_escape') if diff is str else diff - if ('"}' not in delta_text): - return None - end_loc = delta_text.rindex('"}') - diff = delta_text[:end_loc] + '"}' - logger.debug( - "Finishing tool and found diff that had not " - "been streamed yet: %s", diff) - self.streamed_args_for_tool[self.current_tool_id] \ - += diff - return DeltaMessage(tool_calls=[ - DeltaToolCall(index=self.current_tool_id, - function=DeltaFunctionCall( - arguments=diff).model_dump( - exclude_none=True)) - ]) - - # case -- otherwise we're just generating text - else: - text = delta_text.replace(self.tool_call_start_token, "") - text = text.replace(self.tool_call_end_token, "") - delta = DeltaMessage(tool_calls=[], content=text) - return delta - - try: - - current_tool_call = partial_json_parser.loads( - tool_call_portion or "{}", - flags) if tool_call_portion else None - logger.debug("Parsed tool call %s", current_tool_call) - except partial_json_parser.core.exceptions.MalformedJSON: - logger.debug('not enough tokens to parse into JSON yet') - return None - except json.decoder.JSONDecodeError: - logger.debug("unable to parse JSON") - return None - - # case - we haven't sent the tool name yet. If it's available, send - # it. otherwise, wait until it's available. - if not self.current_tool_name_sent: - if (current_tool_call is None): - return None - function_name: Union[str, None] = current_tool_call.get("name") - if function_name: - self.current_tool_name_sent = True - return DeltaMessage(tool_calls=[ - DeltaToolCall(index=self.current_tool_id, - type="function", - id=make_tool_call_id(), - function=DeltaFunctionCall( - name=function_name).model_dump( - exclude_none=True)) - ]) - else: - return None - # case -- otherwise, send the tool call delta - - # if the tool call portion is None, send the delta as text - if tool_call_portion is None: - # if there's text but not tool calls, send that - - # otherwise None to skip chunk - delta = DeltaMessage(content=delta_text) \ - if text_portion is not None else None - return delta - - # now, the nitty-gritty of tool calls - # now we have the portion to parse as tool call. - - logger.debug("Trying to parse current tool call with ID %s", - self.current_tool_id) - - # if we're starting a new tool call, push an empty object in as - # a placeholder for the arguments - if len(self.prev_tool_call_arr) <= self.current_tool_id: - self.prev_tool_call_arr.append({}) - - # main logic for tool parsing here - compare prev. partially-parsed - # JSON to the current partially-parsed JSON - prev_arguments = ( - self.prev_tool_call_arr[self.current_tool_id].get("arguments")) - cur_arguments = current_tool_call.get("arguments") - - logger.debug("diffing old arguments: %s", prev_arguments) - logger.debug("against new ones: %s", cur_arguments) - - # case -- no arguments have been created yet. skip sending a delta. - if not cur_arguments and not prev_arguments: - logger.debug("Skipping text %s - no arguments", delta_text) - delta = None - - # case -- prev arguments are defined, but non are now. - # probably impossible, but not a fatal error - just keep going - elif not cur_arguments and prev_arguments: - logger.error("should be impossible to have arguments reset " - "mid-call. skipping streaming anything.") - delta = None - - # case -- we now have the first info about arguments available from - # autocompleting the JSON - elif cur_arguments and not prev_arguments: - # extract the content after {"name": ..., "arguments": - # directly from tool_call_portion as cur_arguments_json, - # since cur_arguments may differ from the original text - # due to partial JSON parsing - # for example, tool_call_portion = - # {"name": "search", "arguments": {"search_request": {" - # but cur_arguments = - # {"search_request": {}} - function_name = current_tool_call.get("name") - match = re.search( - r'\{"name":\s*"' + - re.escape(function_name) + r'"\s*,\s*"arguments":\s*(.*)', - tool_call_portion.strip(), re.DOTALL) - if match: - cur_arguments_json = match.group(1) - else: - cur_arguments_json = json.dumps(cur_arguments, - ensure_ascii=False) - - logger.debug("finding %s in %s", delta_text, - cur_arguments_json) - - # get the location where previous args differ from current. - if (delta_text not in cur_arguments_json): - return None - args_delta_start_loc = cur_arguments_json. \ - rindex(delta_text) + \ - len(delta_text) - - # use that to find the actual delta - arguments_delta = cur_arguments_json[:args_delta_start_loc] - logger.debug("First tokens in arguments received: %s", - arguments_delta) - - delta = DeltaMessage(tool_calls=[ - DeltaToolCall(index=self.current_tool_id, - function=DeltaFunctionCall( - arguments=arguments_delta).model_dump( - exclude_none=True)) - ]) - self.streamed_args_for_tool[self.current_tool_id] \ - += arguments_delta - - # last case -- we have an update to existing arguments. - elif cur_arguments and prev_arguments: - # judge whether the tool_call_portion is a complete JSON - try: - json.loads(tool_call_portion) - is_complete_json = True - except Exception: - is_complete_json = False - - # if the delta_text ends with a '}' and tool_call_portion is a - # complete JSON, then the last '}' does not belong to the - # arguments, so we should trim it off - if isinstance(delta_text, str) \ - and len(delta_text.rstrip()) >= 1 \ - and delta_text.rstrip()[-1] == '}' \ - and is_complete_json: - delta_text = delta_text.rstrip()[:-1] - - logger.debug("got diff %s", delta_text) - - delta = DeltaMessage(tool_calls=[ - DeltaToolCall(index=self.current_tool_id, - function=DeltaFunctionCall( - arguments=delta_text).model_dump( - exclude_none=True)) - ]) - self.streamed_args_for_tool[self.current_tool_id] \ - += delta_text - - # handle saving the state for the current tool into - # the "prev" list for use in diffing for the next iteration - if self.current_tool_id == len(self.prev_tool_call_arr) - 1: - self.prev_tool_call_arr[self.current_tool_id] = \ - current_tool_call - else: - self.prev_tool_call_arr.append(current_tool_call) - - return delta - - except Exception: - logger.exception("Error trying to handle streaming tool call.") - return None # do not stream a delta. skip this token ID. From 34ab6ac1896f05df4fe89e9a72e9897b13824967 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AE=B8=E6=96=87=E5=8D=BF?= Date: Wed, 24 Sep 2025 15:39:03 +0800 Subject: [PATCH 8/9] inherit from Hermes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 许文卿 --- vllm/entrypoints/openai/tool_parsers/longcat_tool_parser.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/vllm/entrypoints/openai/tool_parsers/longcat_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/longcat_tool_parser.py index 45d264c83118..3fdf8ebe40d8 100644 --- a/vllm/entrypoints/openai/tool_parsers/longcat_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/longcat_tool_parser.py @@ -8,11 +8,8 @@ from vllm.entrypoints.openai.tool_parsers.hermes_tool_parser import ( Hermes2ProToolParser) -from vllm.logger import init_logger from vllm.transformers_utils.tokenizer import AnyTokenizer -logger = init_logger(__name__) - @ToolParserManager.register_module("longcat") class LongcatFlashToolParser(Hermes2ProToolParser): From 923024f71d7a5794571746b16f9fbf054f91cece Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AE=B8=E6=96=87=E5=8D=BF?= Date: Wed, 24 Sep 2025 15:46:58 +0800 Subject: [PATCH 9/9] fix format MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 许文卿 --- vllm/entrypoints/openai/tool_parsers/longcat_tool_parser.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/entrypoints/openai/tool_parsers/longcat_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/longcat_tool_parser.py index 3fdf8ebe40d8..87a3fdc44397 100644 --- a/vllm/entrypoints/openai/tool_parsers/longcat_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/longcat_tool_parser.py @@ -7,7 +7,6 @@ ToolParserManager) from vllm.entrypoints.openai.tool_parsers.hermes_tool_parser import ( Hermes2ProToolParser) - from vllm.transformers_utils.tokenizer import AnyTokenizer