Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 47 additions & 1 deletion livekit-agents/livekit/agents/llm/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import asyncio
import base64
import inspect
import re
import sys
import types
from dataclasses import dataclass
Expand Down Expand Up @@ -44,6 +45,51 @@
THINK_TAG_END = "</think>"


def _sanitize_json_control_chars(json_str: str) -> str:
"""
Sanitize a JSON string by escaping control characters inside string values.

LLMs sometimes generate JSON with literal control characters (e.g., newlines)
inside string values, which violates the JSON spec and causes parsing to fail
with: ValueError: control character (\\u0000-\\u001F) found while parsing a string

This function escapes those control characters before parsing.
"""
if not json_str:
return json_str

def escape_control_chars_in_string(match: re.Match[str]) -> str:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

could you add some tests for the parsing function? it's important to ensure that it's handling the wide variety of valid JSON inputs without breaking them.

also, isn't it better to run a regex replacement to strip them out?

"""Escape control characters inside a matched JSON string value."""
string_content = match.group(0)
result = []
i = 0
while i < len(string_content):
char = string_content[i]
if char == "\\" and i + 1 < len(string_content):
# Already escaped sequence, keep as-is
result.append(char)
result.append(string_content[i + 1])
i += 2
elif ord(char) < 0x20: # Control characters (U+0000 to U+001F)
if char == "\n":
result.append("\\n")
elif char == "\r":
result.append("\\r")
elif char == "\t":
result.append("\\t")
else:
result.append(f"\\u{ord(char):04x}")
i += 1
else:
result.append(char)
i += 1
return "".join(result)

# Pattern to match JSON string values (handles escaped quotes)
string_pattern = r'"(?:[^"\\]|\\.)*"'
return re.sub(string_pattern, escape_control_chars_in_string, json_str)


def _compute_lcs(old_ids: list[str], new_ids: list[str]) -> list[str]:
"""
Standard dynamic-programming LCS to get the common subsequence
Expand Down Expand Up @@ -358,7 +404,7 @@ def prepare_function_arguments(

signature = inspect.signature(fnc)
type_hints = get_type_hints(fnc, include_extras=True)
args_dict = from_json(json_arguments)
args_dict = from_json(_sanitize_json_control_chars(json_arguments))

if is_function_tool(fnc):
model_type = function_arguments_to_pydantic_model(fnc)
Expand Down