Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
11f1593
feat: Support reasoning content in Agent SDK
enyst Sep 6, 2025
abbf611
Address PR review comments
openhands-agent Sep 6, 2025
4ca0ef2
Remove Anthropic-specific thinking_blocks field, keep only reasoning_…
openhands-agent Sep 7, 2025
2403165
Merge branch 'main' into openhands/support-reasoning-content
xingyaoww Sep 7, 2025
9d244a4
fix missing comma
xingyaoww Sep 7, 2025
1bf97c2
add reasoning content to all relavant fields
xingyaoww Sep 7, 2025
1aef2e6
directly assign reasoning content since it is also str | None
xingyaoww Sep 7, 2025
6155a25
update repo md
xingyaoww Sep 7, 2025
59c7569
track reasoning tokens
xingyaoww Sep 7, 2025
e23d198
add convertible to other fields too
xingyaoww Sep 7, 2025
a645fc2
support showing reasoning token in metrics
xingyaoww Sep 7, 2025
399a968
examples: add reasoning debug example to visualize Message.reasoning_…
enyst Sep 7, 2025
412c38f
add reasoning file
enyst Sep 7, 2025
d01a8ab
add missing deepseek-reasoner
enyst Sep 8, 2025
083807c
examples: add multi-model reasoning probe and clean up; sdk.llm: remo…
enyst Sep 8, 2025
1a82e87
fix name
enyst Sep 8, 2025
7679f4e
Reasoning: surface reasoning_content end-to-end; add debug example; D…
enyst Sep 8, 2025
a61214b
examples: focus defaults on DeepSeek R1 (deepseek-r1-0528) for debugg…
enyst Sep 8, 2025
bb0642d
examples(reasoning_debug): restore defaults to deepseek-reasoner + ge…
enyst Sep 8, 2025
73605e9
merge: resolve conflicts with upstream/main\n\n- llm_convertible.Acti…
enyst Sep 8, 2025
f1fa4e6
Update openhands/sdk/llm/llm.py
enyst Sep 8, 2025
a195ce9
fix: tolerate reasoning_content on MessageEvent input; make Metrics.a…
enyst Sep 8, 2025
9b2f9b9
tests: remove deepseek-r1-0528 from model_features tests; align expec…
enyst Sep 8, 2025
8cbd0f1
set explicit default for Gemini
enyst Sep 8, 2025
f057010
Update openhands/sdk/llm/llm.py
enyst Sep 8, 2025
20a766e
chore: ruff-format fixes after removing thinking_blocks passthrough\n…
enyst Sep 8, 2025
db5192e
fix: remove thought/reasoning_content from AgentErrorEvent and its us…
enyst Sep 8, 2025
1b06c01
fix(llm): only forward tools when native function-calling is active t…
enyst Sep 9, 2025
e858943
chore(examples): remove debugging-only reasoning probe from PR; keep …
enyst Sep 9, 2025
539b450
also save cache read
xingyaoww Sep 9, 2025
c79029c
simplify duplicated logic
xingyaoww Sep 9, 2025
51494f3
remove unused test and revert changes
xingyaoww Sep 9, 2025
1f87277
Merge commit '46b1fa541af21be5849bcc5775efd2747ef36b41' into openhand…
xingyaoww Sep 9, 2025
048202c
rename example
xingyaoww Sep 9, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .openhands/microagents/repo.md
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,7 @@ The simplified pattern eliminates the need for manual executor instantiation and
- Avoid using # type: ignore. Treat it only as a last resort. In most cases, issues should be resolved by improving type annotations, adding assertions, or adjusting code/tests—rather than silencing the type checker.
- Please AVOID using # type: ignore[attr-defined] unless absolutely necessary. If the issue can be addressed by adding a few extra assert statements to verify types, prefer that approach instead!
- For issue like # type: ignore[call-arg]: if you discover that the argument doesn't actually exist, do not try to mock it again in tests. Instead, simply remove it.
- Avoid getattr/hasattr guards and instead enforce type correctness by relying on explicit type assertions and proper object usage, ensuring functions only receive the expected Pydantic models or typed inputs.
</CODE>

<TESTING>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@
api_key = os.getenv("LITELLM_API_KEY")
assert api_key is not None, "LITELLM_API_KEY environment variable is not set."
llm = LLM(
model="litellm_proxy/anthropic/claude-sonnet-4-20250514",
# model="litellm_proxy/gemini/gemini-2.5-pro",
model="litellm_proxy/deepseek/deepseek-reasoner",
base_url="https://llm-proxy.eval.all-hands.dev",
api_key=SecretStr(api_key),
)
Expand Down
14 changes: 12 additions & 2 deletions openhands/sdk/agent/agent/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,8 @@ def step(
if i == 0
else [], # Only first gets thought
metrics=metrics if i == len(tool_calls) - 1 else None,
# Only first gets reasoning content
reasoning_content=message.reasoning_content if i == 0 else None,
)
if action_event is None:
continue
Expand Down Expand Up @@ -254,6 +256,7 @@ def _get_action_events(
on_event: ConversationCallbackType,
thought: list[TextContent] = [],
metrics: MetricsSnapshot | None = None,
reasoning_content: str | None = None,
) -> ActionEvent | None:
"""Handle tool calls from the LLM.

Expand All @@ -267,7 +270,10 @@ def _get_action_events(
if tool is None:
err = f"Tool '{tool_name}' not found. Available: {list(self.tools.keys())}"
logger.error(err)
event = AgentErrorEvent(error=err, metrics=metrics)
event = AgentErrorEvent(
error=err,
metrics=metrics,
)
on_event(event)
state.agent_finished = True
return
Expand All @@ -282,14 +288,18 @@ def _get_action_events(
f"Error validating args {tool_call.function.arguments} for tool "
f"'{tool.name}': {e}"
)
event = AgentErrorEvent(error=err, metrics=metrics)
event = AgentErrorEvent(
error=err,
metrics=metrics,
)
on_event(event)
return

# Create one ActionEvent per action
action_event = ActionEvent(
action=action,
thought=thought,
reasoning_content=reasoning_content,
tool_name=tool.name,
tool_call_id=tool_call.id,
tool_call=tool_call,
Expand Down
12 changes: 11 additions & 1 deletion openhands/sdk/conversation/visualizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,14 +123,17 @@ def abbr(n: int | float) -> str:
prompt = usage.prompt_tokens or 0
cache_read = usage.cache_read_tokens or 0
cache_rate = f"{(cache_read / prompt * 100):.2f}%" if prompt > 0 else "N/A"
reasoning_tokens = usage.reasoning_tokens or 0

# Cost
cost_str = f"{cost:.4f}" if cost > 0 else "$0.00"

# Build with fixed color scheme
parts: list[str] = []
parts.append(f"[cyan]↑ input {input_tokens}[/cyan]")
parts.append(f"[magenta]⚡ cache hit {cache_rate}[/magenta]")
parts.append(f"[magenta]cache hit {cache_rate}[/magenta]")
if reasoning_tokens > 0:
parts.append(f"[yellow] reasoning {abbr(reasoning_tokens)}[/yellow]")
parts.append(f"[blue]↓ output {output_tokens}[/blue]")
parts.append(f"[green]$ {cost_str}[/green]")

Expand All @@ -140,6 +143,12 @@ def _create_action_panel(self, event: ActionEvent) -> Panel:
"""Create a Rich Panel for ActionEvent with complete content."""
content = Text()

# Display reasoning content first if available (common to all three types)
if event.reasoning_content:
content.append("Reasoning:\n", style="bold magenta")
content.append(event.reasoning_content, style="white")
content.append("\n\n")

# Display complete thought content
thought_text = " ".join([t.text for t in event.thought])
if thought_text:
Expand Down Expand Up @@ -266,6 +275,7 @@ def _create_message_panel(self, event: MessageEvent) -> Panel:
def _create_error_panel(self, event: AgentErrorEvent) -> Panel:
"""Create a Rich Panel for AgentErrorEvent with complete content."""
content = Text()

content.append("Error Details:\n", style="bold red")
content.append(event.error, style="bright_red")

Expand Down
39 changes: 29 additions & 10 deletions openhands/sdk/event/llm_convertible.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from typing import cast

from litellm import ChatCompletionMessageToolCall, ChatCompletionToolParam
from pydantic import Field
from pydantic import ConfigDict, Field, computed_field

from openhands.sdk.event.base import N_CHAR_PREVIEW, LLMConvertibleEvent
from openhands.sdk.event.types import SourceType
Expand Down Expand Up @@ -42,6 +42,10 @@ class ActionEvent(LLMConvertibleEvent):
thought: list[TextContent] = Field(
..., description="The thought process of the agent before taking this action"
)
reasoning_content: str | None = Field(
default=None,
description="Intermediate reasoning/thinking content from reasoning models",
)
action: Action = Field(..., description="Single action (tool call) returned by LLM")
tool_name: str = Field(..., description="The name of the tool being called")
tool_call_id: str = Field(
Expand Down Expand Up @@ -75,7 +79,12 @@ def to_llm_message(self) -> Message:
content: list[TextContent | ImageContent] = cast(
list[TextContent | ImageContent], self.thought
)
return Message(role="assistant", content=content, tool_calls=[self.tool_call])
return Message(
role="assistant",
content=content,
tool_calls=[self.tool_call],
reasoning_content=self.reasoning_content,
)

def __str__(self) -> str:
"""Plain text string representation for ActionEvent."""
Expand Down Expand Up @@ -131,10 +140,19 @@ class MessageEvent(LLMConvertibleEvent):

This is originally the "MessageAction", but it suppose not to be tool call."""

model_config = ConfigDict(extra="ignore")

source: SourceType
llm_message: Message = Field(
..., description="The exact LLM message for this message event"
)
metrics: MetricsSnapshot | None = Field(
default=None,
description=(
"Snapshot of LLM metrics (token counts and costs) for this message. "
"Only attached to messages from agent."
),
)

# context extensions stuff / microagent can go here
activated_microagents: list[str] = Field(
Expand All @@ -143,13 +161,10 @@ class MessageEvent(LLMConvertibleEvent):
extended_content: list[TextContent] = Field(
default_factory=list, description="List of content added by agent context"
)
metrics: MetricsSnapshot | None = Field(
default=None,
description=(
"Snapshot of LLM metrics (token counts and costs) for this message. "
"Only attached to messages from agent."
),
)

@computed_field
def reasoning_content(self) -> str:
return self.llm_message.reasoning_content or ""

def to_llm_message(self) -> Message:
msg = copy.deepcopy(self.llm_message)
Expand Down Expand Up @@ -220,7 +235,11 @@ def __str__(self) -> str:


class AgentErrorEvent(LLMConvertibleEvent):
"""Error triggered by the agent."""
"""Error triggered by the agent.

Note: This event should not contain model "thought" or "reasoning_content". It
represents an error produced by the agent/scaffold, not model output.
"""

source: SourceType = "agent"
error: str = Field(..., description="The error message from the scaffold")
Expand Down
32 changes: 21 additions & 11 deletions openhands/sdk/llm/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -370,7 +370,10 @@ def completion(

# 3) normalize provider params
kwargs["tools"] = tools # we might remove this field in _normalize_call_kwargs
call_kwargs = self._normalize_call_kwargs(kwargs, has_tools=bool(tools))
has_tools_flag = (
bool(tools) and use_native_fc
) # only keep tools when native FC is active
call_kwargs = self._normalize_call_kwargs(kwargs, has_tools=has_tools_flag)
Copy link
Collaborator Author

@enyst enyst Sep 9, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note from the agent

  • Before fix, the proxy was returning model=deepseek-chat whenever we sent the tools field to a model that doesn’t support native function calling. That dropped reasoning_content and reasoning tokens, so the probe showed NO with 0 tokens.

... which is confirmed 🫠


# 4) optional request logging context (kept small)
assert self._telemetry is not None
Expand Down Expand Up @@ -495,11 +498,11 @@ def _normalize_call_kwargs(self, opts: dict, *, has_tools: bool) -> dict:
# Anthropic/OpenAI reasoning models ignore temp/top_p
out.pop("temperature", None)
out.pop("top_p", None)
# Gemini 2.5 budget mapping
# Gemini 2.5-pro default to low if not set
# otherwise litellm doesn't send reasoning, even though it happens
if "gemini-2.5-pro" in self.model:
if self.reasoning_effort in {None, "low", "none"}:
out["thinking"] = {"budget_tokens": 128}
out["allowed_openai_params"] = ["thinking"]
if self.reasoning_effort in {None, "none"}:
out["reasoning_effort"] = "low"

# Anthropic Opus 4.1: prefer temperature when
# both provided; disable extended thinking
Expand Down Expand Up @@ -563,14 +566,21 @@ def _all_choices(
"Expected non-streaming Choices when post-processing mocked tools"
)

non_fn_message: dict = resp.choices[0].message.model_dump()
fn_msgs = convert_non_fncall_messages_to_fncall_messages(
# Preserve provider-specific reasoning fields before conversion
orig_msg = resp.choices[0].message
non_fn_message: dict = orig_msg.model_dump()
fn_msgs: list[dict] = convert_non_fncall_messages_to_fncall_messages(
nonfncall_msgs + [non_fn_message], tools
)
last = fn_msgs[-1]
if not isinstance(last, LiteLLMMessage):
last = LiteLLMMessage(**last)
resp.choices[0].message = last
last: dict = fn_msgs[-1]

for name in ("reasoning_content", "provider_specific_fields"):
val = getattr(orig_msg, name, None)
if not val:
continue
last[name] = val

resp.choices[0].message = LiteLLMMessage.model_validate(last)
return resp

# =========================================================================
Expand Down
15 changes: 14 additions & 1 deletion openhands/sdk/llm/message.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,11 @@ class Message(BaseModel):
name: str | None = None # name of the tool
# force string serializer
force_string_serializer: bool = False
# reasoning content (from reasoning models like o1, Claude thinking, DeepSeek R1)
reasoning_content: str | None = Field(
default=None,
description="Intermediate reasoning/thinking content from reasoning models",
)

@property
def contains_image(self) -> bool:
Expand Down Expand Up @@ -178,14 +183,22 @@ def _add_tool_call_keys(self, message_dict: dict[str, Any]) -> dict[str, Any]:

@classmethod
def from_litellm_message(cls, message: LiteLLMMessage) -> "Message":
"""Convert a litellm LiteLLMMessage to our Message class."""
"""Convert a LiteLLMMessage to our Message class.

Provider-agnostic mapping for reasoning:
- Prefer `message.reasoning_content` if present (LiteLLM normalized field)
"""
assert message.role != "function", "Function role is not supported"

rc = getattr(message, "reasoning_content", None)

return Message(
role=message.role,
content=[TextContent(text=message.content)]
if isinstance(message.content, str)
else [],
tool_calls=message.tool_calls,
reasoning_content=rc,
)


Expand Down
10 changes: 10 additions & 0 deletions openhands/sdk/llm/utils/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,9 @@ class TokenUsage(BaseModel):
cache_write_tokens: int = Field(
default=0, ge=0, description="Cache write tokens must be non-negative"
)
reasoning_tokens: int = Field(
default=0, ge=0, description="Reasoning tokens must be non-negative"
)
context_window: int = Field(
default=0, ge=0, description="Context window must be non-negative"
)
Expand All @@ -63,6 +66,7 @@ def __add__(self, other: "TokenUsage") -> "TokenUsage":
completion_tokens=self.completion_tokens + other.completion_tokens,
cache_read_tokens=self.cache_read_tokens + other.cache_read_tokens,
cache_write_tokens=self.cache_write_tokens + other.cache_write_tokens,
reasoning_tokens=self.reasoning_tokens + other.reasoning_tokens,
context_window=max(self.context_window, other.context_window),
per_turn_token=other.per_turn_token,
response_id=self.response_id,
Expand Down Expand Up @@ -122,6 +126,7 @@ def initialize_accumulated_token_usage(self) -> "Metrics":
completion_tokens=0,
cache_read_tokens=0,
cache_write_tokens=0,
reasoning_tokens=0,
context_window=0,
response_id="",
)
Expand Down Expand Up @@ -159,6 +164,7 @@ def add_token_usage(
cache_write_tokens: int,
context_window: int,
response_id: str,
reasoning_tokens: int = 0,
) -> None:
"""Add a single usage record."""
# Token each turn for calculating context usage.
Expand All @@ -170,6 +176,7 @@ def add_token_usage(
completion_tokens=completion_tokens,
cache_read_tokens=cache_read_tokens,
cache_write_tokens=cache_write_tokens,
reasoning_tokens=reasoning_tokens,
context_window=context_window,
per_turn_token=per_turn_token,
response_id=response_id,
Expand All @@ -183,6 +190,7 @@ def add_token_usage(
completion_tokens=completion_tokens,
cache_read_tokens=cache_read_tokens,
cache_write_tokens=cache_write_tokens,
reasoning_tokens=reasoning_tokens,
context_window=context_window,
per_turn_token=per_turn_token,
response_id="",
Expand Down Expand Up @@ -286,6 +294,8 @@ def diff(self, baseline: "Metrics") -> "Metrics":
- base_usage.cache_read_tokens,
cache_write_tokens=current_usage.cache_write_tokens
- base_usage.cache_write_tokens,
reasoning_tokens=current_usage.reasoning_tokens
- base_usage.reasoning_tokens,
context_window=current_usage.context_window,
per_turn_token=0,
response_id="",
Expand Down
4 changes: 1 addition & 3 deletions openhands/sdk/llm/utils/model_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ class ModelFeatures:
]

REASONING_EFFORT_PATTERNS: list[str] = [
# Mirror main behavior exactly (no unintended expansion), plus DeepSeek support
# Mirror main behavior exactly (no unintended expansion)
"o1-2024-12-17",
"o1",
"o3",
Expand All @@ -116,8 +116,6 @@ class ModelFeatures:
"gemini-2.5-pro",
"gpt-5",
"gpt-5-2025-08-07",
# DeepSeek reasoning family
"deepseek-r1-0528*",
]

PROMPT_CACHE_PATTERNS: list[str] = [
Expand Down
Loading
Loading