OpenHands · xingyaoww · Sep 9, 2025 · Sep 6, 2025 · Sep 6, 2025 · Sep 7, 2025
diff --git a/.openhands/microagents/repo.md b/.openhands/microagents/repo.md
@@ -186,6 +186,7 @@ The simplified pattern eliminates the need for manual executor instantiation and
 - Avoid using # type: ignore. Treat it only as a last resort. In most cases, issues should be resolved by improving type annotations, adding assertions, or adjusting code/tests—rather than silencing the type checker.
   - Please AVOID using # type: ignore[attr-defined] unless absolutely necessary. If the issue can be addressed by adding a few extra assert statements to verify types, prefer that approach instead!
   - For issue like # type: ignore[call-arg]: if you discover that the argument doesn't actually exist, do not try to mock it again in tests. Instead, simply remove it.
+- Avoid getattr/hasattr guards and instead enforce type correctness by relying on explicit type assertions and proper object usage, ensuring functions only receive the expected Pydantic models or typed inputs.
 </CODE>
 
 <TESTING>

diff --git a/examples/6_interactive_terminal.py → ...les/6_interactive_terminal_w_reasoning.py b/examples/6_interactive_terminal.py → ...les/6_interactive_terminal_w_reasoning.py
@@ -24,7 +24,8 @@
 api_key = os.getenv("LITELLM_API_KEY")
 assert api_key is not None, "LITELLM_API_KEY environment variable is not set."
 llm = LLM(
-    model="litellm_proxy/anthropic/claude-sonnet-4-20250514",
+    # model="litellm_proxy/gemini/gemini-2.5-pro",
+    model="litellm_proxy/deepseek/deepseek-reasoner",
     base_url="https://llm-proxy.eval.all-hands.dev",
     api_key=SecretStr(api_key),
 )

diff --git a/openhands/sdk/agent/agent/agent.py b/openhands/sdk/agent/agent/agent.py
@@ -201,6 +201,8 @@ def step(
                     if i == 0
                     else [],  # Only first gets thought
                     metrics=metrics if i == len(tool_calls) - 1 else None,
+                    # Only first gets reasoning content
+                    reasoning_content=message.reasoning_content if i == 0 else None,
                 )
                 if action_event is None:
                     continue
@@ -254,6 +256,7 @@ def _get_action_events(
         on_event: ConversationCallbackType,
         thought: list[TextContent] = [],
         metrics: MetricsSnapshot | None = None,
+        reasoning_content: str | None = None,
     ) -> ActionEvent | None:
         """Handle tool calls from the LLM.
 
@@ -267,7 +270,10 @@ def _get_action_events(
         if tool is None:
             err = f"Tool '{tool_name}' not found. Available: {list(self.tools.keys())}"
             logger.error(err)
-            event = AgentErrorEvent(error=err, metrics=metrics)
+            event = AgentErrorEvent(
+                error=err,
+                metrics=metrics,
+            )
             on_event(event)
             state.agent_finished = True
             return
@@ -282,14 +288,18 @@ def _get_action_events(
                 f"Error validating args {tool_call.function.arguments} for tool "
                 f"'{tool.name}': {e}"
             )
-            event = AgentErrorEvent(error=err, metrics=metrics)
+            event = AgentErrorEvent(
+                error=err,
+                metrics=metrics,
+            )
             on_event(event)
             return
 
         # Create one ActionEvent per action
         action_event = ActionEvent(
             action=action,
             thought=thought,
+            reasoning_content=reasoning_content,
             tool_name=tool.name,
             tool_call_id=tool_call.id,
             tool_call=tool_call,

diff --git a/openhands/sdk/conversation/visualizer.py b/openhands/sdk/conversation/visualizer.py
@@ -123,14 +123,17 @@ def abbr(n: int | float) -> str:
         prompt = usage.prompt_tokens or 0
         cache_read = usage.cache_read_tokens or 0
         cache_rate = f"{(cache_read / prompt * 100):.2f}%" if prompt > 0 else "N/A"
+        reasoning_tokens = usage.reasoning_tokens or 0
 
         # Cost
         cost_str = f"{cost:.4f}" if cost > 0 else "$0.00"
 
         # Build with fixed color scheme
         parts: list[str] = []
         parts.append(f"[cyan]↑ input {input_tokens}[/cyan]")
-        parts.append(f"[magenta]⚡ cache hit {cache_rate}[/magenta]")
+        parts.append(f"[magenta]cache hit {cache_rate}[/magenta]")
+        if reasoning_tokens > 0:
+            parts.append(f"[yellow] reasoning {abbr(reasoning_tokens)}[/yellow]")
         parts.append(f"[blue]↓ output {output_tokens}[/blue]")
         parts.append(f"[green]$ {cost_str}[/green]")
 
@@ -140,6 +143,12 @@ def _create_action_panel(self, event: ActionEvent) -> Panel:
         """Create a Rich Panel for ActionEvent with complete content."""
         content = Text()
 
+        # Display reasoning content first if available (common to all three types)
+        if event.reasoning_content:
+            content.append("Reasoning:\n", style="bold magenta")
+            content.append(event.reasoning_content, style="white")
+            content.append("\n\n")
+
         # Display complete thought content
         thought_text = " ".join([t.text for t in event.thought])
         if thought_text:
@@ -266,6 +275,7 @@ def _create_message_panel(self, event: MessageEvent) -> Panel:
     def _create_error_panel(self, event: AgentErrorEvent) -> Panel:
         """Create a Rich Panel for AgentErrorEvent with complete content."""
         content = Text()
+
         content.append("Error Details:\n", style="bold red")
         content.append(event.error, style="bright_red")
 

diff --git a/openhands/sdk/event/llm_convertible.py b/openhands/sdk/event/llm_convertible.py
@@ -2,7 +2,7 @@
 from typing import cast
 
 from litellm import ChatCompletionMessageToolCall, ChatCompletionToolParam
-from pydantic import Field
+from pydantic import ConfigDict, Field, computed_field
 
 from openhands.sdk.event.base import N_CHAR_PREVIEW, LLMConvertibleEvent
 from openhands.sdk.event.types import SourceType
@@ -42,6 +42,10 @@ class ActionEvent(LLMConvertibleEvent):
     thought: list[TextContent] = Field(
         ..., description="The thought process of the agent before taking this action"
     )
+    reasoning_content: str | None = Field(
+        default=None,
+        description="Intermediate reasoning/thinking content from reasoning models",
+    )
     action: Action = Field(..., description="Single action (tool call) returned by LLM")
     tool_name: str = Field(..., description="The name of the tool being called")
     tool_call_id: str = Field(
@@ -75,7 +79,12 @@ def to_llm_message(self) -> Message:
         content: list[TextContent | ImageContent] = cast(
             list[TextContent | ImageContent], self.thought
         )
-        return Message(role="assistant", content=content, tool_calls=[self.tool_call])
+        return Message(
+            role="assistant",
+            content=content,
+            tool_calls=[self.tool_call],
+            reasoning_content=self.reasoning_content,
+        )
 
     def __str__(self) -> str:
         """Plain text string representation for ActionEvent."""
@@ -131,10 +140,19 @@ class MessageEvent(LLMConvertibleEvent):
 
     This is originally the "MessageAction", but it suppose not to be tool call."""
 
+    model_config = ConfigDict(extra="ignore")
+
     source: SourceType
     llm_message: Message = Field(
         ..., description="The exact LLM message for this message event"
     )
+    metrics: MetricsSnapshot | None = Field(
+        default=None,
+        description=(
+            "Snapshot of LLM metrics (token counts and costs) for this message. "
+            "Only attached to messages from agent."
+        ),
+    )
 
     # context extensions stuff / microagent can go here
     activated_microagents: list[str] = Field(
@@ -143,13 +161,10 @@ class MessageEvent(LLMConvertibleEvent):
     extended_content: list[TextContent] = Field(
         default_factory=list, description="List of content added by agent context"
     )
-    metrics: MetricsSnapshot | None = Field(
-        default=None,
-        description=(
-            "Snapshot of LLM metrics (token counts and costs) for this message. "
-            "Only attached to messages from agent."
-        ),
-    )
+
+    @computed_field
+    def reasoning_content(self) -> str:
+        return self.llm_message.reasoning_content or ""
 
     def to_llm_message(self) -> Message:
         msg = copy.deepcopy(self.llm_message)
@@ -220,7 +235,11 @@ def __str__(self) -> str:
 
 
 class AgentErrorEvent(LLMConvertibleEvent):
-    """Error triggered by the agent."""
+    """Error triggered by the agent.
+
+    Note: This event should not contain model "thought" or "reasoning_content". It
+    represents an error produced by the agent/scaffold, not model output.
+    """
 
     source: SourceType = "agent"
     error: str = Field(..., description="The error message from the scaffold")

diff --git a/openhands/sdk/llm/llm.py b/openhands/sdk/llm/llm.py
@@ -370,7 +370,10 @@ def completion(
 
         # 3) normalize provider params
         kwargs["tools"] = tools  # we might remove this field in _normalize_call_kwargs
-        call_kwargs = self._normalize_call_kwargs(kwargs, has_tools=bool(tools))
+        has_tools_flag = (
+            bool(tools) and use_native_fc
+        )  # only keep tools when native FC is active
+        call_kwargs = self._normalize_call_kwargs(kwargs, has_tools=has_tools_flag)
 
         # 4) optional request logging context (kept small)
         assert self._telemetry is not None
@@ -495,11 +498,11 @@ def _normalize_call_kwargs(self, opts: dict, *, has_tools: bool) -> dict:
             # Anthropic/OpenAI reasoning models ignore temp/top_p
             out.pop("temperature", None)
             out.pop("top_p", None)
-            # Gemini 2.5 budget mapping
+            # Gemini 2.5-pro default to low if not set
+            # otherwise litellm doesn't send reasoning, even though it happens
             if "gemini-2.5-pro" in self.model:
-                if self.reasoning_effort in {None, "low", "none"}:
-                    out["thinking"] = {"budget_tokens": 128}
-                    out["allowed_openai_params"] = ["thinking"]
+                if self.reasoning_effort in {None, "none"}:
+                    out["reasoning_effort"] = "low"
 
         # Anthropic Opus 4.1: prefer temperature when
         # both provided; disable extended thinking
@@ -563,14 +566,21 @@ def _all_choices(
                 "Expected non-streaming Choices when post-processing mocked tools"
             )
 
-        non_fn_message: dict = resp.choices[0].message.model_dump()
-        fn_msgs = convert_non_fncall_messages_to_fncall_messages(
+        # Preserve provider-specific reasoning fields before conversion
+        orig_msg = resp.choices[0].message
+        non_fn_message: dict = orig_msg.model_dump()
+        fn_msgs: list[dict] = convert_non_fncall_messages_to_fncall_messages(
             nonfncall_msgs + [non_fn_message], tools
         )
-        last = fn_msgs[-1]
-        if not isinstance(last, LiteLLMMessage):
-            last = LiteLLMMessage(**last)
-        resp.choices[0].message = last
+        last: dict = fn_msgs[-1]
+
+        for name in ("reasoning_content", "provider_specific_fields"):
+            val = getattr(orig_msg, name, None)
+            if not val:
+                continue
+            last[name] = val
+
+        resp.choices[0].message = LiteLLMMessage.model_validate(last)
         return resp
 
     # =========================================================================

diff --git a/openhands/sdk/llm/message.py b/openhands/sdk/llm/message.py
@@ -81,6 +81,11 @@ class Message(BaseModel):
     name: str | None = None  # name of the tool
     # force string serializer
     force_string_serializer: bool = False
+    # reasoning content (from reasoning models like o1, Claude thinking, DeepSeek R1)
+    reasoning_content: str | None = Field(
+        default=None,
+        description="Intermediate reasoning/thinking content from reasoning models",
+    )
 
     @property
     def contains_image(self) -> bool:
@@ -178,14 +183,22 @@ def _add_tool_call_keys(self, message_dict: dict[str, Any]) -> dict[str, Any]:
 
     @classmethod
     def from_litellm_message(cls, message: LiteLLMMessage) -> "Message":
-        """Convert a litellm LiteLLMMessage to our Message class."""
+        """Convert a LiteLLMMessage to our Message class.
+
+        Provider-agnostic mapping for reasoning:
+        - Prefer `message.reasoning_content` if present (LiteLLM normalized field)
+        """
         assert message.role != "function", "Function role is not supported"
+
+        rc = getattr(message, "reasoning_content", None)
+
         return Message(
             role=message.role,
             content=[TextContent(text=message.content)]
             if isinstance(message.content, str)
             else [],
             tool_calls=message.tool_calls,
+            reasoning_content=rc,
         )
 
 

diff --git a/openhands/sdk/llm/utils/metrics.py b/openhands/sdk/llm/utils/metrics.py
@@ -47,6 +47,9 @@ class TokenUsage(BaseModel):
     cache_write_tokens: int = Field(
         default=0, ge=0, description="Cache write tokens must be non-negative"
     )
+    reasoning_tokens: int = Field(
+        default=0, ge=0, description="Reasoning tokens must be non-negative"
+    )
     context_window: int = Field(
         default=0, ge=0, description="Context window must be non-negative"
     )
@@ -63,6 +66,7 @@ def __add__(self, other: "TokenUsage") -> "TokenUsage":
             completion_tokens=self.completion_tokens + other.completion_tokens,
             cache_read_tokens=self.cache_read_tokens + other.cache_read_tokens,
             cache_write_tokens=self.cache_write_tokens + other.cache_write_tokens,
+            reasoning_tokens=self.reasoning_tokens + other.reasoning_tokens,
             context_window=max(self.context_window, other.context_window),
             per_turn_token=other.per_turn_token,
             response_id=self.response_id,
@@ -122,6 +126,7 @@ def initialize_accumulated_token_usage(self) -> "Metrics":
                 completion_tokens=0,
                 cache_read_tokens=0,
                 cache_write_tokens=0,
+                reasoning_tokens=0,
                 context_window=0,
                 response_id="",
             )
@@ -159,6 +164,7 @@ def add_token_usage(
         cache_write_tokens: int,
         context_window: int,
         response_id: str,
+        reasoning_tokens: int = 0,
     ) -> None:
         """Add a single usage record."""
         # Token each turn for calculating context usage.
@@ -170,6 +176,7 @@ def add_token_usage(
             completion_tokens=completion_tokens,
             cache_read_tokens=cache_read_tokens,
             cache_write_tokens=cache_write_tokens,
+            reasoning_tokens=reasoning_tokens,
             context_window=context_window,
             per_turn_token=per_turn_token,
             response_id=response_id,
@@ -183,6 +190,7 @@ def add_token_usage(
             completion_tokens=completion_tokens,
             cache_read_tokens=cache_read_tokens,
             cache_write_tokens=cache_write_tokens,
+            reasoning_tokens=reasoning_tokens,
             context_window=context_window,
             per_turn_token=per_turn_token,
             response_id="",
@@ -286,6 +294,8 @@ def diff(self, baseline: "Metrics") -> "Metrics":
                 - base_usage.cache_read_tokens,
                 cache_write_tokens=current_usage.cache_write_tokens
                 - base_usage.cache_write_tokens,
+                reasoning_tokens=current_usage.reasoning_tokens
+                - base_usage.reasoning_tokens,
                 context_window=current_usage.context_window,
                 per_turn_token=0,
                 response_id="",

diff --git a/openhands/sdk/llm/utils/model_features.py b/openhands/sdk/llm/utils/model_features.py
@@ -103,7 +103,7 @@ class ModelFeatures:
 ]
 
 REASONING_EFFORT_PATTERNS: list[str] = [
-    # Mirror main behavior exactly (no unintended expansion), plus DeepSeek support
+    # Mirror main behavior exactly (no unintended expansion)
     "o1-2024-12-17",
     "o1",
     "o3",
@@ -116,8 +116,6 @@ class ModelFeatures:
     "gemini-2.5-pro",
     "gpt-5",
     "gpt-5-2025-08-07",
-    # DeepSeek reasoning family
-    "deepseek-r1-0528*",
 ]
 
 PROMPT_CACHE_PATTERNS: list[str] = [