Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
78cfae6
Add AgentReviewCritic (forks agent settings)
openhands-agent Jan 12, 2026
7248273
Merge branch 'main' into openhands/critic-agent-reviewer
xingyaoww Jan 13, 2026
6180b26
Fix example lint/type issues
openhands-agent Jan 13, 2026
7961dc8
Merge branch 'main' into openhands/critic-agent-reviewer
xingyaoww Jan 20, 2026
87d8aa3
feat: Add callback hook support and refactor AgentReviewCritic
openhands-agent Jan 20, 2026
bce1736
Refactor critic agent hook example to script style
openhands-agent Jan 20, 2026
4475dfc
Document examples script style convention in AGENTS.md
openhands-agent Jan 20, 2026
672204d
Merge main into openhands/critic-agent-reviewer
openhands-agent Feb 4, 2026
843e4ab
Fix circular import and duplicate example numbers
openhands-agent Feb 4, 2026
37e7e90
Merge remote-tracking branch 'origin/main' into openhands/critic-agen…
openhands-agent Feb 9, 2026
7aa3b6b
feat(critic): Add iterative refinement support to AgentReviewCritic
openhands-agent Feb 9, 2026
0a2b7ed
fix: rename 36_critic_example.py to 38_critic_example.py to avoid dup…
openhands-agent Feb 9, 2026
bc65099
Merge branch 'main' into openhands/critic-agent-reviewer
xingyaoww Feb 10, 2026
e758229
refactor: remove 38_critic_example.py, keep AgentReviewCritic as 34_c…
openhands-agent Feb 10, 2026
4c091ec
refactor: merge critic examples into 34_critic_example.py with mode s…
openhands-agent Feb 10, 2026
5a265e8
refactor: merge critic examples into 34_critic_example.py with mode s…
openhands-agent Feb 10, 2026
5550ac9
fix: exclude Callable fields from JSON schema to fix OpenAPI generation
openhands-agent Feb 10, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions AGENTS.md
Original file line number Diff line number Diff line change
Expand Up @@ -255,4 +255,11 @@ For examples that use the critic model (e.g., `34_critic_example.py`), the criti
- Ruff ignores `ARG` (unused arguments) under `tests/**/*.py` to allow pytest fixtures.
- Repository guidance lives in `AGENTS.md` (loaded as a third-party skill file).
</REPO_CONFIG_NOTES>

<EXAMPLES_STYLE>
- Examples in `examples/01_standalone_sdk/` should be written as direct scripts, NOT wrapped in a `main()` function.
- The script code should run at module level (after imports and helper function definitions).
- Keep examples concise: avoid excessive print statements that make the code long and less readable.
- See `examples/01_standalone_sdk/01_hello_world.py` for the canonical pattern.
</EXAMPLES_STYLE>
</REPO>
215 changes: 139 additions & 76 deletions examples/01_standalone_sdk/34_critic_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,33 +7,57 @@
feedback that can trigger follow-up prompts when the agent hasn't completed the
task successfully.

Two critic modes are supported:

1. **API-based Critic** (CRITIC_MODE=api): Uses an external critic API endpoint.
Auto-configures for All-Hands LLM proxy, or uses explicit env vars.

2. **Agent Review Critic** (CRITIC_MODE=agent_review): Spawns a separate OpenHands
agent to do a PR-style review of the git diff.

Key concepts demonstrated:
1. Setting up a critic with IterativeRefinementConfig for automatic retry
2. Conversation.run() automatically handles retries based on critic scores
3. Custom follow-up prompt generation via critic.get_followup_prompt()
4. Iterating until the task is completed successfully or max iterations reached

For All-Hands LLM proxy (llm-proxy.*.all-hands.dev), the critic is auto-configured
using the same base_url with /vllm suffix and "critic" as the model name.
Requirements:
- export LLM_API_KEY=...
- optional: CRITIC_MODE (api|agent_review), LLM_MODEL, LLM_BASE_URL

Run:
# API-based critic (default)
python examples/01_standalone_sdk/34_critic_example.py

# Agent review critic
CRITIC_MODE=agent_review python examples/01_standalone_sdk/34_critic_example.py
"""

import os
import re
import signal
import subprocess
import tempfile
from pathlib import Path

from pydantic import SecretStr

from openhands.sdk import LLM, Agent, Conversation, Tool
from openhands.sdk.critic import APIBasedCritic, IterativeRefinementConfig
from openhands.sdk.critic.base import CriticBase
from openhands.sdk.critic.impl.agent_review import AgentReviewCritic
from openhands.tools.file_editor import FileEditorTool
from openhands.tools.preset.critic import get_critic_agent
from openhands.tools.preset.default import get_default_agent
from openhands.tools.task_tracker import TaskTrackerTool
from openhands.tools.terminal import TerminalTool


signal.signal(signal.SIGINT, lambda *_: (_ for _ in ()).throw(KeyboardInterrupt()))


# Configuration
# Higher threshold (70%) makes it more likely the agent needs multiple iterations,
# which better demonstrates how iterative refinement works.
# Adjust as needed to see different behaviors.
CRITIC_MODE = os.getenv("CRITIC_MODE", "api") # "api" or "agent_review"
SUCCESS_THRESHOLD = float(os.getenv("CRITIC_SUCCESS_THRESHOLD", "0.7"))
MAX_ITERATIONS = int(os.getenv("MAX_ITERATIONS", "3"))

Expand All @@ -48,36 +72,16 @@ def get_required_env(name: str) -> str:
)


def get_default_critic(llm: LLM) -> CriticBase | None:
"""Auto-configure critic for All-Hands LLM proxy.
def get_api_critic(llm: LLM) -> CriticBase | None:
"""Auto-configure API-based critic for All-Hands LLM proxy.

When the LLM base_url matches `llm-proxy.*.all-hands.dev`, returns an
APIBasedCritic configured with:
- server_url: {base_url}/vllm
- api_key: same as LLM
- model_name: "critic"

Args:
llm: The LLM instance to derive critic configuration from.

Returns:
An APIBasedCritic if the LLM is configured for All-Hands proxy,
None otherwise.

Example:
llm = LLM(
model="anthropic/claude-sonnet-4-5",
api_key=api_key,
base_url="https://llm-proxy.eval.all-hands.dev",
)
critic = get_default_critic(llm)
if critic is None:
# Fall back to explicit configuration
critic = APIBasedCritic(
server_url="https://my-critic-server.com",
api_key="my-api-key",
model_name="my-critic-model",
)
Returns None if not using All-Hands proxy.
"""
base_url = llm.base_url
api_key = llm.api_key
Expand All @@ -96,10 +100,22 @@ def get_default_critic(llm: LLM) -> CriticBase | None:
)


# Task prompt designed to be moderately complex with subtle requirements.
# The task is simple enough to complete in 1-2 iterations, but has specific
# requirements that are easy to miss - triggering critic feedback.
INITIAL_TASK_PROMPT = """\
def _git(workspace: Path, *args: str) -> None:
subprocess.run(["git", *args], cwd=workspace, check=True, capture_output=True)


def _git_patch(workspace: Path) -> str:
return subprocess.check_output(["git", "diff"], cwd=workspace, text=True)


# Task prompts for different modes
AGENT_REVIEW_TASK = (
"Edit calc.py to add a new function multiply(a, b) that "
"multiplies two numbers. Add proper type hints and a docstring. "
"Then finish."
)

API_CRITIC_TASK = """\
Create a Python word statistics tool called `wordstats` that analyzes text files.

## Structure
Expand Down Expand Up @@ -168,76 +184,123 @@ def get_default_critic(llm: LLM) -> CriticBase | None:
"""


# Setup LLM
llm_api_key = get_required_env("LLM_API_KEY")
llm = LLM(
# Use a weaker model to increase likelihood of needing multiple iterations
model="anthropic/claude-haiku-4-5",
api_key=llm_api_key,
usage_id="agent",
model=os.getenv("LLM_MODEL", "anthropic/claude-sonnet-4-5-20250929"),
api_key=SecretStr(llm_api_key),
top_p=0.95,
base_url=os.getenv("LLM_BASE_URL", None),
)

# Setup critic with iterative refinement config
# The IterativeRefinementConfig tells Conversation.run() to automatically
# retry the task if the critic score is below the threshold
# Setup iterative refinement config
iterative_config = IterativeRefinementConfig(
success_threshold=SUCCESS_THRESHOLD,
max_iterations=MAX_ITERATIONS,
)

# Auto-configure critic for All-Hands proxy or use explicit env vars
critic = get_default_critic(llm)
if critic is None:
print("⚠️ No All-Hands LLM proxy detected, trying explicit env vars...")
critic = APIBasedCritic(
server_url=get_required_env("CRITIC_SERVER_URL"),
api_key=get_required_env("CRITIC_API_KEY"),
model_name=get_required_env("CRITIC_MODEL_NAME"),
iterative_refinement=iterative_config,
)
else:
# Add iterative refinement config to the auto-configured critic
critic = critic.model_copy(update={"iterative_refinement": iterative_config})

# Create agent with critic (iterative refinement is built into the critic)
agent = Agent(
llm=llm,
tools=[
Tool(name=TerminalTool.name),
Tool(name=FileEditorTool.name),
Tool(name=TaskTrackerTool.name),
],
critic=critic,
)

# Create workspace
workspace = Path(tempfile.mkdtemp(prefix="critic_demo_"))
print(f"📁 Created workspace: {workspace}")

# Create conversation - iterative refinement is handled automatically
# by Conversation.run() based on the critic's config
# Setup critic based on mode
if CRITIC_MODE == "agent_review":
# Initialize git repo for agent review mode
_git(workspace, "init", "-q")
_git(workspace, "config", "user.email", "example@example.com")
_git(workspace, "config", "user.name", "Example")

# Create initial file for the task
(workspace / "calc.py").write_text(
"""def add(a, b):
return a + b


if __name__ == "__main__":
print(add(1, 2))
"""
)
_git(workspace, "add", "calc.py")
_git(workspace, "commit", "-m", "init", "-q")

critic: CriticBase = AgentReviewCritic(
llm=llm,
agent_factory=get_critic_agent,
review_style="roasted",
workspace_dir=str(workspace),
iterative_refinement=iterative_config,
)
task_prompt = AGENT_REVIEW_TASK
mode_description = "Agent Review Critic (PR-style code review)"

# Use default agent preset for agent review mode (cli_mode=True disables browser)
base_agent = get_default_agent(llm=llm, cli_mode=True)
agent = base_agent.model_copy(update={"critic": critic})

else: # API mode
# Auto-configure critic for All-Hands proxy or use explicit env vars
api_critic = get_api_critic(llm)
if api_critic is None:
print("⚠️ No All-Hands LLM proxy detected, trying explicit env vars...")
critic = APIBasedCritic(
server_url=get_required_env("CRITIC_SERVER_URL"),
api_key=get_required_env("CRITIC_API_KEY"),
model_name=get_required_env("CRITIC_MODEL_NAME"),
iterative_refinement=iterative_config,
)
else:
critic = api_critic.model_copy(
update={"iterative_refinement": iterative_config}
)
task_prompt = API_CRITIC_TASK
mode_description = "API-based Critic"

# Create agent with tools for API mode
agent = Agent(
llm=llm,
tools=[
Tool(name=TerminalTool.name),
Tool(name=FileEditorTool.name),
Tool(name=TaskTrackerTool.name),
],
critic=critic,
)

# Create conversation
conversation = Conversation(
agent=agent,
workspace=str(workspace),
)

print("\n" + "=" * 70)
print("🚀 Starting Iterative Refinement with Critic Model")
print(f"🚀 Starting Iterative Refinement with {mode_description}")
print("=" * 70)
print(f"Success threshold: {SUCCESS_THRESHOLD:.0%}")
print(f"Max iterations: {MAX_ITERATIONS}")
print("\nThe agent will work on the task, and the critic will evaluate progress.")
print("If the critic finds issues, it will provide feedback for improvement.\n")

# Send the task and run - Conversation.run() handles retries automatically
conversation.send_message(INITIAL_TASK_PROMPT)
# Send the task and run
conversation.send_message(task_prompt)
conversation.run()

# Print additional info about created files
print("\nCreated files:")
for path in sorted(workspace.rglob("*")):
if path.is_file():
relative = path.relative_to(workspace)
print(f" - {relative}")
# Show results based on mode
if CRITIC_MODE == "agent_review":
patch = _git_patch(workspace)
if patch:
print("\n[Current git diff]")
print(patch[:500] + "..." if len(patch) > 500 else patch)
else:
print("\nCreated files:")
for path in sorted(workspace.rglob("*")):
if path.is_file():
relative = path.relative_to(workspace)
print(f" - {relative}")

print("\n" + "=" * 70)
print("Example Complete!")
print("=" * 70)

# Report cost
cost = llm.metrics.accumulated_cost
cost = conversation.conversation_stats.get_combined_metrics().accumulated_cost
print(f"\nEXAMPLE_COST: {cost:.4f}")
12 changes: 12 additions & 0 deletions openhands-sdk/openhands/sdk/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,18 @@
except PackageNotFoundError:
__version__ = "0.0.0" # fallback for editable/unbuilt environments


# Rebuild models that have forward references now that all imports are done
def _rebuild_forward_refs() -> None:
"""Rebuild Pydantic models with forward references."""
from openhands.sdk.critic.impl.agent_review import AgentReviewCritic

# Pass Agent to the model_rebuild so it can resolve the forward reference
AgentReviewCritic.model_rebuild(_types_namespace={"Agent": Agent})


_rebuild_forward_refs()

__all__ = [
"LLM",
"LLMRegistry",
Expand Down
2 changes: 2 additions & 0 deletions openhands-sdk/openhands/sdk/critic/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from openhands.sdk.critic.base import CriticBase, IterativeRefinementConfig
from openhands.sdk.critic.impl import (
AgentFinishedCritic,
AgentReviewCritic,
APIBasedCritic,
EmptyPatchCritic,
PassCritic,
Expand All @@ -15,6 +16,7 @@
"IterativeRefinementConfig",
# Critic implementations
"AgentFinishedCritic",
"AgentReviewCritic",
"APIBasedCritic",
"EmptyPatchCritic",
"PassCritic",
Expand Down
2 changes: 2 additions & 0 deletions openhands-sdk/openhands/sdk/critic/impl/__init__.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
"""Critic implementations module."""

from openhands.sdk.critic.impl.agent_finished import AgentFinishedCritic
from openhands.sdk.critic.impl.agent_review import AgentReviewCritic
from openhands.sdk.critic.impl.api import APIBasedCritic
from openhands.sdk.critic.impl.empty_patch import EmptyPatchCritic
from openhands.sdk.critic.impl.pass_critic import PassCritic


__all__ = [
"AgentFinishedCritic",
"AgentReviewCritic",
"APIBasedCritic",
"EmptyPatchCritic",
"PassCritic",
Expand Down
Loading
Loading