Conversation
This reverts commit bb3fb1c.
FIRE-751 Feature | rogue Client side reporting the experiments to the server
if the env var / toml is set populate it in the input field |
WalkthroughAdds Pydantic API models and conversion utilities for ApiEvaluationResult; shifts LLM summaries to a StructuredSummary JSON shape with parsing and fallbacks; introduces Qualifire reporting and a /report_summary endpoint; updates SDKs, CLI, UI, TUI, and tests to produce, consume, and display structured summaries. Changes
Sequence Diagram(s)sequenceDiagram
autonumber
actor User
participant UI as Frontend UI
participant SDK as Python SDK
participant Server as Rogue Server
participant Conv as API Format Service
participant Qual as Qualifire
User->>UI: Request report / select tab
UI->>SDK: generate_summary(results, qualifire_api_key?, job_id?, deep_test?, judge_model?)
SDK-->>Server: POST /api/v1/llm/summary {job_id, deep_test, qualifire_api_key, judge_model}
Server->>Server: LLM produces StructuredSummary JSON (parse/validate)
Server-->>SDK: (markdown, StructuredSummary)
SDK-->>UI: (markdown, StructuredSummary)
UI->>Conv: convert_with_structured_summary(results, StructuredSummary, deep_test, start_time, judge_model)
Conv-->>UI: ApiEvaluationResult
UI->>UI: render ApiEvaluationResult.model_dump_json(indent=2, exclude_none=True)
alt report to Qualifire requested
Server->>Qual: POST {qualifire_url}/api/rogue/v1/report (X-qualifire-key, ApiEvaluationResult JSON)
Qual-->>Server: response or error
end
Estimated code review effort🎯 4 (Complex) | ⏱️ ~60 minutes Pre-merge checks (3 passed)✅ Passed checks (3 passed)
Poem
📜 Recent review detailsConfiguration used: CodeRabbit UI Review profile: CHILL Plan: Pro 📒 Files selected for processing (2)
🚧 Files skipped from review as they are similar to previous changes (1)
🧰 Additional context used📓 Path-based instructions (1).github/**📄 CodeRabbit inference engine (AGENTS.md)
Files:
🔇 Additional comments (1)
✨ Finishing touches🧪 Generate unit tests
Comment |
There was a problem hiding this comment.
Actionable comments posted: 6
🧹 Nitpick comments (2)
sdks/python/rogue_sdk/types.py (1)
88-89: Nitpick: Line too long.Line 88 exceeds the typical 88-character limit for Python code. Consider breaking it into multiple lines.
- "Authentication Credentials cannot be empty for the selected auth type.", # noqa: E501 + "Authentication Credentials cannot be empty for the selected auth type.",rogue/ui/components/report_generator.py (1)
159-165: Consider usingmodel_dump_jsonfor fallback as well.The fallback uses
str(results)which may not provide a consistent JSON format.except Exception as e: logger.warning( f"Failed to convert results to API format: {e}", extra={ "results": results, }, ) - results_json = str(results) + results_json = results.model_dump_json(indent=2)
📜 Review details
Configuration used: CodeRabbit UI
Review profile: CHILL
Plan: Lite
📒 Files selected for processing (3)
rogue/tests/models/test_evaluation_result.py(8 hunks)rogue/ui/components/report_generator.py(2 hunks)sdks/python/rogue_sdk/types.py(4 hunks)
🧰 Additional context used
📓 Path-based instructions (2)
{rogue,tests,examples}/**/*.py
📄 CodeRabbit inference engine (AGENTS.md)
{rogue,tests,examples}/**/*.py: Format Python code with Black
Organize imports according to isort conventions
Use type hints for all function signatures
Follow PEP 8 naming (snake_case for variables and functions, PascalCase for classes)
Use try/except blocks for code that may raise exceptions
Ensure code passes flake8 linting
Ensure code passes mypy type checks per .mypy.ini
Address Bandit-reported security issues in Python code
Files:
rogue/ui/components/report_generator.pyrogue/tests/models/test_evaluation_result.py
rogue/**/*.py
📄 CodeRabbit inference engine (AGENTS.md)
Place application/package code under the rogue/ directory
Files:
rogue/ui/components/report_generator.pyrogue/tests/models/test_evaluation_result.py
🧬 Code graph analysis (1)
rogue/tests/models/test_evaluation_result.py (1)
rogue/ui/components/report_generator.py (2)
convert_to_api_format(44-94)ApiEvaluationResult(38-41)
🔇 Additional comments (3)
rogue/ui/components/report_generator.py (1)
151-166: Good error handling for API conversion.The try-except block properly handles conversion failures with logging and fallback behavior.
rogue/tests/models/test_evaluation_result.py (2)
168-191: Well-structured API conversion test.The test thoroughly validates the conversion to the new API format, including all nested structures and data types. Good coverage of the conversion logic.
62-63: Formatting change: trailing commas.The addition of trailing commas is a good practice for better version control diffs.
Also applies to: 68-69, 75-76, 81-82, 94-95, 100-101, 113-114, 119-120, 132-133, 138-139, 154-155
| # Conversion functions for new API format | ||
| def convert_to_api_format(evaluation_results: EvaluationResults) -> ApiEvaluationResult: | ||
| """Convert legacy EvaluationResults to new API format. | ||
|
|
||
| Args: | ||
| evaluation_results: Legacy evaluation results to convert | ||
|
|
||
| Returns: | ||
| ApiEvaluationResult: New format evaluation result | ||
| """ | ||
| api_scenarios = [] | ||
|
|
||
| for result in evaluation_results.results: | ||
| # Convert conversations to new format | ||
| api_conversations = [] | ||
| for conv_eval in result.conversations: | ||
| # Convert ChatHistory messages to ApiChatMessage | ||
| api_messages = [] | ||
| for msg in conv_eval.messages.messages: | ||
| timestamp = datetime.now(timezone.utc) | ||
| if msg.timestamp: | ||
| try: | ||
| if isinstance(msg.timestamp, str): | ||
| timestamp = datetime.fromisoformat( | ||
| msg.timestamp.replace("Z", "+00:00"), | ||
| ) | ||
| else: | ||
| timestamp = msg.timestamp | ||
| except (ValueError, AttributeError): | ||
| timestamp = datetime.now(timezone.utc) | ||
|
|
||
| api_messages.append( | ||
| ApiChatMessage( | ||
| role=msg.role, | ||
| content=msg.content, | ||
| timestamp=timestamp, | ||
| ), | ||
| ) | ||
|
|
||
| api_conversations.append( | ||
| ApiConversationEvaluation( | ||
| passed=conv_eval.passed, | ||
| messages=api_messages, | ||
| reason=conv_eval.reason if conv_eval.reason else None, | ||
| ), | ||
| ) | ||
|
|
||
| api_scenarios.append( | ||
| ApiScenarioResult( | ||
| description=result.scenario.scenario, | ||
| totalConversations=len(api_conversations), | ||
| flaggedConversations=len( | ||
| [c for c in api_conversations if not c.passed], | ||
| ), | ||
| conversations=api_conversations, | ||
| ), | ||
| ) | ||
|
|
||
| return ApiEvaluationResult(scenarios=api_scenarios) | ||
|
|
There was a problem hiding this comment.
Code duplication detected: convert_to_api_format function.
The convert_to_api_format function is duplicated between sdks/python/rogue_sdk/types.py and rogue/ui/components/report_generator.py. This creates a significant maintenance burden.
The function should exist in only one location. Since the SDK is the source of truth for types, keep it in sdks/python/rogue_sdk/types.py and import it in the UI component.
🤖 Prompt for AI Agents
In sdks/python/rogue_sdk/types.py around lines 244-303 the convert_to_api_format
implementation is duplicated elsewhere; keep the canonical implementation here
and remove the duplicate in rogue/ui/components/report_generator.py, then make
the function importable from the SDK (ensure it is top-level in types.py and
exported via the package __init__ if needed), update report_generator.py to
import convert_to_api_format from sdks.python.rogue_sdk.types (or from the
package export) and remove the local copy, and run tests/flake8 to ensure
imports and references are correct.
|
|
||
|
|
||
| # Conversion functions for new API format | ||
| def convert_to_api_format(evaluation_results: EvaluationResults) -> ApiEvaluationResult: |
There was a problem hiding this comment.
Design suggestion:
For each class API...(BaseModel) create a classmethod to convert from the regular type. For example:
class ApiChatMessage(BaseModel):
"""Chat message for new API format with datetime timestamp."""
role: str
content: str
timestamp: datetime
@classmethod
def from_chat_message(cls, message: ChatMessage) -> Self:
timestamp = datetime.now(timezone.utc)
if msg.timestamp:
try:
if isinstance(msg.timestamp, str):
timestamp = datetime.fromisoformat(
msg.timestamp.replace("Z", "+00:00"),
)
else:
timestamp = msg.timestamp
except (ValueError, AttributeError):
timestamp = datetime.now(timezone.utc)
return cls(
role=msg.role,
content=msg.content,
timestamp=timestamp,
)The same for the rest
yuval-qf
left a comment
There was a problem hiding this comment.
Left a question and a design suggestion
There was a problem hiding this comment.
Actionable comments posted: 0
Caution
Some comments are outside the diff and can’t be posted inline due to platform limitations.
⚠️ Outside diff range comments (3)
rogue/ui/components/scenario_runner.py (1)
484-501: Always close SDK in finally to avoid connection leak.If
generate_summaryraises,sdk.close()is skipped.Apply:
- # Generate summary using SDK (server-based) - summary = "Summary generation failed." - try: + # Generate summary using SDK (server-based) + summary = "Summary generation failed." + sdk: RogueSDK | None = None + try: sdk_config = RogueClientConfig( base_url=state.get("rogue_server_url", "http://localhost:8000"), timeout=600.0, ) - sdk = RogueSDK(sdk_config) + sdk = RogueSDK(sdk_config) summary = await sdk.generate_summary( results=all_results, model=config.get("service_llm"), api_key=config.get("judge_llm_api_key"), ) - await sdk.close() except Exception: logger.exception("Summary generation failed") + finally: + if sdk: + await sdk.close()sdks/python/rogue_sdk/websocket.py (1)
156-169: Avoid CancelledError in done-callback and duplicate exception lookups.
Task.exception()raises on cancelled tasks; the lambda also calls it twice.Apply:
- if asyncio.iscoroutinefunction(handler): - task = asyncio.create_task(handler(event, data)) - task.add_done_callback( - lambda t: ( - logger.exception( - "WS handler error", - exc_info=t.exception(), - ) - if t.exception() - else None - ), - ) + if asyncio.iscoroutinefunction(handler): + task = asyncio.create_task(handler(event, data)) + def _on_done(t: asyncio.Task) -> None: + try: + exc = t.exception() + except asyncio.CancelledError: + return + if exc: + logger.exception("WS handler error", exc_info=exc) + task.add_done_callback(_on_done)rogue/server/api/llm.py (1)
23-24: Add an explicit return type to satisfy typing guidelines.The function lacks a return annotation under rogue/**/*.py rules.
-@router.post("/scenarios", response_model=ScenarioGenerationResponse) -async def generate_scenarios(request: ScenarioGenerationRequest): +@router.post("/scenarios", response_model=ScenarioGenerationResponse) +async def generate_scenarios( + request: ScenarioGenerationRequest, +) -> ScenarioGenerationResponse:
♻️ Duplicate comments (2)
sdks/python/rogue_sdk/types.py (1)
210-242: Consolidate API models to a single source of truth.
Api*models are duplicated here and inrogue/server/models/api_format.py. Keep one canonical set (SDK or server) and import from it to avoid drift.#!/bin/bash rg -nP '(class\s+Api(ChatMessage|ConversationEvaluation|ScenarioResult|EvaluationResult)\b)' \ sdks/python/rogue_sdk/types.py rogue/server/models/api_format.py rogue/ui/components -Srogue/tests/models/test_evaluation_result.py (1)
11-12: Import from the SDK types to avoid server/SDK duplication.Use the canonical SDK exports to keep tests decoupled from server internals (matches prior feedback).
-from rogue.server.services.api_format_service import convert_to_api_format -from rogue.server.models.api_format import ApiEvaluationResult, StructuredSummary +from rogue_sdk.types import ( + convert_to_api_format, + ApiEvaluationResult, + StructuredSummary, +)
🧹 Nitpick comments (7)
rogue/server/services/__init__.py (1)
2-2: Export looks good. Consider aligning with isort order.Optional: keep the import group alphabetized to satisfy isort consistently.
rogue/server/models/api_format.py (2)
41-57: PEP 8 naming vs API casing: prefer snake_case with aliases.Fields like
totalConversations,flaggedConversations,startTime,judgeModelbreak snake_case. Consider snake_case internals with Pydantic v2 aliases to keep API JSON camelCase.Example:
-from pydantic import BaseModel +from pydantic import BaseModel, Field, ConfigDict class ApiScenarioResult(BaseModel): + model_config = ConfigDict(populate_by_name=True) - totalConversations: Optional[int] = None - flaggedConversations: Optional[int] = None + total_conversations: Optional[int] = Field( + default=None, serialization_alias="totalConversations", validation_alias="totalConversations" + ) + flagged_conversations: Optional[int] = Field( + default=None, serialization_alias="flaggedConversations", validation_alias="flaggedConversations" + )Same approach for
ApiEvaluationResult.startTime -> start_time,judgeModel -> judge_model, andkeyFindings/recommendationif needed.
16-20: Type stronger structure for detailed_breakdown.Replace
List[dict]with a dedicated model (e.g.,BreakdownRow) to get validation and IDE help.sdks/python/rogue_sdk/types.py (1)
263-274: Log parse failures before falling back to now().Silent fallback hides data issues; add a warning with the offending value and error.
Apply:
- except (ValueError, AttributeError): - timestamp = datetime.now(timezone.utc) + except (ValueError, AttributeError) as parse_err: + logger.warning( + f"Failed to parse timestamp: {msg.timestamp!r}; " + f"falling back to now. error={parse_err}" + ) + timestamp = datetime.now(timezone.utc)sdks/python/rogue_sdk/sdk.py (1)
299-324: Include “Detailed Breakdown” in markdown output (optional).Add a section rendering the per-scenario breakdown to improve UI completeness.
structured_summary = response_data.summary if hasattr(structured_summary, "overall_summary"): # Format as markdown string for UI display summary_parts = [ f"# Evaluation Results Summary\n\n## Overall Summary\n" f"{structured_summary.overall_summary}", ] if structured_summary.key_findings: findings = "\n".join( f"- {finding}" for finding in structured_summary.key_findings ) summary_parts.append(f"\n---\n\n## Key Findings\n{findings}") if structured_summary.recommendations: recommendations = "\n".join( f"{i + 1}. {rec}" for i, rec in enumerate(structured_summary.recommendations) ) summary_parts.append(f"\n---\n\n## Recommendations\n{recommendations}") + + if getattr(structured_summary, "detailed_breakdown", None): + rows = [ + "| Scenario | Status | Outcome |", + "|---|---|---|", + ] + for item in structured_summary.detailed_breakdown: + scenario = item.get("scenario", "") + status = item.get("status", "") + outcome = item.get("outcome", "") + rows.append(f"| {scenario} | {status} | {outcome} |") + table = "\n".join(rows) + summary_parts.append(f"\n---\n\n## Detailed Breakdown\n{table}") return "\n".join(summary_parts) else: # Fallback for string response return str(structured_summary)rogue/tests/models/test_evaluation_result.py (1)
213-217: Remove duplicated assertions.These repeat checks already made on Lines 197–201.
- # Test new fields - assert api_format.summary == "Test summary for overall evaluation" - assert api_format.keyFindings == "• Key finding 1\n• Key finding 2" - assert api_format.recommendation == "• Recommendation 1\n• Recommendation 2" - assert api_format.deepTest is True - assert api_format.judgeModel == "openai/gpt-4o-mini" - assert isinstance(api_format.startTime, datetime) + assert isinstance(api_format.startTime, datetime)rogue/server/services/llm_service.py (1)
253-263: Deduplicate code-fence stripping.Factor into a tiny helper to avoid repeating logic and to keep parsing clear.
+def _strip_json_fences(content: str) -> str: + content = content.strip() + if content.startswith("```json"): + content = content[7:] + if content.endswith("```"): + content = content[:-3] + return content.strip() + # Parse the JSON response from the LLM - content = response.choices[0].message.content.strip() - - # Remove markdown code blocks if present - if content.startswith("```json"): - content = content[7:] - if content.endswith("```"): - content = content[:-3] - content = content.strip() + content = _strip_json_fences(response.choices[0].message.content) # Parse JSON and create StructuredSummary summary_data = json.loads(content) return StructuredSummary(**summary_data)
📜 Review details
Configuration used: CodeRabbit UI
Review profile: CHILL
Plan: Lite
📒 Files selected for processing (23)
examples/js/cli/package.json(1 hunks)examples/js/langgraph-js-example/package.json(1 hunks)examples/js/vercel-ai-example/package.json(1 hunks)examples/tshirt_store_agent/tshirt_store_agent_executor.py(4 hunks)packages/tui/internal/theme/themes/vesper.json(1 hunks)rogue/common/generic_agent_executor.py(4 hunks)rogue/common/remote_agent_connection.py(2 hunks)rogue/server/api/llm.py(3 hunks)rogue/server/models/api_format.py(1 hunks)rogue/server/services/__init__.py(1 hunks)rogue/server/services/interviewer_service.py(3 hunks)rogue/server/services/llm_service.py(6 hunks)rogue/server/services/scenario_evaluation_service.py(2 hunks)rogue/tests/models/test_cli_input.py(1 hunks)rogue/tests/models/test_evaluation_result.py(8 hunks)rogue/ui/components/config_screen.py(4 hunks)rogue/ui/components/report_generator.py(2 hunks)rogue/ui/components/scenario_runner.py(1 hunks)sdks/python/rogue_sdk/client.py(3 hunks)sdks/python/rogue_sdk/sdk.py(5 hunks)sdks/python/rogue_sdk/tests/test_types.py(2 hunks)sdks/python/rogue_sdk/types.py(5 hunks)sdks/python/rogue_sdk/websocket.py(2 hunks)
✅ Files skipped from review due to trivial changes (11)
- rogue/tests/models/test_cli_input.py
- examples/js/langgraph-js-example/package.json
- examples/js/vercel-ai-example/package.json
- packages/tui/internal/theme/themes/vesper.json
- sdks/python/rogue_sdk/tests/test_types.py
- rogue/ui/components/config_screen.py
- examples/js/cli/package.json
- rogue/server/services/interviewer_service.py
- rogue/server/services/scenario_evaluation_service.py
- examples/tshirt_store_agent/tshirt_store_agent_executor.py
- rogue/common/remote_agent_connection.py
🚧 Files skipped from review as they are similar to previous changes (1)
- rogue/ui/components/report_generator.py
🧰 Additional context used
📓 Path-based instructions (2)
{rogue,tests,examples}/**/*.py
📄 CodeRabbit inference engine (AGENTS.md)
{rogue,tests,examples}/**/*.py: Format Python code with Black
Organize imports according to isort conventions
Use type hints for all function signatures
Follow PEP 8 naming (snake_case for variables and functions, PascalCase for classes)
Use try/except blocks for code that may raise exceptions
Ensure code passes flake8 linting
Ensure code passes mypy type checks per .mypy.ini
Address Bandit-reported security issues in Python code
Files:
rogue/server/services/__init__.pyrogue/server/models/api_format.pyrogue/ui/components/scenario_runner.pyrogue/server/api/llm.pyrogue/tests/models/test_evaluation_result.pyrogue/server/services/llm_service.pyrogue/common/generic_agent_executor.py
rogue/**/*.py
📄 CodeRabbit inference engine (AGENTS.md)
Place application/package code under the rogue/ directory
Files:
rogue/server/services/__init__.pyrogue/server/models/api_format.pyrogue/ui/components/scenario_runner.pyrogue/server/api/llm.pyrogue/tests/models/test_evaluation_result.pyrogue/server/services/llm_service.pyrogue/common/generic_agent_executor.py
🔇 Additional comments (6)
rogue/ui/components/scenario_runner.py (1)
486-491: Sanity-check RogueClientConfig.base_url handling.The SDK’s field validator may attempt to construct
HttpUrl(v)fromstr, which can be invalid in Pydantic v2. Verify it accepts a plain string safely.Run:
#!/bin/bash rg -n -C3 'class RogueClientConfig' sdks/python/rogue_sdk/types.py rg -n -C3 'def validate_base_url' sdks/python/rogue_sdk/types.pyrogue/common/generic_agent_executor.py (1)
151-156: LGTM — formatting-only changes.No semantic impact detected.
Also applies to: 159-163, 186-190, 195-201
sdks/python/rogue_sdk/types.py (1)
448-449: Confirm all callers updated for structured summary.
SummaryGenerationResponse.summarynow usesStructuredSummary. Ensure UI/report code doesn’t assume a plain str.sdks/python/rogue_sdk/client.py (1)
167-186: Structured-summary handling looks correct and backward-compatible.Graceful dict-check + legacy fallback is solid. No action needed.
rogue/server/api/llm.py (1)
61-65: API switch to ServerSummaryGenerationResponse looks good.Return type aligns with server-side StructuredSummary.
rogue/server/services/llm_service.py (1)
264-283: Good fallbacks on parse/system errors.Deterministic StructuredSummary on failure keeps API stable.
There was a problem hiding this comment.
Actionable comments posted: 1
🧹 Nitpick comments (4)
rogue/server/models/__init__.py (1)
3-15: Consider re-exporting StructuredSummary for a single, stable import path.Several callers (e.g., services) need StructuredSummary; exposing it here avoids importing from a submodule directly and aligns with “centralize public API surface”.
from .api_format import ( ApiChatMessage, ApiConversationEvaluation, ApiEvaluationResult, ApiScenarioResult, + StructuredSummary, ) __all__ = [ "ApiChatMessage", "ApiConversationEvaluation", "ApiEvaluationResult", "ApiScenarioResult", + "StructuredSummary", ]rogue/server/services/api_format_service.py (3)
84-86: Avoid building a temporary list when counting flagged conversations.Use a generator with sum for lower memory and slightly cleaner intent.
- flaggedConversations=len( - [c for c in api_conversations if not c.passed], - ), + flaggedConversations=sum(1 for c in api_conversations if not c.passed),
12-18: Optional: Import via the package’s public surface once StructuredSummary is re-exported.Keeps imports consistent and respects the “single import path” intent.
-from ..models.api_format import ( +from ..models import ( ApiChatMessage, ApiConversationEvaluation, ApiEvaluationResult, ApiScenarioResult, StructuredSummary, )Please confirm that rogue/server/models/init.py re-exports StructuredSummary before applying.
91-104: Minor: Preserve list structure for key findings and recommendations if the API model supports list[str].If ApiEvaluationResult.keyFindings/recommendation are lists, emitting newline-joined strings forces the UI to split. If the model expects strings, ignore this.
If lists are supported:
- key_findings = "\n".join( - f"• {finding}" for finding in structured_summary.key_findings - ) - recommendation = "\n".join( - f"• {rec}" for rec in structured_summary.recommendations - ) + key_findings = list(structured_summary.key_findings) + recommendation = list(structured_summary.recommendations)
📜 Review details
Configuration used: CodeRabbit UI
Review profile: CHILL
Plan: Lite
📒 Files selected for processing (2)
rogue/server/models/__init__.py(1 hunks)rogue/server/services/api_format_service.py(1 hunks)
🧰 Additional context used
📓 Path-based instructions (2)
{rogue,tests,examples}/**/*.py
📄 CodeRabbit inference engine (AGENTS.md)
{rogue,tests,examples}/**/*.py: Format Python code with Black
Organize imports according to isort conventions
Use type hints for all function signatures
Follow PEP 8 naming (snake_case for variables and functions, PascalCase for classes)
Use try/except blocks for code that may raise exceptions
Ensure code passes flake8 linting
Ensure code passes mypy type checks per .mypy.ini
Address Bandit-reported security issues in Python code
Files:
rogue/server/services/api_format_service.pyrogue/server/models/__init__.py
rogue/**/*.py
📄 CodeRabbit inference engine (AGENTS.md)
Place application/package code under the rogue/ directory
Files:
rogue/server/services/api_format_service.pyrogue/server/models/__init__.py
🔇 Additional comments (1)
rogue/server/models/__init__.py (1)
1-2: Good module surface + docstring.Clean re-export hub; no runtime logic.
There was a problem hiding this comment.
Actionable comments posted: 3
Caution
Some comments are outside the diff and can’t be posted inline due to platform limitations.
⚠️ Outside diff range comments (2)
sdks/python/rogue_sdk/sdk.py (1)
287-335: Fix return type and robustly coerce summary into StructuredSummaryThe second return value can be None or a non-model (e.g., dict/str). The current signature Tuple[str, StructuredSummary] is unsound and will fail mypy/consumers. Also, prefer model_validate/isinstance over hasattr.
Apply:
@@ - ) -> Tuple[str, StructuredSummary]: + ) -> Tuple[str, Optional[StructuredSummary]]: @@ - # Convert structured summary back to string format for backward compatibility - structured_summary = response_data.summary - if hasattr(structured_summary, "overall_summary"): + # Try to coerce the server response into StructuredSummary + raw_summary = response_data.summary + structured_summary: Optional[StructuredSummary] + try: + structured_summary = ( + raw_summary + if isinstance(raw_summary, StructuredSummary) + else StructuredSummary.model_validate(raw_summary) + if raw_summary is not None + else None + ) + except Exception: + structured_summary = None + + if structured_summary is not None: @@ - return "\n".join(summary_parts), structured_summary - else: - # Fallback for string response - return str(structured_summary), structured_summary + return "\n".join(summary_parts), structured_summary + else: + # Fallback for non-structured/None response + return ("" if raw_summary is None else str(raw_summary)), NoneRun to find other call sites that still expect a single return (and update them to destructure):
#!/bin/bash # Find generate_summary usages and show surrounding lines rg -nC2 --type=py '\bgenerate_summary\s*\(' # Heuristic: calls not destructuring assignment on same line rg -nP --type=py '(^|\s)(await\s+)?(\w+\s*=\s*)?sdk\.generate_summary\s*\('rogue/run_cli.py (1)
206-210: Avoid flake8 F841: unused structured_summaryYou’re not using the second return value. Assign to underscore.
Apply:
- summary, structured_summary = await sdk.generate_summary( + summary, _ = await sdk.generate_summary( results=results, model=judge_llm, api_key=judge_llm_api_key, )
🧹 Nitpick comments (1)
rogue/run_cli.py (1)
214-216: Specify encoding when writing the reportMatch the UTF-8 encoding you use for results to avoid platform-default discrepancies.
Apply:
- output_report_file.write_text(summary) + output_report_file.write_text(summary, encoding="utf-8")
📜 Review details
Configuration used: CodeRabbit UI
Review profile: CHILL
Plan: Lite
📒 Files selected for processing (4)
rogue/run_cli.py(1 hunks)rogue/ui/components/report_generator.py(2 hunks)rogue/ui/components/scenario_runner.py(1 hunks)sdks/python/rogue_sdk/sdk.py(7 hunks)
🚧 Files skipped from review as they are similar to previous changes (1)
- rogue/ui/components/scenario_runner.py
🧰 Additional context used
📓 Path-based instructions (2)
{rogue,tests,examples}/**/*.py
📄 CodeRabbit inference engine (AGENTS.md)
{rogue,tests,examples}/**/*.py: Format Python code with Black
Organize imports according to isort conventions
Use type hints for all function signatures
Follow PEP 8 naming (snake_case for variables and functions, PascalCase for classes)
Use try/except blocks for code that may raise exceptions
Ensure code passes flake8 linting
Ensure code passes mypy type checks per .mypy.ini
Address Bandit-reported security issues in Python code
Files:
rogue/run_cli.pyrogue/ui/components/report_generator.py
rogue/**/*.py
📄 CodeRabbit inference engine (AGENTS.md)
Place application/package code under the rogue/ directory
Files:
rogue/run_cli.pyrogue/ui/components/report_generator.py
🔇 Additional comments (5)
sdks/python/rogue_sdk/sdk.py (4)
118-120: LGTM: clearer error message formattingTrailing comma is fine under Black; message text reads well.
196-198: LGTM: clearer exception pathGood use of trailing comma and specific message.
214-215: LGTM: explicit WebSocket error copy-throughMessage conveys source error succinctly.
231-233: LGTM: TimeoutError messageClear and actionable timeout text.
rogue/ui/components/report_generator.py (1)
8-9: Fix relative import depth: use two dots, not three.
from ...server...attempts to import beyond the top-level package fromrogue/ui/components/. This will raise "attempted relative import beyond top-level package" at runtime.Apply:
-from ...server.services.api_format_service import convert_with_structured_summary +from ..server.services.api_format_service import convert_with_structured_summaryLikely an incorrect or invalid review comment.
There was a problem hiding this comment.
Actionable comments posted: 2
🧹 Nitpick comments (6)
packages/tui/internal/tui/evaluation.go (3)
95-107: Name the nested types for reuse and clarityDefining anonymous structs makes reuse/tests harder. Introduce named items for the summary block and breakdown entries.
Apply:
-type SummaryResp struct { - Summary struct { - OverallSummary string `json:"overall_summary"` - KeyFindings []string `json:"key_findings"` - Recommendations []string `json:"recommendations"` - DetailedBreakdown []struct { - Scenario string `json:"scenario"` - Status string `json:"status"` - Outcome string `json:"outcome"` - } `json:"detailed_breakdown"` - } `json:"summary"` - Message string `json:"message"` -} +type DetailedBreakdownItem struct { + Scenario string `json:"scenario"` + Status string `json:"status"` + Outcome string `json:"outcome"` +} + +type SummaryBlock struct { + OverallSummary string `json:"overall_summary"` + KeyFindings []string `json:"key_findings"` + Recommendations []string `json:"recommendations"` + DetailedBreakdown []DetailedBreakdownItem `json:"detailed_breakdown"` +} + +type SummaryResp struct { + Summary SummaryBlock `json:"summary"` + Message string `json:"message"` +}
462-474: Guard empty results and omit api_key when emptyAvoid sending an empty results envelope and exclude api_key when not provided.
Apply:
- if job.Results == nil { - return nil, fmt.Errorf("no results available for job %s", jobID) - } + if len(job.Results) == 0 { + return nil, fmt.Errorf("no results available for job %s", jobID) + } @@ - summaryReq := map[string]interface{}{ - "model": model, - "api_key": apiKey, - "results": map[string]interface{}{ - "results": job.Results, - }, - } + summaryReq := map[string]interface{}{ + "model": model, + "results": map[string]interface{}{ + "results": job.Results, + }, + } + if apiKey != "" { + summaryReq["api_key"] = apiKey + }
484-485: Set Accept header for explicit JSON negotiationSmall hardening for content-type expectations.
Apply:
req.Header.Set("Content-Type", "application/json") +req.Header.Set("Accept", "application/json")packages/tui/internal/tui/report_view.go (1)
25-25: Main container height likely truncates contentUsing m.height-12 can clip background/layout. Prefer full height unless there’s a deliberate reserved region.
Apply:
- Height(m.height - 12). + Height(m.height).packages/tui/internal/tui/app.go (2)
160-174: New detailedBreakdown state is unusedThe Model field detailedBreakdown []table.Row isn’t populated or rendered yet. Either wire it into ReportScreen (e.g., maintain a table.Model with headers/rows) or defer adding the field until the table view lands to avoid dead state.
Would you like a follow-up patch to populate rows from structuredSummary.Summary.DetailedBreakdown and render a table in renderReport?
269-269: Report viewport height reduced to 15 linesThis tightens visible report space; verify it plays well with typical summary lengths and doesn’t regress usability on smaller terminals. Consider making this responsive to terminal height (min/max) if needed.
📜 Review details
Configuration used: CodeRabbit UI
Review profile: CHILL
Plan: Lite
⛔ Files ignored due to path filters (1)
packages/tui/go.sumis excluded by!**/*.sum
📒 Files selected for processing (4)
packages/tui/go.mod(1 hunks)packages/tui/internal/tui/app.go(4 hunks)packages/tui/internal/tui/evaluation.go(4 hunks)packages/tui/internal/tui/report_view.go(3 hunks)
🧰 Additional context used
🧬 Code graph analysis (3)
packages/tui/internal/tui/report_view.go (1)
packages/tui/internal/styles/styles.go (3)
NewStyle(45-47)Border(23-23)Style(40-42)
packages/tui/internal/tui/evaluation.go (2)
packages/tui/internal/tui/app.go (1)
Scenario(203-207)packages/sdk/src/types.ts (1)
Scenario(41-47)
packages/tui/internal/tui/app.go (5)
packages/sdk/src/types.ts (1)
Scenario(41-47)packages/tui/internal/components/command_line.go (1)
CommandInput(21-30)packages/tui/internal/components/dialog.go (1)
Dialog(38-51)packages/tui/internal/components/scenario_editor.go (1)
ScenarioEditor(33-71)packages/tui/internal/components/viewport.go (1)
NewViewport(74-93)
🔇 Additional comments (5)
packages/tui/go.mod (1)
26-26: Confirm v2 imports and table API compatibility
- All
bubbles,bubbletea, andlipglossimports reference/v2(no lingering v1 paths). Please manually verify that thegithub.com/charmbracelet/bubbles/v2/tableusage inpackages/tui/internal/tui/app.gomatches the v2.0.0-beta.1 API.packages/tui/internal/tui/evaluation.go (1)
455-509: Signature change ripple check completeAll call sites of
GenerateSummarynow correctly handle the new*SummaryRespreturn:
app.go: usesstructuredSummary.Summary.OverallSummary(line 92) before assigning tom.evalState.Summary- Error path in
app.gostill assigns a string tom.evalState.Summaryreport_view.goandeval_view.gocontinue to treatm.evalState.Summaryas a string for renderingNo remaining changes required.
packages/tui/internal/tui/report_view.go (2)
105-112: Request viewport and container setup: Provide the code whereviewportHeightis computed andcontentAreais initialized (the lines immediately before 105) so we can confirm the height passed tolipgloss.Placematches the container.
74-75: No changes required for BorderBackground usageStyle.BorderBackground(...) is available in lipgloss/v2 v2.0.0-beta.3, so the existing call is valid.
packages/tui/internal/tui/app.go (1)
11-13: I've added steps to locate the module file and verify that only/v2imports are used across the codebase.
There was a problem hiding this comment.
Actionable comments posted: 10
Caution
Some comments are outside the diff and can’t be posted inline due to platform limitations.
⚠️ Outside diff range comments (4)
packages/tui/internal/tui/evaluation.go (2)
423-428: Strengthen URL validation (ParseRequestURI + scheme/host check)url.Parse accepts many invalid inputs; enforce scheme/host.
- // Validate URLs - if _, err := url.Parse(serverURL); err != nil { - return nil, nil, fmt.Errorf("invalid server url: %w", err) - } - if _, err := url.Parse(agentURL); err != nil { - return nil, nil, fmt.Errorf("invalid agent url: %w", err) - } + // Validate URLs strictly (require scheme + host) + if u, err := url.ParseRequestURI(serverURL); err != nil || u.Scheme == "" || u.Host == "" { + return nil, nil, fmt.Errorf("invalid server url: %q", serverURL) + } + if u, err := url.ParseRequestURI(agentURL); err != nil || u.Scheme == "" || u.Host == "" { + return nil, nil, fmt.Errorf("invalid agent url: %q", agentURL) + }
468-479: Nil deref on qualifireAPIKey; hardcoded Qualifire URLDereferencing a nil *string will panic. Also defaulting to localhost is risky in prod builds.
- summaryReq := map[string]interface{}{ - "model": model, - "api_key": apiKey, - "results": map[string]interface{}{ - "results": job.Results, - }, - "job_id": jobID, - "qualifire_api_key": *qualifireAPIKey, - "qualifire_url": "http://localhost:3000", - } + summaryReq := map[string]interface{}{ + "model": model, + "api_key": apiKey, + "results": map[string]interface{}{"results": job.Results}, + "job_id": jobID, + // Prefer production default; make configurable if needed. + "qualifire_url": "https://app.qualifire.ai", + } + if qualifireAPIKey != nil && *qualifireAPIKey != "" { + summaryReq["qualifire_api_key"] = *qualifireAPIKey + }Optional: also treat empty results as no-op instead of error:
- if job.Results == nil { + if job.Results == nil || (len(job.Results) == 0) { return nil, fmt.Errorf("no results available for job %s", jobID) }Also applies to: 507-514
sdks/python/rogue_sdk/types.py (1)
431-442: Unify Qualifire URL default across SDKs
Detected two hardcoded defaults in Python (sdks/python/rogue_sdk/types.py lines 441 & 510) set to"https://app.qualifire.ai"and a mismatched Go default ("http://localhost:3000"in packages/tui/internal/tui/evaluation.go line 477). Extractqualifire_urlinto a shared config or environment variable to prevent drift.rogue/evaluator_agent/evaluator_agent.py (1)
361-369: Tool signature mismatch breaks FunctionTool invocation.The LLM instructions call _log_evaluation with 4 params, but the function requires a 5th (scenario_type). This will raise at runtime. Make scenario_type optional with a default or remove it.
- def _log_evaluation( - self, - scenario: dict[str, str], - context_id: str, - evaluation_passed: bool, - reason: str, - scenario_type: Optional[str], - ) -> None: + def _log_evaluation( + self, + scenario: dict[str, str], + context_id: str, + evaluation_passed: bool, + reason: str, + scenario_type: Optional[str] = None, + ) -> None: @@ "evaluation_passed (from agent)": evaluation_passed, "reason (from agent)": reason, - "scenario_type": scenario_type, + "scenario_type": scenario_type, }, )Option B (safer): drop the extra param entirely and read type from the parsed scenario.
Also applies to: 384-399
♻️ Duplicate comments (4)
packages/tui/internal/tui/report_view.go (1)
61-67: Normalize viewport sizing; use one source of truth (and add guards).Inconsistent subtractions (-2, -8) cause clipping/off-by-one scroll issues and misaligned centering. Compute width/height once, guard for small terminals, and reuse those exact values for SetSize and Style.Height.
- viewportWidth := m.width - 8 // Leave margins - viewportHeight := m.height - 8 // title(3) + help(1) + margins(4) + viewportWidth := m.width - 8 // leave margins + viewportHeight := m.height - 4 // title(3) + help(1) + margins(4) + if viewportWidth < 20 { + viewportWidth = 20 + } + if viewportHeight < 5 { + viewportHeight = 5 + } @@ - viewport.SetSize(viewportWidth, viewportHeight-2) + viewport.SetSize(viewportWidth, viewportHeight) @@ - Height(viewportHeight - 8). + Height(viewportHeight). @@ - Width(viewportWidth). - Height(viewportHeight-8). + Width(viewportWidth). + Height(viewportHeight). @@ - Height(viewportHeight - 8). + Height(viewportHeight). @@ - viewportHeight-8, + viewportHeight,Also applies to: 71-76, 81-83, 106-113
packages/tui/internal/tui/app.go (1)
90-114: Fix panic risk (err not checked) and avoid O(n²) string concat; only emit non-empty sections.Dereferencing structuredSummary before checking err can panic on failures; repeated + concatenation is inefficient.
- structuredSummary, err := sdk.GenerateSummary(ctx, m.evalState.JobID, judgeModel, apiKey, &m.config.QualifireAPIKey) - - overallSummary := structuredSummary.Summary.OverallSummary - keyFindings := structuredSummary.Summary.KeyFindings - parsedKeyFindings := "" - for _, finding := range keyFindings { - parsedKeyFindings += "- " + finding + "\n" - } - recommendations := structuredSummary.Summary.Recommendations - parsedRecommendations := "" - for _, recommendation := range recommendations { - parsedRecommendations += "- " + recommendation + "\n" - } - - detailedBreakdown := structuredSummary.Summary.DetailedBreakdown - parsedDetailedBreakdown := "" - for _, breakdown := range detailedBreakdown { - parsedDetailedBreakdown += "- " + breakdown.Scenario + " - " + breakdown.Status + " - " + breakdown.Outcome + "\n" - } - - summary := "## Overall Summary\n\n" + overallSummary + - "\n\n" + "## Key Findings\n\n" + parsedKeyFindings + - "\n\n" + "## Recommendations\n\n" + parsedRecommendations + - "\n\n" + "## Detailed Breakdown\n\n" + parsedDetailedBreakdown + structuredSummary, err := sdk.GenerateSummary(ctx, m.evalState.JobID, judgeModel, apiKey, &m.config.QualifireAPIKey) + if err != nil { + return SummaryGeneratedMsg{Summary: "", Err: err} + } + + sum := structuredSummary.Summary + var b strings.Builder + if sum.OverallSummary != "" { + b.WriteString("## Overall Summary\n\n") + b.WriteString(sum.OverallSummary) + b.WriteString("\n\n") + } + if len(sum.KeyFindings) > 0 { + b.WriteString("## Key Findings\n\n") + for _, f := range sum.KeyFindings { + if f == "" { + continue + } + b.WriteString("- ") + b.WriteString(f) + b.WriteByte('\n') + } + b.WriteByte('\n') + } + if len(sum.Recommendations) > 0 { + b.WriteString("## Recommendations\n\n") + for _, r := range sum.Recommendations { + if r == "" { + continue + } + b.WriteString("- ") + b.WriteString(r) + b.WriteByte('\n') + } + b.WriteByte('\n') + } + if len(sum.DetailedBreakdown) > 0 { + b.WriteString("## Detailed Breakdown\n\n") + for _, br := range sum.DetailedBreakdown { + b.WriteString("- ") + b.WriteString(br.Scenario) + b.WriteString(" - ") + b.WriteString(br.Status) + b.WriteString(" - ") + b.WriteString(br.Outcome) + b.WriteByte('\n') + } + } + summary := strings.TrimSpace(b.String())sdks/python/rogue_sdk/types.py (1)
264-275: Log timestamp parse failures instead of silent fallbackRetain current fallback but log once per failure for observability.
- except (ValueError, AttributeError): - timestamp = datetime.now(timezone.utc) + except (ValueError, AttributeError) as exc: + logger.warning( + f"Failed to parse timestamp: {msg.timestamp!r}; using now()", + exc_info=exc, + ) + timestamp = datetime.now(timezone.utc)rogue/server/services/api_format_service.py (1)
51-61: Normalize message timestamps to timezone-aware UTC (repeat of prior review).A naive parsed datetime leaks through when msg.timestamp lacks tzinfo. Always coerce to aware UTC.
Apply this diff:
- timestamp = datetime.now(timezone.utc) - if msg.timestamp: - try: - if isinstance(msg.timestamp, str): - timestamp = datetime.fromisoformat( - msg.timestamp.replace("Z", "+00:00"), - ) - else: - timestamp = msg.timestamp - except (ValueError, AttributeError): - timestamp = datetime.now(timezone.utc) + timestamp = datetime.now(timezone.utc) + if msg.timestamp: + try: + if isinstance(msg.timestamp, str): + parsed = datetime.fromisoformat( + msg.timestamp.replace("Z", "+00:00"), + ) + timestamp = ( + parsed.replace(tzinfo=timezone.utc) + if parsed.tzinfo is None + else parsed.astimezone(timezone.utc) + ) + elif isinstance(msg.timestamp, datetime): + timestamp = ( + msg.timestamp.replace(tzinfo=timezone.utc) + if msg.timestamp.tzinfo is None + else msg.timestamp.astimezone(timezone.utc) + ) + else: + timestamp = datetime.now(timezone.utc) + except (ValueError, AttributeError, TypeError): + timestamp = datetime.now(timezone.utc)
🧹 Nitpick comments (21)
packages/tui/internal/tui/report_view.go (1)
25-25: Main container should fill full height.Using Height(m.height - 12) can leave a background gap. Prefer full height.
- Height(m.height - 12). + Height(m.height).packages/tui/internal/tui/app.go (1)
115-118: Propagate nil error when summary builds successfully.After the refactor above, return Err: nil to avoid implying partial failure.
- return SummaryGeneratedMsg{ - Summary: summary, - Err: err, - } + return SummaryGeneratedMsg{Summary: summary, Err: nil}packages/tui/internal/tui/eval_ui.go (2)
38-49: Harden parsing: default scenario_type when missing; minor robustness.Provide a sane default when scenario_type is empty to match SDK defaults (e.g., "policy").
- for _, s := range v.Scenarios { + for _, s := range v.Scenarios { + st := s.ScenarioType + if st == "" { + st = "policy" + } if s.Scenario != "" { out = append(out, EvalScenario{ Scenario: s.Scenario, - ScenarioType: ScenarioType(s.ScenarioType), + ScenarioType: ScenarioType(st), ExpectedOutcome: s.ExpectedOutcome, }) } }Also applies to: 52-52, 56-59
39-41: Handle Getwd error and return empty slice instead of nil.Avoids surprises for callers and early-exits cleanly.
- wd, _ := os.Getwd() + wd, err := os.Getwd() + if err != nil { + return []EvalScenario{} + } @@ - return nil + return []EvalScenario{}Also applies to: 71-71
rogue/server/services/__init__.py (1)
2-9: Alphabetize imports to satisfy isort/flake8.Keep submodule imports sorted lexicographically.
from . import ( - api_format_service, - evaluation_library, - evaluation_service, - interviewer_service, - llm_service, - scenario_evaluation_service, - qualifire_service, + api_format_service, + evaluation_library, + evaluation_service, + interviewer_service, + llm_service, + qualifire_service, + scenario_evaluation_service, )rogue/evaluator_agent/evaluator_agent.py (1)
120-126: Keep tool docstring in sync with callable signature.If you keep scenario_type as a parameter, reflect it in the instructions; otherwise, remove it from the signature and leave the docstring as-is.
-3. `_log_evaluation(scenario: dict, context_id: str, evaluation_passed: bool, reason: str)` +3. `_log_evaluation(scenario: dict, context_id: str, evaluation_passed: bool, reason: str, scenario_type: Optional[str] = None)`rogue/server/services/qualifire_service.py (1)
10-13: Add explicit return type for report_summary.Annotate return for mypy and readability.
- def report_summary( + def report_summary( request: ReportSummaryRequest, evaluation_result: EvaluationResult, - ): + ) -> dict:rogue/run_cli.py (3)
191-194: Plumb Qualifire key through CLI/config.You added qualifire_api_key_secret to create_report but it isn’t sourced from CLI/config here. Ensure CLIInput/args support and forward it.
220-222: Specify encoding when writing the report.Avoid locale-dependent defaults.
- output_report_file.write_text(summary) + output_report_file.write_text(summary, encoding="utf-8")
361-363: Forward Qualifire key to create_report if available.Pipe the secret from CLIInput/config to create_report.
rogue/server/models/api_format.py (2)
10-12: Import Field/ConfigDict for aliases and stricter validationTo support aliasing (camelCase in JSON) and extra="forbid" configs in models below, extend imports.
-from pydantic import BaseModel +from pydantic import BaseModel, Field, ConfigDict
14-20: Forbid unknown fields and require timezone-aware timestamps
- Add extra="forbid" to block unexpected data.
- Enforce/verify timestamps are timezone-aware (UTC) to avoid ambiguity.
class ApiChatMessage(BaseModel): """Chat message for new API format with datetime timestamp.""" + model_config = ConfigDict(extra="forbid") role: str content: str timestamp: datetimeWould you like me to add a validator that rejects naive datetimes, or is upstream guaranteed to send RFC 3339 with Z?
rogue/server/services/llm_service.py (4)
254-259: Harden code-fence stripping (handles case/whitespace/newlines)Use a single regex to remove leading/trailing fences in more edge cases.
- # Remove markdown code blocks if present - if content.startswith("```json"): - content = content[7:] - if content.endswith("```"): - content = content[:-3] - content = content.strip() + # Remove leading/trailing ```/```json fences robustly + content = re.sub( + r'^\s*```(?:json)?\s*|\s*```\s*$', + '', + content.strip(), + flags=re.IGNORECASE, + ).strip()(Remember to import re at the top.)
-import json +import json +import re
261-263: Prefer model_validate for clarityMinor readability: use Pydantic’s validator for dict input.
- summary_data = json.loads(content) - return StructuredSummary(**summary_data) + summary_data = json.loads(content) + return StructuredSummary.model_validate(summary_data)
156-163: Externalize STATIC_SCENARIOS to configHardcoding dataset ids/sample sizes makes deployments rigid. Consider moving to a config/env flag or settings module.
244-248: Enforce JSON-mode in LiteLLM completions
Addresponse_format={'type': 'json_object'}to thecompletioncall (LiteLLM supports this natively) and guard it withsupports_response_schema(model)or vialitellm.get_supported_openai_params()to ensure the target provider honors JSON schemas.File: rogue/server/services/llm_service.py @@ lines 244-248 response = completion( model=model, messages=messages, api_key=api_key, + response_format={'type': 'json_object'}, )sdks/python/rogue_sdk/sdk.py (1)
308-343: Improve Markdown formatting for detailed_breakdownCurrent f-string prints raw dicts. Format rows for readability and guard non-dict entries.
- if hasattr(structured_summary, "overall_summary"): + if hasattr(structured_summary, "overall_summary"): # Format as markdown string for UI display @@ - if structured_summary.detailed_breakdown: - breakdown = "\n".join( - f"{i + 1}. {row}" - for i, row in enumerate(structured_summary.detailed_breakdown) - ) - summary_parts.append(f"\n---\n\n## Detailed Breakdown\n{breakdown}") + if structured_summary.detailed_breakdown: + def _fmt_row(i, row): + if isinstance(row, dict): + scenario = row.get("scenario", "N/A") + status = row.get("status", "") + outcome = row.get("outcome", "") + return f"{i + 1}. {status} {scenario} — {outcome}" + return f"{i + 1}. {row}" + breakdown = "\n".join( + _fmt_row(i, row) + for i, row in enumerate(structured_summary.detailed_breakdown) + ) + summary_parts.append(f"\n---\n\n## Detailed Breakdown\n{breakdown}")rogue/server/services/api_format_service.py (1)
39-41: Coerce start_time to aware UTC.If a naive datetime is supplied, normalize to UTC to avoid mixed-aware/naive issues.
Apply this diff:
if start_time is None: start_time = datetime.now(timezone.utc) + else: + start_time = ( + start_time.replace(tzinfo=timezone.utc) + if start_time.tzinfo is None + else start_time.astimezone(timezone.utc) + )rogue/server/api/llm.py (3)
126-129: Reduce sensitive/verbose logging of summary/results.Avoid dumping full summary and entire results; log counts/metadata instead.
Apply this diff:
- logger.info( - "Summary", - extra={"summary": summary, "results": request.results}, - ) + logger.info( + "Summary generated", + extra={ + "key_findings_count": len(getattr(summary, "key_findings", []) or []), + "results_count": len(request.results.results), + }, + )
136-141: Ensure start_time is timezone-aware UTC when using job.created_at.job.created_at may be naive; normalize before sending to Qualifire.
Apply this diff:
- start_time=( - job.created_at - if job is not None - else datetime.now(timezone.utc) - ), + start_time=( + ( + job.created_at.replace(tzinfo=timezone.utc) + if job.created_at.tzinfo is None + else job.created_at.astimezone(timezone.utc) + ) + if job is not None and getattr(job, "created_at", None) + else datetime.now(timezone.utc) + ),
90-95: Avoid blocking the event loop with sync LLM calls.LLMService.generate_summary_from_results is sync; offload to a threadpool.
You can replace the call with:
from fastapi.concurrency import run_in_threadpool summary = await run_in_threadpool( LLMService.generate_summary_from_results, model=request.model, results=request.results, llm_provider_api_key=request.api_key, )
📜 Review details
Configuration used: CodeRabbit UI
Review profile: CHILL
Plan: Lite
📒 Files selected for processing (18)
packages/tui/internal/tui/app.go(4 hunks)packages/tui/internal/tui/eval_ui.go(2 hunks)packages/tui/internal/tui/evaluation.go(6 hunks)packages/tui/internal/tui/report_view.go(3 hunks)rogue/evaluator_agent/evaluator_agent.py(3 hunks)rogue/run_cli.py(3 hunks)rogue/server/api/__init__.py(1 hunks)rogue/server/api/evaluation.py(1 hunks)rogue/server/api/llm.py(4 hunks)rogue/server/models/api_format.py(1 hunks)rogue/server/services/__init__.py(1 hunks)rogue/server/services/api_format_service.py(1 hunks)rogue/server/services/llm_service.py(6 hunks)rogue/server/services/qualifire_service.py(1 hunks)rogue/ui/components/scenario_runner.py(1 hunks)sdks/python/rogue_sdk/client.py(4 hunks)sdks/python/rogue_sdk/sdk.py(7 hunks)sdks/python/rogue_sdk/types.py(8 hunks)
✅ Files skipped from review due to trivial changes (1)
- rogue/server/api/init.py
🚧 Files skipped from review as they are similar to previous changes (1)
- rogue/ui/components/scenario_runner.py
🧰 Additional context used
📓 Path-based instructions (2)
{rogue,tests,examples}/**/*.py
📄 CodeRabbit inference engine (AGENTS.md)
{rogue,tests,examples}/**/*.py: Format Python code with Black
Organize imports according to isort conventions
Use type hints for all function signatures
Follow PEP 8 naming (snake_case for variables and functions, PascalCase for classes)
Use try/except blocks for code that may raise exceptions
Ensure code passes flake8 linting
Ensure code passes mypy type checks per .mypy.ini
Address Bandit-reported security issues in Python code
Files:
rogue/server/services/__init__.pyrogue/server/services/qualifire_service.pyrogue/server/services/api_format_service.pyrogue/server/models/api_format.pyrogue/server/api/evaluation.pyrogue/evaluator_agent/evaluator_agent.pyrogue/server/services/llm_service.pyrogue/run_cli.pyrogue/server/api/llm.py
rogue/**/*.py
📄 CodeRabbit inference engine (AGENTS.md)
Place application/package code under the rogue/ directory
Files:
rogue/server/services/__init__.pyrogue/server/services/qualifire_service.pyrogue/server/services/api_format_service.pyrogue/server/models/api_format.pyrogue/server/api/evaluation.pyrogue/evaluator_agent/evaluator_agent.pyrogue/server/services/llm_service.pyrogue/run_cli.pyrogue/server/api/llm.py
🧬 Code graph analysis (13)
rogue/server/services/qualifire_service.py (2)
rogue/server/services/api_format_service.py (1)
convert_with_structured_summary(116-141)sdks/python/rogue_sdk/client.py (1)
request(79-83)
rogue/server/services/api_format_service.py (1)
rogue/server/models/api_format.py (4)
ApiChatMessage(14-19)ApiConversationEvaluation(22-27)ApiEvaluationResult(40-49)ApiScenarioResult(30-37)
packages/tui/internal/tui/app.go (3)
sdks/python/rogue_sdk/types.py (1)
Scenario(94-133)packages/sdk/src/types.ts (1)
Scenario(41-47)packages/tui/internal/components/viewport.go (1)
NewViewport(74-93)
rogue/server/models/api_format.py (1)
sdks/python/rogue_sdk/types.py (1)
StructuredSummary(444-450)
packages/tui/internal/tui/eval_ui.go (3)
sdks/python/rogue_sdk/types.py (3)
Scenarios(136-154)Scenario(94-133)ScenarioType(45-49)packages/tui/internal/tui/evaluation.go (2)
EvalScenario(44-48)ScenarioType(27-27)packages/sdk/src/types.ts (1)
Scenario(41-47)
rogue/server/services/llm_service.py (1)
sdks/python/rogue_sdk/types.py (5)
EvaluationResults(192-208)Scenario(94-133)Scenarios(136-154)ScenarioType(45-49)StructuredSummary(444-450)
sdks/python/rogue_sdk/client.py (1)
sdks/python/rogue_sdk/types.py (3)
StructuredSummary(444-450)SummaryGenerationResponse(453-457)SummaryGenerationRequest(431-441)
sdks/python/rogue_sdk/sdk.py (3)
sdks/python/rogue_sdk/types.py (1)
StructuredSummary(444-450)rogue/server/api/llm.py (1)
generate_summary(72-158)sdks/python/rogue_sdk/client.py (1)
generate_summary(147-194)
sdks/python/rogue_sdk/types.py (2)
rogue/server/models/api_format.py (4)
ApiChatMessage(14-19)ApiConversationEvaluation(22-27)ApiScenarioResult(30-37)ApiEvaluationResult(40-49)rogue/server/services/api_format_service.py (1)
convert_to_api_format(20-113)
rogue/run_cli.py (3)
rogue/server/api/llm.py (1)
generate_summary(72-158)sdks/python/rogue_sdk/client.py (1)
generate_summary(147-194)sdks/python/rogue_sdk/sdk.py (1)
generate_summary(287-342)
packages/tui/internal/tui/evaluation.go (3)
sdks/python/rogue_sdk/types.py (2)
Scenario(94-133)ScenarioType(45-49)packages/sdk/src/types.ts (1)
Scenario(41-47)sdks/python/rogue_sdk/sdk.py (1)
RogueSDK(34-391)
rogue/server/api/llm.py (9)
sdks/python/rogue_sdk/types.py (3)
SummaryGenerationRequest(431-441)ReportSummaryResponse(513-516)ReportSummaryRequest(501-510)rogue/server/api/evaluation.py (1)
get_evaluation_service(23-24)rogue/server/services/evaluation_service.py (2)
EvaluationService(20-234)get_job(31-33)rogue/server/models/api_format.py (1)
ServerSummaryGenerationResponse(52-56)rogue/common/logging/config.py (1)
get_logger(155-167)rogue/server/services/llm_service.py (1)
LLMService(166-283)rogue/server/services/qualifire_service.py (2)
QualifireService(8-40)report_summary(10-40)sdks/python/rogue_sdk/client.py (2)
generate_summary(147-194)request(79-83)sdks/python/rogue_sdk/sdk.py (1)
generate_summary(287-342)
packages/tui/internal/tui/report_view.go (1)
packages/tui/internal/styles/styles.go (3)
NewStyle(45-47)Border(23-23)Style(40-42)
🔇 Additional comments (23)
packages/tui/internal/tui/app.go (4)
160-174: detailedBreakdown field addition looks good.Type ties the import; no unused-import build issue.
269-269: Report viewport initial height changed to 15.Verify this still fits the new layout; otherwise align with standardized sizing or compute dynamically.
11-13: Import path confirmed
The import pathgithub.com/charmbracelet/bubbles/v2/tableis correct for Bubbles v2 according to pkg.go.dev and the official Charmbracelet repo.
382-387: Incorrect sizing refactor suggestion
The report view in report_view.go usesviewport.SetSize(viewportWidth, viewportHeight-2), notmsg.Width-8/msg.Height-4, and other components apply context-specific offsets (e.g., height-12 for headers, –4 for help/interview). Switching to the proposed diff will break those layouts. Instead, extract and use shared constants for horizontal and vertical margins if you need consistent sizing.Likely an incorrect or invalid review comment.
packages/tui/internal/tui/eval_ui.go (1)
17-17: Shift to structured scenarios is solid.Aligns UI with API/SDK models.
packages/tui/internal/tui/evaluation.go (4)
45-47: Adds expected_outcome to scenarios — OKJSON tag + omitempty are correct; aligns with SDK/server contracts.
96-108: Structured summary response type — OKFields map cleanly to the server’s StructuredSummary shape.
447-450: Scenario conversion — OKScenario, ScenarioType, ExpectedOutcome are forwarded correctly.
419-421: Signature change verified The TUI caller in eval_ui.go has been updated to pass[]EvalScenario; no further updates required.sdks/python/rogue_sdk/types.py (4)
80-81: AgentConfig.qualifire_api_key — OKMatches new flows; optional and safe default.
389-391: EvaluationJob.deep_test/judge_model — OKCaptures flags required downstream in reports.
444-457: StructuredSummary model — OKField names and shapes match the response used by clients.
501-517: Report summary models — OKSurface looks minimal and sufficient for BFF reporting.
rogue/server/api/evaluation.py (1)
62-64: Persist deep_test and judge_model on job — OKEnables downstream formatting/reporting without re-deriving flags.
sdks/python/rogue_sdk/client.py (3)
147-156: API surface extension looks good; confirm server supports new fields.Params (qualifire_api_key, job_id, deep_test, judge_model) align with SummaryGenerationRequest. OK.
175-194: Structured response handling LGTM.Good fallback to legacy string; preserves backward compatibility.
269-270: Minor style change is fine.Trailing comma in exception message is harmless and Black-compatible.
rogue/server/models/api_format.py (2)
52-56: LGTM: ServerSummaryGenerationResponse aligns with SDK typesReturning a StructuredSummary here matches the client expectations and reduces ad-hoc parsing downstream.
43-47: No changes needed:keyFindingsis intentionally a string for the API contract
Theapi_format_service.pyjoinsStructuredSummary.key_findingsinto a single string and the existing tests assert this behavior, so retainingOptional[str]aligns with the legacy BFF shape.rogue/server/services/llm_service.py (1)
173-185: Docstring/API contract improvements look goodClearer args/returns/raises improve maintainability. No action needed.
sdks/python/rogue_sdk/sdk.py (2)
118-119: LGTM: clearer error message with no semantic changeMinor formatting-only tweak reads better and stays lint-clean.
292-297: Call sites updated for new Tuple return signature Callers in rogue/run_cli.py and rogue/ui/components/scenario_runner.py already unpack both values; no further action required.rogue/server/api/llm.py (1)
178-189: Report only the first evaluation resultresults[0]ignores all other results in the job. Confirm the signature ofQualifireService.report_summary– if it should handle multiple results, either loop overjob.resultsand callreport_summaryfor each, or pass the fullEvaluationResultscontainer instead of just the first element.
| logger.info( | ||
| "Qualifire API key", | ||
| extra={"qualifire_api_key": request.qualifire_api_key}, | ||
| ) |
There was a problem hiding this comment.
Do not log secrets (qualifire_api_key).
This logs a raw API key. Remove or only log presence.
Apply this diff:
- logger.info(
- "Qualifire API key",
- extra={"qualifire_api_key": request.qualifire_api_key},
- )
+ logger.info(
+ "Qualifire API key provided",
+ extra={"present": bool(request.qualifire_api_key)},
+ )📝 Committable suggestion
‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.
| logger.info( | |
| "Qualifire API key", | |
| extra={"qualifire_api_key": request.qualifire_api_key}, | |
| ) | |
| logger.info( | |
| "Qualifire API key provided", | |
| extra={"present": bool(request.qualifire_api_key)}, | |
| ) |
🤖 Prompt for AI Agents
In rogue/server/api/llm.py around lines 98 to 101, the code logs the raw
qualifire_api_key which is a secret; remove the secret from logs and instead log
only its presence or a boolean/masked value. Replace the logger.info call so it
does not include request.qualifire_api_key (e.g., log "qualifire_api_key
present": True or "qualifire_api_key": "*****"), or remove the log entirely if
not needed, ensuring no secret value is ever written to logs.
| class ApiScenarioResult(BaseModel): | ||
| """Result of evaluating a single scenario in new API format.""" | ||
|
|
||
| description: Optional[str] = None | ||
| expectedOutcome: Optional[str] = None | ||
| totalConversations: Optional[int] = None | ||
| flaggedConversations: Optional[int] = None | ||
| conversations: List[ApiConversationEvaluation] | ||
|
|
There was a problem hiding this comment.
🛠️ Refactor suggestion
Use snake_case fields with serialization aliases to meet PEP 8 while emitting camelCase JSON
Internal snake_case keeps mypy/flake8 happy and avoids mixed styles. Emit camelCase via serialization_alias so the wire format is unchanged.
class ApiScenarioResult(BaseModel):
"""Result of evaluating a single scenario in new API format."""
- description: Optional[str] = None
- expectedOutcome: Optional[str] = None
- totalConversations: Optional[int] = None
- flaggedConversations: Optional[int] = None
- conversations: List[ApiConversationEvaluation]
+ model_config = ConfigDict(extra="forbid")
+ description: Optional[str] = None
+ expected_outcome: Optional[str] = Field(None, serialization_alias="expectedOutcome")
+ total_conversations: Optional[int] = Field(None, serialization_alias="totalConversations")
+ flagged_conversations: Optional[int] = Field(None, serialization_alias="flaggedConversations")
+ conversations: List[ApiConversationEvaluation] = Field(default_factory=list)
class ApiEvaluationResult(BaseModel):
"""New API format for evaluation results."""
- scenarios: List[ApiScenarioResult]
- summary: Optional[str] = None
- keyFindings: Optional[str] = None
- recommendation: Optional[str] = None
- deepTest: bool = False
- startTime: datetime
- judgeModel: Optional[str] = None
+ model_config = ConfigDict(extra="forbid")
+ scenarios: List[ApiScenarioResult]
+ summary: Optional[str] = None
+ key_findings: Optional[str] = Field(None, serialization_alias="keyFindings")
+ recommendation: Optional[str] = None
+ deep_test: bool = Field(False, serialization_alias="deepTest")
+ start_time: datetime = Field(..., serialization_alias="startTime")
+ judge_model: Optional[str] = Field(None, serialization_alias="judgeModel")Note: This preserves the API surface (camelCase) while making codebase PEP 8 compliant. You may need to update construction sites to use snake_case names (aliases still serialize to camelCase). I can bulk-update call sites if helpful.
Also applies to: 40-49
| api_scenarios.append( | ||
| ApiScenarioResult( | ||
| description=result.scenario.scenario, | ||
| expectedOutcome=result.scenario.expected_outcome, | ||
| totalConversations=len(api_conversations), | ||
| flaggedConversations=len( | ||
| [c for c in api_conversations if not c.passed], | ||
| ), | ||
| conversations=api_conversations, | ||
| ), |
There was a problem hiding this comment.
Don’t pass unknown field expectedOutcome to ApiScenarioResult (guard or remove).
ApiScenarioResult (per models/api_format.py) doesn’t declare expectedOutcome. Passing it will either raise a validation error (if extras=forbid) or be dropped silently. Gate this add at runtime to avoid breakage.
Apply this diff to make it safe:
- api_scenarios.append(
- ApiScenarioResult(
- description=result.scenario.scenario,
- expectedOutcome=result.scenario.expected_outcome,
- totalConversations=len(api_conversations),
- flaggedConversations=len(
- [c for c in api_conversations if not c.passed],
- ),
- conversations=api_conversations,
- ),
- )
+ scenario_kwargs = dict(
+ description=result.scenario.scenario,
+ totalConversations=len(api_conversations),
+ flaggedConversations=len([c for c in api_conversations if not c.passed]),
+ conversations=api_conversations,
+ )
+ expected = getattr(result.scenario, "expected_outcome", None)
+ if (
+ expected is not None
+ and hasattr(ApiScenarioResult, "model_fields")
+ and "expectedOutcome" in ApiScenarioResult.model_fields
+ ):
+ scenario_kwargs["expectedOutcome"] = expected
+
+ api_scenarios.append(ApiScenarioResult(**scenario_kwargs))📝 Committable suggestion
‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.
| api_scenarios.append( | |
| ApiScenarioResult( | |
| description=result.scenario.scenario, | |
| expectedOutcome=result.scenario.expected_outcome, | |
| totalConversations=len(api_conversations), | |
| flaggedConversations=len( | |
| [c for c in api_conversations if not c.passed], | |
| ), | |
| conversations=api_conversations, | |
| ), | |
| # Build kwargs for ApiScenarioResult, omitting unknown fields | |
| scenario_kwargs = dict( | |
| description=result.scenario.scenario, | |
| totalConversations=len(api_conversations), | |
| flaggedConversations=len([c for c in api_conversations if not c.passed]), | |
| conversations=api_conversations, | |
| ) | |
| # Only include expectedOutcome if the model actually declares it | |
| expected = getattr(result.scenario, "expected_outcome", None) | |
| if ( | |
| expected is not None | |
| and hasattr(ApiScenarioResult, "model_fields") | |
| and "expectedOutcome" in ApiScenarioResult.model_fields | |
| ): | |
| scenario_kwargs["expectedOutcome"] = expected | |
| api_scenarios.append(ApiScenarioResult(**scenario_kwargs)) |
🤖 Prompt for AI Agents
In rogue/server/services/api_format_service.py around lines 79-88, the code is
passing an unknown field expectedOutcome to ApiScenarioResult which will be
rejected or dropped; fix this by removing the expectedOutcome argument (or
alternatively only include it when the ApiScenarioResult dataclass/model
actually defines that field). Concretely, stop passing
expectedOutcome=result.scenario.expected_outcome into the ApiScenarioResult
constructor, or build the kwargs dict dynamically and add expectedOutcome only
if hasattr(ApiScenarioResult, "expectedOutcome") or the model's field list
contains it, then pass that dict to ApiScenarioResult.
| api_evaluation_result = convert_with_structured_summary( | ||
| evaluation_results=evaluation_result, | ||
| structured_summary=request.structured_summary, | ||
| deep_test=request.deep_test, | ||
| start_time=request.start_time, | ||
| judge_model=request.judge_model, | ||
| ) |
There was a problem hiding this comment.
Wrong type passed to converter (EvaluationResult vs EvaluationResults).
convert_with_structured_summary expects EvaluationResults, but a single EvaluationResult is passed. This will mis-shape the payload.
-from rogue_sdk.types import EvaluationResult, ReportSummaryRequest
+from rogue_sdk.types import EvaluationResult, EvaluationResults, ReportSummaryRequest
@@
- api_evaluation_result = convert_with_structured_summary(
- evaluation_results=evaluation_result,
+ api_evaluation_result = convert_with_structured_summary(
+ evaluation_results=EvaluationResults(results=[evaluation_result]),
structured_summary=request.structured_summary,
deep_test=request.deep_test,
start_time=request.start_time,
judge_model=request.judge_model,
)🤖 Prompt for AI Agents
In rogue/server/services/qualifire_service.py around lines 18 to 24, the call
passes a single EvaluationResult (evaluation_result) to
convert_with_structured_summary which expects an EvaluationResults collection;
replace the single-item argument with an EvaluationResults instance (e.g. use
the existing evaluation_results collection variable or construct an
EvaluationResults wrapping the single result) so the converter receives the
correct type and payload shape, and update any variable names accordingly.
| response = requests.post( | ||
| f"{request.qualifire_url}/api/rogue/v1/report", | ||
| headers={"X-qualifire-key": request.qualifire_api_key}, | ||
| json=api_evaluation_result.model_dump(mode="json"), | ||
| timeout=300, | ||
| ) |
There was a problem hiding this comment.
🛠️ Refactor suggestion
Harden network/error handling and avoid json() on error bodies.
requests.post can raise; response.json() may fail for non-JSON errors. Add try/except and prefer response.text on failures.
- response = requests.post(
- f"{request.qualifire_url}/api/rogue/v1/report",
- headers={"X-qualifire-key": request.qualifire_api_key},
- json=api_evaluation_result.model_dump(mode="json"),
- timeout=300,
- )
-
- if not response.ok:
- logger.error(
- "Failed to report summary to Qualifire",
- extra={"response": response.json()},
- )
- raise Exception(f"Failed to report summary to Qualifire: {response.json()}")
+ try:
+ headers = {}
+ if request.qualifire_api_key:
+ headers["X-Qualifire-Key"] = request.qualifire_api_key
+ response = requests.post(
+ f"{request.qualifire_url}/api/rogue/v1/report",
+ headers=headers,
+ json=api_evaluation_result.model_dump(mode="json"),
+ timeout=300,
+ )
+ except requests.RequestException as e:
+ logger.exception("Network error reporting summary to Qualifire")
+ raise
+
+ if not response.ok:
+ body = None
+ try:
+ body = response.json()
+ except Exception:
+ body = response.text
+ logger.error(
+ "Failed to report summary to Qualifire",
+ extra={"status": response.status_code, "response": body},
+ )
+ raise Exception(f"Failed to report summary to Qualifire: {body}")Also applies to: 33-39
🤖 Prompt for AI Agents
In rogue/server/services/qualifire_service.py around lines 26-31 (and similarly
for lines 33-39), the code calls requests.post and then assumes a JSON body;
wrap the network call and response parsing in a try/except that catches
requests.RequestException and handles timeouts/connection errors, check
response.ok before calling response.json(), and if parsing fails fall back to
response.text; log or raise a clear error including status_code and
response.text when response.ok is False or JSON decoding fails, and ensure the
timeout is preserved and resources are cleaned up.
| data = SummaryGenerationRequest( | ||
| results=results, | ||
| model=model, | ||
| qualifire_api_key=qualifire_api_key, | ||
| job_id=job_id, | ||
| deep_test=deep_test, | ||
| judge_model=judge_model, | ||
| ) |
There was a problem hiding this comment.
🛠️ Refactor suggestion
Avoid sending None for non-optional job_id.
SummaryGenerationRequest.job_id is str; passing None overrides the default and may fail validation.
- data = SummaryGenerationRequest(
+ data = SummaryGenerationRequest(
results=results,
model=model,
- qualifire_api_key=qualifire_api_key,
- job_id=job_id,
+ qualifire_api_key=qualifire_api_key,
+ job_id=job_id or "",
deep_test=deep_test,
judge_model=judge_model,
)📝 Committable suggestion
‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.
| data = SummaryGenerationRequest( | |
| results=results, | |
| model=model, | |
| qualifire_api_key=qualifire_api_key, | |
| job_id=job_id, | |
| deep_test=deep_test, | |
| judge_model=judge_model, | |
| ) | |
| data = SummaryGenerationRequest( | |
| results=results, | |
| model=model, | |
| qualifire_api_key=qualifire_api_key, | |
| job_id=job_id or "", | |
| deep_test=deep_test, | |
| judge_model=judge_model, | |
| ) |
🤖 Prompt for AI Agents
In sdks/python/rogue_sdk/client.py around lines 158 to 165, the code passes
job_id which may be None into SummaryGenerationRequest.job_id (a non-optional
str) causing validation errors; change the call to only supply job_id when it is
not None (e.g., build kwargs and conditionally add 'job_id' or default to a
valid string) so None is never passed into the SummaryGenerationRequest
constructor.
| class ApiEvaluationResult(BaseModel): | ||
| """New API format for evaluation results.""" | ||
|
|
||
| scenarios: List[ApiScenarioResult] | ||
|
|
||
|
|
There was a problem hiding this comment.
🛠️ Refactor suggestion
ApiEvaluationResult missing metadata fields
Add optional fields to align with server API surface.
class ApiEvaluationResult(BaseModel):
"""New API format for evaluation results."""
scenarios: List[ApiScenarioResult]
+ summary: Optional[str] = None
+ keyFindings: Optional[str] = None
+ recommendation: Optional[str] = None
+ deepTest: bool = False
+ startTime: datetime = Field(default_factory=lambda: datetime.now(timezone.utc))
+ judgeModel: Optional[str] = NoneOptional: extend convert_to_api_format(...) to accept structured_summary/deep_test/start_time/judge_model and populate these.
📝 Committable suggestion
‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.
| class ApiEvaluationResult(BaseModel): | |
| """New API format for evaluation results.""" | |
| scenarios: List[ApiScenarioResult] | |
| class ApiEvaluationResult(BaseModel): | |
| """New API format for evaluation results.""" | |
| scenarios: List[ApiScenarioResult] | |
| summary: Optional[str] = None | |
| keyFindings: Optional[str] = None | |
| recommendation: Optional[str] = None | |
| deepTest: bool = False | |
| startTime: datetime = Field( | |
| default_factory=lambda: datetime.now(timezone.utc) | |
| ) | |
| judgeModel: Optional[str] = None |
🤖 Prompt for AI Agents
In sdks/python/rogue_sdk/types.py around lines 239 to 244, ApiEvaluationResult
currently only declares scenarios but is missing optional metadata fields
expected by the server API; add optional attributes for structured_summary
(e.g., Optional[Dict[str, Any]] or Optional[StructuredSummaryType]), deep_test
(Optional[bool]), start_time (Optional[datetime] or Optional[str] ISO), and
judge_model (Optional[str]) to the Pydantic model, marking them Optional and
with sensible defaults (None); optionally update the convert_to_api_format(...)
helper to accept and pass through structured_summary, deep_test, start_time, and
judge_model parameters so ApiEvaluationResult instances are populated with these
fields when available.
There was a problem hiding this comment.
Actionable comments posted: 0
♻️ Duplicate comments (1)
packages/tui/internal/tui/app.go (1)
90-114: Panic risk: dereference before err check; also O(n²) string buildingCheck err before accessing structuredSummary; build output with strings.Builder and only add non-empty sections.
Apply:
- structuredSummary, err := sdk.GenerateSummary(ctx, m.evalState.JobID, judgeModel, apiKey, &m.config.QualifireAPIKey) - - overallSummary := structuredSummary.Summary.OverallSummary - keyFindings := structuredSummary.Summary.KeyFindings - parsedKeyFindings := "" - for _, finding := range keyFindings { - parsedKeyFindings += "- " + finding + "\n" - } - recommendations := structuredSummary.Summary.Recommendations - parsedRecommendations := "" - for _, recommendation := range recommendations { - parsedRecommendations += "- " + recommendation + "\n" - } - - detailedBreakdown := structuredSummary.Summary.DetailedBreakdown - parsedDetailedBreakdown := "" - for _, breakdown := range detailedBreakdown { - parsedDetailedBreakdown += "- " + breakdown.Scenario + " - " + breakdown.Status + " - " + breakdown.Outcome + "\n" - } - - summary := "## Overall Summary\n\n" + overallSummary + - "\n\n" + "## Key Findings\n\n" + parsedKeyFindings + - "\n\n" + "## Recommendations\n\n" + parsedRecommendations + - "\n\n" + "## Detailed Breakdown\n\n" + parsedDetailedBreakdown + structuredSummary, err := sdk.GenerateSummary(ctx, m.evalState.JobID, judgeModel, apiKey, &m.config.QualifireAPIKey) + if err != nil { + return SummaryGeneratedMsg{Summary: "", Err: err} + } + + sum := structuredSummary.Summary + var b strings.Builder + + if sum.OverallSummary != "" { + b.WriteString("## Overall Summary\n\n") + b.WriteString(sum.OverallSummary) + b.WriteString("\n\n") + } + if len(sum.KeyFindings) > 0 { + b.WriteString("## Key Findings\n\n") + for _, f := range sum.KeyFindings { + if f == "" { + continue + } + b.WriteString("- ") + b.WriteString(f) + b.WriteByte('\n') + } + b.WriteByte('\n') + } + if len(sum.Recommendations) > 0 { + b.WriteString("## Recommendations\n\n") + for _, r := range sum.Recommendations { + if r == "" { + continue + } + b.WriteString("- ") + b.WriteString(r) + b.WriteByte('\n') + } + b.WriteByte('\n') + } + if len(sum.DetailedBreakdown) > 0 { + b.WriteString("## Detailed Breakdown\n\n") + for _, br := range sum.DetailedBreakdown { + b.WriteString("- ") + b.WriteString(br.Scenario) + b.WriteString(" - ") + b.WriteString(br.Status) + b.WriteString(" - ") + b.WriteString(br.Outcome) + b.WriteByte('\n') + } + } + + summary := strings.TrimSpace(b.String())#!/bin/bash # Confirm all call sites match the updated SDK signature and there’s exactly one definition. rg -nP 'GenerateSummary\s*\(' -C2 rg -nP 'func\s*\(\s*\*?RogueSDK\s*\)\s*GenerateSummary\s*\(' -C3
🧹 Nitpick comments (1)
packages/tui/internal/tui/app.go (1)
316-333: Paste into dialogs: insert at cursor, not always appendFor consistency with LLM dialog, insert at InputCursor; also normalize CRLF.
Apply:
- // Clean the clipboard text (remove newlines and trim whitespace) - cleanText := strings.TrimSpace(strings.ReplaceAll(clipboardText, "\n", "")) + // Clean the clipboard text (strip CRLF/newlines and trim) + cleanText := strings.TrimSpace(strings.ReplaceAll(strings.ReplaceAll(clipboardText, "\r\n", ""), "\n", "")) @@ - m.dialog.Input += cleanText - m.dialog.InputCursor = len(m.dialog.Input) + before := m.dialog.Input[:m.dialog.InputCursor] + after := m.dialog.Input[m.dialog.InputCursor:] + m.dialog.Input = before + cleanText + after + m.dialog.InputCursor += len(cleanText)
📜 Review details
Configuration used: CodeRabbit UI
Review profile: CHILL
Plan: Lite
📒 Files selected for processing (2)
packages/tui/internal/components/llm_config_dialog.go(2 hunks)packages/tui/internal/tui/app.go(5 hunks)
🧰 Additional context used
🧬 Code graph analysis (1)
packages/tui/internal/tui/app.go (3)
packages/tui/internal/components/command_line.go (1)
CommandInput(21-30)packages/tui/internal/components/dialog.go (1)
Dialog(38-51)packages/tui/internal/components/viewport.go (1)
NewViewport(74-93)
🔇 Additional comments (5)
packages/tui/internal/components/llm_config_dialog.go (2)
552-556: Good: now using the exported clipboard helperThis aligns with reuse across the components package.
572-600: Ignore duplicate removal suggestion No duplicateGetClipboardContentdefinition exists—only the one inpackages/tui/internal/components/llm_config_dialog.gowas found.Likely an incorrect or invalid review comment.
packages/tui/internal/tui/app.go (3)
160-175: Model fields extension looks finedetailedBreakdown typed to table.Row aligns with the new import.
269-269: Viewport height tweak acknowledgedReduces report panel height to 15; no concerns.
11-13: No action required: dependency already declared in packages/tui/go.mod
There was a problem hiding this comment.
Actionable comments posted: 2
Caution
Some comments are outside the diff and can’t be posted inline due to platform limitations.
⚠️ Outside diff range comments (1)
rogue/server/api/llm.py (1)
91-96: Offload sync LLM summary generation to threadpoolPrevent event-loop blocking by running the synchronous LLM call in a threadpool.
- summary = LLMService.generate_summary_from_results( - model=request.model, - results=request.results, - llm_provider_api_key=request.api_key, - ) + summary = await run_in_threadpool( + LLMService.generate_summary_from_results, + model=request.model, + results=request.results, + llm_provider_api_key=request.api_key, + )
♻️ Duplicate comments (3)
rogue/server/api/llm.py (3)
99-102: Do not log secrets (qualifire_api_key)This logs a raw API key. Log only presence.
- logger.info( - "Qualifire API key", - extra={"qualifire_api_key": request.qualifire_api_key}, - ) + logger.info( + "Qualifire API key provided", + extra={"present": bool(request.qualifire_api_key)}, + )
132-147: Run Qualifire reporting in threadpool and fix type mismatch to EvaluationResults
- Offload the blocking requests.post.
- QualifireService.report_summary expects EvaluationResult but callers pass EvaluationResults; align kwarg and type to evaluation_results. This comment complements prior review feedback.
Proposed change here (assumes updating QualifireService signature to accept
evaluation_results: EvaluationResults):- QualifireService.report_summary( - ReportSummaryRequest( - job_id=request.job_id, - structured_summary=summary, - deep_test=job.deep_test if job else request.deep_test, - start_time=( - job.created_at - if job is not None - else datetime.now(timezone.utc) - ), - judge_model=job.judge_model if job else request.judge_model, - qualifire_url=request.qualifire_url, - qualifire_api_key=request.qualifire_api_key, - ), - evaluation_result=request.results, - ) + await run_in_threadpool( + QualifireService.report_summary, + ReportSummaryRequest( + job_id=request.job_id, + structured_summary=summary, + deep_test=job.deep_test if job else request.deep_test, + start_time=( + job.created_at + if job is not None + else datetime.now(timezone.utc) + ), + judge_model=job.judge_model if job else request.judge_model, + qualifire_url=request.qualifire_url, + qualifire_api_key=request.qualifire_api_key, + ), + evaluation_results=request.results, + )Verification script to locate and confirm the service signature and all call sites:
#!/bin/bash # Find QualifireService.report_summary signature and usages rg -nP 'class\s+QualifireService\b|def\s+report_summary\s*\(' -C3 rogue rg -nP 'QualifireService\.report_summary\s*\(' -n -C2 rogueI can prep the matching patch for rogue/server/services/qualifire_service.py if you want.
187-190: Same two issues here: blocking call and EvaluationResults type kwargMirror the fixes from the other call site.
- QualifireService.report_summary( - request, - evaluation_result=EvaluationResults(results=results), - ) + await run_in_threadpool( + QualifireService.report_summary, + request, + evaluation_results=EvaluationResults(results=results), + )
🧹 Nitpick comments (3)
rogue/server/api/llm.py (3)
127-131: Avoid dumping large/PII payloads into logsLogging full summary and results can bloat logs and risk sensitive data exposure. Log sizes/ids instead.
- logger.info( - "Summary", - extra={"summary": summary, "results": request.results}, - ) + logger.info( + "Summary ready", + extra={ + "results_count": len(request.results.results), + }, + )
162-166: Add return type annotationAnnotate the handler to satisfy mypy and repo guidelines.
-async def report_summary_handler( +async def report_summary_handler( request: ReportSummaryRequest, evaluation_service: EvaluationService = Depends(get_evaluation_service), -): +) -> ReportSummaryResponse:
195-200: Preserve HTTPException details; don’t wrapRe-raise HTTPException to keep its detail/status unmodified; wrap only unexpected exceptions.
- except Exception as e: - logger.exception("Failed to report summary") - raise HTTPException( - status_code=e.status_code if hasattr(e, "status_code") else 500, - detail=f"Failed to report summary: {str(e)}", - ) + except HTTPException: + raise + except Exception as e: + logger.exception("Failed to report summary") + raise HTTPException(status_code=500, detail=f"Failed to report summary: {e}")
📜 Review details
Configuration used: CodeRabbit UI
Review profile: CHILL
Plan: Pro
📒 Files selected for processing (2)
rogue/evaluator_agent/evaluator_agent.py(3 hunks)rogue/server/api/llm.py(4 hunks)
🚧 Files skipped from review as they are similar to previous changes (1)
- rogue/evaluator_agent/evaluator_agent.py
🧰 Additional context used
📓 Path-based instructions (2)
{rogue,tests,examples}/**/*.py
📄 CodeRabbit inference engine (AGENTS.md)
{rogue,tests,examples}/**/*.py: Format Python code with Black
Organize imports according to isort conventions
Use type hints for all function signatures
Follow PEP 8 naming (snake_case for variables and functions, PascalCase for classes)
Use try/except blocks for code that may raise exceptions
Ensure code passes flake8 linting
Ensure code passes mypy type checks per .mypy.ini
Address Bandit-reported security issues in Python code
Files:
rogue/server/api/llm.py
rogue/**/*.py
📄 CodeRabbit inference engine (AGENTS.md)
Place application/package code under the rogue/ directory
Files:
rogue/server/api/llm.py
🧬 Code graph analysis (1)
rogue/server/api/llm.py (7)
sdks/python/rogue_sdk/types.py (4)
EvaluationResults(192-208)SummaryGenerationRequest(431-441)ReportSummaryResponse(513-516)ReportSummaryRequest(501-510)rogue/server/api/evaluation.py (1)
get_evaluation_service(23-24)rogue/server/services/evaluation_service.py (2)
EvaluationService(20-234)get_job(31-33)rogue/server/models/api_format.py (1)
ServerSummaryGenerationResponse(52-56)rogue/common/logging/config.py (1)
get_logger(103-115)rogue/server/services/llm_service.py (1)
LLMService(166-283)rogue/server/services/qualifire_service.py (2)
QualifireService(8-40)report_summary(10-40)
⏰ Context from checks skipped due to timeout of 90000ms. You can increase the timeout in your CodeRabbit configuration to a maximum of 15 minutes (900000ms). (1)
- GitHub Check: codestyle
🔇 Additional comments (2)
rogue/server/api/llm.py (2)
69-76: Endpoint shape and DI look goodSwitch to ServerSummaryGenerationResponse and injecting EvaluationService via Depends is clean.
149-152: Response construction looks goodReturning the structured summary and success message via the server model is consistent.
| if not results or len(results) == 0: | ||
| raise HTTPException( | ||
| status_code=404, | ||
| detail="Evaluation results not found or empty", | ||
| ) |
There was a problem hiding this comment.
Bug: len() called on EvaluationResults (no len)
This will raise TypeError. Check the underlying list instead.
- if not results or len(results) == 0:
+ if not results or not results.results:📝 Committable suggestion
‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.
| if not results or len(results) == 0: | |
| raise HTTPException( | |
| status_code=404, | |
| detail="Evaluation results not found or empty", | |
| ) | |
| if not results or not results.results: | |
| raise HTTPException( | |
| status_code=404, | |
| detail="Evaluation results not found or empty", | |
| ) |
🤖 Prompt for AI Agents
In rogue/server/api/llm.py around lines 181-185, the code calls len(results) on
an EvaluationResults object which lacks __len__; replace that check by
inspecting the underlying list attribute instead. Change the condition to check
the actual list field on the EvaluationResults instance (for example
results.evaluations or results.items — use the real attribute name in this
class) and call len() on that list or simply test its truthiness (e.g., if not
results or not results.<list_attr>: raise HTTPException(...)). Ensure you do not
call len() directly on the EvaluationResults object.
There was a problem hiding this comment.
Actionable comments posted: 7
Caution
Some comments are outside the diff and can’t be posted inline due to platform limitations.
⚠️ Outside diff range comments (2)
sdks/python/rogue_sdk/types.py (1)
484-499: Bug: invalid HttpUrl construction in validator
HttpUrl(v)is not a constructor in Pydantic v2 and will raise. Either drop this validator or useTypeAdapter.-from pydantic import ( +from pydantic import ( BaseModel, ConfigDict, Field, HttpUrl, field_validator, model_validator, ) +from pydantic import TypeAdapter @@ - @field_validator("base_url", mode="after") - def validate_base_url(cls, v: str | HttpUrl) -> HttpUrl: - if isinstance(v, str): - return HttpUrl(v) - return v + @field_validator("base_url", mode="before") + def validate_base_url(cls, v): + # Accept str or HttpUrl; coerce strings into HttpUrl + return TypeAdapter(HttpUrl).validate_python(v)packages/tui/internal/tui/evaluation.go (1)
29-32: Align ScenarioType with server API (replace "custom").Python/TS SDKs expose "policy" and "prompt_injection" (no "custom"). Using "custom" will likely 400 the request when validated server-side.
Apply:
-const ( - ScenarioTypePolicy ScenarioType = "policy" - ScenarioTypeCustom ScenarioType = "custom" -) +const ( + ScenarioTypePolicy ScenarioType = "policy" + ScenarioTypePromptInjection ScenarioType = "prompt_injection" +)
♻️ Duplicate comments (3)
packages/tui/internal/tui/app.go (2)
101-106: Good: error-checked before dereferenceEarly-return on
errprevents panics when accessingstructuredSummary. Matches earlier guidance.
110-131: Avoid O(n²) string concatenations; use strings.Builder and skip empty itemsThe loops build large strings via
+=. Preferstrings.Builderand write sections conditionally.- overallSummary := structuredSummary.Summary.OverallSummary - keyFindings := structuredSummary.Summary.KeyFindings - parsedKeyFindings := "" - for _, finding := range keyFindings { - parsedKeyFindings += "- " + finding + "\n" - } - recommendations := structuredSummary.Summary.Recommendations - parsedRecommendations := "" - for _, recommendation := range recommendations { - parsedRecommendations += "- " + recommendation + "\n" - } - detailedBreakdown := structuredSummary.Summary.DetailedBreakdown - parsedDetailedBreakdown := "" - for _, breakdown := range detailedBreakdown { - parsedDetailedBreakdown += "- " + breakdown.Scenario + " - " + breakdown.Status + " - " + breakdown.Outcome + "\n" - } - - summary := "## Overall Summary\n\n" + overallSummary + - "\n\n" + "## Key Findings\n\n" + parsedKeyFindings + - "\n\n" + "## Recommendations\n\n" + parsedRecommendations + - "\n\n" + "## Detailed Breakdown\n\n" + parsedDetailedBreakdown + sum := structuredSummary.Summary + var b strings.Builder + if sum.OverallSummary != "" { + b.WriteString("## Overall Summary\n\n") + b.WriteString(sum.OverallSummary) + b.WriteString("\n\n") + } + if len(sum.KeyFindings) > 0 { + b.WriteString("## Key Findings\n\n") + for _, f := range sum.KeyFindings { + if f == "" { continue } + b.WriteString("- ") + b.WriteString(f) + b.WriteByte('\n') + } + b.WriteByte('\n') + } + if len(sum.Recommendations) > 0 { + b.WriteString("## Recommendations\n\n") + for _, r := range sum.Recommendations { + if r == "" { continue } + b.WriteString("- ") + b.WriteString(r) + b.WriteByte('\n') + } + b.WriteByte('\n') + } + if len(sum.DetailedBreakdown) > 0 { + b.WriteString("## Detailed Breakdown\n\n") + for _, br := range sum.DetailedBreakdown { + b.WriteString("- ") + b.WriteString(br.Scenario) + b.WriteString(" - ") + b.WriteString(br.Status) + b.WriteString(" - ") + b.WriteString(br.Outcome) + b.WriteByte('\n') + } + } + summary := strings.TrimSpace(b.String())rogue/server/api/llm.py (1)
99-103: Do not log secrets (qualifire_api_key)This prints the raw API key to logs.
- logger.info( - "Qualifire API key", - extra={"qualifire_api_key": request.qualifire_api_key}, - ) + logger.info("Qualifire API key provided", extra={"present": bool(request.qualifire_api_key)})
🧹 Nitpick comments (10)
packages/tui/internal/tui/app.go (1)
334-351: Paste handling: avoid re-reading clipboard; sanitize minimallyOn
tea.PasteMsg, you re-read the OS clipboard (can be slow/error-prone) and strip all newlines. Prefer handling the paste event’s content (if available in your version) or at least replace\nwith spaces to preserve word boundaries.- cleanText := strings.TrimSpace(strings.ReplaceAll(clipboardText, "\n", "")) + cleaned := strings.ReplaceAll(clipboardText, "\r\n", "\n") + cleaned = strings.TrimSpace(strings.ReplaceAll(cleaned, "\n", " "))sdks/python/rogue_sdk/types.py (2)
264-275: Surface timestamp parse failuresSilently falling back to now() hides data issues. Log a warning when parsing fails.
- if msg.timestamp: + if msg.timestamp: try: if isinstance(msg.timestamp, str): timestamp = datetime.fromisoformat( msg.timestamp.replace("Z", "+00:00"), ) else: timestamp = msg.timestamp except (ValueError, AttributeError): - timestamp = datetime.now(timezone.utc) + logger.warning("Failed to parse timestamp: %r", msg.timestamp) + timestamp = datetime.now(timezone.utc)
444-451: Consider typing detailed_breakdown rows
List[dict]loses structure. Optional: model it to match server/TUI:{scenario,status,outcome}.-class StructuredSummary(BaseModel): +class StructuredBreakdownItem(BaseModel): + scenario: str + status: str + outcome: str + +class StructuredSummary(BaseModel): @@ - detailed_breakdown: List[dict] # Table rows for scenario breakdown + detailed_breakdown: List[StructuredBreakdownItem]rogue/server/api/llm.py (1)
127-131: Reduce log verbosity for large payloadsLogging the entire
summaryandresultsat INFO can bloat logs. Use DEBUG or drop payload fields.- logger.info( - "Summary", - extra={"summary": summary, "results": request.results}, - ) + logger.debug("Summary generated", extra={"results_count": len(request.results.results)})packages/tui/internal/tui/evaluation.go (6)
96-105: Name the breakdown row type for reuse and clarity.Inline anon struct hampers reuse/validation.
type StructuredSummary struct { OverallSummary string `json:"overall_summary"` KeyFindings []string `json:"key_findings"` Recommendations []string `json:"recommendations"` - DetailedBreakdown []struct { - Scenario string `json:"scenario"` - Status string `json:"status"` - Outcome string `json:"outcome"` - } `json:"detailed_breakdown"` + DetailedBreakdown []DetailedBreakdownRow `json:"detailed_breakdown"` }Add below (outside selected range):
type DetailedBreakdownRow struct { Scenario string `json:"scenario"` Status string `json:"status"` Outcome string `json:"outcome"` }
431-437: Stricter URL validation.url.Parse accepts many invalid inputs. Check scheme+host or use ParseRequestURI.
- if _, err := url.Parse(serverURL); err != nil { + u1, err := url.ParseRequestURI(serverURL) + if err != nil || (u1.Scheme != "http" && u1.Scheme != "https") || u1.Host == "" { return nil, nil, fmt.Errorf("invalid server url: %w", err) } - if _, err := url.Parse(agentURL); err != nil { + u2, err := url.ParseRequestURI(agentURL) + if err != nil || (u2.Scheme != "http" && u2.Scheme != "https") || u2.Host == "" { return nil, nil, fmt.Errorf("invalid agent url: %w", err) }
453-461: Pre-allocate scenarios slice.Avoids reallocations on large lists.
-// Convert scenarios -for _, s := range scenarios { +// Convert scenarios +request.Scenarios = make([]EvalScenario, 0, len(scenarios)) +for _, s := range scenarios { request.Scenarios = append(request.Scenarios, EvalScenario{ Scenario: s.Scenario, ScenarioType: s.ScenarioType, ExpectedOutcome: s.ExpectedOutcome, }) }
520-523: Be lenient on success status codes.APIs often return 200/201/202. Treat 2xx as success.
-if resp.StatusCode != http.StatusOK { +if resp.StatusCode < 200 || resp.StatusCode >= 300 { body, _ := io.ReadAll(resp.Body) return nil, fmt.Errorf("summary generation failed: %d %s", resp.StatusCode, string(body)) }
534-574: ReportSummary: allow 2xx and consider longer timeout.The call may wait on downstream BFF; 30s can be tight.
- resp, err := sdk.httpClient.Do(req) + client := &http.Client{Timeout: 2 * time.Minute} + resp, err := client.Do(req) ... -if resp.StatusCode != http.StatusOK { +if resp.StatusCode < 200 || resp.StatusCode >= 300 { body, _ := io.ReadAll(resp.Body) return fmt.Errorf("report summary failed: %d %s", resp.StatusCode, string(body)) }
465-474: Comment no longer matches behavior.Function returns structured JSON, not “markdown summary”.
-// GenerateSummary generates a markdown summary from evaluation results +// GenerateSummary generates a structured summary (LLM-backed) from evaluation results.
📜 Review details
Configuration used: CodeRabbit UI
Review profile: CHILL
Plan: Pro
📒 Files selected for processing (5)
packages/tui/internal/tui/app.go(7 hunks)packages/tui/internal/tui/eval_ui.go(2 hunks)packages/tui/internal/tui/evaluation.go(6 hunks)rogue/server/api/llm.py(4 hunks)sdks/python/rogue_sdk/types.py(8 hunks)
🚧 Files skipped from review as they are similar to previous changes (1)
- packages/tui/internal/tui/eval_ui.go
🧰 Additional context used
📓 Path-based instructions (2)
{rogue,tests,examples}/**/*.py
📄 CodeRabbit inference engine (AGENTS.md)
{rogue,tests,examples}/**/*.py: Format Python code with Black
Organize imports according to isort conventions
Use type hints for all function signatures
Follow PEP 8 naming (snake_case for variables and functions, PascalCase for classes)
Use try/except blocks for code that may raise exceptions
Ensure code passes flake8 linting
Ensure code passes mypy type checks per .mypy.ini
Address Bandit-reported security issues in Python code
Files:
rogue/server/api/llm.py
rogue/**/*.py
📄 CodeRabbit inference engine (AGENTS.md)
Place application/package code under the rogue/ directory
Files:
rogue/server/api/llm.py
🧬 Code graph analysis (4)
rogue/server/api/llm.py (6)
sdks/python/rogue_sdk/types.py (4)
EvaluationResults(192-208)SummaryGenerationRequest(431-441)ReportSummaryResponse(513-516)ReportSummaryRequest(501-510)rogue/server/api/evaluation.py (1)
get_evaluation_service(23-24)rogue/server/services/evaluation_service.py (2)
EvaluationService(20-234)get_job(31-33)rogue/server/models/api_format.py (1)
ServerSummaryGenerationResponse(52-56)rogue/server/services/qualifire_service.py (2)
QualifireService(8-40)report_summary(10-40)sdks/python/rogue_sdk/client.py (2)
generate_summary(147-194)request(79-83)
packages/tui/internal/tui/evaluation.go (2)
sdks/python/rogue_sdk/types.py (3)
Scenario(94-133)ScenarioType(45-49)StructuredSummary(444-450)packages/sdk/src/types.ts (1)
Scenario(41-47)
sdks/python/rogue_sdk/types.py (3)
rogue/server/models/api_format.py (4)
ApiChatMessage(14-19)ApiConversationEvaluation(22-27)ApiScenarioResult(30-37)ApiEvaluationResult(40-49)rogue/server/services/api_format_service.py (1)
convert_to_api_format(20-113)packages/tui/internal/tui/evaluation.go (1)
StructuredSummary(96-105)
packages/tui/internal/tui/app.go (6)
packages/tui/internal/tui/evaluation.go (2)
StructuredSummary(96-105)NewRogueSDK(112-119)packages/tui/internal/components/command_line.go (1)
CommandInput(21-30)packages/tui/internal/components/dialog.go (3)
Dialog(38-51)NewInfoDialog(65-78)PrimaryButton(32-32)packages/tui/internal/components/llm_config_dialog.go (2)
LLMConfigDialog(132-152)GetClipboardContent(573-600)packages/tui/internal/components/viewport.go (1)
NewViewport(74-93)packages/tui/internal/components/dialog_helpers.go (1)
ShowErrorDialog(6-19)
🔇 Additional comments (3)
packages/tui/internal/tui/app.go (1)
178-193: New detailedBreakdown field appears unused
Model.detailedBreakdown []table.Rowisn’t read or rendered here. Either wire it into views or remove to avoid dead state.packages/tui/internal/tui/evaluation.go (2)
441-448: The directory paths and struct definitions need verification before proceeding. Ran scripts to list the repo root and locateEvaluationRequestand its JSON tags—once you share the output, we can see if the server expectsjudge_llmorjudge_model.
491-496: Guard optional fields and confirm payload JSON schema
Only setqualifire_api_keywhenqualifireAPIKeyis non-nil/non-empty; verify the server’sSummaryGenerationRequeststruct JSON tags (especially theresultsfield nesting) to ensure you’re sending the expected shape.
| structuredSummary, err := sdk.GenerateSummary( | ||
| ctx, | ||
| m.evalState.JobID, | ||
| judgeModel, | ||
| apiKey, | ||
| &m.config.QualifireAPIKey, | ||
| m.evalState.DeepTest, | ||
| judgeModel, | ||
| m.config.ServerURL, | ||
| ) |
There was a problem hiding this comment.
💡 Verification agent
🧩 Analysis chain
Possible incorrect SDK call: duplicate judgeModel argument; verify signature
GenerateSummary receives judgeModel twice (Lines 93 and 97). Confirm the parameter order and types to avoid passing a wrong value silently.
Run to locate the Go SDK method signature and all callsites:
🏁 Script executed:
#!/bin/bash
rg -nP 'func\s+\(\s*\*\s*RogueSDK\s*\)\s*GenerateSummary\(' -C3
rg -nP '\bGenerateSummary\(' -n -C2Length of output: 662
Replace duplicate judgeModel argument in GenerateSummary call
GenerateSummary’s signature expects two distinct model parameters; the 7th argument here reuses judgeModel but should be the intended model (e.g. summaryModel or m.config.AnswerModel). [packages/tui/internal/tui/app.go:90]
🤖 Prompt for AI Agents
In packages/tui/internal/tui/app.go around lines 90 to 99, the call to
sdk.GenerateSummary mistakenly passes judgeModel twice (as the 4th and 7th
arguments); replace the 7th argument with the intended summary/answer model (for
example summaryModel or m.config.AnswerModel) so the function receives the two
distinct model parameters expected by its signature, and ensure you import or
reference the correct model variable that exists in the surrounding scope.
| case "save_qualifire_and_report": | ||
| // Handle Qualifire API key save and report persistence | ||
| if m.dialog != nil && m.dialog.Title == "Configure Qualifire API Key" { | ||
| // Save the API key to config (allow empty to clear the key) | ||
| m.config.QualifireAPIKey = msg.Input | ||
| // Only enable integration if there's an API key | ||
| if msg.Input != "" { | ||
| m.config.QualifireEnabled = true | ||
| if m.configState != nil { | ||
| m.configState.QualifireEnabled = true | ||
| m.configState.HasChanges = true | ||
| } | ||
| } | ||
|
|
||
| // immediately report the summary | ||
| if m.evalState != nil && m.evalState.Completed { | ||
| sdk := NewRogueSDK(m.config.ServerURL) | ||
| err := sdk.ReportSummary( | ||
| context.Background(), | ||
| m.evalState.JobID, | ||
| m.evalState.StructuredSummary, | ||
| m.evalState.DeepTest, | ||
| m.evalState.JudgeModel, | ||
| m.config.QualifireAPIKey, | ||
| ) | ||
| if err != nil { | ||
| // Show error dialog | ||
| errorDialog := components.ShowErrorDialog( | ||
| "Report Summary Error", | ||
| fmt.Sprintf("Failed to report summary: %v", err), | ||
| ) | ||
| m.dialog = &errorDialog | ||
| } | ||
|
|
||
| err = m.saveConfig() | ||
| if err != nil { | ||
| // Show error dialog | ||
| errorDialog := components.ShowErrorDialog( | ||
| "Configuration Error", | ||
| fmt.Sprintf("Failed to save Qualifire configuration: %v", err), | ||
| ) | ||
| m.dialog = &errorDialog | ||
| return m, nil | ||
| } else { | ||
| // Show appropriate success dialog | ||
| var message string | ||
| if msg.Input != "" { | ||
| message = "Qualifire API key has been successfully saved and integration is now enabled. Your evaluation report will now be automatically persisted." | ||
| } else { | ||
| message = "Qualifire API key has been cleared and integration is now disabled." | ||
| } | ||
| successDialog := components.NewInfoDialog( | ||
| "Qualifire Configured", | ||
| message, | ||
| ) | ||
| m.dialog = &successDialog | ||
| return m, nil | ||
| } | ||
| } | ||
| } |
There was a problem hiding this comment.
🛠️ Refactor suggestion
Do not report when API key is empty; also avoid blocking UI with network I/O
This path attempts to report even when msg.Input == "", and performs a synchronous HTTP call inside Update, freezing the TUI during the request.
- Guard against empty key and skip reporting.
- Offload reporting into a
tea.Cmd(follow the pattern used bysummaryGenerationCmd) and show a spinner.
Minimal immediate guard:
- // immediately report the summary
- if m.evalState != nil && m.evalState.Completed {
+ // immediately report the summary if we have a key
+ if m.evalState != nil && m.evalState.Completed && m.config.QualifireAPIKey != "" {
sdk := NewRogueSDK(m.config.ServerURL)
err := sdk.ReportSummary(Follow-up: happy to sketch a reportSummaryCmd(jobID, summary, deepTest, judgeModel, apiKey) if you want it non-blocking.
📝 Committable suggestion
‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.
| case "save_qualifire_and_report": | |
| // Handle Qualifire API key save and report persistence | |
| if m.dialog != nil && m.dialog.Title == "Configure Qualifire API Key" { | |
| // Save the API key to config (allow empty to clear the key) | |
| m.config.QualifireAPIKey = msg.Input | |
| // Only enable integration if there's an API key | |
| if msg.Input != "" { | |
| m.config.QualifireEnabled = true | |
| if m.configState != nil { | |
| m.configState.QualifireEnabled = true | |
| m.configState.HasChanges = true | |
| } | |
| } | |
| // immediately report the summary | |
| if m.evalState != nil && m.evalState.Completed { | |
| sdk := NewRogueSDK(m.config.ServerURL) | |
| err := sdk.ReportSummary( | |
| context.Background(), | |
| m.evalState.JobID, | |
| m.evalState.StructuredSummary, | |
| m.evalState.DeepTest, | |
| m.evalState.JudgeModel, | |
| m.config.QualifireAPIKey, | |
| ) | |
| if err != nil { | |
| // Show error dialog | |
| errorDialog := components.ShowErrorDialog( | |
| "Report Summary Error", | |
| fmt.Sprintf("Failed to report summary: %v", err), | |
| ) | |
| m.dialog = &errorDialog | |
| } | |
| err = m.saveConfig() | |
| if err != nil { | |
| // Show error dialog | |
| errorDialog := components.ShowErrorDialog( | |
| "Configuration Error", | |
| fmt.Sprintf("Failed to save Qualifire configuration: %v", err), | |
| ) | |
| m.dialog = &errorDialog | |
| return m, nil | |
| } else { | |
| // Show appropriate success dialog | |
| var message string | |
| if msg.Input != "" { | |
| message = "Qualifire API key has been successfully saved and integration is now enabled. Your evaluation report will now be automatically persisted." | |
| } else { | |
| message = "Qualifire API key has been cleared and integration is now disabled." | |
| } | |
| successDialog := components.NewInfoDialog( | |
| "Qualifire Configured", | |
| message, | |
| ) | |
| m.dialog = &successDialog | |
| return m, nil | |
| } | |
| } | |
| } | |
| // immediately report the summary if we have a key | |
| if m.evalState != nil && m.evalState.Completed && m.config.QualifireAPIKey != "" { | |
| sdk := NewRogueSDK(m.config.ServerURL) | |
| err := sdk.ReportSummary( | |
| context.Background(), | |
| m.evalState.JobID, | |
| m.evalState.StructuredSummary, | |
| m.evalState.DeepTest, | |
| m.evalState.JudgeModel, | |
| m.config.QualifireAPIKey, | |
| ) |
🤖 Prompt for AI Agents
In packages/tui/internal/tui/app.go around lines 552 to 611, the code currently
attempts to report the summary synchronously even when msg.Input is empty and
performs a blocking HTTP call inside Update; change it so that if msg.Input ==
"" you skip any reporting entirely, and if non-empty, do NOT call
sdk.ReportSummary inline—instead spawn a non-blocking tea.Cmd (follow the
existing summaryGenerationCmd pattern) that runs ReportSummary in a goroutine,
dispatches progress/response messages to update (so you can show a spinner and
success/error dialogs), and only call m.saveConfig() either before enqueuing the
report cmd or handle save errors through the cmd flow; ensure Update returns
immediately with the cmd rather than performing network I/O synchronously.
Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>
….com/qualifire-dev/rogue-private into feature/FIRE-751-rogue-report-to-bff
There was a problem hiding this comment.
Actionable comments posted: 2
♻️ Duplicate comments (1)
rogue/run_cli.py (1)
210-214: Correct: unwrap SecretStr before passing to SDKConverting SecretStr to a raw str before calling sdk.generate_summary addresses the earlier serialization issue.
🧹 Nitpick comments (1)
rogue/run_cli.py (1)
219-221: Consider passing job_id to enable Qualifire reportingServer-side reporting to Qualifire only happens when both qualifire_api_key and job_id are provided. You’re passing qualifire_api_key but not job_id, so reporting will be skipped. If intended, ignore; otherwise, wire job_id through.
Add job_id to the SDK call:
summary, _ = await sdk.generate_summary( results=results, model=judge_llm, api_key=judge_llm_api_key, qualifire_api_key=qualifire_api_key, deep_test=deep_test_mode, judge_model=judge_model, + job_id=job_id, )Apply supporting changes outside this hunk:
- Extend create_report signature:
- judge_model: str | None = None, + judge_model: str | None = None, + job_id: str | None = None,
- Return and propagate job_id from scenario run:
-async def run_scenarios(... ) -> EvaluationResults | None: +async def run_scenarios(... ) -> tuple[EvaluationResults, str] | None: -async def _run_scenarios_with_sdk(... ) -> EvaluationResults | None: +async def _run_scenarios_with_sdk(... ) -> tuple[EvaluationResults, str] | None: - return results + return results, final_job.job_id
- Unpack and pass job_id in run_cli:
- results = await run_scenarios( + run_out = await run_scenarios( ... - if not results: + if not run_out: raise ValueError(...) + results, job_id = run_out report_summary = await create_report( rogue_server_url=args.rogue_server_url, judge_llm=cli_input.judge_llm, results=results, output_report_file=cli_input.output_report_file, judge_llm_api_key_secret=cli_input.judge_llm_api_key, deep_test_mode=cli_input.deep_test_mode, judge_model=cli_input.judge_llm, + job_id=job_id, )
📜 Review details
Configuration used: CodeRabbit UI
Review profile: CHILL
Plan: Pro
📒 Files selected for processing (1)
rogue/run_cli.py(3 hunks)
🧰 Additional context used
📓 Path-based instructions (2)
{rogue,tests,examples}/**/*.py
📄 CodeRabbit inference engine (AGENTS.md)
{rogue,tests,examples}/**/*.py: Format Python code with Black
Organize imports according to isort conventions
Use type hints for all function signatures
Follow PEP 8 naming (snake_case for variables and functions, PascalCase for classes)
Use try/except blocks for code that may raise exceptions
Ensure code passes flake8 linting
Ensure code passes mypy type checks per .mypy.ini
Address Bandit-reported security issues in Python code
Files:
rogue/run_cli.py
rogue/**/*.py
📄 CodeRabbit inference engine (AGENTS.md)
Place application/package code under the rogue/ directory
Files:
rogue/run_cli.py
🧬 Code graph analysis (1)
rogue/run_cli.py (3)
rogue/server/api/llm.py (1)
generate_summary(73-159)sdks/python/rogue_sdk/client.py (1)
generate_summary(147-194)sdks/python/rogue_sdk/sdk.py (1)
generate_summary(287-342)
🪛 Flake8 (7.2.0)
rogue/run_cli.py
[error] 209-209: IndentationError: expected an indented block after 'try' statement on line 208
(E999)
🪛 GitHub Actions: pr
rogue/run_cli.py
[error] 208-209: mypy: syntax error - expected an indented block after 'try' statement in rogue/run_cli.py (line 208). Command: 'uv run mypy --config-file .mypy.ini'.
🪛 GitHub Actions: Test
rogue/run_cli.py
[error] 208-208: IndentationError: expected an indented block after 'try' statement on line 208.
🪛 GitHub Actions: Rogue
rogue/run_cli.py
[error] 208-209: IndentationError: expected an indented block after 'try' statement on line 208 in rogue/run_cli.py (during startup of server; command: uv run -m rogue server --host 0.0.0.0 --port 8000 &).
🔇 Additional comments (1)
rogue/run_cli.py (1)
191-193: Signature extension looks goodAdding qualifire_api_key_secret, deep_test_mode, and judge_model to create_report is appropriate and correctly typed with safe defaults.
| deep_test_mode=cli_input.deep_test_mode, | ||
| judge_model=cli_input.judge_llm, | ||
| ) |
There was a problem hiding this comment.
💡 Verification agent
🧩 Analysis chain
Wire Qualifire API key from CLI/config (if available)
create_report now accepts qualifire_api_key_secret but the call site doesn’t pass it. If CLIInput exposes it, forward it.
report_summary = await create_report(
rogue_server_url=args.rogue_server_url,
judge_llm=cli_input.judge_llm,
results=results,
output_report_file=cli_input.output_report_file,
judge_llm_api_key_secret=cli_input.judge_llm_api_key,
+ qualifire_api_key_secret=cli_input.qualifire_api_key,
deep_test_mode=cli_input.deep_test_mode,
judge_model=cli_input.judge_llm,
)Run to confirm the field name on CLIInput/PartialCLIInput:
🏁 Script executed:
#!/bin/bash
set -euo pipefail
rg -n --type=py -C2 'class\s+CLIInput|class\s+PartialCLIInput'
rg -n --type=py -C2 'qualifire.*api.*key'
rg -n --type=py -C2 '\bjob_id\b'Length of output: 32349
🏁 Script executed:
#!/bin/bash
set -euo pipefail
echo "=== rogue/models/cli_input.py ==="
sed -n '1,240p' rogue/models/cli_input.py || true
echo
echo "=== rogue/run_cli.py (first 520 lines with numbers) ==="
nl -ba -v1 -w4 -s': ' rogue/run_cli.py | sed -n '1,520p' || true
echo
echo "=== create_report call sites ==="
rg -n --type py 'create_report\(' -S || true
rg -n --type py 'create_report\(' -S | while IFS=: read -r file lineno rest; do
echo "---- $file:$lineno ----"
sed -n "$((lineno-3)),$((lineno+3))p" "$file" || true
doneLength of output: 3650
Wire Qualifire API key from CLI/config and forward it to create_report
create_report accepts qualifire_api_key_secret but run_cli doesn't pass it and CLIInput/PartialCLIInput do not expose the key — add the field and forward it.
- Add qualifire_api_key: SecretStr | None = None to CLIInput and PartialCLIInput (rogue/models/cli_input.py — next to judge_llm_api_key).
- Forward it at the create_report call site in rogue/run_cli.py (call around line 361): add qualifire_api_key_secret=cli_input.qualifire_api_key.
Suggested change:
report_summary = await create_report(
rogue_server_url=args.rogue_server_url,
judge_llm=cli_input.judge_llm,
results=results,
output_report_file=cli_input.output_report_file,
judge_llm_api_key_secret=cli_input.judge_llm_api_key,
+ qualifire_api_key_secret=cli_input.qualifire_api_key,
deep_test_mode=cli_input.deep_test_mode,
judge_model=cli_input.judge_llm,
)- Update tests that construct CLIInput (rogue/tests/models/test_cli_input.py) if they need to supply the new field.
📝 Committable suggestion
‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.
| deep_test_mode=cli_input.deep_test_mode, | |
| judge_model=cli_input.judge_llm, | |
| ) | |
| qualifire_api_key_secret=cli_input.qualifire_api_key, | |
| deep_test_mode=cli_input.deep_test_mode, | |
| judge_model=cli_input.judge_llm, | |
| ) |
🤖 Prompt for AI Agents
In rogue/run_cli.py around lines 367-369, the call to create_report does not
pass the qualifire API key; add qualifire_api_key: SecretStr | None = None to
CLIInput and PartialCLIInput in rogue/models/cli_input.py (place it next to
judge_llm_api_key), then forward it in the create_report call by adding
qualifire_api_key_secret=cli_input.qualifire_api_key; finally update any tests
that construct CLIInput (e.g., rogue/tests/models/test_cli_input.py) to include
the new field where necessary.
There was a problem hiding this comment.
Actionable comments posted: 2
♻️ Duplicate comments (1)
rogue/run_cli.py (1)
369-371: Wire Qualifire API key (SecretStr) and job_id into create_report callCurrently, create_report isn’t given qualifire_api_key_secret or job_id, so reporting won’t trigger. This was flagged earlier and remains. Pass both.
report_summary = await create_report( rogue_server_url=args.rogue_server_url, judge_llm=cli_input.judge_llm, results=results, output_report_file=cli_input.output_report_file, judge_llm_api_key_secret=cli_input.judge_llm_api_key, + qualifire_api_key_secret=cli_input.qualifire_api_key, + job_id=job_id, deep_test_mode=cli_input.deep_test_mode, judge_model=cli_input.judge_llm, )Follow-up (outside this hunk): plumb job_id from the evaluation step.
- Return job_id from _run_scenarios_with_sdk and run_scenarios:
# signatures async def _run_scenarios_with_sdk(...) -> tuple[EvaluationResults | None, str | None]: async def run_scenarios(...) -> tuple[EvaluationResults | None, str | None]:
- Inside _run_scenarios_with_sdk, after wait_for_evaluation:
if final_job.results: results = EvaluationResults(results=final_job.results) evaluation_results_output_path.write_text( results.model_dump_json(indent=2, exclude_none=True), encoding="utf-8", ) return results, final_job.job_id else: logger.error("Scenario evaluation completed but no results found.") return None, final_job.job_id
- In run_cli:
results, job_id = await run_scenarios(...) if not results: raise ValueError(f"No scenarios were evaluated for {cli_input.evaluated_agent_url}") report_summary = await create_report( ..., qualifire_api_key_secret=cli_input.qualifire_api_key, job_id=job_id, ... )#!/usr/bin/env bash # Verify SDK and models expose needed fields set -euo pipefail rg -n --type=py -C2 'def generate_summary\(' sdks/python/rogue_sdk | sed -n '1,200p' rg -n --type=py -C2 '\bqualifire_api_key\b|\bjob_id\b' sdks/python/rogue_sdk rg -n --type=py -C3 'class\s+(CLIInput|PartialCLIInput)\b' rogue/models | sed -n '1,240p' rg -n --type=py -C2 '\bqualifire.*api.*key\b' rogue/models
🧹 Nitpick comments (1)
rogue/run_cli.py (1)
222-224: Re-raise with bareraiseto preserve traceback
raise eloses the original traceback context; useraise.- except Exception as e: - logger.exception("Failed to generate summary") - raise e + except Exception: + logger.exception("Failed to generate summary") + raise
📜 Review details
Configuration used: CodeRabbit UI
Review profile: CHILL
Plan: Pro
📒 Files selected for processing (1)
rogue/run_cli.py(3 hunks)
🧰 Additional context used
📓 Path-based instructions (2)
{rogue,tests,examples}/**/*.py
📄 CodeRabbit inference engine (AGENTS.md)
{rogue,tests,examples}/**/*.py: Format Python code with Black
Organize imports according to isort conventions
Use type hints for all function signatures
Follow PEP 8 naming (snake_case for variables and functions, PascalCase for classes)
Use try/except blocks for code that may raise exceptions
Ensure code passes flake8 linting
Ensure code passes mypy type checks per .mypy.ini
Address Bandit-reported security issues in Python code
Files:
rogue/run_cli.py
rogue/**/*.py
📄 CodeRabbit inference engine (AGENTS.md)
Place application/package code under the rogue/ directory
Files:
rogue/run_cli.py
🧬 Code graph analysis (1)
rogue/run_cli.py (3)
rogue/server/api/llm.py (1)
generate_summary(73-159)sdks/python/rogue_sdk/client.py (1)
generate_summary(147-194)sdks/python/rogue_sdk/sdk.py (1)
generate_summary(287-342)
🪛 GitHub Actions: Rogue
rogue/run_cli.py
[error] 178-178: Scenario evaluation completed but no results found.
[error] 358-358: No scenarios were evaluated for http://localhost:10001/ (ValueError) during rogue CLI run. Command: 'uv run -m rogue cli --workdir ./examples/tshirt_store_agent/.rogue --evaluated-agent-url http://localhost:10001 --judge-llm openai/gpt-4.1-mini'
🔇 Additional comments (1)
rogue/run_cli.py (1)
209-213: LGTM: SecretStr correctly unwrapped before passing to SDKThis addresses the earlier serialization concern and avoids leaking the secret via logs.
| qualifire_api_key_secret: SecretStr | None = None, | ||
| deep_test_mode: bool = False, | ||
| judge_model: str | None = None, | ||
| ) -> str: |
There was a problem hiding this comment.
Plumb job_id through create_report to enable Qualifire reporting
Without job_id, server-side reporting is skipped even if qualifire_api_key is provided. Add job_id to the function signature so it can be forwarded to the SDK call.
async def create_report(
rogue_server_url: str,
judge_llm: str,
results: EvaluationResults,
output_report_file: Path,
judge_llm_api_key_secret: SecretStr | None = None,
- qualifire_api_key_secret: SecretStr | None = None,
- deep_test_mode: bool = False,
- judge_model: str | None = None,
+ qualifire_api_key_secret: SecretStr | None = None,
+ job_id: str | None = None,
+ deep_test_mode: bool = False,
+ judge_model: str | None = None,
) -> str:📝 Committable suggestion
‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.
| qualifire_api_key_secret: SecretStr | None = None, | |
| deep_test_mode: bool = False, | |
| judge_model: str | None = None, | |
| ) -> str: | |
| async def create_report( | |
| rogue_server_url: str, | |
| judge_llm: str, | |
| results: EvaluationResults, | |
| output_report_file: Path, | |
| judge_llm_api_key_secret: SecretStr | None = None, | |
| qualifire_api_key_secret: SecretStr | None = None, | |
| job_id: str | None = None, | |
| deep_test_mode: bool = False, | |
| judge_model: str | None = None, | |
| ) -> str: |
🤖 Prompt for AI Agents
In rogue/run_cli.py around lines 191 to 194, create_report currently lacks a
job_id parameter so Qualifire reporting is skipped; add a job_id: str | None =
None parameter to the function signature and propagate that job_id into the
downstream SDK call that submits the report (e.g., pass job_id=job_id when
calling the Qualifire client method). Also update any local calls to
create_report in this module to pass the new job_id where available, and adjust
type hints and tests/imports as needed.
| summary, _ = await sdk.generate_summary( | ||
| results=results, | ||
| model=judge_llm, | ||
| api_key=judge_llm_api_key, | ||
| qualifire_api_key=qualifire_api_key, | ||
| deep_test=deep_test_mode, | ||
| judge_model=judge_model, | ||
| ) |
There was a problem hiding this comment.
Forward job_id to SDK so BFF reporting actually happens
The SDK/client and server expect job_id for reporting; omitting it disables the new reporting path.
summary, _ = await sdk.generate_summary(
results=results,
model=judge_llm,
api_key=judge_llm_api_key,
qualifire_api_key=qualifire_api_key,
+ job_id=job_id,
deep_test=deep_test_mode,
judge_model=judge_model,
)📝 Committable suggestion
‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.
| summary, _ = await sdk.generate_summary( | |
| results=results, | |
| model=judge_llm, | |
| api_key=judge_llm_api_key, | |
| qualifire_api_key=qualifire_api_key, | |
| deep_test=deep_test_mode, | |
| judge_model=judge_model, | |
| ) | |
| summary, _ = await sdk.generate_summary( | |
| results=results, | |
| model=judge_llm, | |
| api_key=judge_llm_api_key, | |
| qualifire_api_key=qualifire_api_key, | |
| job_id=job_id, | |
| deep_test=deep_test_mode, | |
| judge_model=judge_model, | |
| ) |
🤖 Prompt for AI Agents
In rogue/run_cli.py around lines 214 to 221, the call to sdk.generate_summary
omits the job_id parameter so the SDK/server reporting path is not triggered;
update the call to forward the current job_id (pass job_id=job_id) into
sdk.generate_summary alongside the existing args so the SDK can perform BFF
reporting. Ensure the job_id variable in scope is used and run tests to verify
reporting is enabled.
There was a problem hiding this comment.
Actionable comments posted: 0
Caution
Some comments are outside the diff and can’t be posted inline due to platform limitations.
⚠️ Outside diff range comments (3)
sdks/python/rogue_sdk/types.py (1)
494-499: Fix HttpUrl validator (current code constructs HttpUrl incorrectly)HttpUrl is a type, not a constructor. Use TypeAdapter to validate strings.
-from pydantic import ( +from pydantic import ( BaseModel, ConfigDict, Field, HttpUrl, + TypeAdapter, field_validator, model_validator, ) @@ @field_validator("base_url", mode="after") def validate_base_url(cls, v: str | HttpUrl) -> HttpUrl: - if isinstance(v, str): - return HttpUrl(v) - return v + if isinstance(v, str): + return TypeAdapter(HttpUrl).validate_python(v) + return vrogue/server/api/llm.py (2)
48-52: Offload scenario generation to threadpoolLLMService is sync; run it off the event loop.
- scenarios = LLMService.generate_scenarios( - model=request.model, - context=request.business_context, - llm_provider_api_key=request.api_key, - ) + scenarios = await run_in_threadpool( + LLMService.generate_scenarios, + model=request.model, + context=request.business_context, + llm_provider_api_key=request.api_key, + )
91-95: Offload summary generation to threadpoolPrevents blocking with provider SDK calls.
- summary = LLMService.generate_summary_from_results( - model=request.model, - results=request.results, - llm_provider_api_key=request.api_key, - ) + summary = await run_in_threadpool( + LLMService.generate_summary_from_results, + model=request.model, + results=request.results, + llm_provider_api_key=request.api_key, + )
♻️ Duplicate comments (10)
sdks/python/rogue_sdk/types.py (4)
263-275: Warn on timestamp parse failures (don’t silently fall back)Silent fallback hides data issues. Log and continue.
- except (ValueError, AttributeError): - timestamp = datetime.now(timezone.utc) + except (ValueError, AttributeError) as e: + logger.warning( + "Failed to parse message timestamp; defaulting to now", + extra={"timestamp": msg.timestamp, "error": str(e)}, + ) + timestamp = datetime.now(timezone.utc)
230-237: Align ApiScenarioResult with server: add expectedOutcomeField is present server-side; omitting it drops data.
class ApiScenarioResult(BaseModel): """Result of evaluating a single scenario in new API format.""" description: Optional[str] = None + expectedOutcome: Optional[str] = None totalConversations: Optional[int] = None flaggedConversations: Optional[int] = None conversations: List[ApiConversationEvaluation]
239-244: Add summary/metadata fields to ApiEvaluationResultThese are required by the server contract and downstream consumers.
class ApiEvaluationResult(BaseModel): """New API format for evaluation results.""" scenarios: List[ApiScenarioResult] + summary: Optional[str] = None + keyFindings: Optional[str] = None + recommendation: Optional[str] = None + deepTest: bool = False + startTime: datetime = Field(default_factory=lambda: datetime.now(timezone.utc)) + judgeModel: Optional[str] = None
245-304: Extend converter signature and map structured summary + metadataKeep SDK converter in sync with server’s convert_to_api_format behavior.
-def convert_to_api_format(evaluation_results: EvaluationResults) -> ApiEvaluationResult: +def convert_to_api_format( + evaluation_results: EvaluationResults, + structured_summary: Optional["StructuredSummary"] = None, + deep_test: bool = False, + start_time: Optional[datetime] = None, + judge_model: Optional[str] = None, +) -> ApiEvaluationResult: @@ - api_scenarios = [] + if start_time is None: + start_time = datetime.now(timezone.utc) + api_scenarios: List[ApiScenarioResult] = [] @@ - api_scenarios.append( - ApiScenarioResult( - description=result.scenario.scenario, - totalConversations=len(api_conversations), - flaggedConversations=len( - [c for c in api_conversations if not c.passed], - ), - conversations=api_conversations, - ), - ) + api_scenarios.append( + ApiScenarioResult( + description=result.scenario.scenario, + expectedOutcome=result.scenario.expected_outcome, + totalConversations=len(api_conversations), + flaggedConversations=sum(1 for c in api_conversations if not c.passed), + conversations=api_conversations, + ) + ) @@ - return ApiEvaluationResult(scenarios=api_scenarios) + summary_text = None + key_findings_text = None + recommendation_text = None + if structured_summary: + summary_text = structured_summary.overall_summary + key_findings_text = "\n".join(f"• {f}" for f in structured_summary.key_findings) + recommendation_text = "\n".join(f"• {r}" for r in structured_summary.recommendations) + + return ApiEvaluationResult( + scenarios=api_scenarios, + summary=summary_text, + keyFindings=key_findings_text, + recommendation=recommendation_text, + deepTest=deep_test, + startTime=start_time, + judgeModel=judge_model, + )rogue/server/services/qualifire_service.py (1)
26-39: Harden network call and error handling; avoid json() on error bodiesWrap requests.post, gate header on presence, and prefer response.text on failures.
- response = requests.post( - f"{request.qualifire_url}/api/rogue/v1/report", - headers={"X-qualifire-key": request.qualifire_api_key}, - json=api_evaluation_result.model_dump(mode="json"), - timeout=300, - ) - - if not response.ok: - logger.error( - "Failed to report summary to Qualifire", - extra={"response": response.json()}, - ) - raise Exception(f"Failed to report summary to Qualifire: {response.json()}") + try: + headers = {} + if request.qualifire_api_key: + headers["X-Qualifire-Key"] = request.qualifire_api_key + response = requests.post( + f"{request.qualifire_url}/api/rogue/v1/report", + headers=headers, + json=api_evaluation_result.model_dump(mode="json"), + timeout=300, + ) + except requests.RequestException: + logger.exception("Network error reporting summary to Qualifire") + raise + + if not response.ok: + try: + body = response.json() + except Exception: + body = response.text + logger.error( + "Failed to report summary to Qualifire", + extra={"status": response.status_code, "response": body}, + ) + raise Exception(f"Failed to report summary to Qualifire: {body}")rogue/server/api/llm.py (5)
181-185: Results emptiness check is correct nowUsing the underlying list avoids the earlier len(EvaluationResults) bug.
99-102: Do not log secretsLog only presence, not the raw API key.
- logger.info( - "Qualifire API key", - extra={"qualifire_api_key": request.qualifire_api_key}, - ) + logger.info("Qualifire API key provided", extra={"present": bool(request.qualifire_api_key)})
7-8: Import run_in_threadpool for sync I/O offloadingAvoid blocking the event loop.
-from fastapi import APIRouter, Depends, HTTPException +from fastapi import APIRouter, Depends, HTTPException +from fastapi.concurrency import run_in_threadpool
187-198: Offload Qualifire reporting in /report_summary to threadpoolAvoid blocking the request handler.
- QualifireService.report_summary( + await run_in_threadpool( + QualifireService.report_summary, ReportSummaryRequest( @@ - ) + evaluation_results=EvaluationResults(results=results), + )
132-147: Offload Qualifire reporting to threadpoolrequests is sync; avoid blocking.
- QualifireService.report_summary( + await run_in_threadpool( + QualifireService.report_summary, ReportSummaryRequest( @@ - ) + evaluation_results=request.results, + )
🧹 Nitpick comments (6)
sdks/python/rogue_sdk/types.py (3)
255-255: Type annotate local list for mypy clarityMinor readability and typing aid.
- api_scenarios = [] + api_scenarios: List[ApiScenarioResult] = []
444-451: Avoid duplicating StructuredSummary across packagesPrefer a single definition (server models vs SDK). If duplication is required, add explicit JSON contract tests to ensure parity.
80-81: Mark API keys as secretsUse SecretStr to avoid accidental logging/serialization of raw keys.
- qualifire_api_key: Optional[str] = None + from pydantic import SecretStr + qualifire_api_key: Optional[SecretStr] = NoneNote: Adjust get_auth_header callers to use .get_secret_value() when needed.
rogue/server/services/qualifire_service.py (1)
10-13: Add return type annotationClarify return type for mypy/IDE tooling.
- def report_summary( + def report_summary( request: ReportSummaryRequest, evaluation_results: EvaluationResults, - ): + ) -> dict:rogue/server/api/llm.py (2)
127-131: Reduce verbose/PII-risk loggingAvoid dumping full results/summary objects in logs; log sizes instead.
- logger.info( - "Summary", - extra={"summary": summary, "results": request.results}, - ) + logger.info( + "Summary generated", + extra={ + "overall_summary_len": len(getattr(summary, "overall_summary", "") or ""), + "results_count": len(request.results.results), + }, + )
154-159: Preserve HTTPException status; only wrap unknown errorsCurrent code returns 500 even for deliberate HTTPException.
- except Exception as e: + except HTTPException: + raise + except Exception as e: logger.exception("Failed to generate summary") raise HTTPException( status_code=500, detail=f"Failed to generate summary: {str(e)}", )
📜 Review details
Configuration used: CodeRabbit UI
Review profile: CHILL
Plan: Pro
📒 Files selected for processing (3)
rogue/server/api/llm.py(4 hunks)rogue/server/services/qualifire_service.py(1 hunks)sdks/python/rogue_sdk/types.py(8 hunks)
🧰 Additional context used
📓 Path-based instructions (2)
{rogue,tests,examples}/**/*.py
📄 CodeRabbit inference engine (AGENTS.md)
{rogue,tests,examples}/**/*.py: Format Python code with Black
Organize imports according to isort conventions
Use type hints for all function signatures
Follow PEP 8 naming (snake_case for variables and functions, PascalCase for classes)
Use try/except blocks for code that may raise exceptions
Ensure code passes flake8 linting
Ensure code passes mypy type checks per .mypy.ini
Address Bandit-reported security issues in Python code
Files:
rogue/server/services/qualifire_service.pyrogue/server/api/llm.py
rogue/**/*.py
📄 CodeRabbit inference engine (AGENTS.md)
Place application/package code under the rogue/ directory
Files:
rogue/server/services/qualifire_service.pyrogue/server/api/llm.py
🧬 Code graph analysis (3)
rogue/server/services/qualifire_service.py (2)
rogue/server/services/api_format_service.py (1)
convert_with_structured_summary(116-141)sdks/python/rogue_sdk/types.py (2)
EvaluationResults(192-208)ReportSummaryRequest(501-510)
sdks/python/rogue_sdk/types.py (3)
rogue/server/models/api_format.py (4)
ApiChatMessage(14-19)ApiConversationEvaluation(22-27)ApiScenarioResult(30-37)ApiEvaluationResult(40-49)rogue/server/services/api_format_service.py (1)
convert_to_api_format(20-113)packages/tui/internal/tui/evaluation.go (1)
StructuredSummary(96-105)
rogue/server/api/llm.py (7)
sdks/python/rogue_sdk/types.py (4)
EvaluationResults(192-208)SummaryGenerationRequest(431-441)ReportSummaryResponse(513-516)ReportSummaryRequest(501-510)rogue/server/api/evaluation.py (1)
get_evaluation_service(23-24)rogue/server/services/evaluation_service.py (2)
EvaluationService(20-234)get_job(31-33)rogue/server/models/api_format.py (1)
ServerSummaryGenerationResponse(52-56)rogue/server/services/llm_service.py (1)
LLMService(166-283)rogue/server/services/qualifire_service.py (2)
QualifireService(8-40)report_summary(10-40)sdks/python/rogue_sdk/client.py (2)
generate_summary(147-194)request(79-83)
⏰ Context from checks skipped due to timeout of 90000ms. You can increase the timeout in your CodeRabbit configuration to a maximum of 15 minutes (900000ms). (1)
- GitHub Check: codestyle
🔇 Additional comments (3)
sdks/python/rogue_sdk/types.py (3)
389-391: LGTM: job metadata fieldsdeep_test and judge_model on EvaluationJob align with reporting requirements.
501-511: LGTM: ReportSummaryRequest shapeMatches usage in routes/services.
513-517: LGTM: ReportSummaryResponseMinimal, sufficient for API.
There was a problem hiding this comment.
Actionable comments posted: 1
Caution
Some comments are outside the diff and can’t be posted inline due to platform limitations.
⚠️ Outside diff range comments (1)
packages/tui/internal/tui/eval_ui.go (1)
119-120: Update summary check to include StructuredSummary.The code only checks the plain
Summaryfield when determining if a summary already exists, but should also considerStructuredSummary.// Don't regenerate if we already have a summary -if m.evalState.Summary != "" { +if m.evalState.Summary != "" || m.evalState.StructuredSummary.OverallSummary != "" { return }
🧹 Nitpick comments (2)
packages/tui/internal/tui/eval_ui.go (2)
27-31: Consider consolidating summary fields.Having both
Summary(string) andStructuredSummaryfields could lead to confusion about which one to use. Consider deprecating the plainSummaryfield in favor of the structured version, or clearly document when each should be used.
47-49: Consider using the EvalScenario struct directly for unmarshaling.The anonymous struct duplicates the field definitions from
EvalScenario. You could simplify by using the actual type.var v struct { - Scenarios []struct { - Scenario string `json:"scenario"` - ScenarioType string `json:"scenario_type"` - ExpectedOutcome string `json:"expected_outcome"` - } `json:"scenarios"` + Scenarios []json.RawMessage `json:"scenarios"` } if json.Unmarshal(b, &v) == nil { out := make([]EvalScenario, 0, len(v.Scenarios)) - for _, s := range v.Scenarios { - if s.Scenario != "" { - out = append(out, EvalScenario{ - Scenario: s.Scenario, - ScenarioType: ScenarioType(s.ScenarioType), - ExpectedOutcome: s.ExpectedOutcome, - }) + for _, raw := range v.Scenarios { + var s EvalScenario + if err := json.Unmarshal(raw, &s); err == nil && s.Scenario != "" { + out = append(out, s) } } return out }Note: This approach would require adjusting the JSON tags in
EvalScenarioto handle thescenario_typefield as a string during unmarshaling, potentially with a custom UnmarshalJSON method.
📜 Review details
Configuration used: CodeRabbit UI
Review profile: CHILL
Plan: Pro
📒 Files selected for processing (1)
packages/tui/internal/tui/eval_ui.go(2 hunks)
🧰 Additional context used
🧬 Code graph analysis (1)
packages/tui/internal/tui/eval_ui.go (1)
packages/tui/internal/tui/evaluation.go (3)
EvalScenario(44-48)StructuredSummary(96-105)ScenarioType(27-27)
⏰ Context from checks skipped due to timeout of 90000ms. You can increase the timeout in your CodeRabbit configuration to a maximum of 15 minutes (900000ms). (1)
- GitHub Check: codestyle
🔇 Additional comments (2)
packages/tui/internal/tui/eval_ui.go (2)
17-17: LGTM! Clean transition to structured scenarios.The change from
[]stringto[]EvalScenarioimproves type safety and enables richer scenario metadata with expected outcomes.
77-77: Resolved — StartEvaluation signature and callers are up to date.
StartEvaluation (packages/tui/internal/tui/evaluation.go) accepts []EvalScenario and the TUI caller in packages/tui/internal/tui/eval_ui.go passes st.Scenarios; no action required.
| func loadScenariosFromWorkdir() []EvalScenario { | ||
| wd, _ := os.Getwd() | ||
| dir := wd | ||
| for { | ||
| p := filepath.Join(dir, ".rogue", "scenarios.json") | ||
| if b, err := os.ReadFile(p); err == nil { | ||
| var v struct { | ||
| Scenarios []struct { | ||
| Scenario string `json:"scenario"` | ||
| Scenario string `json:"scenario"` | ||
| ScenarioType string `json:"scenario_type"` | ||
| ExpectedOutcome string `json:"expected_outcome"` | ||
| } `json:"scenarios"` | ||
| } | ||
| if json.Unmarshal(b, &v) == nil { | ||
| out := make([]string, 0, len(v.Scenarios)) | ||
| out := make([]EvalScenario, 0, len(v.Scenarios)) | ||
| for _, s := range v.Scenarios { | ||
| if s.Scenario != "" { | ||
| out = append(out, s.Scenario) | ||
| out = append(out, EvalScenario{ | ||
| Scenario: s.Scenario, | ||
| ScenarioType: ScenarioType(s.ScenarioType), | ||
| ExpectedOutcome: s.ExpectedOutcome, | ||
| }) | ||
| } | ||
| } | ||
| return out |
There was a problem hiding this comment.
🛠️ Refactor suggestion
Add validation for ScenarioType conversion.
The code directly casts s.ScenarioType string to ScenarioType without validation. If the JSON contains an invalid scenario type, this could lead to unexpected behavior.
Add validation when converting the scenario type:
for _, s := range v.Scenarios {
if s.Scenario != "" {
+ // Validate scenario type
+ scenarioType := ScenarioType(s.ScenarioType)
+ // Consider adding a validation method like scenarioType.IsValid()
+ // or at least document valid values
out = append(out, EvalScenario{
Scenario: s.Scenario,
- ScenarioType: ScenarioType(s.ScenarioType),
+ ScenarioType: scenarioType,
ExpectedOutcome: s.ExpectedOutcome,
})
}
}Consider implementing a validation method for ScenarioType in the evaluation.go file to ensure only valid scenario types are accepted.
📝 Committable suggestion
‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.
| func loadScenariosFromWorkdir() []EvalScenario { | |
| wd, _ := os.Getwd() | |
| dir := wd | |
| for { | |
| p := filepath.Join(dir, ".rogue", "scenarios.json") | |
| if b, err := os.ReadFile(p); err == nil { | |
| var v struct { | |
| Scenarios []struct { | |
| Scenario string `json:"scenario"` | |
| Scenario string `json:"scenario"` | |
| ScenarioType string `json:"scenario_type"` | |
| ExpectedOutcome string `json:"expected_outcome"` | |
| } `json:"scenarios"` | |
| } | |
| if json.Unmarshal(b, &v) == nil { | |
| out := make([]string, 0, len(v.Scenarios)) | |
| out := make([]EvalScenario, 0, len(v.Scenarios)) | |
| for _, s := range v.Scenarios { | |
| if s.Scenario != "" { | |
| out = append(out, s.Scenario) | |
| out = append(out, EvalScenario{ | |
| Scenario: s.Scenario, | |
| ScenarioType: ScenarioType(s.ScenarioType), | |
| ExpectedOutcome: s.ExpectedOutcome, | |
| }) | |
| } | |
| } | |
| return out | |
| func loadScenariosFromWorkdir() []EvalScenario { | |
| wd, _ := os.Getwd() | |
| dir := wd | |
| for { | |
| p := filepath.Join(dir, ".rogue", "scenarios.json") | |
| if b, err := os.ReadFile(p); err == nil { | |
| var v struct { | |
| Scenarios []struct { | |
| Scenario string `json:"scenario"` | |
| ScenarioType string `json:"scenario_type"` | |
| ExpectedOutcome string `json:"expected_outcome"` | |
| } `json:"scenarios"` | |
| } | |
| if json.Unmarshal(b, &v) == nil { | |
| out := make([]EvalScenario, 0, len(v.Scenarios)) | |
| for _, s := range v.Scenarios { | |
| if s.Scenario != "" { | |
| // Validate scenario type | |
| scenarioType := ScenarioType(s.ScenarioType) | |
| // Consider adding a validation method like scenarioType.IsValid() | |
| // or at least document valid values | |
| out = append(out, EvalScenario{ | |
| Scenario: s.Scenario, | |
| ScenarioType: scenarioType, | |
| ExpectedOutcome: s.ExpectedOutcome, | |
| }) | |
| } | |
| } | |
| return out |
🤖 Prompt for AI Agents
In packages/tui/internal/tui/eval_ui.go around lines 39 to 63, the code blindly
casts s.ScenarioType string to ScenarioType which can create invalid values if
JSON contains unexpected strings; update the code to validate the conversion by
adding/using a parser/validator (e.g., ParseScenarioType or
ScenarioType.IsValid) in evaluation.go that returns an error or bool and then
call it here, only append the EvalScenario when the parsed ScenarioType is valid
(otherwise skip the entry or handle/log the error) so that only recognized
scenario types are accepted.
….com/qualifire-dev/rogue-private into feature/FIRE-751-rogue-report-to-bff
This reverts commit bb3fb1c.
Summary by CodeRabbit
New Features
Improvements
Tests