From 3574a98b18b5a735ba263999d076e9b60c3b2278 Mon Sep 17 00:00:00 2001 From: Dror Ivry Date: Thu, 4 Sep 2025 15:38:07 +0300 Subject: [PATCH 01/22] Revert "Revert "wip on the new response"" This reverts commit bb3fb1c47a934e46565305d65d34040394ff54c5. --- rogue/tests/models/test_evaluation_result.py | 51 +++++++-- rogue/ui/components/report_generator.py | 111 ++++++++++++++++++- sdks/python/rogue_sdk/types.py | 101 ++++++++++++++++- 3 files changed, 243 insertions(+), 20 deletions(-) diff --git a/rogue/tests/models/test_evaluation_result.py b/rogue/tests/models/test_evaluation_result.py index b1458423..f7e67e02 100644 --- a/rogue/tests/models/test_evaluation_result.py +++ b/rogue/tests/models/test_evaluation_result.py @@ -1,4 +1,5 @@ import pytest +from datetime import datetime from rogue_sdk.types import ( ChatHistory, ChatMessage, @@ -7,6 +8,10 @@ EvaluationResults, Scenario, ) +from rogue.ui.components.report_generator import ( + convert_to_api_format, + ApiEvaluationResult, +) class TestEvaluationResults: @@ -54,26 +59,26 @@ def get_evaluation_result( EvaluationResults(), get_evaluation_result(scenario_1, conversation_1_passed), EvaluationResults( - results=[get_evaluation_result(scenario_1, conversation_1_passed)] + results=[get_evaluation_result(scenario_1, conversation_1_passed)], ), ), # no overlap from non-empty results ( EvaluationResults( - results=[get_evaluation_result(scenario_1, conversation_1_passed)] + results=[get_evaluation_result(scenario_1, conversation_1_passed)], ), get_evaluation_result(scenario_2, conversation_1_failed), EvaluationResults( results=[ get_evaluation_result(scenario_1, conversation_1_passed), get_evaluation_result(scenario_2, conversation_1_failed), - ] + ], ), ), # scenario overlap with passed unchanged True -> True ( EvaluationResults( - results=[get_evaluation_result(scenario_1, conversation_1_passed)] + results=[get_evaluation_result(scenario_1, conversation_1_passed)], ), get_evaluation_result(scenario_1, conversation_2_passed), EvaluationResults( @@ -86,13 +91,13 @@ def get_evaluation_result( ], passed=True, ), - ] + ], ), ), # scenario overlap with passed changed True -> False ( EvaluationResults( - results=[get_evaluation_result(scenario_1, conversation_1_passed)] + results=[get_evaluation_result(scenario_1, conversation_1_passed)], ), get_evaluation_result(scenario_1, conversation_2_failed), EvaluationResults( @@ -105,13 +110,13 @@ def get_evaluation_result( ], passed=False, ), - ] + ], ), ), # scenario overlap with passed unchanged False -> False (#1) ( EvaluationResults( - results=[get_evaluation_result(scenario_1, conversation_1_failed)] + results=[get_evaluation_result(scenario_1, conversation_1_failed)], ), get_evaluation_result(scenario_1, conversation_2_failed), EvaluationResults( @@ -124,13 +129,13 @@ def get_evaluation_result( ], passed=False, ), - ] + ], ), ), # scenario overlap with passed unchanged False -> False (#2) ( EvaluationResults( - results=[get_evaluation_result(scenario_1, conversation_1_failed)] + results=[get_evaluation_result(scenario_1, conversation_1_failed)], ), get_evaluation_result( scenario_1, @@ -146,7 +151,7 @@ def get_evaluation_result( ], passed=False, ), - ] + ], ), ), ], @@ -159,3 +164,27 @@ def test_add_result( ): existing_results.add_result(new_result) assert existing_results == expected_results + + def test_convert_to_api_format(self): + """Test conversion to new API format.""" + results = EvaluationResults() + result = self.get_evaluation_result(self.scenario_1, self.conversation_1_passed) + results.add_result(result) + + api_format = convert_to_api_format(results) + + assert isinstance(api_format, ApiEvaluationResult) + assert len(api_format.scenarios) == 1 + assert api_format.scenarios[0].description == "Scenario 1" + assert api_format.scenarios[0].totalConversations == 1 + assert api_format.scenarios[0].flaggedConversations == 0 + assert len(api_format.scenarios[0].conversations) == 1 + assert api_format.scenarios[0].conversations[0].passed is True + assert api_format.scenarios[0].conversations[0].reason == "reason" + assert len(api_format.scenarios[0].conversations[0].messages) == 1 + + # Test message conversion + message = api_format.scenarios[0].conversations[0].messages[0] + assert message.role == "user" + assert message.content == "message 1" + assert isinstance(message.timestamp, datetime) diff --git a/rogue/ui/components/report_generator.py b/rogue/ui/components/report_generator.py index 538491b1..dd1236b7 100644 --- a/rogue/ui/components/report_generator.py +++ b/rogue/ui/components/report_generator.py @@ -1,9 +1,97 @@ from pathlib import Path from typing import Tuple +from datetime import datetime, timezone import gradio as gr from loguru import logger from rogue_sdk.types import EvaluationResults +from pydantic import BaseModel +from typing import List, Optional + + +# New API Format Types for report display +class ApiChatMessage(BaseModel): + """Chat message for new API format with datetime timestamp.""" + + role: str + content: str + timestamp: datetime + + +class ApiConversationEvaluation(BaseModel): + """Conversation evaluation for new API format.""" + + passed: bool + messages: List[ApiChatMessage] + reason: Optional[str] = None + + +class ApiScenarioResult(BaseModel): + """Result of evaluating a single scenario in new API format.""" + + description: Optional[str] = None + totalConversations: Optional[int] = None + flaggedConversations: Optional[int] = None + conversations: List[ApiConversationEvaluation] + + +class ApiEvaluationResult(BaseModel): + """New API format for evaluation results.""" + + scenarios: List[ApiScenarioResult] + + +def convert_to_api_format(evaluation_results: EvaluationResults) -> ApiEvaluationResult: + """Convert legacy EvaluationResults to new API format.""" + api_scenarios = [] + + for result in evaluation_results.results: + # Convert conversations to new format + api_conversations = [] + for conv_eval in result.conversations: + # Convert ChatHistory messages to ApiChatMessage + api_messages = [] + for msg in conv_eval.messages.messages: + timestamp = datetime.now(timezone.utc) + if msg.timestamp: + try: + if isinstance(msg.timestamp, str): + timestamp = datetime.fromisoformat( + msg.timestamp.replace("Z", "+00:00"), + ) + else: + timestamp = msg.timestamp + except (ValueError, AttributeError): + timestamp = datetime.now(timezone.utc) + + api_messages.append( + ApiChatMessage( + role=msg.role, + content=msg.content, + timestamp=timestamp, + ), + ) + + api_conversations.append( + ApiConversationEvaluation( + passed=conv_eval.passed, + messages=api_messages, + reason=conv_eval.reason if conv_eval.reason else None, + ), + ) + + api_scenarios.append( + ApiScenarioResult( + description=result.scenario.scenario, + totalConversations=len(api_conversations), + flaggedConversations=len( + [c for c in api_conversations if not c.passed], + ), + conversations=api_conversations, + ), + ) + + return ApiEvaluationResult(scenarios=api_scenarios) def _load_report_data_from_files( @@ -60,13 +148,24 @@ def on_report_tab_select(state): ) results = EvaluationResults() + # Convert to new API format for display + try: + api_format_results = convert_to_api_format(results) + results_json = api_format_results.model_dump_json( + indent=2, + exclude_none=True, + ) + except Exception as e: + logger.warning( + f"Failed to convert results to API format: {e}", + extra={ + "results": results, + }, + ) + results_json = str(results) + return { - evaluation_results_display: gr.update( - value=results.model_dump_json( - indent=2, - exclude_none=True, - ), - ), + evaluation_results_display: gr.update(value=results_json), summary_display: gr.update(value=summary), } diff --git a/sdks/python/rogue_sdk/types.py b/sdks/python/rogue_sdk/types.py index b5359716..c2a741fa 100644 --- a/sdks/python/rogue_sdk/types.py +++ b/sdks/python/rogue_sdk/types.py @@ -85,7 +85,7 @@ def check_auth_credentials(self) -> "AgentConfig": if auth_type and auth_type != AuthType.NO_AUTH and not auth_credentials: raise ValueError( - "Authentication Credentials cannot be empty for the selected auth type." + "Authentication Credentials cannot be empty for the selected auth type.", # noqa: E501 ) return self @@ -110,7 +110,7 @@ def validate_dataset_for_type(self) -> "Scenario": if dataset_required and self.dataset is None: raise ValueError( f"`dataset` must be provided when scenario_type is " - f"'{self.scenario_type.value}'" + f"'{self.scenario_type.value}'", ) elif not dataset_required and self.dataset is not None: logger.info( @@ -143,7 +143,7 @@ def get_scenarios_by_type(self, scenario_type: ScenarioType) -> "Scenarios": scenario for scenario in self.scenarios if scenario.scenario_type == scenario_type - ] + ], ) def get_policy_scenarios(self) -> "Scenarios": @@ -207,6 +207,101 @@ def combine(self, other: "EvaluationResults"): self.add_result(result) +# New API Format Types + + +class ApiChatMessage(BaseModel): + """Chat message for new API format with datetime timestamp.""" + + role: str + content: str + timestamp: datetime + + +class ApiConversationEvaluation(BaseModel): + """Conversation evaluation for new API format.""" + + passed: bool + messages: List[ApiChatMessage] + reason: Optional[str] = None + + +class ApiScenarioResult(BaseModel): + """Result of evaluating a single scenario in new API format.""" + + description: Optional[str] = None + totalConversations: Optional[int] = None + flaggedConversations: Optional[int] = None + conversations: List[ApiConversationEvaluation] + + +class ApiEvaluationResult(BaseModel): + """New API format for evaluation results.""" + + scenarios: List[ApiScenarioResult] + + +# Conversion functions for new API format +def convert_to_api_format(evaluation_results: EvaluationResults) -> ApiEvaluationResult: + """Convert legacy EvaluationResults to new API format. + + Args: + evaluation_results: Legacy evaluation results to convert + + Returns: + ApiEvaluationResult: New format evaluation result + """ + api_scenarios = [] + + for result in evaluation_results.results: + # Convert conversations to new format + api_conversations = [] + for conv_eval in result.conversations: + # Convert ChatHistory messages to ApiChatMessage + api_messages = [] + for msg in conv_eval.messages.messages: + timestamp = datetime.now(timezone.utc) + if msg.timestamp: + try: + if isinstance(msg.timestamp, str): + timestamp = datetime.fromisoformat( + msg.timestamp.replace("Z", "+00:00"), + ) + else: + timestamp = msg.timestamp + except (ValueError, AttributeError): + timestamp = datetime.now(timezone.utc) + + api_messages.append( + ApiChatMessage( + role=msg.role, + content=msg.content, + timestamp=timestamp, + ), + ) + + api_conversations.append( + ApiConversationEvaluation( + passed=conv_eval.passed, + messages=api_messages, + reason=conv_eval.reason if conv_eval.reason else None, + ), + ) + + api_scenarios.append( + ApiScenarioResult( + description=result.scenario.scenario, + totalConversations=len(api_conversations), + flaggedConversations=len( + [c for c in api_conversations if not c.passed], + ), + conversations=api_conversations, + ), + ) + + return ApiEvaluationResult(scenarios=api_scenarios) + + # Interview Types From ae68ff99b249086e69ac607da5a940135dc37929 Mon Sep 17 00:00:00 2001 From: Dror Ivry Date: Thu, 4 Sep 2025 15:47:13 +0300 Subject: [PATCH 02/22] wip --- rogue/tests/models/test_evaluation_result.py | 17 +++++- rogue/ui/components/report_generator.py | 59 ++++++++++++++++++-- 2 files changed, 70 insertions(+), 6 deletions(-) diff --git a/rogue/tests/models/test_evaluation_result.py b/rogue/tests/models/test_evaluation_result.py index f7e67e02..1d5c4dbd 100644 --- a/rogue/tests/models/test_evaluation_result.py +++ b/rogue/tests/models/test_evaluation_result.py @@ -171,7 +171,14 @@ def test_convert_to_api_format(self): result = self.get_evaluation_result(self.scenario_1, self.conversation_1_passed) results.add_result(result) - api_format = convert_to_api_format(results) + api_format = convert_to_api_format( + evaluation_results=results, + summary="Test summary", + key_findings="Key finding 1", + recommendation="Test recommendation", + deep_test=True, + judge_model="openai/gpt-4o-mini", + ) assert isinstance(api_format, ApiEvaluationResult) assert len(api_format.scenarios) == 1 @@ -188,3 +195,11 @@ def test_convert_to_api_format(self): assert message.role == "user" assert message.content == "message 1" assert isinstance(message.timestamp, datetime) + + # Test new fields + assert api_format.summary == "Test summary" + assert api_format.keyFindings == "Key finding 1" + assert api_format.recommendation == "Test recommendation" + assert api_format.deepTest is True + assert api_format.judgeModel == "openai/gpt-4o-mini" + assert isinstance(api_format.startTime, datetime) diff --git a/rogue/ui/components/report_generator.py b/rogue/ui/components/report_generator.py index dd1236b7..4e8624fc 100644 --- a/rogue/ui/components/report_generator.py +++ b/rogue/ui/components/report_generator.py @@ -39,10 +39,40 @@ class ApiEvaluationResult(BaseModel): """New API format for evaluation results.""" scenarios: List[ApiScenarioResult] + summary: Optional[str] = None + keyFindings: Optional[str] = None + recommendation: Optional[str] = None + deepTest: bool = False + startTime: datetime + judgeModel: Optional[str] = None + + +def convert_to_api_format( + evaluation_results: EvaluationResults, + summary: Optional[str] = None, + key_findings: Optional[str] = None, + recommendation: Optional[str] = None, + deep_test: bool = False, + start_time: Optional[datetime] = None, + judge_model: Optional[str] = None, +) -> ApiEvaluationResult: + """Convert legacy EvaluationResults to new API format. + + Args: + evaluation_results: Legacy evaluation results to convert + summary: Generated summary of the evaluation + key_findings: Key findings from the evaluation + recommendation: Recommendations based on the evaluation + deep_test: Whether deep test mode was enabled + start_time: When the evaluation started (defaults to current time) + judge_model: The LLM judge model used + + Returns: + ApiEvaluationResult: New format evaluation result with additional metadata + """ + if start_time is None: + start_time = datetime.now(timezone.utc) - -def convert_to_api_format(evaluation_results: EvaluationResults) -> ApiEvaluationResult: - """Convert legacy EvaluationResults to new API format.""" api_scenarios = [] for result in evaluation_results.results: @@ -91,7 +121,15 @@ def convert_to_api_format(evaluation_results: EvaluationResults) -> ApiEvaluatio ), ) - return ApiEvaluationResult(scenarios=api_scenarios) + return ApiEvaluationResult( + scenarios=api_scenarios, + summary=summary, + keyFindings=key_findings, + recommendation=recommendation, + deepTest=deep_test, + startTime=start_time, + judgeModel=judge_model, + ) def _load_report_data_from_files( @@ -150,7 +188,18 @@ def on_report_tab_select(state): # Convert to new API format for display try: - api_format_results = convert_to_api_format(results) + # Extract configuration and additional metadata from state + config = state.get("config", {}) + + api_format_results = convert_to_api_format( + evaluation_results=results, + summary=summary if summary != "No summary available." else None, + key_findings=state.get("key_findings"), + recommendation=state.get("recommendation"), + deep_test=config.get("deep_test_mode", False), + start_time=state.get("start_time"), + judge_model=config.get("judge_llm"), + ) results_json = api_format_results.model_dump_json( indent=2, exclude_none=True, From 354cbd0dcfed74478e002080c449bc31ec8f0685 Mon Sep 17 00:00:00 2001 From: Dror Ivry Date: Thu, 4 Sep 2025 16:05:55 +0300 Subject: [PATCH 03/22] --wip-- [skip ci] --- rogue/server/models/api_format.py | 47 +++++++++ rogue/tests/models/test_evaluation_result.py | 12 +-- rogue/ui/components/report_generator.py | 100 ++++++++++++++++++- 3 files changed, 150 insertions(+), 9 deletions(-) create mode 100644 rogue/server/models/api_format.py diff --git a/rogue/server/models/api_format.py b/rogue/server/models/api_format.py new file mode 100644 index 00000000..a374e984 --- /dev/null +++ b/rogue/server/models/api_format.py @@ -0,0 +1,47 @@ +"""API format models for evaluation results. + +These models define the enhanced API format for evaluation results +that includes summary, key findings, recommendations, and metadata. +""" + +from datetime import datetime +from typing import List, Optional + +from pydantic import BaseModel + + +class ApiChatMessage(BaseModel): + """Chat message for new API format with datetime timestamp.""" + + role: str + content: str + timestamp: datetime + + +class ApiConversationEvaluation(BaseModel): + """Conversation evaluation for new API format.""" + + passed: bool + messages: List[ApiChatMessage] + reason: Optional[str] = None + + +class ApiScenarioResult(BaseModel): + """Result of evaluating a single scenario in new API format.""" + + description: Optional[str] = None + totalConversations: Optional[int] = None + flaggedConversations: Optional[int] = None + conversations: List[ApiConversationEvaluation] + + +class ApiEvaluationResult(BaseModel): + """New API format for evaluation results.""" + + scenarios: List[ApiScenarioResult] + summary: Optional[str] = None + keyFindings: Optional[str] = None + recommendation: Optional[str] = None + deepTest: bool = False + startTime: datetime + judgeModel: Optional[str] = None diff --git a/rogue/tests/models/test_evaluation_result.py b/rogue/tests/models/test_evaluation_result.py index 1d5c4dbd..4deb72a7 100644 --- a/rogue/tests/models/test_evaluation_result.py +++ b/rogue/tests/models/test_evaluation_result.py @@ -173,9 +173,9 @@ def test_convert_to_api_format(self): api_format = convert_to_api_format( evaluation_results=results, - summary="Test summary", - key_findings="Key finding 1", - recommendation="Test recommendation", + summary="Test summary for overall evaluation", + key_findings="• Key finding 1\n• Key finding 2", + recommendation="• Recommendation 1\n• Recommendation 2", deep_test=True, judge_model="openai/gpt-4o-mini", ) @@ -197,9 +197,9 @@ def test_convert_to_api_format(self): assert isinstance(message.timestamp, datetime) # Test new fields - assert api_format.summary == "Test summary" - assert api_format.keyFindings == "Key finding 1" - assert api_format.recommendation == "Test recommendation" + assert api_format.summary == "Test summary for overall evaluation" + assert api_format.keyFindings == "• Key finding 1\n• Key finding 2" + assert api_format.recommendation == "• Recommendation 1\n• Recommendation 2" assert api_format.deepTest is True assert api_format.judgeModel == "openai/gpt-4o-mini" assert isinstance(api_format.startTime, datetime) diff --git a/rogue/ui/components/report_generator.py b/rogue/ui/components/report_generator.py index 4e8624fc..07e3c9cf 100644 --- a/rogue/ui/components/report_generator.py +++ b/rogue/ui/components/report_generator.py @@ -7,6 +7,90 @@ from rogue_sdk.types import EvaluationResults from pydantic import BaseModel from typing import List, Optional +import re + + +def parse_summary_sections(full_summary: str) -> tuple[str, str, str]: + """Parse a comprehensive summary into separate sections. + + Args: + full_summary: The comprehensive summary text + + Returns: + Tuple of (summary, key_findings, recommendations) + """ + if not full_summary: + return None, None, None + + # Extract the main summary section (everything before Key Findings) + summary_match = re.search( + r"(.*?)(?=---\s*##?\s+Key Findings|##?\s+Key Findings)", + full_summary, + re.DOTALL | re.IGNORECASE, + ) + summary_section = "" + if summary_match: + summary_section = summary_match.group(1).strip() + # Clean up extra dashes and formatting + summary_section = re.sub(r"---+\s*$", "", summary_section).strip() + + # Extract Key Findings section + key_findings_match = re.search( + r"##?\s+Key Findings\s*[-]*\s*(.*?)(?=---\s*##?\s+Recommendations|##?\s+Recommendations|##?\s+Detailed Breakdown|$)", # noqa: E501 + full_summary, + re.DOTALL | re.IGNORECASE, + ) + key_findings_section = "" + if key_findings_match: + key_findings_section = key_findings_match.group(1).strip() + # Clean up bullet points and formatting + key_findings_section = re.sub( + r"^-\s*", + "", + key_findings_section, + flags=re.MULTILINE, + ) + key_findings_section = re.sub(r"---+\s*$", "", key_findings_section).strip() + # Fix bullet point formatting + key_findings_section = re.sub(r"\s*-\s*\*\*", "\n• **", key_findings_section) + if not key_findings_section.startswith( + "•", + ) and not key_findings_section.startswith("-"): + key_findings_section = "• " + key_findings_section + + # Extract Recommendations section + recommendations_match = re.search( + r"##?\s+Recommendations\s*[-]*\s*(.*?)(?=---\s*##?\s+Detailed Breakdown|##?\s+Detailed Breakdown|$)", # noqa: E501 + full_summary, + re.DOTALL | re.IGNORECASE, + ) + recommendations_section = "" + if recommendations_match: + recommendations_section = recommendations_match.group(1).strip() + # Clean up formatting + recommendations_section = re.sub( + r"---+\s*$", + "", + recommendations_section, + ).strip() + # Convert all numbered items to bullet points + recommendations_section = re.sub( + r"^\d+\.\s*", + "• ", + recommendations_section, + flags=re.MULTILINE, + ) + recommendations_section = re.sub( + r"\s+\d+\.\s*", + "\n• ", + recommendations_section, + ) + + return ( + summary_section if summary_section else None, + key_findings_section if key_findings_section else None, + recommendations_section if recommendations_section else None, + ) # New API Format Types for report display @@ -191,11 +275,21 @@ def on_report_tab_select(state): # Extract configuration and additional metadata from state config = state.get("config", {}) + # Parse the summary to extract separate sections + if summary and summary != "No summary available.": + parsed_summary, parsed_key_findings, parsed_recommendations = ( + parse_summary_sections(summary) + ) + else: + parsed_summary = None + parsed_key_findings = None + parsed_recommendations = None + api_format_results = convert_to_api_format( evaluation_results=results, - summary=summary if summary != "No summary available." else None, - key_findings=state.get("key_findings"), - recommendation=state.get("recommendation"), + summary=parsed_summary, + key_findings=parsed_key_findings or state.get("key_findings"), + recommendation=parsed_recommendations or state.get("recommendation"), deep_test=config.get("deep_test_mode", False), start_time=state.get("start_time"), judge_model=config.get("judge_llm"), From dc2f6e986a4fb485adfc3e4ecc0eed2de5113dbd Mon Sep 17 00:00:00 2001 From: Dror Ivry Date: Sun, 7 Sep 2025 12:25:57 +0300 Subject: [PATCH 04/22] wip --- examples/js/cli/package.json | 2 +- examples/js/langgraph-js-example/package.json | 2 +- examples/js/vercel-ai-example/package.json | 2 +- .../tshirt_store_agent_executor.py | 17 +- .../tui/internal/theme/themes/vesper.json | 2 +- rogue/common/generic_agent_executor.py | 17 +- rogue/common/remote_agent_connection.py | 4 +- rogue/server/api/llm.py | 11 +- rogue/server/models/api_format.py | 16 ++ rogue/server/services/__init__.py | 1 + rogue/server/services/interviewer_service.py | 6 +- rogue/server/services/llm_service.py | 111 +++++++-- .../services/scenario_evaluation_service.py | 8 +- rogue/tests/models/test_cli_input.py | 3 +- rogue/tests/models/test_evaluation_result.py | 27 +- rogue/ui/components/config_screen.py | 14 +- rogue/ui/components/report_generator.py | 231 +----------------- rogue/ui/components/scenario_runner.py | 1 + sdks/python/rogue_sdk/client.py | 25 +- sdks/python/rogue_sdk/sdk.py | 35 ++- sdks/python/rogue_sdk/tests/test_types.py | 8 +- sdks/python/rogue_sdk/types.py | 11 +- sdks/python/rogue_sdk/websocket.py | 4 +- 23 files changed, 253 insertions(+), 305 deletions(-) diff --git a/examples/js/cli/package.json b/examples/js/cli/package.json index 0afd3083..758f40a0 100644 --- a/examples/js/cli/package.json +++ b/examples/js/cli/package.json @@ -20,4 +20,4 @@ "ts-node": "^10.9.2", "typescript": "^5.8.3" } -} \ No newline at end of file +} diff --git a/examples/js/langgraph-js-example/package.json b/examples/js/langgraph-js-example/package.json index 46860e2a..86b52c0b 100644 --- a/examples/js/langgraph-js-example/package.json +++ b/examples/js/langgraph-js-example/package.json @@ -28,4 +28,4 @@ "ts-node": "^10.9.2", "typescript": "^5.8.3" } -} \ No newline at end of file +} diff --git a/examples/js/vercel-ai-example/package.json b/examples/js/vercel-ai-example/package.json index a5c1cee5..4c39d140 100644 --- a/examples/js/vercel-ai-example/package.json +++ b/examples/js/vercel-ai-example/package.json @@ -25,4 +25,4 @@ "uuid": "^11.1.0", "zod": "^3.24.1" } -} \ No newline at end of file +} diff --git a/examples/tshirt_store_agent/tshirt_store_agent_executor.py b/examples/tshirt_store_agent/tshirt_store_agent_executor.py index 68286326..32cb739a 100644 --- a/examples/tshirt_store_agent/tshirt_store_agent_executor.py +++ b/examples/tshirt_store_agent/tshirt_store_agent_executor.py @@ -129,7 +129,7 @@ async def _upsert_session(self, session_id: str): if session is None: logger.error( f"Critical error: Session is None even after " - f"create_session for session_id: {session_id}" + f"create_session for session_id: {session_id}", ) raise RuntimeError( f"Failed to get or create session: {session_id}", @@ -151,15 +151,16 @@ def convert_a2a_part_to_genai(part: Part) -> types.Part: if isinstance(part.file, FileWithUri): return types.Part( file_data=types.FileData( - file_uri=part.file.uri, mime_type=part.file.mimeType - ) + file_uri=part.file.uri, + mime_type=part.file.mimeType, + ), ) if isinstance(part.file, FileWithBytes): return types.Part( inline_data=types.Blob( data=base64.b64decode(part.file.bytes), mime_type=part.file.mimeType, - ) + ), ) raise ValueError(f"Unsupported file type: {type(part.file)}") raise ValueError(f"Unsupported part type: {type(part)}") @@ -185,8 +186,8 @@ def convert_genai_part_to_a2a(part: types.Part) -> Part: file=FileWithUri( uri=part.file_data.file_uri or "", mimeType=part.file_data.mime_type, - ) - ) + ), + ), ) if part.inline_data: return Part( @@ -196,7 +197,7 @@ def convert_genai_part_to_a2a(part: types.Part) -> Part: part.inline_data.data, # type: ignore ).decode(), mimeType=part.inline_data.mime_type, - ) - ) + ), + ), ) raise ValueError(f"Unsupported part type: {part}") diff --git a/packages/tui/internal/theme/themes/vesper.json b/packages/tui/internal/theme/themes/vesper.json index b8406f93..08eade58 100644 --- a/packages/tui/internal/theme/themes/vesper.json +++ b/packages/tui/internal/theme/themes/vesper.json @@ -216,4 +216,4 @@ } } } - \ No newline at end of file + diff --git a/rogue/common/generic_agent_executor.py b/rogue/common/generic_agent_executor.py index 0e1de2f6..059dc77a 100644 --- a/rogue/common/generic_agent_executor.py +++ b/rogue/common/generic_agent_executor.py @@ -128,7 +128,7 @@ async def _upsert_session(self, session_id: str): if session is None: logger.error( f"Critical error: Session is None even after " - f"create_session for session_id: {session_id}" + f"create_session for session_id: {session_id}", ) raise RuntimeError( f"Failed to get or create session: {session_id}", @@ -150,15 +150,16 @@ def convert_a2a_part_to_genai(part: Part) -> types.Part: if isinstance(part.file, FileWithUri): return types.Part( file_data=types.FileData( - file_uri=part.file.uri, mime_type=part.file.mimeType - ) + file_uri=part.file.uri, + mime_type=part.file.mimeType, + ), ) if isinstance(part.file, FileWithBytes): return types.Part( inline_data=types.Blob( data=base64.b64decode(part.file.bytes), mime_type=part.file.mimeType, - ) + ), ) raise ValueError(f"Unsupported file type: {type(part.file)}") raise ValueError(f"Unsupported part type: {type(part)}") @@ -184,8 +185,8 @@ def convert_genai_part_to_a2a(part: types.Part) -> Part: file=FileWithUri( uri=part.file_data.file_uri or "", mimeType=part.file_data.mime_type, - ) - ) + ), + ), ) if part.inline_data: return Part( @@ -195,7 +196,7 @@ def convert_genai_part_to_a2a(part: types.Part) -> Part: part.inline_data.data, # type: ignore ).decode(), mimeType=part.inline_data.mime_type, - ) - ) + ), + ), ) raise ValueError(f"Unsupported part type: {part}") diff --git a/rogue/common/remote_agent_connection.py b/rogue/common/remote_agent_connection.py index e08caf83..4fdf080b 100644 --- a/rogue/common/remote_agent_connection.py +++ b/rogue/common/remote_agent_connection.py @@ -81,7 +81,7 @@ async def send_message( SendStreamingMessageRequest( id=uuid4().hex, params=request, - ) + ), ): logger.debug( "received stream response from remote agent", @@ -110,7 +110,7 @@ async def send_message( SendMessageRequest( id=uuid4().hex, params=request, - ) + ), ) logger.debug( diff --git a/rogue/server/api/llm.py b/rogue/server/api/llm.py index 4fa434f8..73a312f4 100644 --- a/rogue/server/api/llm.py +++ b/rogue/server/api/llm.py @@ -9,9 +9,10 @@ ScenarioGenerationRequest, ScenarioGenerationResponse, SummaryGenerationRequest, - SummaryGenerationResponse, ) +from ..models.api_format import ServerSummaryGenerationResponse + from ...common.logging import get_logger from ..services.llm_service import LLMService @@ -57,8 +58,10 @@ async def generate_scenarios(request: ScenarioGenerationRequest): ) -@router.post("/summary", response_model=SummaryGenerationResponse) -async def generate_summary(request: SummaryGenerationRequest): +@router.post("/summary", response_model=ServerSummaryGenerationResponse) +async def generate_summary( + request: SummaryGenerationRequest, +) -> ServerSummaryGenerationResponse: """ Generate evaluation summary from results. @@ -81,7 +84,7 @@ async def generate_summary(request: SummaryGenerationRequest): logger.info("Successfully generated evaluation summary") - return SummaryGenerationResponse( + return ServerSummaryGenerationResponse( summary=summary, message="Successfully generated evaluation summary", ) diff --git a/rogue/server/models/api_format.py b/rogue/server/models/api_format.py index a374e984..1142b729 100644 --- a/rogue/server/models/api_format.py +++ b/rogue/server/models/api_format.py @@ -10,6 +10,15 @@ from pydantic import BaseModel +class StructuredSummary(BaseModel): + """Structured summary response from LLM.""" + + overall_summary: str + key_findings: List[str] + recommendations: List[str] + detailed_breakdown: List[dict] # Table rows for scenario breakdown + + class ApiChatMessage(BaseModel): """Chat message for new API format with datetime timestamp.""" @@ -45,3 +54,10 @@ class ApiEvaluationResult(BaseModel): deepTest: bool = False startTime: datetime judgeModel: Optional[str] = None + + +class ServerSummaryGenerationResponse(BaseModel): + """Server response for summary generation with structured summary.""" + + summary: StructuredSummary + message: str diff --git a/rogue/server/services/__init__.py b/rogue/server/services/__init__.py index 8e3466ae..95047763 100644 --- a/rogue/server/services/__init__.py +++ b/rogue/server/services/__init__.py @@ -1,4 +1,5 @@ from . import ( + api_format_service, evaluation_library, evaluation_service, interviewer_service, diff --git a/rogue/server/services/interviewer_service.py b/rogue/server/services/interviewer_service.py index 1caf69b7..d9d9d333 100644 --- a/rogue/server/services/interviewer_service.py +++ b/rogue/server/services/interviewer_service.py @@ -71,7 +71,7 @@ def send_message(self, user_input: str): { "role": "user", "content": user_input, - } + }, ) # Copying the messages to avoid modifying the original list @@ -87,7 +87,7 @@ def send_message(self, user_input: str): "You have asked 3 questions. Now, provide a concise summary of " "the agent's business context based on the conversation." ), - } + }, ) try: @@ -101,7 +101,7 @@ def send_message(self, user_input: str): { "role": "assistant", "content": response.choices[0].message.content, - } + }, ) return response.choices[0].message.content diff --git a/rogue/server/services/llm_service.py b/rogue/server/services/llm_service.py index de46177f..aa229c04 100644 --- a/rogue/server/services/llm_service.py +++ b/rogue/server/services/llm_service.py @@ -1,9 +1,12 @@ +import json from typing import Optional from litellm import completion from loguru import logger from rogue_sdk.types import EvaluationResults, Scenario, Scenarios, ScenarioType +from ..models.api_format import StructuredSummary + SCENARIO_GENERATION_SYSTEM_PROMPT = """ # Test Scenario Designer @@ -98,7 +101,7 @@ # Evaluation Results Summarizer You are a test results summarizer. Your task is to analyze the provided evaluation results -and generate a concise, insightful, and human-readable summary in Markdown format. +and generate a structured JSON response with the summary components. ## Evaluation Results (JSON) @@ -106,22 +109,47 @@ ## Your Task -Based on the JSON data above, create a summary that includes: +Based on the JSON data above, create a structured summary that includes: -1. **Overall Summary**: A brief, high-level overview of the agent's performance, - highlighting the pass/fail ratio and any critical issues discovered. -2. **Key Findings**: Bullet points detailing the most significant discoveries, both - positive and negative. Focus on patterns of failure or notable successes. -3. **Recommendations**: Suggest concrete next steps for improving the agent. These +1. **overall_summary**: A brief, high-level overview of the agent's performance, + highlighting the pass/fail ratio and any critical issues discovered. Return as a single string. +2. **key_findings**: List of the most significant discoveries, both positive and negative. + Focus on patterns of failure or notable successes. Return as an array of strings. +3. **recommendations**: List of concrete next steps for improving the agent. These could include fixing specific bugs, improving training data, or clarifying policies. -4. **Detailed Breakdown**: A table that provides a granular look at each - scenario that was tested, including the pass/fail with the appropriate emoji ✅/❌ status and a brief note on the outcome. + Return as an array of strings. +4. **detailed_breakdown**: Array of objects representing a table that provides a granular + look at each scenario tested. Each object should have: scenario, status (✅/❌), outcome. + +## Output Format +You MUST respond with valid JSON in exactly this format: + +```json +{ + "overall_summary": "Brief overview text here...", + "key_findings": [ + "First key finding", + "Second key finding" + ], + "recommendations": [ + "First recommendation", + "Second recommendation" + ], + "detailed_breakdown": [ + { + "scenario": "Scenario name", + "status": "✅", + "outcome": "Brief outcome description" + } + ] +} +``` ## Guidelines - Use clear and professional language. -- Format the output using Markdown for readability (headings, bold text, lists, etc.). - Be objective and base your summary strictly on the provided data. -- Ensure the summary is well-organized and easy to navigate. +- Return ONLY valid JSON - no markdown, no explanations, no additional text. +- Ensure all strings are properly escaped for JSON. """ # noqa: E501 @@ -142,13 +170,18 @@ def generate_scenarios( context: str, llm_provider_api_key: Optional[str] = None, ) -> Scenarios: - """ - Generates scenarios for the given business context using the given model. - :param model: LLM model to use for scenario generation. - :param context: Business context to use for scenario generation. - :param llm_provider_api_key: api key for the LLM provider - (if applicable, env can also be used instead). - :return: The generated scenarios + """Generate test scenarios from business context using LLM. + + Args: + model: LLM model to use for generation + context: Business context description for scenario generation + llm_provider_api_key: API key for the LLM provider + + Returns: + Scenarios: Generated test scenarios + + Raises: + Exception: If scenario generation fails """ system_prompt = SCENARIO_GENERATION_SYSTEM_PROMPT.replace( r"{$BUSINESS_CONTEXT}", @@ -188,7 +221,7 @@ def generate_summary_from_results( model: str, results: EvaluationResults, llm_provider_api_key: Optional[str] = None, - ) -> str: + ) -> StructuredSummary: system_prompt = SUMMARY_GENERATION_SYSTEM_PROMPT.replace( r"{$EVALUATION_RESULTS}", results.model_dump_json(indent=2), @@ -198,7 +231,10 @@ def generate_summary_from_results( {"role": "system", "content": system_prompt}, { "role": "user", - "content": "Please generate the summary based on the provided results.", + "content": ( + "Please generate the structured summary based on the " + "provided results." + ), }, ] @@ -210,7 +246,38 @@ def generate_summary_from_results( messages=messages, api_key=api_key, ) - return response.choices[0].message.content + + # Parse the JSON response from the LLM + content = response.choices[0].message.content.strip() + + # Remove markdown code blocks if present + if content.startswith("```json"): + content = content[7:] + if content.endswith("```"): + content = content[:-3] + content = content.strip() + + # Parse JSON and create StructuredSummary + summary_data = json.loads(content) + return StructuredSummary(**summary_data) + + except json.JSONDecodeError as e: + logger.exception(f"Failed to parse JSON response from LLM: {e}") + # Return a fallback structured summary + return StructuredSummary( + overall_summary="Error: Could not parse summary response from LLM.", + key_findings=["Unable to generate key findings due to parsing error."], + recommendations=["Please review the evaluation results manually."], + detailed_breakdown=[], + ) except Exception: logger.exception("Failed to generate summary from results") - return "Error: Could not generate a summary for the evaluation results." + # Return a fallback structured summary + return StructuredSummary( + overall_summary=( + "Error: Could not generate a summary for the evaluation results." + ), + key_findings=["Unable to generate key findings due to system error."], + recommendations=["Please review the evaluation results manually."], + detailed_breakdown=[], + ) diff --git a/rogue/server/services/scenario_evaluation_service.py b/rogue/server/services/scenario_evaluation_service.py index 4ed69ba0..8de94c76 100644 --- a/rogue/server/services/scenario_evaluation_service.py +++ b/rogue/server/services/scenario_evaluation_service.py @@ -78,7 +78,7 @@ async def evaluate_scenarios(self) -> AsyncGenerator[tuple[str, Any], None]: results = data if results and results.results: logger.info( - f"📊 Processing {len(results.results)} evaluation results" + f"📊 Processing {len(results.results)} evaluation results", ) for res in results.results: self._results.add_result(res) @@ -86,12 +86,12 @@ async def evaluate_scenarios(self) -> AsyncGenerator[tuple[str, Any], None]: logger.warning("⚠️ Received results update but no results data") else: # it's a 'chat' or 'status' update logger.debug( - f"🔄 Forwarding {update_type} update: {str(data)[:50]}..." + f"🔄 Forwarding {update_type} update: {str(data)[:50]}...", ) yield update_type, data logger.info( - f"🏁 arun_evaluator_agent completed. Total updates: {update_count}" + f"🏁 arun_evaluator_agent completed. Total updates: {update_count}", ) except Exception as e: @@ -132,6 +132,6 @@ async def evaluate_scenarios(self) -> AsyncGenerator[tuple[str, Any], None]: ( "✅ ScenarioEvaluationService completed with " f"{len(self._results.results)} total results" - ) + ), ) yield "done", self._results diff --git a/rogue/tests/models/test_cli_input.py b/rogue/tests/models/test_cli_input.py index 627bc921..e2571b47 100644 --- a/rogue/tests/models/test_cli_input.py +++ b/rogue/tests/models/test_cli_input.py @@ -32,7 +32,8 @@ def test_check_auth_credentials(self, auth_type, credentials, should_raise): if should_raise: with pytest.raises( - ValidationError, match="Authentication Credentials cannot be empty" + ValidationError, + match="Authentication Credentials cannot be empty", ): CLIInput(**input_data) else: diff --git a/rogue/tests/models/test_evaluation_result.py b/rogue/tests/models/test_evaluation_result.py index 4deb72a7..4c6277e8 100644 --- a/rogue/tests/models/test_evaluation_result.py +++ b/rogue/tests/models/test_evaluation_result.py @@ -8,10 +8,8 @@ EvaluationResults, Scenario, ) -from rogue.ui.components.report_generator import ( - convert_to_api_format, - ApiEvaluationResult, -) +from rogue.server.services.api_format_service import convert_to_api_format +from rogue.server.models.api_format import ApiEvaluationResult, StructuredSummary class TestEvaluationResults: @@ -171,11 +169,19 @@ def test_convert_to_api_format(self): result = self.get_evaluation_result(self.scenario_1, self.conversation_1_passed) results.add_result(result) + # Create structured summary for testing + structured_summary = StructuredSummary( + overall_summary="Test summary for overall evaluation", + key_findings=["Key finding 1", "Key finding 2"], + recommendations=["Recommendation 1", "Recommendation 2"], + detailed_breakdown=[ + {"scenario": "Test", "status": "✅", "outcome": "Passed"}, + ], + ) + api_format = convert_to_api_format( evaluation_results=results, - summary="Test summary for overall evaluation", - key_findings="• Key finding 1\n• Key finding 2", - recommendation="• Recommendation 1\n• Recommendation 2", + structured_summary=structured_summary, deep_test=True, judge_model="openai/gpt-4o-mini", ) @@ -186,6 +192,13 @@ def test_convert_to_api_format(self): assert api_format.scenarios[0].totalConversations == 1 assert api_format.scenarios[0].flaggedConversations == 0 assert len(api_format.scenarios[0].conversations) == 1 + + # Test structured summary fields + assert api_format.summary == "Test summary for overall evaluation" + assert api_format.keyFindings == "• Key finding 1\n• Key finding 2" + assert api_format.recommendation == "• Recommendation 1\n• Recommendation 2" + assert api_format.deepTest is True + assert api_format.judgeModel == "openai/gpt-4o-mini" assert api_format.scenarios[0].conversations[0].passed is True assert api_format.scenarios[0].conversations[0].reason == "reason" assert len(api_format.scenarios[0].conversations[0].messages) == 1 diff --git a/rogue/ui/components/config_screen.py b/rogue/ui/components/config_screen.py index bd8e5793..e32e1cb5 100644 --- a/rogue/ui/components/config_screen.py +++ b/rogue/ui/components/config_screen.py @@ -36,7 +36,7 @@ def create_config_screen( ) gr.Markdown( "When enabled, you'll be guided through an AI-powered interview to " - "extract your agent's business context. Turn off to skip this step." + "extract your agent's business context. Turn off to skip this step.", ) gr.Markdown("**Deep Test Mode**") @@ -46,7 +46,7 @@ def create_config_screen( ) gr.Markdown( "When enabled, the evaluator will " - "approach each scenario from different angles" + "approach each scenario from different angles", ) gr.Markdown("### Parallel Runs") @@ -76,7 +76,8 @@ def create_config_screen( ), ) auth_credentials_error = gr.Markdown( - visible=False, elem_classes=["error-label"] + visible=False, + elem_classes=["error-label"], ) gr.Markdown("## Evaluator Configuration") @@ -84,12 +85,12 @@ def create_config_screen( "Specify the models for the evaluation process. " "The **Service LLM** will be used to interview, " "generate scenarios and summaries. The **Judge LLM** is used by the " - "evaluator agent to score the agent's performance against those scenarios." + "evaluator agent to score the agent's performance against those scenarios.", ) gr.Markdown( "ℹ️ Under the hood we're using `litellm`. See the " "[list of supported models](https://docs.litellm.ai/docs/providers). " - "You can use environment variables for API keys." + "You can use environment variables for API keys.", ) service_llm = gr.Textbox( @@ -226,7 +227,8 @@ def save_config( msg = error["msg"] if loc in error_labels: error_updates[error_labels[loc]] = gr.update( - value=f"**Error:** {msg}", visible=True + value=f"**Error:** {msg}", + visible=True, ) else: logger.exception("Unhandled validation error") diff --git a/rogue/ui/components/report_generator.py b/rogue/ui/components/report_generator.py index 07e3c9cf..2aa71aa4 100644 --- a/rogue/ui/components/report_generator.py +++ b/rogue/ui/components/report_generator.py @@ -1,219 +1,11 @@ from pathlib import Path from typing import Tuple -from datetime import datetime, timezone import gradio as gr from loguru import logger from rogue_sdk.types import EvaluationResults -from pydantic import BaseModel -from typing import List, Optional -import re - -def parse_summary_sections(full_summary: str) -> tuple[str, str, str]: - """Parse a comprehensive summary into separate sections. - - Args: - full_summary: The comprehensive summary text - - Returns: - Tuple of (summary, key_findings, recommendations) - """ - if not full_summary: - return None, None, None - - # Extract the main summary section (everything before Key Findings) - summary_match = re.search( - r"(.*?)(?=---\s*##?\s+Key Findings|##?\s+Key Findings)", - full_summary, - re.DOTALL | re.IGNORECASE, - ) - summary_section = "" - if summary_match: - summary_section = summary_match.group(1).strip() - # Clean up extra dashes and formatting - summary_section = re.sub(r"---+\s*$", "", summary_section).strip() - - # Extract Key Findings section - key_findings_match = re.search( - r"##?\s+Key Findings\s*[-]*\s*(.*?)(?=---\s*##?\s+Recommendations|##?\s+Recommendations|##?\s+Detailed Breakdown|$)", # noqa: E501 - full_summary, - re.DOTALL | re.IGNORECASE, - ) - key_findings_section = "" - if key_findings_match: - key_findings_section = key_findings_match.group(1).strip() - # Clean up bullet points and formatting - key_findings_section = re.sub( - r"^-\s*", - "", - key_findings_section, - flags=re.MULTILINE, - ) - key_findings_section = re.sub(r"---+\s*$", "", key_findings_section).strip() - # Fix bullet point formatting - key_findings_section = re.sub(r"\s*-\s*\*\*", "\n• **", key_findings_section) - if not key_findings_section.startswith( - "•", - ) and not key_findings_section.startswith("-"): - key_findings_section = "• " + key_findings_section - - # Extract Recommendations section - recommendations_match = re.search( - r"##?\s+Recommendations\s*[-]*\s*(.*?)(?=---\s*##?\s+Detailed Breakdown|##?\s+Detailed Breakdown|$)", # noqa: E501 - full_summary, - re.DOTALL | re.IGNORECASE, - ) - recommendations_section = "" - if recommendations_match: - recommendations_section = recommendations_match.group(1).strip() - # Clean up formatting - recommendations_section = re.sub( - r"---+\s*$", - "", - recommendations_section, - ).strip() - # Convert all numbered items to bullet points - recommendations_section = re.sub( - r"^\d+\.\s*", - "• ", - recommendations_section, - flags=re.MULTILINE, - ) - recommendations_section = re.sub( - r"\s+\d+\.\s*", - "\n• ", - recommendations_section, - ) - - return ( - summary_section if summary_section else None, - key_findings_section if key_findings_section else None, - recommendations_section if recommendations_section else None, - ) - - -# New API Format Types for report display -class ApiChatMessage(BaseModel): - """Chat message for new API format with datetime timestamp.""" - - role: str - content: str - timestamp: datetime - - -class ApiConversationEvaluation(BaseModel): - """Conversation evaluation for new API format.""" - - passed: bool - messages: List[ApiChatMessage] - reason: Optional[str] = None - - -class ApiScenarioResult(BaseModel): - """Result of evaluating a single scenario in new API format.""" - - description: Optional[str] = None - totalConversations: Optional[int] = None - flaggedConversations: Optional[int] = None - conversations: List[ApiConversationEvaluation] - - -class ApiEvaluationResult(BaseModel): - """New API format for evaluation results.""" - - scenarios: List[ApiScenarioResult] - summary: Optional[str] = None - keyFindings: Optional[str] = None - recommendation: Optional[str] = None - deepTest: bool = False - startTime: datetime - judgeModel: Optional[str] = None - - -def convert_to_api_format( - evaluation_results: EvaluationResults, - summary: Optional[str] = None, - key_findings: Optional[str] = None, - recommendation: Optional[str] = None, - deep_test: bool = False, - start_time: Optional[datetime] = None, - judge_model: Optional[str] = None, -) -> ApiEvaluationResult: - """Convert legacy EvaluationResults to new API format. - - Args: - evaluation_results: Legacy evaluation results to convert - summary: Generated summary of the evaluation - key_findings: Key findings from the evaluation - recommendation: Recommendations based on the evaluation - deep_test: Whether deep test mode was enabled - start_time: When the evaluation started (defaults to current time) - judge_model: The LLM judge model used - - Returns: - ApiEvaluationResult: New format evaluation result with additional metadata - """ - if start_time is None: - start_time = datetime.now(timezone.utc) - - api_scenarios = [] - - for result in evaluation_results.results: - # Convert conversations to new format - api_conversations = [] - for conv_eval in result.conversations: - # Convert ChatHistory messages to ApiChatMessage - api_messages = [] - for msg in conv_eval.messages.messages: - timestamp = datetime.now(timezone.utc) - if msg.timestamp: - try: - if isinstance(msg.timestamp, str): - timestamp = datetime.fromisoformat( - msg.timestamp.replace("Z", "+00:00"), - ) - else: - timestamp = msg.timestamp - except (ValueError, AttributeError): - timestamp = datetime.now(timezone.utc) - - api_messages.append( - ApiChatMessage( - role=msg.role, - content=msg.content, - timestamp=timestamp, - ), - ) - - api_conversations.append( - ApiConversationEvaluation( - passed=conv_eval.passed, - messages=api_messages, - reason=conv_eval.reason if conv_eval.reason else None, - ), - ) - - api_scenarios.append( - ApiScenarioResult( - description=result.scenario.scenario, - totalConversations=len(api_conversations), - flaggedConversations=len( - [c for c in api_conversations if not c.passed], - ), - conversations=api_conversations, - ), - ) - - return ApiEvaluationResult( - scenarios=api_scenarios, - summary=summary, - keyFindings=key_findings, - recommendation=recommendation, - deepTest=deep_test, - startTime=start_time, - judgeModel=judge_model, - ) +from ...server.services.api_format_service import convert_with_structured_summary def _load_report_data_from_files( @@ -270,26 +62,17 @@ def on_report_tab_select(state): ) results = EvaluationResults() - # Convert to new API format for display + # Convert to new API format for display using server service try: # Extract configuration and additional metadata from state config = state.get("config", {}) - # Parse the summary to extract separate sections - if summary and summary != "No summary available.": - parsed_summary, parsed_key_findings, parsed_recommendations = ( - parse_summary_sections(summary) - ) - else: - parsed_summary = None - parsed_key_findings = None - parsed_recommendations = None - - api_format_results = convert_to_api_format( + # For now, pass None for structured_summary since UI still uses + # string summaries. This will be updated when the UI summary generation + # is converted to structured format + api_format_results = convert_with_structured_summary( evaluation_results=results, - summary=parsed_summary, - key_findings=parsed_key_findings or state.get("key_findings"), - recommendation=parsed_recommendations or state.get("recommendation"), + structured_summary=None, # TODO: Convert UI to use structured summaries deep_test=config.get("deep_test_mode", False), start_time=state.get("start_time"), judge_model=config.get("judge_llm"), diff --git a/rogue/ui/components/scenario_runner.py b/rogue/ui/components/scenario_runner.py index a1ea60ba..d63f3866 100644 --- a/rogue/ui/components/scenario_runner.py +++ b/rogue/ui/components/scenario_runner.py @@ -481,6 +481,7 @@ def on_status_update(status_data): # final_output_path.write_text(all_results.model_dump_json(indent=2)) # Generate summary using SDK (server-based) + summary = "Summary generation failed." try: sdk_config = RogueClientConfig( base_url=state.get("rogue_server_url", "http://localhost:8000"), diff --git a/sdks/python/rogue_sdk/client.py b/sdks/python/rogue_sdk/client.py index 966e5558..f88b7fd2 100644 --- a/sdks/python/rogue_sdk/client.py +++ b/sdks/python/rogue_sdk/client.py @@ -24,6 +24,7 @@ SendMessageResponse, StartInterviewRequest, StartInterviewResponse, + StructuredSummary, SummaryGenerationRequest, SummaryGenerationResponse, ) @@ -162,7 +163,27 @@ async def generate_summary( "/api/v1/llm/summary", json=data.model_dump(mode="json"), ) - return SummaryGenerationResponse(**response) + + # Handle server's structured summary response + summary_data = response.get("summary", {}) + if isinstance(summary_data, dict) and "overall_summary" in summary_data: + # Server returned StructuredSummary - convert to our expected format + structured_summary = StructuredSummary(**summary_data) + return SummaryGenerationResponse( + summary=structured_summary, + message=response.get("message", "Successfully generated summary"), + ) + else: + # Fallback for legacy string response + return SummaryGenerationResponse( + summary=StructuredSummary( + overall_summary=str(summary_data), + key_findings=[], + recommendations=[], + detailed_breakdown=[], + ), + message=response.get("message", "Successfully generated summary"), + ) async def start_interview( self, @@ -237,7 +258,7 @@ async def wait_for_evaluation( elapsed = asyncio.get_running_loop().time() - start_time if elapsed >= max_wait_time: raise TimeoutError( - f"Evaluation {job_id} did not complete within {max_wait_time}s" + f"Evaluation {job_id} did not complete within {max_wait_time}s", ) await asyncio.sleep(poll_interval) diff --git a/sdks/python/rogue_sdk/sdk.py b/sdks/python/rogue_sdk/sdk.py index efca694c..ba89cfdb 100644 --- a/sdks/python/rogue_sdk/sdk.py +++ b/sdks/python/rogue_sdk/sdk.py @@ -114,7 +114,7 @@ def on_websocket_event( """Add WebSocket event handler.""" if not self.ws_client: raise RuntimeError( - "WebSocket not connected. Call connect_websocket() first." + "WebSocket not connected. Call connect_websocket() first.", ) self.ws_client.on(event, handler) @@ -192,7 +192,7 @@ def handle_final_job_result(task): result_future.set_result(result) else: result_future.set_exception( - Exception("Failed to retrieve final job result") + Exception("Failed to retrieve final job result"), ) except Exception as e: result_future.set_exception(e) @@ -210,7 +210,7 @@ def handle_chat_update(event, data): def handle_error(event, data): if not result_future.done(): result_future.set_exception( - Exception(f"WebSocket error: {data.get('error')}") + Exception(f"WebSocket error: {data.get('error')}"), ) # Connect WebSocket for updates @@ -228,7 +228,7 @@ def handle_error(event, data): return result except asyncio.TimeoutError: raise TimeoutError( - f"Evaluation {job_id} did not complete within {timeout}s" + f"Evaluation {job_id} did not complete within {timeout}s", ) finally: await self.disconnect_websocket() @@ -296,7 +296,32 @@ async def generate_summary( api_key=api_key, ) - return response_data.summary + # Convert structured summary back to string format for backward compatibility + structured_summary = response_data.summary + if hasattr(structured_summary, "overall_summary"): + # Format as markdown string for UI display + summary_parts = [ + f"# Evaluation Results Summary\n\n## Overall Summary\n" + f"{structured_summary.overall_summary}", + ] + + if structured_summary.key_findings: + findings = "\n".join( + f"- {finding}" for finding in structured_summary.key_findings + ) + summary_parts.append(f"\n---\n\n## Key Findings\n{findings}") + + if structured_summary.recommendations: + recommendations = "\n".join( + f"{i + 1}. {rec}" + for i, rec in enumerate(structured_summary.recommendations) + ) + summary_parts.append(f"\n---\n\n## Recommendations\n{recommendations}") + + return "\n".join(summary_parts) + else: + # Fallback for string response + return str(structured_summary) async def start_interview( self, diff --git a/sdks/python/rogue_sdk/tests/test_types.py b/sdks/python/rogue_sdk/tests/test_types.py index db652f7c..aa906bde 100644 --- a/sdks/python/rogue_sdk/tests/test_types.py +++ b/sdks/python/rogue_sdk/tests/test_types.py @@ -42,7 +42,10 @@ def test_validate_dataset_for_type(self, scenario_type, dataset, should_raise): ], ) def test_validate_dataset_sample_size( - self, dataset, dataset_sample_size, should_raise + self, + dataset, + dataset_sample_size, + should_raise, ): input_data = { "scenario": "Test Scenario", @@ -55,7 +58,8 @@ def test_validate_dataset_sample_size( if should_raise: with pytest.raises( - ValidationError, match="`dataset_sample_size` must be set" + ValidationError, + match="`dataset_sample_size` must be set", ): Scenario(**input_data) else: diff --git a/sdks/python/rogue_sdk/types.py b/sdks/python/rogue_sdk/types.py index c2a741fa..b1b8706c 100644 --- a/sdks/python/rogue_sdk/types.py +++ b/sdks/python/rogue_sdk/types.py @@ -433,10 +433,19 @@ class SummaryGenerationRequest(BaseModel): api_key: Optional[str] = None +class StructuredSummary(BaseModel): + """Structured summary response from LLM.""" + + overall_summary: str + key_findings: List[str] + recommendations: List[str] + detailed_breakdown: List[dict] # Table rows for scenario breakdown + + class SummaryGenerationResponse(BaseModel): """Response containing generated summary.""" - summary: str + summary: StructuredSummary message: str diff --git a/sdks/python/rogue_sdk/websocket.py b/sdks/python/rogue_sdk/websocket.py index f0a79797..7b9d1b76 100644 --- a/sdks/python/rogue_sdk/websocket.py +++ b/sdks/python/rogue_sdk/websocket.py @@ -164,7 +164,7 @@ def _emit(self, event: WebSocketEventType, data: Any) -> None: ) if t.exception() else None - ) + ), ) else: handler(event, data) @@ -180,7 +180,7 @@ async def _schedule_reconnect(self) -> None: delay = self.reconnect_delay * (2 ** (self.reconnect_attempts - 1)) logger.info( - f"Scheduling reconnect attempt {self.reconnect_attempts} in {delay}s" + f"Scheduling reconnect attempt {self.reconnect_attempts} in {delay}s", ) await asyncio.sleep(delay) From d682161be29d170c9b63c1a086f13a1db12e5087 Mon Sep 17 00:00:00 2001 From: Dror Ivry Date: Sun, 7 Sep 2025 12:31:02 +0300 Subject: [PATCH 05/22] wip --- rogue/server/models/__init__.py | 15 +++ rogue/server/services/api_format_service.py | 141 ++++++++++++++++++++ 2 files changed, 156 insertions(+) create mode 100644 rogue/server/models/__init__.py create mode 100644 rogue/server/services/api_format_service.py diff --git a/rogue/server/models/__init__.py b/rogue/server/models/__init__.py new file mode 100644 index 00000000..9a5b64b5 --- /dev/null +++ b/rogue/server/models/__init__.py @@ -0,0 +1,15 @@ +"""Server models for the Rogue Agent Evaluator.""" + +from .api_format import ( + ApiChatMessage, + ApiConversationEvaluation, + ApiEvaluationResult, + ApiScenarioResult, +) + +__all__ = [ + "ApiChatMessage", + "ApiConversationEvaluation", + "ApiEvaluationResult", + "ApiScenarioResult", +] diff --git a/rogue/server/services/api_format_service.py b/rogue/server/services/api_format_service.py new file mode 100644 index 00000000..44ccc2cf --- /dev/null +++ b/rogue/server/services/api_format_service.py @@ -0,0 +1,141 @@ +"""Service for converting evaluation results to API format. + +This service handles the conversion from legacy EvaluationResults +to the new enhanced API format with structured summary data. +""" + +from datetime import datetime, timezone +from typing import Optional + +from rogue_sdk.types import EvaluationResults + +from ..models.api_format import ( + ApiChatMessage, + ApiConversationEvaluation, + ApiEvaluationResult, + ApiScenarioResult, + StructuredSummary, +) + + +def convert_to_api_format( + evaluation_results: EvaluationResults, + structured_summary: Optional[StructuredSummary] = None, + deep_test: bool = False, + start_time: Optional[datetime] = None, + judge_model: Optional[str] = None, +) -> ApiEvaluationResult: + """Convert legacy EvaluationResults to new API format. + + Args: + evaluation_results: Legacy evaluation results to convert + structured_summary: Structured summary from LLM with separate sections + deep_test: Whether deep test mode was enabled + start_time: When the evaluation started (defaults to current time) + judge_model: The LLM judge model used + + Returns: + ApiEvaluationResult: New format evaluation result with additional metadata + """ + if start_time is None: + start_time = datetime.now(timezone.utc) + + api_scenarios = [] + + for result in evaluation_results.results: + # Convert conversations to new format + api_conversations = [] + for conv_eval in result.conversations: + # Convert ChatHistory messages to ApiChatMessage + api_messages = [] + for msg in conv_eval.messages.messages: + timestamp = datetime.now(timezone.utc) + if msg.timestamp: + try: + if isinstance(msg.timestamp, str): + timestamp = datetime.fromisoformat( + msg.timestamp.replace("Z", "+00:00"), + ) + else: + timestamp = msg.timestamp + except (ValueError, AttributeError): + timestamp = datetime.now(timezone.utc) + + api_messages.append( + ApiChatMessage( + role=msg.role, + content=msg.content, + timestamp=timestamp, + ), + ) + + api_conversations.append( + ApiConversationEvaluation( + passed=conv_eval.passed, + messages=api_messages, + reason=conv_eval.reason if conv_eval.reason else None, + ), + ) + + api_scenarios.append( + ApiScenarioResult( + description=result.scenario.scenario, + totalConversations=len(api_conversations), + flaggedConversations=len( + [c for c in api_conversations if not c.passed], + ), + conversations=api_conversations, + ), + ) + + # Extract structured summary components + summary = None + key_findings = None + recommendation = None + + if structured_summary: + summary = structured_summary.overall_summary + key_findings = "\n".join( + f"• {finding}" for finding in structured_summary.key_findings + ) + recommendation = "\n".join( + f"• {rec}" for rec in structured_summary.recommendations + ) + + return ApiEvaluationResult( + scenarios=api_scenarios, + summary=summary, + keyFindings=key_findings, + recommendation=recommendation, + deepTest=deep_test, + startTime=start_time, + judgeModel=judge_model, + ) + + +def convert_with_structured_summary( + evaluation_results: EvaluationResults, + structured_summary: Optional[StructuredSummary] = None, + deep_test: bool = False, + start_time: Optional[datetime] = None, + judge_model: Optional[str] = None, +) -> ApiEvaluationResult: + """Convert to API format with structured summary. + + Args: + evaluation_results: Legacy evaluation results to convert + structured_summary: Structured summary from LLM + deep_test: Whether deep test mode was enabled + start_time: When the evaluation started + judge_model: The LLM judge model used + + Returns: + ApiEvaluationResult: New format with structured summary data + """ + return convert_to_api_format( + evaluation_results=evaluation_results, + structured_summary=structured_summary, + deep_test=deep_test, + start_time=start_time, + judge_model=judge_model, + ) From 1d7e28a934d3c0641f14bb242e19089c90f8e4fc Mon Sep 17 00:00:00 2001 From: Dror Ivry Date: Sun, 7 Sep 2025 13:16:57 +0300 Subject: [PATCH 06/22] server now respond with the results --- rogue/run_cli.py | 2 +- rogue/ui/components/report_generator.py | 2 +- rogue/ui/components/scenario_runner.py | 4 +++- sdks/python/rogue_sdk/sdk.py | 18 ++++++++++++++---- 4 files changed, 19 insertions(+), 7 deletions(-) diff --git a/rogue/run_cli.py b/rogue/run_cli.py index acf8979e..f0894eeb 100644 --- a/rogue/run_cli.py +++ b/rogue/run_cli.py @@ -203,7 +203,7 @@ async def create_report( sdk = RogueSDK(sdk_config) try: - summary = await sdk.generate_summary( + summary, structured_summary = await sdk.generate_summary( results=results, model=judge_llm, api_key=judge_llm_api_key, diff --git a/rogue/ui/components/report_generator.py b/rogue/ui/components/report_generator.py index 2aa71aa4..db554368 100644 --- a/rogue/ui/components/report_generator.py +++ b/rogue/ui/components/report_generator.py @@ -72,7 +72,7 @@ def on_report_tab_select(state): # is converted to structured format api_format_results = convert_with_structured_summary( evaluation_results=results, - structured_summary=None, # TODO: Convert UI to use structured summaries + structured_summary=state.get("structured_summary"), deep_test=config.get("deep_test_mode", False), start_time=state.get("start_time"), judge_model=config.get("judge_llm"), diff --git a/rogue/ui/components/scenario_runner.py b/rogue/ui/components/scenario_runner.py index d63f3866..b1fa6d7e 100644 --- a/rogue/ui/components/scenario_runner.py +++ b/rogue/ui/components/scenario_runner.py @@ -489,12 +489,14 @@ def on_status_update(status_data): ) sdk = RogueSDK(sdk_config) - summary = await sdk.generate_summary( + summary, structured_summary = await sdk.generate_summary( results=all_results, model=config.get("service_llm"), api_key=config.get("judge_llm_api_key"), ) + state["structured_summary"] = structured_summary + await sdk.close() except Exception: logger.exception("Summary generation failed") diff --git a/sdks/python/rogue_sdk/sdk.py b/sdks/python/rogue_sdk/sdk.py index ba89cfdb..98bdc55a 100644 --- a/sdks/python/rogue_sdk/sdk.py +++ b/sdks/python/rogue_sdk/sdk.py @@ -5,7 +5,7 @@ """ import asyncio -from typing import Any, Callable, Optional +from typing import Any, Callable, Optional, Tuple from loguru import logger from pydantic import HttpUrl @@ -25,6 +25,7 @@ RogueClientConfig, Scenarios, SendMessageResponse, + StructuredSummary, WebSocketEventType, ) from .websocket import RogueWebSocketClient @@ -288,7 +289,7 @@ async def generate_summary( results: EvaluationResults, model: str = "openai/gpt-4o-mini", api_key: Optional[str] = None, - ) -> str: + ) -> Tuple[str, StructuredSummary]: """Generate evaluation summary from results.""" response_data = await self.http_client.generate_summary( results=results, @@ -318,10 +319,19 @@ async def generate_summary( ) summary_parts.append(f"\n---\n\n## Recommendations\n{recommendations}") - return "\n".join(summary_parts) + if structured_summary.detailed_breakdown: + breakdown = "\n".join( + f"{i + 1}. {row}" + for i, row in enumerate(structured_summary.detailed_breakdown) + ) + summary_parts.append(f"\n---\n\n## Detailed Breakdown\n{breakdown}") + + summary_parts.append("\n---\n") + + return "\n".join(summary_parts), structured_summary else: # Fallback for string response - return str(structured_summary) + return str(structured_summary), structured_summary async def start_interview( self, From 6e0d9eed9e038267767bca80f38e236755a5a1c5 Mon Sep 17 00:00:00 2001 From: Dror Ivry Date: Sun, 7 Sep 2025 18:16:38 +0300 Subject: [PATCH 07/22] refactor the detailed breakdown --- packages/tui/go.mod | 1 + packages/tui/go.sum | 2 + packages/tui/internal/tui/app.go | 59 +++++++++++++++++------- packages/tui/internal/tui/evaluation.go | 38 +++++++++------ packages/tui/internal/tui/report_view.go | 18 ++++---- 5 files changed, 80 insertions(+), 38 deletions(-) diff --git a/packages/tui/go.mod b/packages/tui/go.mod index 2e986de1..413f9fe3 100644 --- a/packages/tui/go.mod +++ b/packages/tui/go.mod @@ -23,6 +23,7 @@ require ( require ( github.com/alecthomas/chroma/v2 v2.20.0 + github.com/charmbracelet/bubbles/v2 v2.0.0-beta.1 github.com/charmbracelet/colorprofile v0.3.1 // indirect github.com/charmbracelet/x/cellbuf v0.0.14-0.20250505150409-97991a1f17d1 // indirect github.com/charmbracelet/x/term v0.2.1 // indirect diff --git a/packages/tui/go.sum b/packages/tui/go.sum index a69ee239..d4e9da06 100644 --- a/packages/tui/go.sum +++ b/packages/tui/go.sum @@ -6,6 +6,8 @@ github.com/alecthomas/repr v0.5.1 h1:E3G4t2QbHTSNpPKBgMTln5KLkZHLOcU7r37J4pXBuIg github.com/alecthomas/repr v0.5.1/go.mod h1:Fr0507jx4eOXV7AlPV6AVZLYrLIuIeSOWtW57eE/O/4= github.com/aymanbagabas/go-udiff v0.2.0 h1:TK0fH4MteXUDspT88n8CKzvK0X9O2xu9yQjWpi6yML8= github.com/aymanbagabas/go-udiff v0.2.0/go.mod h1:RE4Ex0qsGkTAJoQdQQCA0uG+nAzJO/pI/QwceO5fgrA= +github.com/charmbracelet/bubbles/v2 v2.0.0-beta.1 h1:swACzss0FjnyPz1enfX56GKkLiuKg5FlyVmOLIlU2kE= +github.com/charmbracelet/bubbles/v2 v2.0.0-beta.1/go.mod h1:6HamsBKWqEC/FVHuQMHgQL+knPyvHH55HwJDHl/adMw= github.com/charmbracelet/bubbletea/v2 v2.0.0-beta.4 h1:UgUuKKvBwgqm2ZEL+sKv/OLeavrUb4gfHgdxe6oIOno= github.com/charmbracelet/bubbletea/v2 v2.0.0-beta.4/go.mod h1:0wWFRpsgF7vHsCukVZ5LAhZkiR4j875H6KEM2/tFQmA= github.com/charmbracelet/colorprofile v0.3.1 h1:k8dTHMd7fgw4bnFd7jXTLZrSU/CQrKnL3m+AxCzDz40= diff --git a/packages/tui/internal/tui/app.go b/packages/tui/internal/tui/app.go index 1d9746aa..1a59e45b 100644 --- a/packages/tui/internal/tui/app.go +++ b/packages/tui/internal/tui/app.go @@ -8,7 +8,9 @@ import ( "strings" "time" + "github.com/charmbracelet/bubbles/v2/table" tea "github.com/charmbracelet/bubbletea/v2" + "github.com/pelletier/go-toml/v2" "github.com/rogue/tui/internal/components" "github.com/rogue/tui/internal/theme" @@ -85,7 +87,31 @@ func (m *Model) summaryGenerationCmd() tea.Cmd { ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute) defer cancel() - summary, err := sdk.GenerateSummary(ctx, m.evalState.JobID, judgeModel, apiKey) + structuredSummary, err := sdk.GenerateSummary(ctx, m.evalState.JobID, judgeModel, apiKey) + + overallSummary := structuredSummary.Summary.OverallSummary + keyFindings := structuredSummary.Summary.KeyFindings + parsedKeyFindings := "" + for _, finding := range keyFindings { + parsedKeyFindings += "- " + finding + "\n" + } + recommendations := structuredSummary.Summary.Recommendations + parsedRecommendations := "" + for _, recommendation := range recommendations { + parsedRecommendations += "- " + recommendation + "\n" + } + + detailedBreakdown := structuredSummary.Summary.DetailedBreakdown + parsedDetailedBreakdown := "" + for _, breakdown := range detailedBreakdown { + parsedDetailedBreakdown += "- " + breakdown.Scenario + " - " + breakdown.Status + " - " + breakdown.Outcome + "\n" + } + + summary := "## Overall Summary\n\n" + overallSummary + + "\n\n" + "## Key Findings\n\n" + parsedKeyFindings + + "\n\n" + "## Recommendations\n\n" + parsedRecommendations + + "\n\n" + "## Detailed Breakdown\n\n" + parsedDetailedBreakdown + return SummaryGeneratedMsg{ Summary: summary, Err: err, @@ -131,20 +157,21 @@ type App struct { // Model represents the main application state type Model struct { - currentScreen Screen - width int - height int - input string - cursor int - evaluations []Evaluation - scenarios []Scenario - config Config - version string - commandInput components.CommandInput - dialog *components.Dialog - dialogStack []components.Dialog - llmDialog *components.LLMConfigDialog - scenarioEditor components.ScenarioEditor + currentScreen Screen + width int + height int + input string + cursor int + evaluations []Evaluation + scenarios []Scenario + config Config + version string + commandInput components.CommandInput + dialog *components.Dialog + dialogStack []components.Dialog + llmDialog *components.LLMConfigDialog + scenarioEditor components.ScenarioEditor + detailedBreakdown []table.Row // Spinners for loading states healthSpinner components.Spinner @@ -238,7 +265,7 @@ func (a *App) Run() error { // Initialize viewports eventsViewport: components.NewViewport(1, 80, 20), summaryViewport: components.NewViewport(2, 80, 20), - reportViewport: components.NewViewport(3, 80, 20), + reportViewport: components.NewViewport(3, 80, 15), focusedViewport: 0, // Start with events viewport focused eventsAutoScroll: true, // Start with auto-scroll enabled } diff --git a/packages/tui/internal/tui/evaluation.go b/packages/tui/internal/tui/evaluation.go index ecbd6a2c..69bdbc2d 100644 --- a/packages/tui/internal/tui/evaluation.go +++ b/packages/tui/internal/tui/evaluation.go @@ -92,6 +92,20 @@ type RogueSDK struct { ws *websocket.Conn } +type SummaryResp struct { + Summary struct { + OverallSummary string `json:"overall_summary"` + KeyFindings []string `json:"key_findings"` + Recommendations []string `json:"recommendations"` + DetailedBreakdown []struct { + Scenario string `json:"scenario"` + Status string `json:"status"` + Outcome string `json:"outcome"` + } `json:"detailed_breakdown"` + } `json:"summary"` + Message string `json:"message"` +} + // NewRogueSDK creates a new SDK instance func NewRogueSDK(baseURL string) *RogueSDK { return &RogueSDK{ @@ -438,15 +452,15 @@ func (m *Model) StartEvaluation(ctx context.Context, serverURL, agentURL string, } // GenerateSummary generates a markdown summary from evaluation results -func (sdk *RogueSDK) GenerateSummary(ctx context.Context, jobID, model, apiKey string) (string, error) { +func (sdk *RogueSDK) GenerateSummary(ctx context.Context, jobID, model, apiKey string) (*SummaryResp, error) { // First get the evaluation job to extract results job, err := sdk.GetEvaluation(ctx, jobID) if err != nil { - return "", fmt.Errorf("failed to get evaluation results: %w", err) + return nil, fmt.Errorf("failed to get evaluation results: %w", err) } if job.Results == nil { - return "", fmt.Errorf("no results available for job %s", jobID) + return nil, fmt.Errorf("no results available for job %s", jobID) } // Prepare summary request - match server's SummaryGenerationRequest format @@ -460,12 +474,12 @@ func (sdk *RogueSDK) GenerateSummary(ctx context.Context, jobID, model, apiKey s body, err := json.Marshal(summaryReq) if err != nil { - return "", err + return nil, err } req, err := http.NewRequestWithContext(ctx, "POST", sdk.baseURL+"/api/v1/llm/summary", bytes.NewReader(body)) if err != nil { - return "", err + return nil, err } req.Header.Set("Content-Type", "application/json") @@ -476,24 +490,22 @@ func (sdk *RogueSDK) GenerateSummary(ctx context.Context, jobID, model, apiKey s resp, err := longTimeoutClient.Do(req) if err != nil { - return "", err + return nil, err } defer resp.Body.Close() if resp.StatusCode != http.StatusOK { body, _ := io.ReadAll(resp.Body) - return "", fmt.Errorf("summary generation failed: %d %s", resp.StatusCode, string(body)) + return nil, fmt.Errorf("summary generation failed: %d %s", resp.StatusCode, string(body)) } - var summaryResp struct { - Summary string `json:"summary"` - Message string `json:"message"` - } + var summaryResp SummaryResp + if err := json.NewDecoder(resp.Body).Decode(&summaryResp); err != nil { - return "", err + return nil, err } - return summaryResp.Summary, nil + return &summaryResp, nil } // CheckServerHealth calls GET /health and returns the status string diff --git a/packages/tui/internal/tui/report_view.go b/packages/tui/internal/tui/report_view.go index 08c71e45..76ba8c5d 100644 --- a/packages/tui/internal/tui/report_view.go +++ b/packages/tui/internal/tui/report_view.go @@ -22,7 +22,7 @@ func (m Model) renderReport() string { // Main container style with full width and height background mainStyle := lipgloss.NewStyle(). Width(m.width). - Height(m.height - 1). // -1 for footer + Height(m.height - 12). Background(t.Background()) // Title style @@ -58,12 +58,12 @@ func (m Model) renderReport() string { } // Calculate viewport dimensions - viewportWidth := m.width - 4 // Leave margins - viewportHeight := m.height - 8 // title(3) + help(1) + margins(4) + viewportWidth := m.width - 8 // Leave margins + viewportHeight := m.height - 4 // title(3) + help(1) + margins(4) // Create a temporary copy of the viewport to avoid modifying the original viewport := m.reportViewport - viewport.SetSize(viewportWidth, viewportHeight) + viewport.SetSize(viewportWidth, viewportHeight-2) viewport.SetContent(reportContent) // Style the viewport with border @@ -71,14 +71,14 @@ func (m Model) renderReport() string { Border(lipgloss.RoundedBorder()). BorderForeground(t.Border()). BorderBackground(t.BackgroundPanel()). - Background(t.BackgroundPanel()). - Width(viewportWidth + 2). // +2 for border - Height(viewportHeight + 2) // +2 for border + Background(t.BackgroundPanel()) // Apply viewport styling viewport.Style = lipgloss.NewStyle(). Foreground(t.Text()). Background(t.BackgroundPanel()). + Width(viewportWidth). + Height(viewportHeight-8). Padding(1, 2) // Help text style @@ -102,13 +102,13 @@ func (m Model) renderReport() string { // Center the viewport in the available space contentArea := lipgloss.NewStyle(). Width(m.width). - Height(viewportHeight + 2). + Height(viewportHeight). Background(t.Background()) centeredViewport := contentArea.Render( lipgloss.Place( m.width, - viewportHeight+2, + viewportHeight, lipgloss.Center, lipgloss.Top, viewportContent, From c30193d64ca1b4f6617377a1d2b54dd9c2ec0862 Mon Sep 17 00:00:00 2001 From: Dror Ivry Date: Mon, 8 Sep 2025 15:04:07 +0300 Subject: [PATCH 08/22] Hotfix - change the token input count --- packages/tui/internal/tui/report_view.go | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/packages/tui/internal/tui/report_view.go b/packages/tui/internal/tui/report_view.go index 76ba8c5d..84bf31ec 100644 --- a/packages/tui/internal/tui/report_view.go +++ b/packages/tui/internal/tui/report_view.go @@ -59,7 +59,7 @@ func (m Model) renderReport() string { // Calculate viewport dimensions viewportWidth := m.width - 8 // Leave margins - viewportHeight := m.height - 4 // title(3) + help(1) + margins(4) + viewportHeight := m.height - 8 // title(3) + help(1) + margins(4) // Create a temporary copy of the viewport to avoid modifying the original viewport := m.reportViewport @@ -68,6 +68,7 @@ func (m Model) renderReport() string { // Style the viewport with border viewportStyle := lipgloss.NewStyle(). + Height(viewportHeight - 8). Border(lipgloss.RoundedBorder()). BorderForeground(t.Border()). BorderBackground(t.BackgroundPanel()). @@ -102,13 +103,13 @@ func (m Model) renderReport() string { // Center the viewport in the available space contentArea := lipgloss.NewStyle(). Width(m.width). - Height(viewportHeight). + Height(viewportHeight - 8). Background(t.Background()) centeredViewport := contentArea.Render( lipgloss.Place( m.width, - viewportHeight, + viewportHeight-8, lipgloss.Center, lipgloss.Top, viewportContent, From 45c037f52a98392748489c7e7fd586f4fcf6a806 Mon Sep 17 00:00:00 2001 From: Dror Ivry Date: Tue, 9 Sep 2025 13:39:52 +0300 Subject: [PATCH 09/22] report is working when precondifuring the api key --- packages/tui/internal/tui/app.go | 2 +- packages/tui/internal/tui/eval_ui.go | 16 ++- packages/tui/internal/tui/evaluation.go | 17 ++-- rogue/evaluator_agent/evaluator_agent.py | 4 + rogue/run_cli.py | 10 +- rogue/server/api/__init__.py | 7 +- rogue/server/api/evaluation.py | 2 + rogue/server/api/llm.py | 106 +++++++++++++++++++- rogue/server/models/api_format.py | 11 +- rogue/server/services/__init__.py | 1 + rogue/server/services/api_format_service.py | 4 +- rogue/server/services/llm_service.py | 2 +- rogue/server/services/qualifire_service.py | 40 ++++++++ rogue/ui/components/scenario_runner.py | 3 + sdks/python/rogue_sdk/client.py | 8 ++ sdks/python/rogue_sdk/sdk.py | 8 ++ sdks/python/rogue_sdk/types.py | 26 +++++ 17 files changed, 239 insertions(+), 28 deletions(-) create mode 100644 rogue/server/services/qualifire_service.py diff --git a/packages/tui/internal/tui/app.go b/packages/tui/internal/tui/app.go index 1a59e45b..3e350e60 100644 --- a/packages/tui/internal/tui/app.go +++ b/packages/tui/internal/tui/app.go @@ -87,7 +87,7 @@ func (m *Model) summaryGenerationCmd() tea.Cmd { ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute) defer cancel() - structuredSummary, err := sdk.GenerateSummary(ctx, m.evalState.JobID, judgeModel, apiKey) + structuredSummary, err := sdk.GenerateSummary(ctx, m.evalState.JobID, judgeModel, apiKey, &m.config.QualifireAPIKey) overallSummary := structuredSummary.Summary.OverallSummary keyFindings := structuredSummary.Summary.KeyFindings diff --git a/packages/tui/internal/tui/eval_ui.go b/packages/tui/internal/tui/eval_ui.go index 2c8eba6d..e5e738ad 100644 --- a/packages/tui/internal/tui/eval_ui.go +++ b/packages/tui/internal/tui/eval_ui.go @@ -14,7 +14,7 @@ type EvaluationViewState struct { JudgeModel string ParallelRuns int DeepTest bool - Scenarios []string + Scenarios []EvalScenario // Runtime Running bool @@ -35,7 +35,7 @@ type EvaluationViewState struct { } // loadScenariosFromWorkdir reads .rogue/scenarios.json upward from CWD -func loadScenariosFromWorkdir() []string { +func loadScenariosFromWorkdir() []EvalScenario { wd, _ := os.Getwd() dir := wd for { @@ -43,14 +43,20 @@ func loadScenariosFromWorkdir() []string { if b, err := os.ReadFile(p); err == nil { var v struct { Scenarios []struct { - Scenario string `json:"scenario"` + Scenario string `json:"scenario"` + ScenarioType string `json:"scenario_type"` + ExpectedOutcome string `json:"expected_outcome"` } `json:"scenarios"` } if json.Unmarshal(b, &v) == nil { - out := make([]string, 0, len(v.Scenarios)) + out := make([]EvalScenario, 0, len(v.Scenarios)) for _, s := range v.Scenarios { if s.Scenario != "" { - out = append(out, s.Scenario) + out = append(out, EvalScenario{ + Scenario: s.Scenario, + ScenarioType: ScenarioType(s.ScenarioType), + ExpectedOutcome: s.ExpectedOutcome, + }) } } return out diff --git a/packages/tui/internal/tui/evaluation.go b/packages/tui/internal/tui/evaluation.go index 69bdbc2d..8575556b 100644 --- a/packages/tui/internal/tui/evaluation.go +++ b/packages/tui/internal/tui/evaluation.go @@ -42,8 +42,9 @@ type AgentConfig struct { } type EvalScenario struct { - Scenario string `json:"scenario"` - ScenarioType ScenarioType `json:"scenario_type"` + Scenario string `json:"scenario"` + ScenarioType ScenarioType `json:"scenario_type"` + ExpectedOutcome string `json:"expected_outcome,omitempty"` } type EvaluationRequest struct { @@ -415,7 +416,7 @@ func (sdk *RogueSDK) CancelEvaluation(ctx context.Context, jobID string) error { } // StartEvaluation is the main entry point used by the TUI -func (m *Model) StartEvaluation(ctx context.Context, serverURL, agentURL string, scenarios []string, judgeModel string, parallelRuns int, deepTest bool) (<-chan EvaluationEvent, func() error, error) { +func (m *Model) StartEvaluation(ctx context.Context, serverURL, agentURL string, scenarios []EvalScenario, judgeModel string, parallelRuns int, deepTest bool) (<-chan EvaluationEvent, func() error, error) { sdk := NewRogueSDK(serverURL) // Validate URLs @@ -443,8 +444,9 @@ func (m *Model) StartEvaluation(ctx context.Context, serverURL, agentURL string, // Convert scenarios for _, s := range scenarios { request.Scenarios = append(request.Scenarios, EvalScenario{ - Scenario: s, - ScenarioType: ScenarioTypePolicy, + Scenario: s.Scenario, + ScenarioType: s.ScenarioType, + ExpectedOutcome: s.ExpectedOutcome, }) } @@ -452,7 +454,7 @@ func (m *Model) StartEvaluation(ctx context.Context, serverURL, agentURL string, } // GenerateSummary generates a markdown summary from evaluation results -func (sdk *RogueSDK) GenerateSummary(ctx context.Context, jobID, model, apiKey string) (*SummaryResp, error) { +func (sdk *RogueSDK) GenerateSummary(ctx context.Context, jobID, model, apiKey string, qualifireAPIKey *string) (*SummaryResp, error) { // First get the evaluation job to extract results job, err := sdk.GetEvaluation(ctx, jobID) if err != nil { @@ -470,6 +472,9 @@ func (sdk *RogueSDK) GenerateSummary(ctx context.Context, jobID, model, apiKey s "results": map[string]interface{}{ "results": job.Results, }, + "job_id": jobID, + "qualifire_api_key": *qualifireAPIKey, + "qualifire_url": "http://localhost:3000", } body, err := json.Marshal(summaryReq) diff --git a/rogue/evaluator_agent/evaluator_agent.py b/rogue/evaluator_agent/evaluator_agent.py index 79e4f608..d6d25ed8 100644 --- a/rogue/evaluator_agent/evaluator_agent.py +++ b/rogue/evaluator_agent/evaluator_agent.py @@ -122,6 +122,7 @@ - `scenario`: The entire scenario json object being tested. The json-object contains: - "scenario": The scenario text. - "scenario_type": The scenario type. + - "expected_outcome": The expected outcome of the scenario. - `context_id`: The conversation's context ID - `evaluation_passed`: Boolean indicating whether the agent complied with the policy. You should determine this based on the conversation. - `reason`: A brief explanation of your decision @@ -363,6 +364,7 @@ def _log_evaluation( context_id: str, evaluation_passed: bool, reason: str, + scenario_type: Optional[str], ) -> None: """ Logs the evaluation of the given scenario and test case. @@ -370,6 +372,7 @@ def _log_evaluation( This is the scenario dictionary containing both the scenario text and type: - scenario: The scenario text. - scenario_type: The scenario type. + - expected_outcome: The expected outcome of the scenario. :param context_id: The conversation's context_id. This allows us to distinguish which conversation is being evaluated. :param evaluation_passed: A boolean value with the evaluation result. This is @@ -391,6 +394,7 @@ def _log_evaluation( ), "evaluation_passed (from agent)": evaluation_passed, "reason (from agent)": reason, + "scenario_type": scenario_type, }, ) diff --git a/rogue/run_cli.py b/rogue/run_cli.py index f0894eeb..82eb60a6 100644 --- a/rogue/run_cli.py +++ b/rogue/run_cli.py @@ -188,6 +188,9 @@ async def create_report( results: EvaluationResults, output_report_file: Path, judge_llm_api_key_secret: SecretStr | None = None, + qualifire_api_key_secret: SecretStr | None = None, + deep_test_mode: bool = False, + judge_model: str | None = None, ) -> str: judge_llm_api_key = ( judge_llm_api_key_secret.get_secret_value() @@ -203,10 +206,13 @@ async def create_report( sdk = RogueSDK(sdk_config) try: - summary, structured_summary = await sdk.generate_summary( + summary, _ = await sdk.generate_summary( results=results, model=judge_llm, api_key=judge_llm_api_key, + qualifire_api_key=qualifire_api_key_secret, + deep_test=deep_test_mode, + judge_model=judge_model, ) finally: await sdk.close() @@ -352,6 +358,8 @@ async def run_cli(args: Namespace) -> int: results=results, output_report_file=cli_input.output_report_file, judge_llm_api_key_secret=cli_input.judge_llm_api_key, + deep_test_mode=cli_input.deep_test_mode, + judge_model=cli_input.judge_llm, ) logger.info("Report saved", extra={"report_file": cli_input.output_report_file}) diff --git a/rogue/server/api/__init__.py b/rogue/server/api/__init__.py index e1dec064..0b1b8c2e 100644 --- a/rogue/server/api/__init__.py +++ b/rogue/server/api/__init__.py @@ -2,6 +2,11 @@ API endpoints for the Rogue Agent Evaluator Server. """ -from . import evaluation, health, interview, llm +from . import ( + evaluation, + health, + interview, + llm, +) __all__ = ["evaluation", "health", "interview", "llm"] diff --git a/rogue/server/api/evaluation.py b/rogue/server/api/evaluation.py index 00e29c2f..636dbf0c 100644 --- a/rogue/server/api/evaluation.py +++ b/rogue/server/api/evaluation.py @@ -59,6 +59,8 @@ async def create_evaluation( status=EvaluationStatus.PENDING, created_at=datetime.now(timezone.utc), request=request, + deep_test=request.agent_config.deep_test_mode, + judge_model=request.agent_config.judge_llm, ) await evaluation_service.add_job(job) diff --git a/rogue/server/api/llm.py b/rogue/server/api/llm.py index 73a312f4..acc0166b 100644 --- a/rogue/server/api/llm.py +++ b/rogue/server/api/llm.py @@ -4,17 +4,24 @@ This module provides REST API endpoints for LLM operations. """ -from fastapi import APIRouter, HTTPException +from datetime import datetime, timezone +from fastapi import APIRouter, Depends, HTTPException from rogue_sdk.types import ( ScenarioGenerationRequest, ScenarioGenerationResponse, SummaryGenerationRequest, + ReportSummaryResponse, + ReportSummaryRequest, ) +from rogue.server.api.evaluation import get_evaluation_service +from rogue.server.services.evaluation_service import EvaluationService + from ..models.api_format import ServerSummaryGenerationResponse from ...common.logging import get_logger from ..services.llm_service import LLMService +from ..services.qualifire_service import QualifireService router = APIRouter(prefix="/llm", tags=["llm"]) logger = get_logger(__name__) @@ -58,9 +65,13 @@ async def generate_scenarios(request: ScenarioGenerationRequest): ) -@router.post("/summary", response_model=ServerSummaryGenerationResponse) +@router.post( + "/summary", + response_model=ServerSummaryGenerationResponse, +) async def generate_summary( request: SummaryGenerationRequest, + evaluation_service: EvaluationService = Depends(get_evaluation_service), ) -> ServerSummaryGenerationResponse: """ Generate evaluation summary from results. @@ -84,6 +95,56 @@ async def generate_summary( logger.info("Successfully generated evaluation summary") + logger.info( + "Qualifire API key", + extra={"qualifire_api_key": request.qualifire_api_key}, + ) + logger.info( + "Job ID", + extra={"job_id": request.job_id}, + ) + logger.info( + "Qualifire URL", + extra={"qualifire_url": request.qualifire_url}, + ) + + if request.qualifire_api_key and request.job_id: + + logger.info( + "Reporting summary to Qualifire", + extra={"job_id": request.job_id}, + ) + + job = await evaluation_service.get_job(request.job_id) + + if not job and not request.judge_model and not request.deep_test: + raise HTTPException( + status_code=400, + detail="Job not found and judge model and deep test are not provided", # noqa: E501 + ) + + logger.info( + "Summary", + extra={"summary": summary, "results": request.results}, + ) + + QualifireService.report_summary( + ReportSummaryRequest( + job_id=request.job_id, + structured_summary=summary, + deep_test=job.deep_test if job else request.deep_test, + start_time=( + job.created_at + if job is not None + else datetime.now(timezone.utc) + ), + judge_model=job.judge_model if job else request.judge_model, + qualifire_url=request.qualifire_url, + qualifire_api_key=request.qualifire_api_key, + ), + evaluation_result=request.results, + ) + return ServerSummaryGenerationResponse( summary=summary, message="Successfully generated evaluation summary", @@ -95,3 +156,44 @@ async def generate_summary( status_code=500, detail=f"Failed to generate summary: {str(e)}", ) + + +@router.post("/report_summary", response_model=ReportSummaryResponse) +async def report_summary_handler( + request: ReportSummaryRequest, + evaluation_service: EvaluationService = Depends(get_evaluation_service), +): + """ + Report summary to Qualifire. + """ + try: + job = await evaluation_service.get_job(request.job_id) + + if not job: + raise HTTPException( + status_code=404, + detail="Evaluation job not found", + ) + + results = job.results + + if not results or len(results) == 0: + raise HTTPException( + status_code=404, + detail="Evaluation results not found or empty", + ) + + QualifireService.report_summary( + request, + evaluation_result=results[0], + ) + + return ReportSummaryResponse( + success=True, + ) + except Exception as e: + logger.exception("Failed to report summary") + raise HTTPException( + status_code=500, + detail=f"Failed to report summary: {str(e)}", + ) diff --git a/rogue/server/models/api_format.py b/rogue/server/models/api_format.py index 1142b729..f3895b8e 100644 --- a/rogue/server/models/api_format.py +++ b/rogue/server/models/api_format.py @@ -8,15 +8,7 @@ from typing import List, Optional from pydantic import BaseModel - - -class StructuredSummary(BaseModel): - """Structured summary response from LLM.""" - - overall_summary: str - key_findings: List[str] - recommendations: List[str] - detailed_breakdown: List[dict] # Table rows for scenario breakdown +from rogue_sdk.types import StructuredSummary class ApiChatMessage(BaseModel): @@ -39,6 +31,7 @@ class ApiScenarioResult(BaseModel): """Result of evaluating a single scenario in new API format.""" description: Optional[str] = None + expectedOutcome: Optional[str] = None totalConversations: Optional[int] = None flaggedConversations: Optional[int] = None conversations: List[ApiConversationEvaluation] diff --git a/rogue/server/services/__init__.py b/rogue/server/services/__init__.py index 95047763..6b0b3aaf 100644 --- a/rogue/server/services/__init__.py +++ b/rogue/server/services/__init__.py @@ -5,4 +5,5 @@ interviewer_service, llm_service, scenario_evaluation_service, + qualifire_service, ) diff --git a/rogue/server/services/api_format_service.py b/rogue/server/services/api_format_service.py index 44ccc2cf..d63336e2 100644 --- a/rogue/server/services/api_format_service.py +++ b/rogue/server/services/api_format_service.py @@ -7,14 +7,13 @@ from datetime import datetime, timezone from typing import Optional -from rogue_sdk.types import EvaluationResults +from rogue_sdk.types import EvaluationResults, StructuredSummary from ..models.api_format import ( ApiChatMessage, ApiConversationEvaluation, ApiEvaluationResult, ApiScenarioResult, - StructuredSummary, ) @@ -80,6 +79,7 @@ def convert_to_api_format( api_scenarios.append( ApiScenarioResult( description=result.scenario.scenario, + expectedOutcome=result.scenario.expected_outcome, totalConversations=len(api_conversations), flaggedConversations=len( [c for c in api_conversations if not c.passed], diff --git a/rogue/server/services/llm_service.py b/rogue/server/services/llm_service.py index aa229c04..a775edf9 100644 --- a/rogue/server/services/llm_service.py +++ b/rogue/server/services/llm_service.py @@ -4,8 +4,8 @@ from litellm import completion from loguru import logger from rogue_sdk.types import EvaluationResults, Scenario, Scenarios, ScenarioType +from rogue_sdk.types import StructuredSummary -from ..models.api_format import StructuredSummary SCENARIO_GENERATION_SYSTEM_PROMPT = """ # Test Scenario Designer diff --git a/rogue/server/services/qualifire_service.py b/rogue/server/services/qualifire_service.py new file mode 100644 index 00000000..05aabf60 --- /dev/null +++ b/rogue/server/services/qualifire_service.py @@ -0,0 +1,40 @@ +import requests +from loguru import logger + +from .api_format_service import convert_with_structured_summary +from rogue_sdk.types import EvaluationResult, ReportSummaryRequest + + +class QualifireService: + @staticmethod + def report_summary( + request: ReportSummaryRequest, + evaluation_result: EvaluationResult, + ): + logger.info( + "Reporting summary to Qualifire", + ) + + api_evaluation_result = convert_with_structured_summary( + evaluation_results=evaluation_result, + structured_summary=request.structured_summary, + deep_test=request.deep_test, + start_time=request.start_time, + judge_model=request.judge_model, + ) + + response = requests.post( + f"{request.qualifire_url}/api/rogue/v1/report", + headers={"X-qualifire-key": request.qualifire_api_key}, + json=api_evaluation_result.model_dump(mode="json"), + timeout=300, + ) + + if not response.ok: + logger.error( + "Failed to report summary to Qualifire", + extra={"response": response.json()}, + ) + raise Exception(f"Failed to report summary to Qualifire: {response.json()}") + + return response.json() diff --git a/rogue/ui/components/scenario_runner.py b/rogue/ui/components/scenario_runner.py index b1fa6d7e..e46c5540 100644 --- a/rogue/ui/components/scenario_runner.py +++ b/rogue/ui/components/scenario_runner.py @@ -493,6 +493,9 @@ def on_status_update(status_data): results=all_results, model=config.get("service_llm"), api_key=config.get("judge_llm_api_key"), + qualifire_api_key=config.get("qualifire_api_key"), + deep_test=config.get("deep_test_mode", False), + judge_model=config.get("judge_llm"), ) state["structured_summary"] = structured_summary diff --git a/sdks/python/rogue_sdk/client.py b/sdks/python/rogue_sdk/client.py index f88b7fd2..09546da8 100644 --- a/sdks/python/rogue_sdk/client.py +++ b/sdks/python/rogue_sdk/client.py @@ -149,11 +149,19 @@ async def generate_summary( results: EvaluationResults, model: str, api_key: Optional[str] = None, + qualifire_api_key: Optional[str] = None, + job_id: Optional[str] = None, + deep_test: bool = False, + judge_model: Optional[str] = None, ) -> SummaryGenerationResponse: """Generate summary via API.""" data = SummaryGenerationRequest( results=results, model=model, + qualifire_api_key=qualifire_api_key, + job_id=job_id, + deep_test=deep_test, + judge_model=judge_model, ) if api_key: data.api_key = api_key diff --git a/sdks/python/rogue_sdk/sdk.py b/sdks/python/rogue_sdk/sdk.py index 98bdc55a..d1ec55cf 100644 --- a/sdks/python/rogue_sdk/sdk.py +++ b/sdks/python/rogue_sdk/sdk.py @@ -289,12 +289,20 @@ async def generate_summary( results: EvaluationResults, model: str = "openai/gpt-4o-mini", api_key: Optional[str] = None, + qualifire_api_key: Optional[str] = None, + job_id: Optional[str] = None, + deep_test: bool = False, + judge_model: Optional[str] = None, ) -> Tuple[str, StructuredSummary]: """Generate evaluation summary from results.""" response_data = await self.http_client.generate_summary( results=results, model=model, api_key=api_key, + qualifire_api_key=qualifire_api_key, + job_id=job_id, + deep_test=deep_test, + judge_model=judge_model, ) # Convert structured summary back to string format for backward compatibility diff --git a/sdks/python/rogue_sdk/types.py b/sdks/python/rogue_sdk/types.py index b1b8706c..6842a61f 100644 --- a/sdks/python/rogue_sdk/types.py +++ b/sdks/python/rogue_sdk/types.py @@ -77,6 +77,7 @@ class AgentConfig(BaseModel): parallel_runs: int = 1 judge_llm_api_key: Optional[str] = None business_context: str = "" + qualifire_api_key: Optional[str] = None @model_validator(mode="after") def check_auth_credentials(self) -> "AgentConfig": @@ -385,6 +386,8 @@ class EvaluationJob(BaseModel): results: Optional[List[EvaluationResult]] = None error_message: Optional[str] = None progress: float = 0.0 + deep_test: bool = False + judge_model: Optional[str] = None class EvaluationResponse(BaseModel): @@ -431,6 +434,11 @@ class SummaryGenerationRequest(BaseModel): results: EvaluationResults model: str = "openai/gpt-4.1" api_key: Optional[str] = None + qualifire_api_key: Optional[str] = None + job_id: str = "" + deep_test: bool = False + judge_model: Optional[str] = None + qualifire_url: Optional[str] = "https://app.qualifire.ai" class StructuredSummary(BaseModel): @@ -488,3 +496,21 @@ def validate_base_url(cls, v: str | HttpUrl) -> HttpUrl: if isinstance(v, str): return HttpUrl(v) return v + + +class ReportSummaryRequest(BaseModel): + """Request to report a summary.""" + + job_id: str + structured_summary: Optional[StructuredSummary] = None + deep_test: bool = False + start_time: Optional[datetime] = None + judge_model: Optional[str] = None + qualifire_api_key: Optional[str] = None + qualifire_url: Optional[str] = "https://app.qualifire.ai" + + +class ReportSummaryResponse(BaseModel): + """Response to report a summary.""" + + success: bool From 7036bb0e8337386bfac13738379be7f006596f54 Mon Sep 17 00:00:00 2001 From: Dror Ivry Date: Tue, 9 Sep 2025 14:38:13 +0300 Subject: [PATCH 10/22] fixed paste --- .../internal/components/llm_config_dialog.go | 6 +++--- packages/tui/internal/tui/app.go | 19 +++++++++++++++++++ 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/packages/tui/internal/components/llm_config_dialog.go b/packages/tui/internal/components/llm_config_dialog.go index b07471a3..54fcd104 100644 --- a/packages/tui/internal/components/llm_config_dialog.go +++ b/packages/tui/internal/components/llm_config_dialog.go @@ -549,7 +549,7 @@ func (d LLMConfigDialog) handleEnter() (LLMConfigDialog, tea.Cmd) { // handlePaste handles clipboard paste operation for API key input func (d LLMConfigDialog) handlePaste() (LLMConfigDialog, tea.Cmd) { // Get clipboard content based on the operating system - clipboardText, err := getClipboardContent() + clipboardText, err := GetClipboardContent() if err != nil { // If clipboard reading fails, just return without error return d, nil @@ -569,8 +569,8 @@ func (d LLMConfigDialog) handlePaste() (LLMConfigDialog, tea.Cmd) { return d, nil } -// getClipboardContent reads content from the system clipboard -func getClipboardContent() (string, error) { +// GetClipboardContent reads content from the system clipboard +func GetClipboardContent() (string, error) { var cmd *exec.Cmd switch runtime.GOOS { diff --git a/packages/tui/internal/tui/app.go b/packages/tui/internal/tui/app.go index 3e350e60..f1f9a30c 100644 --- a/packages/tui/internal/tui/app.go +++ b/packages/tui/internal/tui/app.go @@ -312,6 +312,25 @@ func (m Model) Update(msg tea.Msg) (tea.Model, tea.Cmd) { } return m, tea.Batch(cmds...) } + + if m.dialog != nil { + clipboardText, err := components.GetClipboardContent() + if err != nil { + // If clipboard reading fails, just return without error + return m, nil + } + + // Clean the clipboard text (remove newlines and trim whitespace) + cleanText := strings.TrimSpace(strings.ReplaceAll(clipboardText, "\n", "")) + + if cleanText == "" { + return m, nil + } + + m.dialog.Input += cleanText + m.dialog.InputCursor = len(m.dialog.Input) + return m, nil + } case components.SpinnerTickMsg: // Update spinners m.healthSpinner, cmd = m.healthSpinner.Update(msg) From 64ff4641bcb892617241ece94aa4f6e7f58f421b Mon Sep 17 00:00:00 2001 From: Dror Ivry Date: Tue, 9 Sep 2025 15:52:11 +0300 Subject: [PATCH 11/22] wip --- sdks/python/rogue_sdk/types.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdks/python/rogue_sdk/types.py b/sdks/python/rogue_sdk/types.py index 6842a61f..d7a929d6 100644 --- a/sdks/python/rogue_sdk/types.py +++ b/sdks/python/rogue_sdk/types.py @@ -434,10 +434,10 @@ class SummaryGenerationRequest(BaseModel): results: EvaluationResults model: str = "openai/gpt-4.1" api_key: Optional[str] = None - qualifire_api_key: Optional[str] = None job_id: str = "" deep_test: bool = False judge_model: Optional[str] = None + qualifire_api_key: Optional[str] = None qualifire_url: Optional[str] = "https://app.qualifire.ai" From 2f23574d1f24ce2cfe558efcc82eafe41f200ac8 Mon Sep 17 00:00:00 2001 From: Dror Ivry Date: Wed, 10 Sep 2025 13:17:48 +0300 Subject: [PATCH 12/22] fixed report_summary --- rogue/evaluator_agent/evaluator_agent.py | 4 ++++ rogue/server/api/llm.py | 5 +++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/rogue/evaluator_agent/evaluator_agent.py b/rogue/evaluator_agent/evaluator_agent.py index d6d25ed8..001ae760 100644 --- a/rogue/evaluator_agent/evaluator_agent.py +++ b/rogue/evaluator_agent/evaluator_agent.py @@ -395,6 +395,10 @@ def _log_evaluation( "evaluation_passed (from agent)": evaluation_passed, "reason (from agent)": reason, "scenario_type": scenario_type, + "expected_outcome": scenario.get( + "expected_outcome", + "None", + ), }, ) diff --git a/rogue/server/api/llm.py b/rogue/server/api/llm.py index acc0166b..367358f0 100644 --- a/rogue/server/api/llm.py +++ b/rogue/server/api/llm.py @@ -7,6 +7,7 @@ from datetime import datetime, timezone from fastapi import APIRouter, Depends, HTTPException from rogue_sdk.types import ( + EvaluationResults, ScenarioGenerationRequest, ScenarioGenerationResponse, SummaryGenerationRequest, @@ -185,7 +186,7 @@ async def report_summary_handler( QualifireService.report_summary( request, - evaluation_result=results[0], + evaluation_result=EvaluationResults(results=results), ) return ReportSummaryResponse( @@ -194,6 +195,6 @@ async def report_summary_handler( except Exception as e: logger.exception("Failed to report summary") raise HTTPException( - status_code=500, + status_code=e.status_code if hasattr(e, "status_code") else 500, detail=f"Failed to report summary: {str(e)}", ) From 5cb2a0cdf3bc3b51d9f8f2a6a77bb737ce47805f Mon Sep 17 00:00:00 2001 From: Dror Ivry Date: Wed, 10 Sep 2025 14:04:27 +0300 Subject: [PATCH 13/22] tui report save --- packages/tui/internal/tui/app.go | 82 ++++++++++++++++++++++- packages/tui/internal/tui/eval_ui.go | 1 + packages/tui/internal/tui/evaluation.go | 88 +++++++++++++++++++++---- rogue/server/api/llm.py | 10 ++- sdks/python/rogue_sdk/types.py | 5 +- 5 files changed, 167 insertions(+), 19 deletions(-) diff --git a/packages/tui/internal/tui/app.go b/packages/tui/internal/tui/app.go index f1f9a30c..0fe05b7b 100644 --- a/packages/tui/internal/tui/app.go +++ b/packages/tui/internal/tui/app.go @@ -87,7 +87,25 @@ func (m *Model) summaryGenerationCmd() tea.Cmd { ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute) defer cancel() - structuredSummary, err := sdk.GenerateSummary(ctx, m.evalState.JobID, judgeModel, apiKey, &m.config.QualifireAPIKey) + structuredSummary, err := sdk.GenerateSummary( + ctx, + m.evalState.JobID, + judgeModel, + apiKey, + &m.config.QualifireAPIKey, + m.evalState.DeepTest, + judgeModel, + m.config.ServerURL, + ) + + if err != nil { + return SummaryGeneratedMsg{ + Summary: "", + Err: err, + } + } + + m.evalState.StructuredSummary = structuredSummary.Summary overallSummary := structuredSummary.Summary.OverallSummary keyFindings := structuredSummary.Summary.KeyFindings @@ -531,6 +549,66 @@ func (m Model) Update(msg tea.Msg) (tea.Model, tea.Cmd) { // Handle dialog closure if m.dialog != nil { switch msg.Action { + case "save_qualifire_and_report": + // Handle Qualifire API key save and report persistence + if m.dialog != nil && m.dialog.Title == "Configure Qualifire API Key" { + // Save the API key to config (allow empty to clear the key) + m.config.QualifireAPIKey = msg.Input + // Only enable integration if there's an API key + if msg.Input != "" { + m.config.QualifireEnabled = true + if m.configState != nil { + m.configState.QualifireEnabled = true + m.configState.HasChanges = true + } + } + + // immediately report the summary + if m.evalState != nil && m.evalState.Completed { + sdk := NewRogueSDK(m.config.ServerURL) + err := sdk.ReportSummary( + context.Background(), + m.evalState.JobID, + m.evalState.StructuredSummary, + m.evalState.DeepTest, + m.evalState.JudgeModel, + m.config.QualifireAPIKey, + ) + if err != nil { + // Show error dialog + errorDialog := components.ShowErrorDialog( + "Report Summary Error", + fmt.Sprintf("Failed to report summary: %v", err), + ) + m.dialog = &errorDialog + } + + err = m.saveConfig() + if err != nil { + // Show error dialog + errorDialog := components.ShowErrorDialog( + "Configuration Error", + fmt.Sprintf("Failed to save Qualifire configuration: %v", err), + ) + m.dialog = &errorDialog + return m, nil + } else { + // Show appropriate success dialog + var message string + if msg.Input != "" { + message = "Qualifire API key has been successfully saved and integration is now enabled. Your evaluation report will now be automatically persisted." + } else { + message = "Qualifire API key has been cleared and integration is now disabled." + } + successDialog := components.NewInfoDialog( + "Qualifire Configured", + message, + ) + m.dialog = &successDialog + return m, nil + } + } + } case "save_qualifire": // Handle Qualifire API key save if m.dialog != nil && m.dialog.Title == "Configure Qualifire API Key" { @@ -589,7 +667,7 @@ func (m Model) Update(msg tea.Msg) (tea.Model, tea.Cmd) { ) // Customize the buttons for this specific use case dialog.Buttons = []components.DialogButton{ - {Label: "Save", Action: "save_qualifire", Style: components.PrimaryButton}, + {Label: "Save", Action: "save_qualifire_and_report", Style: components.PrimaryButton}, } // Position cursor at end of existing key if there is one dialog.InputCursor = len(m.config.QualifireAPIKey) diff --git a/packages/tui/internal/tui/eval_ui.go b/packages/tui/internal/tui/eval_ui.go index e5e738ad..8d2e9d92 100644 --- a/packages/tui/internal/tui/eval_ui.go +++ b/packages/tui/internal/tui/eval_ui.go @@ -28,6 +28,7 @@ type EvaluationViewState struct { JobID string // For tracking the evaluation job Completed bool // Whether evaluation finished successfully SummaryGenerated bool // Whether summary generation was already attempted + StructuredSummary StructuredSummary // Editing state for New Evaluation currentField int // 0: AgentURL, 1: JudgeModel, 2: DeepTest, 3: StartButton diff --git a/packages/tui/internal/tui/evaluation.go b/packages/tui/internal/tui/evaluation.go index 8575556b..920cc11d 100644 --- a/packages/tui/internal/tui/evaluation.go +++ b/packages/tui/internal/tui/evaluation.go @@ -93,18 +93,19 @@ type RogueSDK struct { ws *websocket.Conn } +type StructuredSummary struct { + OverallSummary string `json:"overall_summary"` + KeyFindings []string `json:"key_findings"` + Recommendations []string `json:"recommendations"` + DetailedBreakdown []struct { + Scenario string `json:"scenario"` + Status string `json:"status"` + Outcome string `json:"outcome"` + } `json:"detailed_breakdown"` +} type SummaryResp struct { - Summary struct { - OverallSummary string `json:"overall_summary"` - KeyFindings []string `json:"key_findings"` - Recommendations []string `json:"recommendations"` - DetailedBreakdown []struct { - Scenario string `json:"scenario"` - Status string `json:"status"` - Outcome string `json:"outcome"` - } `json:"detailed_breakdown"` - } `json:"summary"` - Message string `json:"message"` + Summary StructuredSummary `json:"summary"` + Message string `json:"message"` } // NewRogueSDK creates a new SDK instance @@ -416,7 +417,15 @@ func (sdk *RogueSDK) CancelEvaluation(ctx context.Context, jobID string) error { } // StartEvaluation is the main entry point used by the TUI -func (m *Model) StartEvaluation(ctx context.Context, serverURL, agentURL string, scenarios []EvalScenario, judgeModel string, parallelRuns int, deepTest bool) (<-chan EvaluationEvent, func() error, error) { +func (m *Model) StartEvaluation( + ctx context.Context, + serverURL string, + agentURL string, + scenarios []EvalScenario, + judgeModel string, + parallelRuns int, + deepTest bool, +) (<-chan EvaluationEvent, func() error, error) { sdk := NewRogueSDK(serverURL) // Validate URLs @@ -454,7 +463,14 @@ func (m *Model) StartEvaluation(ctx context.Context, serverURL, agentURL string, } // GenerateSummary generates a markdown summary from evaluation results -func (sdk *RogueSDK) GenerateSummary(ctx context.Context, jobID, model, apiKey string, qualifireAPIKey *string) (*SummaryResp, error) { +func (sdk *RogueSDK) GenerateSummary( + ctx context.Context, + jobID, model, apiKey string, + qualifireAPIKey *string, + deepTest bool, + judgeModel string, + qualifireURL string, +) (*SummaryResp, error) { // First get the evaluation job to extract results job, err := sdk.GetEvaluation(ctx, jobID) if err != nil { @@ -474,7 +490,9 @@ func (sdk *RogueSDK) GenerateSummary(ctx context.Context, jobID, model, apiKey s }, "job_id": jobID, "qualifire_api_key": *qualifireAPIKey, - "qualifire_url": "http://localhost:3000", + "qualifire_url": qualifireURL, + "deep_test": deepTest, + "judge_model": judgeModel, } body, err := json.Marshal(summaryReq) @@ -513,6 +531,48 @@ func (sdk *RogueSDK) GenerateSummary(ctx context.Context, jobID, model, apiKey s return &summaryResp, nil } +// ReportSummary reports a summary to Qualifire +func (sdk *RogueSDK) ReportSummary( + ctx context.Context, + jobID string, + summary StructuredSummary, + deepTest bool, + judgeModel string, + qualifireAPIKey string, +) error { + reportReq := map[string]interface{}{ + "job_id": jobID, + "structured_summary": summary, + "deep_test": deepTest, + "judge_model": judgeModel, + "qualifire_api_key": qualifireAPIKey, + } + + body, err := json.Marshal(reportReq) + if err != nil { + return err + } + + req, err := http.NewRequestWithContext(ctx, "POST", sdk.baseURL+"/api/v1/llm/report_summary", bytes.NewReader(body)) + if err != nil { + return err + } + req.Header.Set("Content-Type", "application/json") + + resp, err := sdk.httpClient.Do(req) + if err != nil { + return err + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + body, _ := io.ReadAll(resp.Body) + return fmt.Errorf("report summary failed: %d %s", resp.StatusCode, string(body)) + } + + return nil +} + // CheckServerHealth calls GET /health and returns the status string func (m *Model) CheckServerHealth(ctx context.Context, serverURL string) (string, error) { sdk := NewRogueSDK(serverURL) diff --git a/rogue/server/api/llm.py b/rogue/server/api/llm.py index 367358f0..05b9459e 100644 --- a/rogue/server/api/llm.py +++ b/rogue/server/api/llm.py @@ -185,7 +185,15 @@ async def report_summary_handler( ) QualifireService.report_summary( - request, + ReportSummaryRequest( + job_id=request.job_id, + structured_summary=request.structured_summary, + deep_test=request.deep_test, + start_time=job.created_at, + judge_model=job.judge_model, + qualifire_api_key=request.qualifire_api_key, + qualifire_url=request.qualifire_url, + ), evaluation_result=EvaluationResults(results=results), ) diff --git a/sdks/python/rogue_sdk/types.py b/sdks/python/rogue_sdk/types.py index d7a929d6..dc91f321 100644 --- a/sdks/python/rogue_sdk/types.py +++ b/sdks/python/rogue_sdk/types.py @@ -504,10 +504,11 @@ class ReportSummaryRequest(BaseModel): job_id: str structured_summary: Optional[StructuredSummary] = None deep_test: bool = False - start_time: Optional[datetime] = None judge_model: Optional[str] = None + start_time: Optional[datetime] = None qualifire_api_key: Optional[str] = None - qualifire_url: Optional[str] = "https://app.qualifire.ai" + # qualifire_url: Optional[str] = "https://app.qualifire.ai" + qualifire_url: Optional[str] = "http://localhost:3000" class ReportSummaryResponse(BaseModel): From d9ce9e243cd16a9dec3f9d2b2d2a2ac5a8d3cfad Mon Sep 17 00:00:00 2001 From: Dror Ivry Date: Wed, 10 Sep 2025 14:04:56 +0300 Subject: [PATCH 14/22] tui report save --- sdks/python/rogue_sdk/types.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sdks/python/rogue_sdk/types.py b/sdks/python/rogue_sdk/types.py index dc91f321..96fedddc 100644 --- a/sdks/python/rogue_sdk/types.py +++ b/sdks/python/rogue_sdk/types.py @@ -507,8 +507,7 @@ class ReportSummaryRequest(BaseModel): judge_model: Optional[str] = None start_time: Optional[datetime] = None qualifire_api_key: Optional[str] = None - # qualifire_url: Optional[str] = "https://app.qualifire.ai" - qualifire_url: Optional[str] = "http://localhost:3000" + qualifire_url: Optional[str] = "https://app.qualifire.ai" class ReportSummaryResponse(BaseModel): From c5134b3d292e5a4e77547b81241f26529400267b Mon Sep 17 00:00:00 2001 From: Dror Ivry Date: Wed, 10 Sep 2025 15:19:10 +0300 Subject: [PATCH 15/22] tui report save --- packages/tui/internal/tui/app.go | 15 +++++++++++---- packages/tui/internal/tui/evaluation.go | 2 -- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/packages/tui/internal/tui/app.go b/packages/tui/internal/tui/app.go index 0fe05b7b..1c75b0e2 100644 --- a/packages/tui/internal/tui/app.go +++ b/packages/tui/internal/tui/app.go @@ -86,16 +86,18 @@ func (m *Model) summaryGenerationCmd() tea.Cmd { // Create a context with longer timeout for summary generation ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute) defer cancel() - + parsedAPIKey := &m.config.QualifireAPIKey + if m.config.QualifireEnabled == false { + parsedAPIKey = nil + } structuredSummary, err := sdk.GenerateSummary( ctx, m.evalState.JobID, judgeModel, apiKey, - &m.config.QualifireAPIKey, + parsedAPIKey, m.evalState.DeepTest, judgeModel, - m.config.ServerURL, ) if err != nil { @@ -565,6 +567,11 @@ func (m Model) Update(msg tea.Msg) (tea.Model, tea.Cmd) { // immediately report the summary if m.evalState != nil && m.evalState.Completed { + parsedAPIKey := m.config.QualifireAPIKey + if m.config.QualifireEnabled == false { + parsedAPIKey = "" + } + sdk := NewRogueSDK(m.config.ServerURL) err := sdk.ReportSummary( context.Background(), @@ -572,7 +579,7 @@ func (m Model) Update(msg tea.Msg) (tea.Model, tea.Cmd) { m.evalState.StructuredSummary, m.evalState.DeepTest, m.evalState.JudgeModel, - m.config.QualifireAPIKey, + parsedAPIKey, ) if err != nil { // Show error dialog diff --git a/packages/tui/internal/tui/evaluation.go b/packages/tui/internal/tui/evaluation.go index 920cc11d..a3ab4410 100644 --- a/packages/tui/internal/tui/evaluation.go +++ b/packages/tui/internal/tui/evaluation.go @@ -469,7 +469,6 @@ func (sdk *RogueSDK) GenerateSummary( qualifireAPIKey *string, deepTest bool, judgeModel string, - qualifireURL string, ) (*SummaryResp, error) { // First get the evaluation job to extract results job, err := sdk.GetEvaluation(ctx, jobID) @@ -490,7 +489,6 @@ func (sdk *RogueSDK) GenerateSummary( }, "job_id": jobID, "qualifire_api_key": *qualifireAPIKey, - "qualifire_url": qualifireURL, "deep_test": deepTest, "judge_model": judgeModel, } From 38ccedd41637e58119b8bcb2efc4340731d4e2b9 Mon Sep 17 00:00:00 2001 From: drorIvry Date: Thu, 11 Sep 2025 13:49:33 +0300 Subject: [PATCH 16/22] Update rogue/run_cli.py Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> --- rogue/run_cli.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/rogue/run_cli.py b/rogue/run_cli.py index 82eb60a6..62492b5b 100644 --- a/rogue/run_cli.py +++ b/rogue/run_cli.py @@ -206,11 +206,17 @@ async def create_report( sdk = RogueSDK(sdk_config) try: + try: + qualifire_api_key = ( + qualifire_api_key_secret.get_secret_value() + if qualifire_api_key_secret + else None + ) summary, _ = await sdk.generate_summary( results=results, model=judge_llm, api_key=judge_llm_api_key, - qualifire_api_key=qualifire_api_key_secret, + qualifire_api_key=qualifire_api_key, deep_test=deep_test_mode, judge_model=judge_model, ) From 564f4a913ea4887c651504a598e5d301c8e8f9c4 Mon Sep 17 00:00:00 2001 From: Dror Ivry Date: Thu, 11 Sep 2025 13:53:23 +0300 Subject: [PATCH 17/22] ci --- rogue/run_cli.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/rogue/run_cli.py b/rogue/run_cli.py index 82eb60a6..53414cf0 100644 --- a/rogue/run_cli.py +++ b/rogue/run_cli.py @@ -214,6 +214,9 @@ async def create_report( deep_test=deep_test_mode, judge_model=judge_model, ) + except Exception as e: + logger.exception("Failed to generate summary") + raise e finally: await sdk.close() From 4d18d18a414f2dbeedf9d73c063f67e8db95ffd8 Mon Sep 17 00:00:00 2001 From: Dror Ivry Date: Thu, 11 Sep 2025 13:54:36 +0300 Subject: [PATCH 18/22] ci --- rogue/run_cli.py | 1 - 1 file changed, 1 deletion(-) diff --git a/rogue/run_cli.py b/rogue/run_cli.py index 9cf71e1d..614af6b4 100644 --- a/rogue/run_cli.py +++ b/rogue/run_cli.py @@ -205,7 +205,6 @@ async def create_report( ) sdk = RogueSDK(sdk_config) - try: try: qualifire_api_key = ( qualifire_api_key_secret.get_secret_value() From de3af0b2f58538717ddd4993600096b834527db1 Mon Sep 17 00:00:00 2001 From: Dror Ivry Date: Thu, 11 Sep 2025 14:22:21 +0300 Subject: [PATCH 19/22] ci --- rogue/server/api/llm.py | 4 ++-- rogue/server/services/qualifire_service.py | 6 +++--- sdks/python/rogue_sdk/types.py | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/rogue/server/api/llm.py b/rogue/server/api/llm.py index 05b9459e..28b271a0 100644 --- a/rogue/server/api/llm.py +++ b/rogue/server/api/llm.py @@ -143,7 +143,7 @@ async def generate_summary( qualifire_url=request.qualifire_url, qualifire_api_key=request.qualifire_api_key, ), - evaluation_result=request.results, + evaluation_results=request.results, ) return ServerSummaryGenerationResponse( @@ -194,7 +194,7 @@ async def report_summary_handler( qualifire_api_key=request.qualifire_api_key, qualifire_url=request.qualifire_url, ), - evaluation_result=EvaluationResults(results=results), + evaluation_results=EvaluationResults(results=results), ) return ReportSummaryResponse( diff --git a/rogue/server/services/qualifire_service.py b/rogue/server/services/qualifire_service.py index 05aabf60..bfdf1356 100644 --- a/rogue/server/services/qualifire_service.py +++ b/rogue/server/services/qualifire_service.py @@ -2,21 +2,21 @@ from loguru import logger from .api_format_service import convert_with_structured_summary -from rogue_sdk.types import EvaluationResult, ReportSummaryRequest +from rogue_sdk.types import EvaluationResults, ReportSummaryRequest class QualifireService: @staticmethod def report_summary( request: ReportSummaryRequest, - evaluation_result: EvaluationResult, + evaluation_results: EvaluationResults, ): logger.info( "Reporting summary to Qualifire", ) api_evaluation_result = convert_with_structured_summary( - evaluation_results=evaluation_result, + evaluation_results=evaluation_results, structured_summary=request.structured_summary, deep_test=request.deep_test, start_time=request.start_time, diff --git a/sdks/python/rogue_sdk/types.py b/sdks/python/rogue_sdk/types.py index 96fedddc..70cd15e8 100644 --- a/sdks/python/rogue_sdk/types.py +++ b/sdks/python/rogue_sdk/types.py @@ -434,7 +434,7 @@ class SummaryGenerationRequest(BaseModel): results: EvaluationResults model: str = "openai/gpt-4.1" api_key: Optional[str] = None - job_id: str = "" + job_id: Optional[str] = None deep_test: bool = False judge_model: Optional[str] = None qualifire_api_key: Optional[str] = None From 1e5c5a05b374785fa462c724a500ac846cb50000 Mon Sep 17 00:00:00 2001 From: Dror Ivry Date: Thu, 11 Sep 2025 15:45:36 +0300 Subject: [PATCH 20/22] ci --- packages/tui/internal/tui/eval_ui.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/packages/tui/internal/tui/eval_ui.go b/packages/tui/internal/tui/eval_ui.go index 8d2e9d92..76a553be 100644 --- a/packages/tui/internal/tui/eval_ui.go +++ b/packages/tui/internal/tui/eval_ui.go @@ -24,10 +24,10 @@ type EvaluationViewState struct { cancelFn func() error // Report generation - Summary string // Generated markdown summary - JobID string // For tracking the evaluation job - Completed bool // Whether evaluation finished successfully - SummaryGenerated bool // Whether summary generation was already attempted + Summary string // Generated markdown summary + JobID string // For tracking the evaluation job + Completed bool // Whether evaluation finished successfully + SummaryGenerated bool // Whether summary generation was already attempted StructuredSummary StructuredSummary // Editing state for New Evaluation From 53755b7ce6870dee86e23e786c625853e7508498 Mon Sep 17 00:00:00 2001 From: Dror Ivry Date: Thu, 11 Sep 2025 15:50:38 +0300 Subject: [PATCH 21/22] ci --- packages/tui/go.mod | 1 - 1 file changed, 1 deletion(-) diff --git a/packages/tui/go.mod b/packages/tui/go.mod index 413f9fe3..a06d4950 100644 --- a/packages/tui/go.mod +++ b/packages/tui/go.mod @@ -14,7 +14,6 @@ require ( ) require ( - github.com/charmbracelet/x/exp/golden v0.0.0-20250207160936-21c02780d27a // indirect github.com/charmbracelet/x/input v0.3.7 // indirect github.com/charmbracelet/x/windows v0.2.1 // indirect github.com/dlclark/regexp2 v1.11.5 // indirect From e679d82c3cedf063900fc9827d42cc920d1148b3 Mon Sep 17 00:00:00 2001 From: Dror Ivry Date: Thu, 11 Sep 2025 15:57:33 +0300 Subject: [PATCH 22/22] ci --- .github/workflows/rogue.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/rogue.yml b/.github/workflows/rogue.yml index d70c5806..dc20301e 100644 --- a/.github/workflows/rogue.yml +++ b/.github/workflows/rogue.yml @@ -56,5 +56,5 @@ jobs: OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} with: evaluated_agent_url: "http://localhost:10001" - judge_llm: "openai/gpt-4.1-mini" + judge_llm: "openai/gpt-4.1" workdir: "./examples/tshirt_store_agent/.rogue"