From 3574a98b18b5a735ba263999d076e9b60c3b2278 Mon Sep 17 00:00:00 2001
From: Dror Ivry <drorivry@gmail.com>
Date: Thu, 4 Sep 2025 15:38:07 +0300
Subject: [PATCH 01/22] Revert "Revert "wip on the new response""

This reverts commit bb3fb1c47a934e46565305d65d34040394ff54c5.
---
 rogue/tests/models/test_evaluation_result.py |  51 +++++++--
 rogue/ui/components/report_generator.py      | 111 ++++++++++++++++++-
 sdks/python/rogue_sdk/types.py               | 101 ++++++++++++++++-
 3 files changed, 243 insertions(+), 20 deletions(-)

diff --git a/rogue/tests/models/test_evaluation_result.py b/rogue/tests/models/test_evaluation_result.py
index b1458423..f7e67e02 100644
--- a/rogue/tests/models/test_evaluation_result.py
+++ b/rogue/tests/models/test_evaluation_result.py
@@ -1,4 +1,5 @@
 import pytest
+from datetime import datetime
 from rogue_sdk.types import (
     ChatHistory,
     ChatMessage,
@@ -7,6 +8,10 @@
     EvaluationResults,
     Scenario,
 )
+from rogue.ui.components.report_generator import (
+    convert_to_api_format,
+    ApiEvaluationResult,
+)
 
 
 class TestEvaluationResults:
@@ -54,26 +59,26 @@ def get_evaluation_result(
                 EvaluationResults(),
                 get_evaluation_result(scenario_1, conversation_1_passed),
                 EvaluationResults(
-                    results=[get_evaluation_result(scenario_1, conversation_1_passed)]
+                    results=[get_evaluation_result(scenario_1, conversation_1_passed)],
                 ),
             ),
             # no overlap from non-empty results
             (
                 EvaluationResults(
-                    results=[get_evaluation_result(scenario_1, conversation_1_passed)]
+                    results=[get_evaluation_result(scenario_1, conversation_1_passed)],
                 ),
                 get_evaluation_result(scenario_2, conversation_1_failed),
                 EvaluationResults(
                     results=[
                         get_evaluation_result(scenario_1, conversation_1_passed),
                         get_evaluation_result(scenario_2, conversation_1_failed),
-                    ]
+                    ],
                 ),
             ),
             # scenario overlap with passed unchanged True -> True
             (
                 EvaluationResults(
-                    results=[get_evaluation_result(scenario_1, conversation_1_passed)]
+                    results=[get_evaluation_result(scenario_1, conversation_1_passed)],
                 ),
                 get_evaluation_result(scenario_1, conversation_2_passed),
                 EvaluationResults(
@@ -86,13 +91,13 @@ def get_evaluation_result(
                             ],
                             passed=True,
                         ),
-                    ]
+                    ],
                 ),
             ),
             # scenario overlap with passed changed True -> False
             (
                 EvaluationResults(
-                    results=[get_evaluation_result(scenario_1, conversation_1_passed)]
+                    results=[get_evaluation_result(scenario_1, conversation_1_passed)],
                 ),
                 get_evaluation_result(scenario_1, conversation_2_failed),
                 EvaluationResults(
@@ -105,13 +110,13 @@ def get_evaluation_result(
                             ],
                             passed=False,
                         ),
-                    ]
+                    ],
                 ),
             ),
             # scenario overlap with passed unchanged False -> False (#1)
             (
                 EvaluationResults(
-                    results=[get_evaluation_result(scenario_1, conversation_1_failed)]
+                    results=[get_evaluation_result(scenario_1, conversation_1_failed)],
                 ),
                 get_evaluation_result(scenario_1, conversation_2_failed),
                 EvaluationResults(
@@ -124,13 +129,13 @@ def get_evaluation_result(
                             ],
                             passed=False,
                         ),
-                    ]
+                    ],
                 ),
             ),
             # scenario overlap with passed unchanged False -> False (#2)
             (
                 EvaluationResults(
-                    results=[get_evaluation_result(scenario_1, conversation_1_failed)]
+                    results=[get_evaluation_result(scenario_1, conversation_1_failed)],
                 ),
                 get_evaluation_result(
                     scenario_1,
@@ -146,7 +151,7 @@ def get_evaluation_result(
                             ],
                             passed=False,
                         ),
-                    ]
+                    ],
                 ),
             ),
         ],
@@ -159,3 +164,27 @@ def test_add_result(
     ):
         existing_results.add_result(new_result)
         assert existing_results == expected_results
+
+    def test_convert_to_api_format(self):
+        """Test conversion to new API format."""
+        results = EvaluationResults()
+        result = self.get_evaluation_result(self.scenario_1, self.conversation_1_passed)
+        results.add_result(result)
+
+        api_format = convert_to_api_format(results)
+
+        assert isinstance(api_format, ApiEvaluationResult)
+        assert len(api_format.scenarios) == 1
+        assert api_format.scenarios[0].description == "Scenario 1"
+        assert api_format.scenarios[0].totalConversations == 1
+        assert api_format.scenarios[0].flaggedConversations == 0
+        assert len(api_format.scenarios[0].conversations) == 1
+        assert api_format.scenarios[0].conversations[0].passed is True
+        assert api_format.scenarios[0].conversations[0].reason == "reason"
+        assert len(api_format.scenarios[0].conversations[0].messages) == 1
+
+        # Test message conversion
+        message = api_format.scenarios[0].conversations[0].messages[0]
+        assert message.role == "user"
+        assert message.content == "message 1"
+        assert isinstance(message.timestamp, datetime)
diff --git a/rogue/ui/components/report_generator.py b/rogue/ui/components/report_generator.py
index 538491b1..dd1236b7 100644
--- a/rogue/ui/components/report_generator.py
+++ b/rogue/ui/components/report_generator.py
@@ -1,9 +1,97 @@
 from pathlib import Path
 from typing import Tuple
+from datetime import datetime, timezone
 
 import gradio as gr
 from loguru import logger
 from rogue_sdk.types import EvaluationResults
+from pydantic import BaseModel
+from typing import List, Optional
+
+
+# New API Format Types for report display
+class ApiChatMessage(BaseModel):
+    """Chat message for new API format with datetime timestamp."""
+
+    role: str
+    content: str
+    timestamp: datetime
+
+
+class ApiConversationEvaluation(BaseModel):
+    """Conversation evaluation for new API format."""
+
+    passed: bool
+    messages: List[ApiChatMessage]
+    reason: Optional[str] = None
+
+
+class ApiScenarioResult(BaseModel):
+    """Result of evaluating a single scenario in new API format."""
+
+    description: Optional[str] = None
+    totalConversations: Optional[int] = None
+    flaggedConversations: Optional[int] = None
+    conversations: List[ApiConversationEvaluation]
+
+
+class ApiEvaluationResult(BaseModel):
+    """New API format for evaluation results."""
+
+    scenarios: List[ApiScenarioResult]
+
+
+def convert_to_api_format(evaluation_results: EvaluationResults) -> ApiEvaluationResult:
+    """Convert legacy EvaluationResults to new API format."""
+    api_scenarios = []
+
+    for result in evaluation_results.results:
+        # Convert conversations to new format
+        api_conversations = []
+        for conv_eval in result.conversations:
+            # Convert ChatHistory messages to ApiChatMessage
+            api_messages = []
+            for msg in conv_eval.messages.messages:
+                timestamp = datetime.now(timezone.utc)
+                if msg.timestamp:
+                    try:
+                        if isinstance(msg.timestamp, str):
+                            timestamp = datetime.fromisoformat(
+                                msg.timestamp.replace("Z", "+00:00"),
+                            )
+                        else:
+                            timestamp = msg.timestamp
+                    except (ValueError, AttributeError):
+                        timestamp = datetime.now(timezone.utc)
+
+                api_messages.append(
+                    ApiChatMessage(
+                        role=msg.role,
+                        content=msg.content,
+                        timestamp=timestamp,
+                    ),
+                )
+
+            api_conversations.append(
+                ApiConversationEvaluation(
+                    passed=conv_eval.passed,
+                    messages=api_messages,
+                    reason=conv_eval.reason if conv_eval.reason else None,
+                ),
+            )
+
+        api_scenarios.append(
+            ApiScenarioResult(
+                description=result.scenario.scenario,
+                totalConversations=len(api_conversations),
+                flaggedConversations=len(
+                    [c for c in api_conversations if not c.passed],
+                ),
+                conversations=api_conversations,
+            ),
+        )
+
+    return ApiEvaluationResult(scenarios=api_scenarios)
 
 
 def _load_report_data_from_files(
@@ -60,13 +148,24 @@ def on_report_tab_select(state):
             )
             results = EvaluationResults()
 
+        # Convert to new API format for display
+        try:
+            api_format_results = convert_to_api_format(results)
+            results_json = api_format_results.model_dump_json(
+                indent=2,
+                exclude_none=True,
+            )
+        except Exception as e:
+            logger.warning(
+                f"Failed to convert results to API format: {e}",
+                extra={
+                    "results": results,
+                },
+            )
+            results_json = str(results)
+
         return {
-            evaluation_results_display: gr.update(
-                value=results.model_dump_json(
-                    indent=2,
-                    exclude_none=True,
-                ),
-            ),
+            evaluation_results_display: gr.update(value=results_json),
             summary_display: gr.update(value=summary),
         }
 
diff --git a/sdks/python/rogue_sdk/types.py b/sdks/python/rogue_sdk/types.py
index b5359716..c2a741fa 100644
--- a/sdks/python/rogue_sdk/types.py
+++ b/sdks/python/rogue_sdk/types.py
@@ -85,7 +85,7 @@ def check_auth_credentials(self) -> "AgentConfig":
 
         if auth_type and auth_type != AuthType.NO_AUTH and not auth_credentials:
             raise ValueError(
-                "Authentication Credentials cannot be empty for the selected auth type."
+                "Authentication Credentials cannot be empty for the selected auth type.",  # noqa: E501
             )
         return self
 
@@ -110,7 +110,7 @@ def validate_dataset_for_type(self) -> "Scenario":
         if dataset_required and self.dataset is None:
             raise ValueError(
                 f"`dataset` must be provided when scenario_type is "
-                f"'{self.scenario_type.value}'"
+                f"'{self.scenario_type.value}'",
             )
         elif not dataset_required and self.dataset is not None:
             logger.info(
@@ -143,7 +143,7 @@ def get_scenarios_by_type(self, scenario_type: ScenarioType) -> "Scenarios":
                 scenario
                 for scenario in self.scenarios
                 if scenario.scenario_type == scenario_type
-            ]
+            ],
         )
 
     def get_policy_scenarios(self) -> "Scenarios":
@@ -207,6 +207,101 @@ def combine(self, other: "EvaluationResults"):
                 self.add_result(result)
 
 
+# New API Format Types
+
+
+class ApiChatMessage(BaseModel):
+    """Chat message for new API format with datetime timestamp."""
+
+    role: str
+    content: str
+    timestamp: datetime
+
+
+class ApiConversationEvaluation(BaseModel):
+    """Conversation evaluation for new API format."""
+
+    passed: bool
+    messages: List[ApiChatMessage]
+    reason: Optional[str] = None
+
+
+class ApiScenarioResult(BaseModel):
+    """Result of evaluating a single scenario in new API format."""
+
+    description: Optional[str] = None
+    totalConversations: Optional[int] = None
+    flaggedConversations: Optional[int] = None
+    conversations: List[ApiConversationEvaluation]
+
+
+class ApiEvaluationResult(BaseModel):
+    """New API format for evaluation results."""
+
+    scenarios: List[ApiScenarioResult]
+
+
+# Conversion functions for new API format
+def convert_to_api_format(evaluation_results: EvaluationResults) -> ApiEvaluationResult:
+    """Convert legacy EvaluationResults to new API format.
+
+    Args:
+        evaluation_results: Legacy evaluation results to convert
+
+    Returns:
+        ApiEvaluationResult: New format evaluation result
+    """
+    api_scenarios = []
+
+    for result in evaluation_results.results:
+        # Convert conversations to new format
+        api_conversations = []
+        for conv_eval in result.conversations:
+            # Convert ChatHistory messages to ApiChatMessage
+            api_messages = []
+            for msg in conv_eval.messages.messages:
+                timestamp = datetime.now(timezone.utc)
+                if msg.timestamp:
+                    try:
+                        if isinstance(msg.timestamp, str):
+                            timestamp = datetime.fromisoformat(
+                                msg.timestamp.replace("Z", "+00:00"),
+                            )
+                        else:
+                            timestamp = msg.timestamp
+                    except (ValueError, AttributeError):
+                        timestamp = datetime.now(timezone.utc)
+
+                api_messages.append(
+                    ApiChatMessage(
+                        role=msg.role,
+                        content=msg.content,
+                        timestamp=timestamp,
+                    ),
+                )
+
+            api_conversations.append(
+                ApiConversationEvaluation(
+                    passed=conv_eval.passed,
+                    messages=api_messages,
+                    reason=conv_eval.reason if conv_eval.reason else None,
+                ),
+            )
+
+        api_scenarios.append(
+            ApiScenarioResult(
+                description=result.scenario.scenario,
+                totalConversations=len(api_conversations),
+                flaggedConversations=len(
+                    [c for c in api_conversations if not c.passed],
+                ),
+                conversations=api_conversations,
+            ),
+        )
+
+    return ApiEvaluationResult(scenarios=api_scenarios)
+
+
 # Interview Types
 
 

From ae68ff99b249086e69ac607da5a940135dc37929 Mon Sep 17 00:00:00 2001
From: Dror Ivry <drorivry@gmail.com>
Date: Thu, 4 Sep 2025 15:47:13 +0300
Subject: [PATCH 02/22] wip

---
 rogue/tests/models/test_evaluation_result.py | 17 +++++-
 rogue/ui/components/report_generator.py      | 59 ++++++++++++++++++--
 2 files changed, 70 insertions(+), 6 deletions(-)

diff --git a/rogue/tests/models/test_evaluation_result.py b/rogue/tests/models/test_evaluation_result.py
index f7e67e02..1d5c4dbd 100644
--- a/rogue/tests/models/test_evaluation_result.py
+++ b/rogue/tests/models/test_evaluation_result.py
@@ -171,7 +171,14 @@ def test_convert_to_api_format(self):
         result = self.get_evaluation_result(self.scenario_1, self.conversation_1_passed)
         results.add_result(result)
 
-        api_format = convert_to_api_format(results)
+        api_format = convert_to_api_format(
+            evaluation_results=results,
+            summary="Test summary",
+            key_findings="Key finding 1",
+            recommendation="Test recommendation",
+            deep_test=True,
+            judge_model="openai/gpt-4o-mini",
+        )
 
         assert isinstance(api_format, ApiEvaluationResult)
         assert len(api_format.scenarios) == 1
@@ -188,3 +195,11 @@ def test_convert_to_api_format(self):
         assert message.role == "user"
         assert message.content == "message 1"
         assert isinstance(message.timestamp, datetime)
+
+        # Test new fields
+        assert api_format.summary == "Test summary"
+        assert api_format.keyFindings == "Key finding 1"
+        assert api_format.recommendation == "Test recommendation"
+        assert api_format.deepTest is True
+        assert api_format.judgeModel == "openai/gpt-4o-mini"
+        assert isinstance(api_format.startTime, datetime)
diff --git a/rogue/ui/components/report_generator.py b/rogue/ui/components/report_generator.py
index dd1236b7..4e8624fc 100644
--- a/rogue/ui/components/report_generator.py
+++ b/rogue/ui/components/report_generator.py
@@ -39,10 +39,40 @@ class ApiEvaluationResult(BaseModel):
     """New API format for evaluation results."""
 
     scenarios: List[ApiScenarioResult]
+    summary: Optional[str] = None
+    keyFindings: Optional[str] = None
+    recommendation: Optional[str] = None
+    deepTest: bool = False
+    startTime: datetime
+    judgeModel: Optional[str] = None
+
+
+def convert_to_api_format(
+    evaluation_results: EvaluationResults,
+    summary: Optional[str] = None,
+    key_findings: Optional[str] = None,
+    recommendation: Optional[str] = None,
+    deep_test: bool = False,
+    start_time: Optional[datetime] = None,
+    judge_model: Optional[str] = None,
+) -> ApiEvaluationResult:
+    """Convert legacy EvaluationResults to new API format.
+
+    Args:
+        evaluation_results: Legacy evaluation results to convert
+        summary: Generated summary of the evaluation
+        key_findings: Key findings from the evaluation
+        recommendation: Recommendations based on the evaluation
+        deep_test: Whether deep test mode was enabled
+        start_time: When the evaluation started (defaults to current time)
+        judge_model: The LLM judge model used
+
+    Returns:
+        ApiEvaluationResult: New format evaluation result with additional metadata
+    """
+    if start_time is None:
+        start_time = datetime.now(timezone.utc)
 
-
-def convert_to_api_format(evaluation_results: EvaluationResults) -> ApiEvaluationResult:
-    """Convert legacy EvaluationResults to new API format."""
     api_scenarios = []
 
     for result in evaluation_results.results:
@@ -91,7 +121,15 @@ def convert_to_api_format(evaluation_results: EvaluationResults) -> ApiEvaluatio
             ),
         )
 
-    return ApiEvaluationResult(scenarios=api_scenarios)
+    return ApiEvaluationResult(
+        scenarios=api_scenarios,
+        summary=summary,
+        keyFindings=key_findings,
+        recommendation=recommendation,
+        deepTest=deep_test,
+        startTime=start_time,
+        judgeModel=judge_model,
+    )
 
 
 def _load_report_data_from_files(
@@ -150,7 +188,18 @@ def on_report_tab_select(state):
 
         # Convert to new API format for display
         try:
-            api_format_results = convert_to_api_format(results)
+            # Extract configuration and additional metadata from state
+            config = state.get("config", {})
+
+            api_format_results = convert_to_api_format(
+                evaluation_results=results,
+                summary=summary if summary != "No summary available." else None,
+                key_findings=state.get("key_findings"),
+                recommendation=state.get("recommendation"),
+                deep_test=config.get("deep_test_mode", False),
+                start_time=state.get("start_time"),
+                judge_model=config.get("judge_llm"),
+            )
             results_json = api_format_results.model_dump_json(
                 indent=2,
                 exclude_none=True,

From 354cbd0dcfed74478e002080c449bc31ec8f0685 Mon Sep 17 00:00:00 2001
From: Dror Ivry <drorivry@gmail.com>
Date: Thu, 4 Sep 2025 16:05:55 +0300
Subject: [PATCH 03/22] --wip-- [skip ci]

---
 rogue/server/models/api_format.py            |  47 +++++++++
 rogue/tests/models/test_evaluation_result.py |  12 +--
 rogue/ui/components/report_generator.py      | 100 ++++++++++++++++++-
 3 files changed, 150 insertions(+), 9 deletions(-)
 create mode 100644 rogue/server/models/api_format.py

diff --git a/rogue/server/models/api_format.py b/rogue/server/models/api_format.py
new file mode 100644
index 00000000..a374e984
--- /dev/null
+++ b/rogue/server/models/api_format.py
@@ -0,0 +1,47 @@
+"""API format models for evaluation results.
+
+These models define the enhanced API format for evaluation results
+that includes summary, key findings, recommendations, and metadata.
+"""
+
+from datetime import datetime
+from typing import List, Optional
+
+from pydantic import BaseModel
+
+
+class ApiChatMessage(BaseModel):
+    """Chat message for new API format with datetime timestamp."""
+
+    role: str
+    content: str
+    timestamp: datetime
+
+
+class ApiConversationEvaluation(BaseModel):
+    """Conversation evaluation for new API format."""
+
+    passed: bool
+    messages: List[ApiChatMessage]
+    reason: Optional[str] = None
+
+
+class ApiScenarioResult(BaseModel):
+    """Result of evaluating a single scenario in new API format."""
+
+    description: Optional[str] = None
+    totalConversations: Optional[int] = None
+    flaggedConversations: Optional[int] = None
+    conversations: List[ApiConversationEvaluation]
+
+
+class ApiEvaluationResult(BaseModel):
+    """New API format for evaluation results."""
+
+    scenarios: List[ApiScenarioResult]
+    summary: Optional[str] = None
+    keyFindings: Optional[str] = None
+    recommendation: Optional[str] = None
+    deepTest: bool = False
+    startTime: datetime
+    judgeModel: Optional[str] = None
diff --git a/rogue/tests/models/test_evaluation_result.py b/rogue/tests/models/test_evaluation_result.py
index 1d5c4dbd..4deb72a7 100644
--- a/rogue/tests/models/test_evaluation_result.py
+++ b/rogue/tests/models/test_evaluation_result.py
@@ -173,9 +173,9 @@ def test_convert_to_api_format(self):
 
         api_format = convert_to_api_format(
             evaluation_results=results,
-            summary="Test summary",
-            key_findings="Key finding 1",
-            recommendation="Test recommendation",
+            summary="Test summary for overall evaluation",
+            key_findings="• Key finding 1\n• Key finding 2",
+            recommendation="• Recommendation 1\n• Recommendation 2",
             deep_test=True,
             judge_model="openai/gpt-4o-mini",
         )
@@ -197,9 +197,9 @@ def test_convert_to_api_format(self):
         assert isinstance(message.timestamp, datetime)
 
         # Test new fields
-        assert api_format.summary == "Test summary"
-        assert api_format.keyFindings == "Key finding 1"
-        assert api_format.recommendation == "Test recommendation"
+        assert api_format.summary == "Test summary for overall evaluation"
+        assert api_format.keyFindings == "• Key finding 1\n• Key finding 2"
+        assert api_format.recommendation == "• Recommendation 1\n• Recommendation 2"
         assert api_format.deepTest is True
         assert api_format.judgeModel == "openai/gpt-4o-mini"
         assert isinstance(api_format.startTime, datetime)
diff --git a/rogue/ui/components/report_generator.py b/rogue/ui/components/report_generator.py
index 4e8624fc..07e3c9cf 100644
--- a/rogue/ui/components/report_generator.py
+++ b/rogue/ui/components/report_generator.py
@@ -7,6 +7,90 @@
 from rogue_sdk.types import EvaluationResults
 from pydantic import BaseModel
 from typing import List, Optional
+import re
+
+
+def parse_summary_sections(full_summary: str) -> tuple[str, str, str]:
+    """Parse a comprehensive summary into separate sections.
+
+    Args:
+        full_summary: The comprehensive summary text
+
+    Returns:
+        Tuple of (summary, key_findings, recommendations)
+    """
+    if not full_summary:
+        return None, None, None
+
+    # Extract the main summary section (everything before Key Findings)
+    summary_match = re.search(
+        r"(.*?)(?=---\s*##?\s+Key Findings|##?\s+Key Findings)",
+        full_summary,
+        re.DOTALL | re.IGNORECASE,
+    )
+    summary_section = ""
+    if summary_match:
+        summary_section = summary_match.group(1).strip()
+        # Clean up extra dashes and formatting
+        summary_section = re.sub(r"---+\s*$", "", summary_section).strip()
+
+    # Extract Key Findings section
+    key_findings_match = re.search(
+        r"##?\s+Key Findings\s*[-]*\s*(.*?)(?=---\s*##?\s+Recommendations|##?\s+Recommendations|##?\s+Detailed Breakdown|$)",  # noqa: E501
+        full_summary,
+        re.DOTALL | re.IGNORECASE,
+    )
+    key_findings_section = ""
+    if key_findings_match:
+        key_findings_section = key_findings_match.group(1).strip()
+        # Clean up bullet points and formatting
+        key_findings_section = re.sub(
+            r"^-\s*",
+            "",
+            key_findings_section,
+            flags=re.MULTILINE,
+        )
+        key_findings_section = re.sub(r"---+\s*$", "", key_findings_section).strip()
+        # Fix bullet point formatting
+        key_findings_section = re.sub(r"\s*-\s*\*\*", "\n• **", key_findings_section)
+        if not key_findings_section.startswith(
+            "•",
+        ) and not key_findings_section.startswith("-"):
+            key_findings_section = "• " + key_findings_section
+
+    # Extract Recommendations section
+    recommendations_match = re.search(
+        r"##?\s+Recommendations\s*[-]*\s*(.*?)(?=---\s*##?\s+Detailed Breakdown|##?\s+Detailed Breakdown|$)",  # noqa: E501
+        full_summary,
+        re.DOTALL | re.IGNORECASE,
+    )
+    recommendations_section = ""
+    if recommendations_match:
+        recommendations_section = recommendations_match.group(1).strip()
+        # Clean up formatting
+        recommendations_section = re.sub(
+            r"---+\s*$",
+            "",
+            recommendations_section,
+        ).strip()
+        # Convert all numbered items to bullet points
+        recommendations_section = re.sub(
+            r"^\d+\.\s*",
+            "• ",
+            recommendations_section,
+            flags=re.MULTILINE,
+        )
+        recommendations_section = re.sub(
+            r"\s+\d+\.\s*",
+            "\n• ",
+            recommendations_section,
+        )
+
+    return (
+        summary_section if summary_section else None,
+        key_findings_section if key_findings_section else None,
+        recommendations_section if recommendations_section else None,
+    )
 
 
 # New API Format Types for report display
@@ -191,11 +275,21 @@ def on_report_tab_select(state):
             # Extract configuration and additional metadata from state
             config = state.get("config", {})
 
+            # Parse the summary to extract separate sections
+            if summary and summary != "No summary available.":
+                parsed_summary, parsed_key_findings, parsed_recommendations = (
+                    parse_summary_sections(summary)
+                )
+            else:
+                parsed_summary = None
+                parsed_key_findings = None
+                parsed_recommendations = None
+
             api_format_results = convert_to_api_format(
                 evaluation_results=results,
-                summary=summary if summary != "No summary available." else None,
-                key_findings=state.get("key_findings"),
-                recommendation=state.get("recommendation"),
+                summary=parsed_summary,
+                key_findings=parsed_key_findings or state.get("key_findings"),
+                recommendation=parsed_recommendations or state.get("recommendation"),
                 deep_test=config.get("deep_test_mode", False),
                 start_time=state.get("start_time"),
                 judge_model=config.get("judge_llm"),

From dc2f6e986a4fb485adfc3e4ecc0eed2de5113dbd Mon Sep 17 00:00:00 2001
From: Dror Ivry <drorivry@gmail.com>
Date: Sun, 7 Sep 2025 12:25:57 +0300
Subject: [PATCH 04/22] wip

---
 examples/js/cli/package.json                  |   2 +-
 examples/js/langgraph-js-example/package.json |   2 +-
 examples/js/vercel-ai-example/package.json    |   2 +-
 .../tshirt_store_agent_executor.py            |  17 +-
 .../tui/internal/theme/themes/vesper.json     |   2 +-
 rogue/common/generic_agent_executor.py        |  17 +-
 rogue/common/remote_agent_connection.py       |   4 +-
 rogue/server/api/llm.py                       |  11 +-
 rogue/server/models/api_format.py             |  16 ++
 rogue/server/services/__init__.py             |   1 +
 rogue/server/services/interviewer_service.py  |   6 +-
 rogue/server/services/llm_service.py          | 111 +++++++--
 .../services/scenario_evaluation_service.py   |   8 +-
 rogue/tests/models/test_cli_input.py          |   3 +-
 rogue/tests/models/test_evaluation_result.py  |  27 +-
 rogue/ui/components/config_screen.py          |  14 +-
 rogue/ui/components/report_generator.py       | 231 +-----------------
 rogue/ui/components/scenario_runner.py        |   1 +
 sdks/python/rogue_sdk/client.py               |  25 +-
 sdks/python/rogue_sdk/sdk.py                  |  35 ++-
 sdks/python/rogue_sdk/tests/test_types.py     |   8 +-
 sdks/python/rogue_sdk/types.py                |  11 +-
 sdks/python/rogue_sdk/websocket.py            |   4 +-
 23 files changed, 253 insertions(+), 305 deletions(-)

diff --git a/examples/js/cli/package.json b/examples/js/cli/package.json
index 0afd3083..758f40a0 100644
--- a/examples/js/cli/package.json
+++ b/examples/js/cli/package.json
@@ -20,4 +20,4 @@
     "ts-node": "^10.9.2",
     "typescript": "^5.8.3"
   }
-}
\ No newline at end of file
+}
diff --git a/examples/js/langgraph-js-example/package.json b/examples/js/langgraph-js-example/package.json
index 46860e2a..86b52c0b 100644
--- a/examples/js/langgraph-js-example/package.json
+++ b/examples/js/langgraph-js-example/package.json
@@ -28,4 +28,4 @@
     "ts-node": "^10.9.2",
     "typescript": "^5.8.3"
   }
-}
\ No newline at end of file
+}
diff --git a/examples/js/vercel-ai-example/package.json b/examples/js/vercel-ai-example/package.json
index a5c1cee5..4c39d140 100644
--- a/examples/js/vercel-ai-example/package.json
+++ b/examples/js/vercel-ai-example/package.json
@@ -25,4 +25,4 @@
     "uuid": "^11.1.0",
     "zod": "^3.24.1"
   }
-}
\ No newline at end of file
+}
diff --git a/examples/tshirt_store_agent/tshirt_store_agent_executor.py b/examples/tshirt_store_agent/tshirt_store_agent_executor.py
index 68286326..32cb739a 100644
--- a/examples/tshirt_store_agent/tshirt_store_agent_executor.py
+++ b/examples/tshirt_store_agent/tshirt_store_agent_executor.py
@@ -129,7 +129,7 @@ async def _upsert_session(self, session_id: str):
         if session is None:
             logger.error(
                 f"Critical error: Session is None even after "
-                f"create_session for session_id: {session_id}"
+                f"create_session for session_id: {session_id}",
             )
             raise RuntimeError(
                 f"Failed to get or create session: {session_id}",
@@ -151,15 +151,16 @@ def convert_a2a_part_to_genai(part: Part) -> types.Part:
         if isinstance(part.file, FileWithUri):
             return types.Part(
                 file_data=types.FileData(
-                    file_uri=part.file.uri, mime_type=part.file.mimeType
-                )
+                    file_uri=part.file.uri,
+                    mime_type=part.file.mimeType,
+                ),
             )
         if isinstance(part.file, FileWithBytes):
             return types.Part(
                 inline_data=types.Blob(
                     data=base64.b64decode(part.file.bytes),
                     mime_type=part.file.mimeType,
-                )
+                ),
             )
         raise ValueError(f"Unsupported file type: {type(part.file)}")
     raise ValueError(f"Unsupported part type: {type(part)}")
@@ -185,8 +186,8 @@ def convert_genai_part_to_a2a(part: types.Part) -> Part:
                 file=FileWithUri(
                     uri=part.file_data.file_uri or "",
                     mimeType=part.file_data.mime_type,
-                )
-            )
+                ),
+            ),
         )
     if part.inline_data:
         return Part(
@@ -196,7 +197,7 @@ def convert_genai_part_to_a2a(part: types.Part) -> Part:
                         part.inline_data.data,  # type: ignore
                     ).decode(),
                     mimeType=part.inline_data.mime_type,
-                )
-            )
+                ),
+            ),
         )
     raise ValueError(f"Unsupported part type: {part}")
diff --git a/packages/tui/internal/theme/themes/vesper.json b/packages/tui/internal/theme/themes/vesper.json
index b8406f93..08eade58 100644
--- a/packages/tui/internal/theme/themes/vesper.json
+++ b/packages/tui/internal/theme/themes/vesper.json
@@ -216,4 +216,4 @@
       }
     }
   }
-  
\ No newline at end of file
+  
diff --git a/rogue/common/generic_agent_executor.py b/rogue/common/generic_agent_executor.py
index 0e1de2f6..059dc77a 100644
--- a/rogue/common/generic_agent_executor.py
+++ b/rogue/common/generic_agent_executor.py
@@ -128,7 +128,7 @@ async def _upsert_session(self, session_id: str):
         if session is None:
             logger.error(
                 f"Critical error: Session is None even after "
-                f"create_session for session_id: {session_id}"
+                f"create_session for session_id: {session_id}",
             )
             raise RuntimeError(
                 f"Failed to get or create session: {session_id}",
@@ -150,15 +150,16 @@ def convert_a2a_part_to_genai(part: Part) -> types.Part:
         if isinstance(part.file, FileWithUri):
             return types.Part(
                 file_data=types.FileData(
-                    file_uri=part.file.uri, mime_type=part.file.mimeType
-                )
+                    file_uri=part.file.uri,
+                    mime_type=part.file.mimeType,
+                ),
             )
         if isinstance(part.file, FileWithBytes):
             return types.Part(
                 inline_data=types.Blob(
                     data=base64.b64decode(part.file.bytes),
                     mime_type=part.file.mimeType,
-                )
+                ),
             )
         raise ValueError(f"Unsupported file type: {type(part.file)}")
     raise ValueError(f"Unsupported part type: {type(part)}")
@@ -184,8 +185,8 @@ def convert_genai_part_to_a2a(part: types.Part) -> Part:
                 file=FileWithUri(
                     uri=part.file_data.file_uri or "",
                     mimeType=part.file_data.mime_type,
-                )
-            )
+                ),
+            ),
         )
     if part.inline_data:
         return Part(
@@ -195,7 +196,7 @@ def convert_genai_part_to_a2a(part: types.Part) -> Part:
                         part.inline_data.data,  # type: ignore
                     ).decode(),
                     mimeType=part.inline_data.mime_type,
-                )
-            )
+                ),
+            ),
         )
     raise ValueError(f"Unsupported part type: {part}")
diff --git a/rogue/common/remote_agent_connection.py b/rogue/common/remote_agent_connection.py
index e08caf83..4fdf080b 100644
--- a/rogue/common/remote_agent_connection.py
+++ b/rogue/common/remote_agent_connection.py
@@ -81,7 +81,7 @@ async def send_message(
                 SendStreamingMessageRequest(
                     id=uuid4().hex,
                     params=request,
-                )
+                ),
             ):
                 logger.debug(
                     "received stream response from remote agent",
@@ -110,7 +110,7 @@ async def send_message(
                 SendMessageRequest(
                     id=uuid4().hex,
                     params=request,
-                )
+                ),
             )
 
             logger.debug(
diff --git a/rogue/server/api/llm.py b/rogue/server/api/llm.py
index 4fa434f8..73a312f4 100644
--- a/rogue/server/api/llm.py
+++ b/rogue/server/api/llm.py
@@ -9,9 +9,10 @@
     ScenarioGenerationRequest,
     ScenarioGenerationResponse,
     SummaryGenerationRequest,
-    SummaryGenerationResponse,
 )
 
+from ..models.api_format import ServerSummaryGenerationResponse
+
 from ...common.logging import get_logger
 from ..services.llm_service import LLMService
 
@@ -57,8 +58,10 @@ async def generate_scenarios(request: ScenarioGenerationRequest):
         )
 
 
-@router.post("/summary", response_model=SummaryGenerationResponse)
-async def generate_summary(request: SummaryGenerationRequest):
+@router.post("/summary", response_model=ServerSummaryGenerationResponse)
+async def generate_summary(
+    request: SummaryGenerationRequest,
+) -> ServerSummaryGenerationResponse:
     """
     Generate evaluation summary from results.
 
@@ -81,7 +84,7 @@ async def generate_summary(request: SummaryGenerationRequest):
 
         logger.info("Successfully generated evaluation summary")
 
-        return SummaryGenerationResponse(
+        return ServerSummaryGenerationResponse(
             summary=summary,
             message="Successfully generated evaluation summary",
         )
diff --git a/rogue/server/models/api_format.py b/rogue/server/models/api_format.py
index a374e984..1142b729 100644
--- a/rogue/server/models/api_format.py
+++ b/rogue/server/models/api_format.py
@@ -10,6 +10,15 @@
 from pydantic import BaseModel
 
 
+class StructuredSummary(BaseModel):
+    """Structured summary response from LLM."""
+
+    overall_summary: str
+    key_findings: List[str]
+    recommendations: List[str]
+    detailed_breakdown: List[dict]  # Table rows for scenario breakdown
+
+
 class ApiChatMessage(BaseModel):
     """Chat message for new API format with datetime timestamp."""
 
@@ -45,3 +54,10 @@ class ApiEvaluationResult(BaseModel):
     deepTest: bool = False
     startTime: datetime
     judgeModel: Optional[str] = None
+
+
+class ServerSummaryGenerationResponse(BaseModel):
+    """Server response for summary generation with structured summary."""
+
+    summary: StructuredSummary
+    message: str
diff --git a/rogue/server/services/__init__.py b/rogue/server/services/__init__.py
index 8e3466ae..95047763 100644
--- a/rogue/server/services/__init__.py
+++ b/rogue/server/services/__init__.py
@@ -1,4 +1,5 @@
 from . import (
+    api_format_service,
     evaluation_library,
     evaluation_service,
     interviewer_service,
diff --git a/rogue/server/services/interviewer_service.py b/rogue/server/services/interviewer_service.py
index 1caf69b7..d9d9d333 100644
--- a/rogue/server/services/interviewer_service.py
+++ b/rogue/server/services/interviewer_service.py
@@ -71,7 +71,7 @@ def send_message(self, user_input: str):
             {
                 "role": "user",
                 "content": user_input,
-            }
+            },
         )
 
         # Copying the messages to avoid modifying the original list
@@ -87,7 +87,7 @@ def send_message(self, user_input: str):
                         "You have asked 3 questions. Now, provide a concise summary of "
                         "the agent's business context based on the conversation."
                     ),
-                }
+                },
             )
 
         try:
@@ -101,7 +101,7 @@ def send_message(self, user_input: str):
                 {
                     "role": "assistant",
                     "content": response.choices[0].message.content,
-                }
+                },
             )
             return response.choices[0].message.content
 
diff --git a/rogue/server/services/llm_service.py b/rogue/server/services/llm_service.py
index de46177f..aa229c04 100644
--- a/rogue/server/services/llm_service.py
+++ b/rogue/server/services/llm_service.py
@@ -1,9 +1,12 @@
+import json
 from typing import Optional
 
 from litellm import completion
 from loguru import logger
 from rogue_sdk.types import EvaluationResults, Scenario, Scenarios, ScenarioType
 
+from ..models.api_format import StructuredSummary
+
 SCENARIO_GENERATION_SYSTEM_PROMPT = """
 # Test Scenario Designer
 
@@ -98,7 +101,7 @@
 # Evaluation Results Summarizer
 
 You are a test results summarizer. Your task is to analyze the provided evaluation results
-and generate a concise, insightful, and human-readable summary in Markdown format.
+and generate a structured JSON response with the summary components.
 
 ## Evaluation Results (JSON)
 <evaluation_results>
@@ -106,22 +109,47 @@
 </evaluation_results>
 
 ## Your Task
-Based on the JSON data above, create a summary that includes:
+Based on the JSON data above, create a structured summary that includes:
 
-1.  **Overall Summary**: A brief, high-level overview of the agent's performance,
-    highlighting the pass/fail ratio and any critical issues discovered.
-2.  **Key Findings**: Bullet points detailing the most significant discoveries, both
-    positive and negative. Focus on patterns of failure or notable successes.
-3.  **Recommendations**: Suggest concrete next steps for improving the agent. These
+1.  **overall_summary**: A brief, high-level overview of the agent's performance,
+    highlighting the pass/fail ratio and any critical issues discovered. Return as a single string.
+2.  **key_findings**: List of the most significant discoveries, both positive and negative.
+    Focus on patterns of failure or notable successes. Return as an array of strings.
+3.  **recommendations**: List of concrete next steps for improving the agent. These
     could include fixing specific bugs, improving training data, or clarifying policies.
-4.  **Detailed Breakdown**: A table that provides a granular look at each
-    scenario that was tested, including the pass/fail with the appropriate emoji ✅/❌ status and a brief note on the outcome.
+    Return as an array of strings.
+4.  **detailed_breakdown**: Array of objects representing a table that provides a granular
+    look at each scenario tested. Each object should have: scenario, status (✅/❌), outcome.
+
+## Output Format
+You MUST respond with valid JSON in exactly this format:
+
+```json
+{
+  "overall_summary": "Brief overview text here...",
+  "key_findings": [
+    "First key finding",
+    "Second key finding"
+  ],
+  "recommendations": [
+    "First recommendation",
+    "Second recommendation"
+  ],
+  "detailed_breakdown": [
+    {
+      "scenario": "Scenario name",
+      "status": "✅",
+      "outcome": "Brief outcome description"
+    }
+  ]
+}
+```
 
 ## Guidelines
 - Use clear and professional language.
-- Format the output using Markdown for readability (headings, bold text, lists, etc.).
 - Be objective and base your summary strictly on the provided data.
-- Ensure the summary is well-organized and easy to navigate.
+- Return ONLY valid JSON - no markdown, no explanations, no additional text.
+- Ensure all strings are properly escaped for JSON.
 """  # noqa: E501
 
 
@@ -142,13 +170,18 @@ def generate_scenarios(
         context: str,
         llm_provider_api_key: Optional[str] = None,
     ) -> Scenarios:
-        """
-        Generates scenarios for the given business context using the given model.
-        :param model: LLM model to use for scenario generation.
-        :param context: Business context to use for scenario generation.
-        :param llm_provider_api_key: api key for the LLM provider
-            (if applicable, env can also be used instead).
-        :return: The generated scenarios
+        """Generate test scenarios from business context using LLM.
+
+        Args:
+            model: LLM model to use for generation
+            context: Business context description for scenario generation
+            llm_provider_api_key: API key for the LLM provider
+
+        Returns:
+            Scenarios: Generated test scenarios
+
+        Raises:
+            Exception: If scenario generation fails
         """
         system_prompt = SCENARIO_GENERATION_SYSTEM_PROMPT.replace(
             r"{$BUSINESS_CONTEXT}",
@@ -188,7 +221,7 @@ def generate_summary_from_results(
         model: str,
         results: EvaluationResults,
         llm_provider_api_key: Optional[str] = None,
-    ) -> str:
+    ) -> StructuredSummary:
         system_prompt = SUMMARY_GENERATION_SYSTEM_PROMPT.replace(
             r"{$EVALUATION_RESULTS}",
             results.model_dump_json(indent=2),
@@ -198,7 +231,10 @@ def generate_summary_from_results(
             {"role": "system", "content": system_prompt},
             {
                 "role": "user",
-                "content": "Please generate the summary based on the provided results.",
+                "content": (
+                    "Please generate the structured summary based on the "
+                    "provided results."
+                ),
             },
         ]
 
@@ -210,7 +246,38 @@ def generate_summary_from_results(
                 messages=messages,
                 api_key=api_key,
             )
-            return response.choices[0].message.content
+
+            # Parse the JSON response from the LLM
+            content = response.choices[0].message.content.strip()
+
+            # Remove markdown code blocks if present
+            if content.startswith("```json"):
+                content = content[7:]
+            if content.endswith("```"):
+                content = content[:-3]
+            content = content.strip()
+
+            # Parse JSON and create StructuredSummary
+            summary_data = json.loads(content)
+            return StructuredSummary(**summary_data)
+
+        except json.JSONDecodeError as e:
+            logger.exception(f"Failed to parse JSON response from LLM: {e}")
+            # Return a fallback structured summary
+            return StructuredSummary(
+                overall_summary="Error: Could not parse summary response from LLM.",
+                key_findings=["Unable to generate key findings due to parsing error."],
+                recommendations=["Please review the evaluation results manually."],
+                detailed_breakdown=[],
+            )
         except Exception:
             logger.exception("Failed to generate summary from results")
-            return "Error: Could not generate a summary for the evaluation results."
+            # Return a fallback structured summary
+            return StructuredSummary(
+                overall_summary=(
+                    "Error: Could not generate a summary for the evaluation results."
+                ),
+                key_findings=["Unable to generate key findings due to system error."],
+                recommendations=["Please review the evaluation results manually."],
+                detailed_breakdown=[],
+            )
diff --git a/rogue/server/services/scenario_evaluation_service.py b/rogue/server/services/scenario_evaluation_service.py
index 4ed69ba0..8de94c76 100644
--- a/rogue/server/services/scenario_evaluation_service.py
+++ b/rogue/server/services/scenario_evaluation_service.py
@@ -78,7 +78,7 @@ async def evaluate_scenarios(self) -> AsyncGenerator[tuple[str, Any], None]:
                     results = data
                     if results and results.results:
                         logger.info(
-                            f"📊 Processing {len(results.results)} evaluation results"
+                            f"📊 Processing {len(results.results)} evaluation results",
                         )
                         for res in results.results:
                             self._results.add_result(res)
@@ -86,12 +86,12 @@ async def evaluate_scenarios(self) -> AsyncGenerator[tuple[str, Any], None]:
                         logger.warning("⚠️ Received results update but no results data")
                 else:  # it's a 'chat' or 'status' update
                     logger.debug(
-                        f"🔄 Forwarding {update_type} update: {str(data)[:50]}..."
+                        f"🔄 Forwarding {update_type} update: {str(data)[:50]}...",
                     )
                     yield update_type, data
 
             logger.info(
-                f"🏁 arun_evaluator_agent completed. Total updates: {update_count}"
+                f"🏁 arun_evaluator_agent completed. Total updates: {update_count}",
             )
 
         except Exception as e:
@@ -132,6 +132,6 @@ async def evaluate_scenarios(self) -> AsyncGenerator[tuple[str, Any], None]:
             (
                 "✅ ScenarioEvaluationService completed with "
                 f"{len(self._results.results)} total results"
-            )
+            ),
         )
         yield "done", self._results
diff --git a/rogue/tests/models/test_cli_input.py b/rogue/tests/models/test_cli_input.py
index 627bc921..e2571b47 100644
--- a/rogue/tests/models/test_cli_input.py
+++ b/rogue/tests/models/test_cli_input.py
@@ -32,7 +32,8 @@ def test_check_auth_credentials(self, auth_type, credentials, should_raise):
 
         if should_raise:
             with pytest.raises(
-                ValidationError, match="Authentication Credentials cannot be empty"
+                ValidationError,
+                match="Authentication Credentials cannot be empty",
             ):
                 CLIInput(**input_data)
         else:
diff --git a/rogue/tests/models/test_evaluation_result.py b/rogue/tests/models/test_evaluation_result.py
index 4deb72a7..4c6277e8 100644
--- a/rogue/tests/models/test_evaluation_result.py
+++ b/rogue/tests/models/test_evaluation_result.py
@@ -8,10 +8,8 @@
     EvaluationResults,
     Scenario,
 )
-from rogue.ui.components.report_generator import (
-    convert_to_api_format,
-    ApiEvaluationResult,
-)
+from rogue.server.services.api_format_service import convert_to_api_format
+from rogue.server.models.api_format import ApiEvaluationResult, StructuredSummary
 
 
 class TestEvaluationResults:
@@ -171,11 +169,19 @@ def test_convert_to_api_format(self):
         result = self.get_evaluation_result(self.scenario_1, self.conversation_1_passed)
         results.add_result(result)
 
+        # Create structured summary for testing
+        structured_summary = StructuredSummary(
+            overall_summary="Test summary for overall evaluation",
+            key_findings=["Key finding 1", "Key finding 2"],
+            recommendations=["Recommendation 1", "Recommendation 2"],
+            detailed_breakdown=[
+                {"scenario": "Test", "status": "✅", "outcome": "Passed"},
+            ],
+        )
+
         api_format = convert_to_api_format(
             evaluation_results=results,
-            summary="Test summary for overall evaluation",
-            key_findings="• Key finding 1\n• Key finding 2",
-            recommendation="• Recommendation 1\n• Recommendation 2",
+            structured_summary=structured_summary,
             deep_test=True,
             judge_model="openai/gpt-4o-mini",
         )
@@ -186,6 +192,13 @@ def test_convert_to_api_format(self):
         assert api_format.scenarios[0].totalConversations == 1
         assert api_format.scenarios[0].flaggedConversations == 0
         assert len(api_format.scenarios[0].conversations) == 1
+
+        # Test structured summary fields
+        assert api_format.summary == "Test summary for overall evaluation"
+        assert api_format.keyFindings == "• Key finding 1\n• Key finding 2"
+        assert api_format.recommendation == "• Recommendation 1\n• Recommendation 2"
+        assert api_format.deepTest is True
+        assert api_format.judgeModel == "openai/gpt-4o-mini"
         assert api_format.scenarios[0].conversations[0].passed is True
         assert api_format.scenarios[0].conversations[0].reason == "reason"
         assert len(api_format.scenarios[0].conversations[0].messages) == 1
diff --git a/rogue/ui/components/config_screen.py b/rogue/ui/components/config_screen.py
index bd8e5793..e32e1cb5 100644
--- a/rogue/ui/components/config_screen.py
+++ b/rogue/ui/components/config_screen.py
@@ -36,7 +36,7 @@ def create_config_screen(
         )
         gr.Markdown(
             "When enabled, you'll be guided through an AI-powered interview to "
-            "extract your agent's business context. Turn off to skip this step."
+            "extract your agent's business context. Turn off to skip this step.",
         )
 
         gr.Markdown("**Deep Test Mode**")
@@ -46,7 +46,7 @@ def create_config_screen(
         )
         gr.Markdown(
             "When enabled, the evaluator will "
-            "approach each scenario from different angles"
+            "approach each scenario from different angles",
         )
 
         gr.Markdown("### Parallel Runs")
@@ -76,7 +76,8 @@ def create_config_screen(
             ),
         )
         auth_credentials_error = gr.Markdown(
-            visible=False, elem_classes=["error-label"]
+            visible=False,
+            elem_classes=["error-label"],
         )
 
         gr.Markdown("## Evaluator Configuration")
@@ -84,12 +85,12 @@ def create_config_screen(
             "Specify the models for the evaluation process. "
             "The **Service LLM** will be used to interview, "
             "generate scenarios and summaries. The **Judge LLM** is used by the "
-            "evaluator agent to score the agent's performance against those scenarios."
+            "evaluator agent to score the agent's performance against those scenarios.",
         )
         gr.Markdown(
             "ℹ️ Under the hood we're using `litellm`. See the "
             "[list of supported models](https://docs.litellm.ai/docs/providers). "
-            "You can use environment variables for API keys."
+            "You can use environment variables for API keys.",
         )
 
         service_llm = gr.Textbox(
@@ -226,7 +227,8 @@ def save_config(
                 msg = error["msg"]
                 if loc in error_labels:
                     error_updates[error_labels[loc]] = gr.update(
-                        value=f"**Error:** {msg}", visible=True
+                        value=f"**Error:** {msg}",
+                        visible=True,
                     )
                 else:
                     logger.exception("Unhandled validation error")
diff --git a/rogue/ui/components/report_generator.py b/rogue/ui/components/report_generator.py
index 07e3c9cf..2aa71aa4 100644
--- a/rogue/ui/components/report_generator.py
+++ b/rogue/ui/components/report_generator.py
@@ -1,219 +1,11 @@
 from pathlib import Path
 from typing import Tuple
-from datetime import datetime, timezone
 
 import gradio as gr
 from loguru import logger
 from rogue_sdk.types import EvaluationResults
-from pydantic import BaseModel
-from typing import List, Optional
-import re
 
-
-def parse_summary_sections(full_summary: str) -> tuple[str, str, str]:
-    """Parse a comprehensive summary into separate sections.
-
-    Args:
-        full_summary: The comprehensive summary text
-
-    Returns:
-        Tuple of (summary, key_findings, recommendations)
-    """
-    if not full_summary:
-        return None, None, None
-
-    # Extract the main summary section (everything before Key Findings)
-    summary_match = re.search(
-        r"(.*?)(?=---\s*##?\s+Key Findings|##?\s+Key Findings)",
-        full_summary,
-        re.DOTALL | re.IGNORECASE,
-    )
-    summary_section = ""
-    if summary_match:
-        summary_section = summary_match.group(1).strip()
-        # Clean up extra dashes and formatting
-        summary_section = re.sub(r"---+\s*$", "", summary_section).strip()
-
-    # Extract Key Findings section
-    key_findings_match = re.search(
-        r"##?\s+Key Findings\s*[-]*\s*(.*?)(?=---\s*##?\s+Recommendations|##?\s+Recommendations|##?\s+Detailed Breakdown|$)",  # noqa: E501
-        full_summary,
-        re.DOTALL | re.IGNORECASE,
-    )
-    key_findings_section = ""
-    if key_findings_match:
-        key_findings_section = key_findings_match.group(1).strip()
-        # Clean up bullet points and formatting
-        key_findings_section = re.sub(
-            r"^-\s*",
-            "",
-            key_findings_section,
-            flags=re.MULTILINE,
-        )
-        key_findings_section = re.sub(r"---+\s*$", "", key_findings_section).strip()
-        # Fix bullet point formatting
-        key_findings_section = re.sub(r"\s*-\s*\*\*", "\n• **", key_findings_section)
-        if not key_findings_section.startswith(
-            "•",
-        ) and not key_findings_section.startswith("-"):
-            key_findings_section = "• " + key_findings_section
-
-    # Extract Recommendations section
-    recommendations_match = re.search(
-        r"##?\s+Recommendations\s*[-]*\s*(.*?)(?=---\s*##?\s+Detailed Breakdown|##?\s+Detailed Breakdown|$)",  # noqa: E501
-        full_summary,
-        re.DOTALL | re.IGNORECASE,
-    )
-    recommendations_section = ""
-    if recommendations_match:
-        recommendations_section = recommendations_match.group(1).strip()
-        # Clean up formatting
-        recommendations_section = re.sub(
-            r"---+\s*$",
-            "",
-            recommendations_section,
-        ).strip()
-        # Convert all numbered items to bullet points
-        recommendations_section = re.sub(
-            r"^\d+\.\s*",
-            "• ",
-            recommendations_section,
-            flags=re.MULTILINE,
-        )
-        recommendations_section = re.sub(
-            r"\s+\d+\.\s*",
-            "\n• ",
-            recommendations_section,
-        )
-
-    return (
-        summary_section if summary_section else None,
-        key_findings_section if key_findings_section else None,
-        recommendations_section if recommendations_section else None,
-    )
-
-
-# New API Format Types for report display
-class ApiChatMessage(BaseModel):
-    """Chat message for new API format with datetime timestamp."""
-
-    role: str
-    content: str
-    timestamp: datetime
-
-
-class ApiConversationEvaluation(BaseModel):
-    """Conversation evaluation for new API format."""
-
-    passed: bool
-    messages: List[ApiChatMessage]
-    reason: Optional[str] = None
-
-
-class ApiScenarioResult(BaseModel):
-    """Result of evaluating a single scenario in new API format."""
-
-    description: Optional[str] = None
-    totalConversations: Optional[int] = None
-    flaggedConversations: Optional[int] = None
-    conversations: List[ApiConversationEvaluation]
-
-
-class ApiEvaluationResult(BaseModel):
-    """New API format for evaluation results."""
-
-    scenarios: List[ApiScenarioResult]
-    summary: Optional[str] = None
-    keyFindings: Optional[str] = None
-    recommendation: Optional[str] = None
-    deepTest: bool = False
-    startTime: datetime
-    judgeModel: Optional[str] = None
-
-
-def convert_to_api_format(
-    evaluation_results: EvaluationResults,
-    summary: Optional[str] = None,
-    key_findings: Optional[str] = None,
-    recommendation: Optional[str] = None,
-    deep_test: bool = False,
-    start_time: Optional[datetime] = None,
-    judge_model: Optional[str] = None,
-) -> ApiEvaluationResult:
-    """Convert legacy EvaluationResults to new API format.
-
-    Args:
-        evaluation_results: Legacy evaluation results to convert
-        summary: Generated summary of the evaluation
-        key_findings: Key findings from the evaluation
-        recommendation: Recommendations based on the evaluation
-        deep_test: Whether deep test mode was enabled
-        start_time: When the evaluation started (defaults to current time)
-        judge_model: The LLM judge model used
-
-    Returns:
-        ApiEvaluationResult: New format evaluation result with additional metadata
-    """
-    if start_time is None:
-        start_time = datetime.now(timezone.utc)
-
-    api_scenarios = []
-
-    for result in evaluation_results.results:
-        # Convert conversations to new format
-        api_conversations = []
-        for conv_eval in result.conversations:
-            # Convert ChatHistory messages to ApiChatMessage
-            api_messages = []
-            for msg in conv_eval.messages.messages:
-                timestamp = datetime.now(timezone.utc)
-                if msg.timestamp:
-                    try:
-                        if isinstance(msg.timestamp, str):
-                            timestamp = datetime.fromisoformat(
-                                msg.timestamp.replace("Z", "+00:00"),
-                            )
-                        else:
-                            timestamp = msg.timestamp
-                    except (ValueError, AttributeError):
-                        timestamp = datetime.now(timezone.utc)
-
-                api_messages.append(
-                    ApiChatMessage(
-                        role=msg.role,
-                        content=msg.content,
-                        timestamp=timestamp,
-                    ),
-                )
-
-            api_conversations.append(
-                ApiConversationEvaluation(
-                    passed=conv_eval.passed,
-                    messages=api_messages,
-                    reason=conv_eval.reason if conv_eval.reason else None,
-                ),
-            )
-
-        api_scenarios.append(
-            ApiScenarioResult(
-                description=result.scenario.scenario,
-                totalConversations=len(api_conversations),
-                flaggedConversations=len(
-                    [c for c in api_conversations if not c.passed],
-                ),
-                conversations=api_conversations,
-            ),
-        )
-
-    return ApiEvaluationResult(
-        scenarios=api_scenarios,
-        summary=summary,
-        keyFindings=key_findings,
-        recommendation=recommendation,
-        deepTest=deep_test,
-        startTime=start_time,
-        judgeModel=judge_model,
-    )
+from ...server.services.api_format_service import convert_with_structured_summary
 
 
 def _load_report_data_from_files(
@@ -270,26 +62,17 @@ def on_report_tab_select(state):
             )
             results = EvaluationResults()
 
-        # Convert to new API format for display
+        # Convert to new API format for display using server service
         try:
             # Extract configuration and additional metadata from state
             config = state.get("config", {})
 
-            # Parse the summary to extract separate sections
-            if summary and summary != "No summary available.":
-                parsed_summary, parsed_key_findings, parsed_recommendations = (
-                    parse_summary_sections(summary)
-                )
-            else:
-                parsed_summary = None
-                parsed_key_findings = None
-                parsed_recommendations = None
-
-            api_format_results = convert_to_api_format(
+            # For now, pass None for structured_summary since UI still uses
+            # string summaries. This will be updated when the UI summary generation
+            # is converted to structured format
+            api_format_results = convert_with_structured_summary(
                 evaluation_results=results,
-                summary=parsed_summary,
-                key_findings=parsed_key_findings or state.get("key_findings"),
-                recommendation=parsed_recommendations or state.get("recommendation"),
+                structured_summary=None,  # TODO: Convert UI to use structured summaries
                 deep_test=config.get("deep_test_mode", False),
                 start_time=state.get("start_time"),
                 judge_model=config.get("judge_llm"),
diff --git a/rogue/ui/components/scenario_runner.py b/rogue/ui/components/scenario_runner.py
index a1ea60ba..d63f3866 100644
--- a/rogue/ui/components/scenario_runner.py
+++ b/rogue/ui/components/scenario_runner.py
@@ -481,6 +481,7 @@ def on_status_update(status_data):
         # final_output_path.write_text(all_results.model_dump_json(indent=2))
 
         # Generate summary using SDK (server-based)
+        summary = "Summary generation failed."
         try:
             sdk_config = RogueClientConfig(
                 base_url=state.get("rogue_server_url", "http://localhost:8000"),
diff --git a/sdks/python/rogue_sdk/client.py b/sdks/python/rogue_sdk/client.py
index 966e5558..f88b7fd2 100644
--- a/sdks/python/rogue_sdk/client.py
+++ b/sdks/python/rogue_sdk/client.py
@@ -24,6 +24,7 @@
     SendMessageResponse,
     StartInterviewRequest,
     StartInterviewResponse,
+    StructuredSummary,
     SummaryGenerationRequest,
     SummaryGenerationResponse,
 )
@@ -162,7 +163,27 @@ async def generate_summary(
             "/api/v1/llm/summary",
             json=data.model_dump(mode="json"),
         )
-        return SummaryGenerationResponse(**response)
+
+        # Handle server's structured summary response
+        summary_data = response.get("summary", {})
+        if isinstance(summary_data, dict) and "overall_summary" in summary_data:
+            # Server returned StructuredSummary - convert to our expected format
+            structured_summary = StructuredSummary(**summary_data)
+            return SummaryGenerationResponse(
+                summary=structured_summary,
+                message=response.get("message", "Successfully generated summary"),
+            )
+        else:
+            # Fallback for legacy string response
+            return SummaryGenerationResponse(
+                summary=StructuredSummary(
+                    overall_summary=str(summary_data),
+                    key_findings=[],
+                    recommendations=[],
+                    detailed_breakdown=[],
+                ),
+                message=response.get("message", "Successfully generated summary"),
+            )
 
     async def start_interview(
         self,
@@ -237,7 +258,7 @@ async def wait_for_evaluation(
             elapsed = asyncio.get_running_loop().time() - start_time
             if elapsed >= max_wait_time:
                 raise TimeoutError(
-                    f"Evaluation {job_id} did not complete within {max_wait_time}s"
+                    f"Evaluation {job_id} did not complete within {max_wait_time}s",
                 )
 
             await asyncio.sleep(poll_interval)
diff --git a/sdks/python/rogue_sdk/sdk.py b/sdks/python/rogue_sdk/sdk.py
index efca694c..ba89cfdb 100644
--- a/sdks/python/rogue_sdk/sdk.py
+++ b/sdks/python/rogue_sdk/sdk.py
@@ -114,7 +114,7 @@ def on_websocket_event(
         """Add WebSocket event handler."""
         if not self.ws_client:
             raise RuntimeError(
-                "WebSocket not connected. Call connect_websocket() first."
+                "WebSocket not connected. Call connect_websocket() first.",
             )
         self.ws_client.on(event, handler)
 
@@ -192,7 +192,7 @@ def handle_final_job_result(task):
                                 result_future.set_result(result)
                             else:
                                 result_future.set_exception(
-                                    Exception("Failed to retrieve final job result")
+                                    Exception("Failed to retrieve final job result"),
                                 )
                         except Exception as e:
                             result_future.set_exception(e)
@@ -210,7 +210,7 @@ def handle_chat_update(event, data):
         def handle_error(event, data):
             if not result_future.done():
                 result_future.set_exception(
-                    Exception(f"WebSocket error: {data.get('error')}")
+                    Exception(f"WebSocket error: {data.get('error')}"),
                 )
 
         # Connect WebSocket for updates
@@ -228,7 +228,7 @@ def handle_error(event, data):
             return result
         except asyncio.TimeoutError:
             raise TimeoutError(
-                f"Evaluation {job_id} did not complete within {timeout}s"
+                f"Evaluation {job_id} did not complete within {timeout}s",
             )
         finally:
             await self.disconnect_websocket()
@@ -296,7 +296,32 @@ async def generate_summary(
             api_key=api_key,
         )
 
-        return response_data.summary
+        # Convert structured summary back to string format for backward compatibility
+        structured_summary = response_data.summary
+        if hasattr(structured_summary, "overall_summary"):
+            # Format as markdown string for UI display
+            summary_parts = [
+                f"# Evaluation Results Summary\n\n## Overall Summary\n"
+                f"{structured_summary.overall_summary}",
+            ]
+
+            if structured_summary.key_findings:
+                findings = "\n".join(
+                    f"- {finding}" for finding in structured_summary.key_findings
+                )
+                summary_parts.append(f"\n---\n\n## Key Findings\n{findings}")
+
+            if structured_summary.recommendations:
+                recommendations = "\n".join(
+                    f"{i + 1}. {rec}"
+                    for i, rec in enumerate(structured_summary.recommendations)
+                )
+                summary_parts.append(f"\n---\n\n## Recommendations\n{recommendations}")
+
+            return "\n".join(summary_parts)
+        else:
+            # Fallback for string response
+            return str(structured_summary)
 
     async def start_interview(
         self,
diff --git a/sdks/python/rogue_sdk/tests/test_types.py b/sdks/python/rogue_sdk/tests/test_types.py
index db652f7c..aa906bde 100644
--- a/sdks/python/rogue_sdk/tests/test_types.py
+++ b/sdks/python/rogue_sdk/tests/test_types.py
@@ -42,7 +42,10 @@ def test_validate_dataset_for_type(self, scenario_type, dataset, should_raise):
         ],
     )
     def test_validate_dataset_sample_size(
-        self, dataset, dataset_sample_size, should_raise
+        self,
+        dataset,
+        dataset_sample_size,
+        should_raise,
     ):
         input_data = {
             "scenario": "Test Scenario",
@@ -55,7 +58,8 @@ def test_validate_dataset_sample_size(
 
         if should_raise:
             with pytest.raises(
-                ValidationError, match="`dataset_sample_size` must be set"
+                ValidationError,
+                match="`dataset_sample_size` must be set",
             ):
                 Scenario(**input_data)
         else:
diff --git a/sdks/python/rogue_sdk/types.py b/sdks/python/rogue_sdk/types.py
index c2a741fa..b1b8706c 100644
--- a/sdks/python/rogue_sdk/types.py
+++ b/sdks/python/rogue_sdk/types.py
@@ -433,10 +433,19 @@ class SummaryGenerationRequest(BaseModel):
     api_key: Optional[str] = None
 
 
+class StructuredSummary(BaseModel):
+    """Structured summary response from LLM."""
+
+    overall_summary: str
+    key_findings: List[str]
+    recommendations: List[str]
+    detailed_breakdown: List[dict]  # Table rows for scenario breakdown
+
+
 class SummaryGenerationResponse(BaseModel):
     """Response containing generated summary."""
 
-    summary: str
+    summary: StructuredSummary
     message: str
 
 
diff --git a/sdks/python/rogue_sdk/websocket.py b/sdks/python/rogue_sdk/websocket.py
index f0a79797..7b9d1b76 100644
--- a/sdks/python/rogue_sdk/websocket.py
+++ b/sdks/python/rogue_sdk/websocket.py
@@ -164,7 +164,7 @@ def _emit(self, event: WebSocketEventType, data: Any) -> None:
                             )
                             if t.exception()
                             else None
-                        )
+                        ),
                     )
                 else:
                     handler(event, data)
@@ -180,7 +180,7 @@ async def _schedule_reconnect(self) -> None:
         delay = self.reconnect_delay * (2 ** (self.reconnect_attempts - 1))
 
         logger.info(
-            f"Scheduling reconnect attempt {self.reconnect_attempts} in {delay}s"
+            f"Scheduling reconnect attempt {self.reconnect_attempts} in {delay}s",
         )
         await asyncio.sleep(delay)
 

From d682161be29d170c9b63c1a086f13a1db12e5087 Mon Sep 17 00:00:00 2001
From: Dror Ivry <drorivry@gmail.com>
Date: Sun, 7 Sep 2025 12:31:02 +0300
Subject: [PATCH 05/22] wip

---
 rogue/server/models/__init__.py             |  15 +++
 rogue/server/services/api_format_service.py | 141 ++++++++++++++++++++
 2 files changed, 156 insertions(+)
 create mode 100644 rogue/server/models/__init__.py
 create mode 100644 rogue/server/services/api_format_service.py

diff --git a/rogue/server/models/__init__.py b/rogue/server/models/__init__.py
new file mode 100644
index 00000000..9a5b64b5
--- /dev/null
+++ b/rogue/server/models/__init__.py
@@ -0,0 +1,15 @@
+"""Server models for the Rogue Agent Evaluator."""
+
+from .api_format import (
+    ApiChatMessage,
+    ApiConversationEvaluation,
+    ApiEvaluationResult,
+    ApiScenarioResult,
+)
+
+__all__ = [
+    "ApiChatMessage",
+    "ApiConversationEvaluation",
+    "ApiEvaluationResult",
+    "ApiScenarioResult",
+]
diff --git a/rogue/server/services/api_format_service.py b/rogue/server/services/api_format_service.py
new file mode 100644
index 00000000..44ccc2cf
--- /dev/null
+++ b/rogue/server/services/api_format_service.py
@@ -0,0 +1,141 @@
+"""Service for converting evaluation results to API format.
+
+This service handles the conversion from legacy EvaluationResults
+to the new enhanced API format with structured summary data.
+"""
+
+from datetime import datetime, timezone
+from typing import Optional
+
+from rogue_sdk.types import EvaluationResults
+
+from ..models.api_format import (
+    ApiChatMessage,
+    ApiConversationEvaluation,
+    ApiEvaluationResult,
+    ApiScenarioResult,
+    StructuredSummary,
+)
+
+
+def convert_to_api_format(
+    evaluation_results: EvaluationResults,
+    structured_summary: Optional[StructuredSummary] = None,
+    deep_test: bool = False,
+    start_time: Optional[datetime] = None,
+    judge_model: Optional[str] = None,
+) -> ApiEvaluationResult:
+    """Convert legacy EvaluationResults to new API format.
+
+    Args:
+        evaluation_results: Legacy evaluation results to convert
+        structured_summary: Structured summary from LLM with separate sections
+        deep_test: Whether deep test mode was enabled
+        start_time: When the evaluation started (defaults to current time)
+        judge_model: The LLM judge model used
+
+    Returns:
+        ApiEvaluationResult: New format evaluation result with additional metadata
+    """
+    if start_time is None:
+        start_time = datetime.now(timezone.utc)
+
+    api_scenarios = []
+
+    for result in evaluation_results.results:
+        # Convert conversations to new format
+        api_conversations = []
+        for conv_eval in result.conversations:
+            # Convert ChatHistory messages to ApiChatMessage
+            api_messages = []
+            for msg in conv_eval.messages.messages:
+                timestamp = datetime.now(timezone.utc)
+                if msg.timestamp:
+                    try:
+                        if isinstance(msg.timestamp, str):
+                            timestamp = datetime.fromisoformat(
+                                msg.timestamp.replace("Z", "+00:00"),
+                            )
+                        else:
+                            timestamp = msg.timestamp
+                    except (ValueError, AttributeError):
+                        timestamp = datetime.now(timezone.utc)
+
+                api_messages.append(
+                    ApiChatMessage(
+                        role=msg.role,
+                        content=msg.content,
+                        timestamp=timestamp,
+                    ),
+                )
+
+            api_conversations.append(
+                ApiConversationEvaluation(
+                    passed=conv_eval.passed,
+                    messages=api_messages,
+                    reason=conv_eval.reason if conv_eval.reason else None,
+                ),
+            )
+
+        api_scenarios.append(
+            ApiScenarioResult(
+                description=result.scenario.scenario,
+                totalConversations=len(api_conversations),
+                flaggedConversations=len(
+                    [c for c in api_conversations if not c.passed],
+                ),
+                conversations=api_conversations,
+            ),
+        )
+
+    # Extract structured summary components
+    summary = None
+    key_findings = None
+    recommendation = None
+
+    if structured_summary:
+        summary = structured_summary.overall_summary
+        key_findings = "\n".join(
+            f"• {finding}" for finding in structured_summary.key_findings
+        )
+        recommendation = "\n".join(
+            f"• {rec}" for rec in structured_summary.recommendations
+        )
+
+    return ApiEvaluationResult(
+        scenarios=api_scenarios,
+        summary=summary,
+        keyFindings=key_findings,
+        recommendation=recommendation,
+        deepTest=deep_test,
+        startTime=start_time,
+        judgeModel=judge_model,
+    )
+
+
+def convert_with_structured_summary(
+    evaluation_results: EvaluationResults,
+    structured_summary: Optional[StructuredSummary] = None,
+    deep_test: bool = False,
+    start_time: Optional[datetime] = None,
+    judge_model: Optional[str] = None,
+) -> ApiEvaluationResult:
+    """Convert to API format with structured summary.
+
+    Args:
+        evaluation_results: Legacy evaluation results to convert
+        structured_summary: Structured summary from LLM
+        deep_test: Whether deep test mode was enabled
+        start_time: When the evaluation started
+        judge_model: The LLM judge model used
+
+    Returns:
+        ApiEvaluationResult: New format with structured summary data
+    """
+    return convert_to_api_format(
+        evaluation_results=evaluation_results,
+        structured_summary=structured_summary,
+        deep_test=deep_test,
+        start_time=start_time,
+        judge_model=judge_model,
+    )

From 1d7e28a934d3c0641f14bb242e19089c90f8e4fc Mon Sep 17 00:00:00 2001
From: Dror Ivry <drorivry@gmail.com>
Date: Sun, 7 Sep 2025 13:16:57 +0300
Subject: [PATCH 06/22] server now respond with the results

---
 rogue/run_cli.py                        |  2 +-
 rogue/ui/components/report_generator.py |  2 +-
 rogue/ui/components/scenario_runner.py  |  4 +++-
 sdks/python/rogue_sdk/sdk.py            | 18 ++++++++++++++----
 4 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/rogue/run_cli.py b/rogue/run_cli.py
index acf8979e..f0894eeb 100644
--- a/rogue/run_cli.py
+++ b/rogue/run_cli.py
@@ -203,7 +203,7 @@ async def create_report(
     sdk = RogueSDK(sdk_config)
 
     try:
-        summary = await sdk.generate_summary(
+        summary, structured_summary = await sdk.generate_summary(
             results=results,
             model=judge_llm,
             api_key=judge_llm_api_key,
diff --git a/rogue/ui/components/report_generator.py b/rogue/ui/components/report_generator.py
index 2aa71aa4..db554368 100644
--- a/rogue/ui/components/report_generator.py
+++ b/rogue/ui/components/report_generator.py
@@ -72,7 +72,7 @@ def on_report_tab_select(state):
             # is converted to structured format
             api_format_results = convert_with_structured_summary(
                 evaluation_results=results,
-                structured_summary=None,  # TODO: Convert UI to use structured summaries
+                structured_summary=state.get("structured_summary"),
                 deep_test=config.get("deep_test_mode", False),
                 start_time=state.get("start_time"),
                 judge_model=config.get("judge_llm"),
diff --git a/rogue/ui/components/scenario_runner.py b/rogue/ui/components/scenario_runner.py
index d63f3866..b1fa6d7e 100644
--- a/rogue/ui/components/scenario_runner.py
+++ b/rogue/ui/components/scenario_runner.py
@@ -489,12 +489,14 @@ def on_status_update(status_data):
             )
             sdk = RogueSDK(sdk_config)
 
-            summary = await sdk.generate_summary(
+            summary, structured_summary = await sdk.generate_summary(
                 results=all_results,
                 model=config.get("service_llm"),
                 api_key=config.get("judge_llm_api_key"),
             )
 
+            state["structured_summary"] = structured_summary
+
             await sdk.close()
         except Exception:
             logger.exception("Summary generation failed")
diff --git a/sdks/python/rogue_sdk/sdk.py b/sdks/python/rogue_sdk/sdk.py
index ba89cfdb..98bdc55a 100644
--- a/sdks/python/rogue_sdk/sdk.py
+++ b/sdks/python/rogue_sdk/sdk.py
@@ -5,7 +5,7 @@
 """
 
 import asyncio
-from typing import Any, Callable, Optional
+from typing import Any, Callable, Optional, Tuple
 
 from loguru import logger
 from pydantic import HttpUrl
@@ -25,6 +25,7 @@
     RogueClientConfig,
     Scenarios,
     SendMessageResponse,
+    StructuredSummary,
     WebSocketEventType,
 )
 from .websocket import RogueWebSocketClient
@@ -288,7 +289,7 @@ async def generate_summary(
         results: EvaluationResults,
         model: str = "openai/gpt-4o-mini",
         api_key: Optional[str] = None,
-    ) -> str:
+    ) -> Tuple[str, StructuredSummary]:
         """Generate evaluation summary from results."""
         response_data = await self.http_client.generate_summary(
             results=results,
@@ -318,10 +319,19 @@ async def generate_summary(
                 )
                 summary_parts.append(f"\n---\n\n## Recommendations\n{recommendations}")
 
-            return "\n".join(summary_parts)
+            if structured_summary.detailed_breakdown:
+                breakdown = "\n".join(
+                    f"{i + 1}. {row}"
+                    for i, row in enumerate(structured_summary.detailed_breakdown)
+                )
+                summary_parts.append(f"\n---\n\n## Detailed Breakdown\n{breakdown}")
+
+            summary_parts.append("\n---\n")
+
+            return "\n".join(summary_parts), structured_summary
         else:
             # Fallback for string response
-            return str(structured_summary)
+            return str(structured_summary), structured_summary
 
     async def start_interview(
         self,

From 6e0d9eed9e038267767bca80f38e236755a5a1c5 Mon Sep 17 00:00:00 2001
From: Dror Ivry <drorivry@gmail.com>
Date: Sun, 7 Sep 2025 18:16:38 +0300
Subject: [PATCH 07/22] refactor the detailed breakdown

---
 packages/tui/go.mod                      |  1 +
 packages/tui/go.sum                      |  2 +
 packages/tui/internal/tui/app.go         | 59 +++++++++++++++++-------
 packages/tui/internal/tui/evaluation.go  | 38 +++++++++------
 packages/tui/internal/tui/report_view.go | 18 ++++----
 5 files changed, 80 insertions(+), 38 deletions(-)

diff --git a/packages/tui/go.mod b/packages/tui/go.mod
index 2e986de1..413f9fe3 100644
--- a/packages/tui/go.mod
+++ b/packages/tui/go.mod
@@ -23,6 +23,7 @@ require (
 
 require (
 	github.com/alecthomas/chroma/v2 v2.20.0
+	github.com/charmbracelet/bubbles/v2 v2.0.0-beta.1
 	github.com/charmbracelet/colorprofile v0.3.1 // indirect
 	github.com/charmbracelet/x/cellbuf v0.0.14-0.20250505150409-97991a1f17d1 // indirect
 	github.com/charmbracelet/x/term v0.2.1 // indirect
diff --git a/packages/tui/go.sum b/packages/tui/go.sum
index a69ee239..d4e9da06 100644
--- a/packages/tui/go.sum
+++ b/packages/tui/go.sum
@@ -6,6 +6,8 @@ github.com/alecthomas/repr v0.5.1 h1:E3G4t2QbHTSNpPKBgMTln5KLkZHLOcU7r37J4pXBuIg
 github.com/alecthomas/repr v0.5.1/go.mod h1:Fr0507jx4eOXV7AlPV6AVZLYrLIuIeSOWtW57eE/O/4=
 github.com/aymanbagabas/go-udiff v0.2.0 h1:TK0fH4MteXUDspT88n8CKzvK0X9O2xu9yQjWpi6yML8=
 github.com/aymanbagabas/go-udiff v0.2.0/go.mod h1:RE4Ex0qsGkTAJoQdQQCA0uG+nAzJO/pI/QwceO5fgrA=
+github.com/charmbracelet/bubbles/v2 v2.0.0-beta.1 h1:swACzss0FjnyPz1enfX56GKkLiuKg5FlyVmOLIlU2kE=
+github.com/charmbracelet/bubbles/v2 v2.0.0-beta.1/go.mod h1:6HamsBKWqEC/FVHuQMHgQL+knPyvHH55HwJDHl/adMw=
 github.com/charmbracelet/bubbletea/v2 v2.0.0-beta.4 h1:UgUuKKvBwgqm2ZEL+sKv/OLeavrUb4gfHgdxe6oIOno=
 github.com/charmbracelet/bubbletea/v2 v2.0.0-beta.4/go.mod h1:0wWFRpsgF7vHsCukVZ5LAhZkiR4j875H6KEM2/tFQmA=
 github.com/charmbracelet/colorprofile v0.3.1 h1:k8dTHMd7fgw4bnFd7jXTLZrSU/CQrKnL3m+AxCzDz40=
diff --git a/packages/tui/internal/tui/app.go b/packages/tui/internal/tui/app.go
index 1d9746aa..1a59e45b 100644
--- a/packages/tui/internal/tui/app.go
+++ b/packages/tui/internal/tui/app.go
@@ -8,7 +8,9 @@ import (
 	"strings"
 	"time"
 
+	"github.com/charmbracelet/bubbles/v2/table"
 	tea "github.com/charmbracelet/bubbletea/v2"
+
 	"github.com/pelletier/go-toml/v2"
 	"github.com/rogue/tui/internal/components"
 	"github.com/rogue/tui/internal/theme"
@@ -85,7 +87,31 @@ func (m *Model) summaryGenerationCmd() tea.Cmd {
 		ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
 		defer cancel()
 
-		summary, err := sdk.GenerateSummary(ctx, m.evalState.JobID, judgeModel, apiKey)
+		structuredSummary, err := sdk.GenerateSummary(ctx, m.evalState.JobID, judgeModel, apiKey)
+
+		overallSummary := structuredSummary.Summary.OverallSummary
+		keyFindings := structuredSummary.Summary.KeyFindings
+		parsedKeyFindings := ""
+		for _, finding := range keyFindings {
+			parsedKeyFindings += "- " + finding + "\n"
+		}
+		recommendations := structuredSummary.Summary.Recommendations
+		parsedRecommendations := ""
+		for _, recommendation := range recommendations {
+			parsedRecommendations += "- " + recommendation + "\n"
+		}
+
+		detailedBreakdown := structuredSummary.Summary.DetailedBreakdown
+		parsedDetailedBreakdown := ""
+		for _, breakdown := range detailedBreakdown {
+			parsedDetailedBreakdown += "- " + breakdown.Scenario + " - " + breakdown.Status + " - " + breakdown.Outcome + "\n"
+		}
+
+		summary := "## Overall Summary\n\n" + overallSummary +
+			"\n\n" + "## Key Findings\n\n" + parsedKeyFindings +
+			"\n\n" + "## Recommendations\n\n" + parsedRecommendations +
+			"\n\n" + "## Detailed Breakdown\n\n" + parsedDetailedBreakdown
+
 		return SummaryGeneratedMsg{
 			Summary: summary,
 			Err:     err,
@@ -131,20 +157,21 @@ type App struct {
 
 // Model represents the main application state
 type Model struct {
-	currentScreen  Screen
-	width          int
-	height         int
-	input          string
-	cursor         int
-	evaluations    []Evaluation
-	scenarios      []Scenario
-	config         Config
-	version        string
-	commandInput   components.CommandInput
-	dialog         *components.Dialog
-	dialogStack    []components.Dialog
-	llmDialog      *components.LLMConfigDialog
-	scenarioEditor components.ScenarioEditor
+	currentScreen     Screen
+	width             int
+	height            int
+	input             string
+	cursor            int
+	evaluations       []Evaluation
+	scenarios         []Scenario
+	config            Config
+	version           string
+	commandInput      components.CommandInput
+	dialog            *components.Dialog
+	dialogStack       []components.Dialog
+	llmDialog         *components.LLMConfigDialog
+	scenarioEditor    components.ScenarioEditor
+	detailedBreakdown []table.Row
 
 	// Spinners for loading states
 	healthSpinner  components.Spinner
@@ -238,7 +265,7 @@ func (a *App) Run() error {
 		// Initialize viewports
 		eventsViewport:   components.NewViewport(1, 80, 20),
 		summaryViewport:  components.NewViewport(2, 80, 20),
-		reportViewport:   components.NewViewport(3, 80, 20),
+		reportViewport:   components.NewViewport(3, 80, 15),
 		focusedViewport:  0,    // Start with events viewport focused
 		eventsAutoScroll: true, // Start with auto-scroll enabled
 	}
diff --git a/packages/tui/internal/tui/evaluation.go b/packages/tui/internal/tui/evaluation.go
index ecbd6a2c..69bdbc2d 100644
--- a/packages/tui/internal/tui/evaluation.go
+++ b/packages/tui/internal/tui/evaluation.go
@@ -92,6 +92,20 @@ type RogueSDK struct {
 	ws         *websocket.Conn
 }
 
+type SummaryResp struct {
+	Summary struct {
+		OverallSummary    string   `json:"overall_summary"`
+		KeyFindings       []string `json:"key_findings"`
+		Recommendations   []string `json:"recommendations"`
+		DetailedBreakdown []struct {
+			Scenario string `json:"scenario"`
+			Status   string `json:"status"`
+			Outcome  string `json:"outcome"`
+		} `json:"detailed_breakdown"`
+	} `json:"summary"`
+	Message string `json:"message"`
+}
+
 // NewRogueSDK creates a new SDK instance
 func NewRogueSDK(baseURL string) *RogueSDK {
 	return &RogueSDK{
@@ -438,15 +452,15 @@ func (m *Model) StartEvaluation(ctx context.Context, serverURL, agentURL string,
 }
 
 // GenerateSummary generates a markdown summary from evaluation results
-func (sdk *RogueSDK) GenerateSummary(ctx context.Context, jobID, model, apiKey string) (string, error) {
+func (sdk *RogueSDK) GenerateSummary(ctx context.Context, jobID, model, apiKey string) (*SummaryResp, error) {
 	// First get the evaluation job to extract results
 	job, err := sdk.GetEvaluation(ctx, jobID)
 	if err != nil {
-		return "", fmt.Errorf("failed to get evaluation results: %w", err)
+		return nil, fmt.Errorf("failed to get evaluation results: %w", err)
 	}
 
 	if job.Results == nil {
-		return "", fmt.Errorf("no results available for job %s", jobID)
+		return nil, fmt.Errorf("no results available for job %s", jobID)
 	}
 
 	// Prepare summary request - match server's SummaryGenerationRequest format
@@ -460,12 +474,12 @@ func (sdk *RogueSDK) GenerateSummary(ctx context.Context, jobID, model, apiKey s
 
 	body, err := json.Marshal(summaryReq)
 	if err != nil {
-		return "", err
+		return nil, err
 	}
 
 	req, err := http.NewRequestWithContext(ctx, "POST", sdk.baseURL+"/api/v1/llm/summary", bytes.NewReader(body))
 	if err != nil {
-		return "", err
+		return nil, err
 	}
 	req.Header.Set("Content-Type", "application/json")
 
@@ -476,24 +490,22 @@ func (sdk *RogueSDK) GenerateSummary(ctx context.Context, jobID, model, apiKey s
 
 	resp, err := longTimeoutClient.Do(req)
 	if err != nil {
-		return "", err
+		return nil, err
 	}
 	defer resp.Body.Close()
 
 	if resp.StatusCode != http.StatusOK {
 		body, _ := io.ReadAll(resp.Body)
-		return "", fmt.Errorf("summary generation failed: %d %s", resp.StatusCode, string(body))
+		return nil, fmt.Errorf("summary generation failed: %d %s", resp.StatusCode, string(body))
 	}
 
-	var summaryResp struct {
-		Summary string `json:"summary"`
-		Message string `json:"message"`
-	}
+	var summaryResp SummaryResp
+
 	if err := json.NewDecoder(resp.Body).Decode(&summaryResp); err != nil {
-		return "", err
+		return nil, err
 	}
 
-	return summaryResp.Summary, nil
+	return &summaryResp, nil
 }
 
 // CheckServerHealth calls GET /health and returns the status string
diff --git a/packages/tui/internal/tui/report_view.go b/packages/tui/internal/tui/report_view.go
index 08c71e45..76ba8c5d 100644
--- a/packages/tui/internal/tui/report_view.go
+++ b/packages/tui/internal/tui/report_view.go
@@ -22,7 +22,7 @@ func (m Model) renderReport() string {
 	// Main container style with full width and height background
 	mainStyle := lipgloss.NewStyle().
 		Width(m.width).
-		Height(m.height - 1). // -1 for footer
+		Height(m.height - 12).
 		Background(t.Background())
 
 	// Title style
@@ -58,12 +58,12 @@ func (m Model) renderReport() string {
 	}
 
 	// Calculate viewport dimensions
-	viewportWidth := m.width - 4   // Leave margins
-	viewportHeight := m.height - 8 // title(3) + help(1) + margins(4)
+	viewportWidth := m.width - 8   // Leave margins
+	viewportHeight := m.height - 4 // title(3) + help(1) + margins(4)
 
 	// Create a temporary copy of the viewport to avoid modifying the original
 	viewport := m.reportViewport
-	viewport.SetSize(viewportWidth, viewportHeight)
+	viewport.SetSize(viewportWidth, viewportHeight-2)
 	viewport.SetContent(reportContent)
 
 	// Style the viewport with border
@@ -71,14 +71,14 @@ func (m Model) renderReport() string {
 		Border(lipgloss.RoundedBorder()).
 		BorderForeground(t.Border()).
 		BorderBackground(t.BackgroundPanel()).
-		Background(t.BackgroundPanel()).
-		Width(viewportWidth + 2).  // +2 for border
-		Height(viewportHeight + 2) // +2 for border
+		Background(t.BackgroundPanel())
 
 	// Apply viewport styling
 	viewport.Style = lipgloss.NewStyle().
 		Foreground(t.Text()).
 		Background(t.BackgroundPanel()).
+		Width(viewportWidth).
+		Height(viewportHeight-8).
 		Padding(1, 2)
 
 	// Help text style
@@ -102,13 +102,13 @@ func (m Model) renderReport() string {
 	// Center the viewport in the available space
 	contentArea := lipgloss.NewStyle().
 		Width(m.width).
-		Height(viewportHeight + 2).
+		Height(viewportHeight).
 		Background(t.Background())
 
 	centeredViewport := contentArea.Render(
 		lipgloss.Place(
 			m.width,
-			viewportHeight+2,
+			viewportHeight,
 			lipgloss.Center,
 			lipgloss.Top,
 			viewportContent,

From c30193d64ca1b4f6617377a1d2b54dd9c2ec0862 Mon Sep 17 00:00:00 2001
From: Dror Ivry <drorivry@gmail.com>
Date: Mon, 8 Sep 2025 15:04:07 +0300
Subject: [PATCH 08/22] Hotfix - change the token input count

---
 packages/tui/internal/tui/report_view.go | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/packages/tui/internal/tui/report_view.go b/packages/tui/internal/tui/report_view.go
index 76ba8c5d..84bf31ec 100644
--- a/packages/tui/internal/tui/report_view.go
+++ b/packages/tui/internal/tui/report_view.go
@@ -59,7 +59,7 @@ func (m Model) renderReport() string {
 
 	// Calculate viewport dimensions
 	viewportWidth := m.width - 8   // Leave margins
-	viewportHeight := m.height - 4 // title(3) + help(1) + margins(4)
+	viewportHeight := m.height - 8 // title(3) + help(1) + margins(4)
 
 	// Create a temporary copy of the viewport to avoid modifying the original
 	viewport := m.reportViewport
@@ -68,6 +68,7 @@ func (m Model) renderReport() string {
 
 	// Style the viewport with border
 	viewportStyle := lipgloss.NewStyle().
+		Height(viewportHeight - 8).
 		Border(lipgloss.RoundedBorder()).
 		BorderForeground(t.Border()).
 		BorderBackground(t.BackgroundPanel()).
@@ -102,13 +103,13 @@ func (m Model) renderReport() string {
 	// Center the viewport in the available space
 	contentArea := lipgloss.NewStyle().
 		Width(m.width).
-		Height(viewportHeight).
+		Height(viewportHeight - 8).
 		Background(t.Background())
 
 	centeredViewport := contentArea.Render(
 		lipgloss.Place(
 			m.width,
-			viewportHeight,
+			viewportHeight-8,
 			lipgloss.Center,
 			lipgloss.Top,
 			viewportContent,

From 45c037f52a98392748489c7e7fd586f4fcf6a806 Mon Sep 17 00:00:00 2001
From: Dror Ivry <drorivry@gmail.com>
Date: Tue, 9 Sep 2025 13:39:52 +0300
Subject: [PATCH 09/22] report is working when precondifuring the api key

---
 packages/tui/internal/tui/app.go            |   2 +-
 packages/tui/internal/tui/eval_ui.go        |  16 ++-
 packages/tui/internal/tui/evaluation.go     |  17 ++--
 rogue/evaluator_agent/evaluator_agent.py    |   4 +
 rogue/run_cli.py                            |  10 +-
 rogue/server/api/__init__.py                |   7 +-
 rogue/server/api/evaluation.py              |   2 +
 rogue/server/api/llm.py                     | 106 +++++++++++++++++++-
 rogue/server/models/api_format.py           |  11 +-
 rogue/server/services/__init__.py           |   1 +
 rogue/server/services/api_format_service.py |   4 +-
 rogue/server/services/llm_service.py        |   2 +-
 rogue/server/services/qualifire_service.py  |  40 ++++++++
 rogue/ui/components/scenario_runner.py      |   3 +
 sdks/python/rogue_sdk/client.py             |   8 ++
 sdks/python/rogue_sdk/sdk.py                |   8 ++
 sdks/python/rogue_sdk/types.py              |  26 +++++
 17 files changed, 239 insertions(+), 28 deletions(-)
 create mode 100644 rogue/server/services/qualifire_service.py

diff --git a/packages/tui/internal/tui/app.go b/packages/tui/internal/tui/app.go
index 1a59e45b..3e350e60 100644
--- a/packages/tui/internal/tui/app.go
+++ b/packages/tui/internal/tui/app.go
@@ -87,7 +87,7 @@ func (m *Model) summaryGenerationCmd() tea.Cmd {
 		ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
 		defer cancel()
 
-		structuredSummary, err := sdk.GenerateSummary(ctx, m.evalState.JobID, judgeModel, apiKey)
+		structuredSummary, err := sdk.GenerateSummary(ctx, m.evalState.JobID, judgeModel, apiKey, &m.config.QualifireAPIKey)
 
 		overallSummary := structuredSummary.Summary.OverallSummary
 		keyFindings := structuredSummary.Summary.KeyFindings
diff --git a/packages/tui/internal/tui/eval_ui.go b/packages/tui/internal/tui/eval_ui.go
index 2c8eba6d..e5e738ad 100644
--- a/packages/tui/internal/tui/eval_ui.go
+++ b/packages/tui/internal/tui/eval_ui.go
@@ -14,7 +14,7 @@ type EvaluationViewState struct {
 	JudgeModel   string
 	ParallelRuns int
 	DeepTest     bool
-	Scenarios    []string
+	Scenarios    []EvalScenario
 
 	// Runtime
 	Running  bool
@@ -35,7 +35,7 @@ type EvaluationViewState struct {
 }
 
 // loadScenariosFromWorkdir reads .rogue/scenarios.json upward from CWD
-func loadScenariosFromWorkdir() []string {
+func loadScenariosFromWorkdir() []EvalScenario {
 	wd, _ := os.Getwd()
 	dir := wd
 	for {
@@ -43,14 +43,20 @@ func loadScenariosFromWorkdir() []string {
 		if b, err := os.ReadFile(p); err == nil {
 			var v struct {
 				Scenarios []struct {
-					Scenario string `json:"scenario"`
+					Scenario        string `json:"scenario"`
+					ScenarioType    string `json:"scenario_type"`
+					ExpectedOutcome string `json:"expected_outcome"`
 				} `json:"scenarios"`
 			}
 			if json.Unmarshal(b, &v) == nil {
-				out := make([]string, 0, len(v.Scenarios))
+				out := make([]EvalScenario, 0, len(v.Scenarios))
 				for _, s := range v.Scenarios {
 					if s.Scenario != "" {
-						out = append(out, s.Scenario)
+						out = append(out, EvalScenario{
+							Scenario:        s.Scenario,
+							ScenarioType:    ScenarioType(s.ScenarioType),
+							ExpectedOutcome: s.ExpectedOutcome,
+						})
 					}
 				}
 				return out
diff --git a/packages/tui/internal/tui/evaluation.go b/packages/tui/internal/tui/evaluation.go
index 69bdbc2d..8575556b 100644
--- a/packages/tui/internal/tui/evaluation.go
+++ b/packages/tui/internal/tui/evaluation.go
@@ -42,8 +42,9 @@ type AgentConfig struct {
 }
 
 type EvalScenario struct {
-	Scenario     string       `json:"scenario"`
-	ScenarioType ScenarioType `json:"scenario_type"`
+	Scenario        string       `json:"scenario"`
+	ScenarioType    ScenarioType `json:"scenario_type"`
+	ExpectedOutcome string       `json:"expected_outcome,omitempty"`
 }
 
 type EvaluationRequest struct {
@@ -415,7 +416,7 @@ func (sdk *RogueSDK) CancelEvaluation(ctx context.Context, jobID string) error {
 }
 
 // StartEvaluation is the main entry point used by the TUI
-func (m *Model) StartEvaluation(ctx context.Context, serverURL, agentURL string, scenarios []string, judgeModel string, parallelRuns int, deepTest bool) (<-chan EvaluationEvent, func() error, error) {
+func (m *Model) StartEvaluation(ctx context.Context, serverURL, agentURL string, scenarios []EvalScenario, judgeModel string, parallelRuns int, deepTest bool) (<-chan EvaluationEvent, func() error, error) {
 	sdk := NewRogueSDK(serverURL)
 
 	// Validate URLs
@@ -443,8 +444,9 @@ func (m *Model) StartEvaluation(ctx context.Context, serverURL, agentURL string,
 	// Convert scenarios
 	for _, s := range scenarios {
 		request.Scenarios = append(request.Scenarios, EvalScenario{
-			Scenario:     s,
-			ScenarioType: ScenarioTypePolicy,
+			Scenario:        s.Scenario,
+			ScenarioType:    s.ScenarioType,
+			ExpectedOutcome: s.ExpectedOutcome,
 		})
 	}
 
@@ -452,7 +454,7 @@ func (m *Model) StartEvaluation(ctx context.Context, serverURL, agentURL string,
 }
 
 // GenerateSummary generates a markdown summary from evaluation results
-func (sdk *RogueSDK) GenerateSummary(ctx context.Context, jobID, model, apiKey string) (*SummaryResp, error) {
+func (sdk *RogueSDK) GenerateSummary(ctx context.Context, jobID, model, apiKey string, qualifireAPIKey *string) (*SummaryResp, error) {
 	// First get the evaluation job to extract results
 	job, err := sdk.GetEvaluation(ctx, jobID)
 	if err != nil {
@@ -470,6 +472,9 @@ func (sdk *RogueSDK) GenerateSummary(ctx context.Context, jobID, model, apiKey s
 		"results": map[string]interface{}{
 			"results": job.Results,
 		},
+		"job_id":            jobID,
+		"qualifire_api_key": *qualifireAPIKey,
+		"qualifire_url":     "http://localhost:3000",
 	}
 
 	body, err := json.Marshal(summaryReq)
diff --git a/rogue/evaluator_agent/evaluator_agent.py b/rogue/evaluator_agent/evaluator_agent.py
index 79e4f608..d6d25ed8 100644
--- a/rogue/evaluator_agent/evaluator_agent.py
+++ b/rogue/evaluator_agent/evaluator_agent.py
@@ -122,6 +122,7 @@
 - `scenario`: The entire scenario json object being tested. The json-object contains:
     - "scenario": The scenario text.
     - "scenario_type": The scenario type.
+    - "expected_outcome": The expected outcome of the scenario.
 - `context_id`: The conversation's context ID
 - `evaluation_passed`: Boolean indicating whether the agent complied with the policy. You should determine this based on the conversation.
 - `reason`: A brief explanation of your decision
@@ -363,6 +364,7 @@ def _log_evaluation(
         context_id: str,
         evaluation_passed: bool,
         reason: str,
+        scenario_type: Optional[str],
     ) -> None:
         """
         Logs the evaluation of the given scenario and test case.
@@ -370,6 +372,7 @@ def _log_evaluation(
             This is the scenario dictionary containing both the scenario text and type:
             - scenario: The scenario text.
             - scenario_type: The scenario type.
+            - expected_outcome: The expected outcome of the scenario.
         :param context_id: The conversation's context_id.
             This allows us to distinguish which conversation is being evaluated.
         :param evaluation_passed: A boolean value with the evaluation result. This is
@@ -391,6 +394,7 @@ def _log_evaluation(
                 ),
                 "evaluation_passed (from agent)": evaluation_passed,
                 "reason (from agent)": reason,
+                "scenario_type": scenario_type,
             },
         )
 
diff --git a/rogue/run_cli.py b/rogue/run_cli.py
index f0894eeb..82eb60a6 100644
--- a/rogue/run_cli.py
+++ b/rogue/run_cli.py
@@ -188,6 +188,9 @@ async def create_report(
     results: EvaluationResults,
     output_report_file: Path,
     judge_llm_api_key_secret: SecretStr | None = None,
+    qualifire_api_key_secret: SecretStr | None = None,
+    deep_test_mode: bool = False,
+    judge_model: str | None = None,
 ) -> str:
     judge_llm_api_key = (
         judge_llm_api_key_secret.get_secret_value()
@@ -203,10 +206,13 @@ async def create_report(
     sdk = RogueSDK(sdk_config)
 
     try:
-        summary, structured_summary = await sdk.generate_summary(
+        summary, _ = await sdk.generate_summary(
             results=results,
             model=judge_llm,
             api_key=judge_llm_api_key,
+            qualifire_api_key=qualifire_api_key_secret,
+            deep_test=deep_test_mode,
+            judge_model=judge_model,
         )
     finally:
         await sdk.close()
@@ -352,6 +358,8 @@ async def run_cli(args: Namespace) -> int:
         results=results,
         output_report_file=cli_input.output_report_file,
         judge_llm_api_key_secret=cli_input.judge_llm_api_key,
+        deep_test_mode=cli_input.deep_test_mode,
+        judge_model=cli_input.judge_llm,
     )
 
     logger.info("Report saved", extra={"report_file": cli_input.output_report_file})
diff --git a/rogue/server/api/__init__.py b/rogue/server/api/__init__.py
index e1dec064..0b1b8c2e 100644
--- a/rogue/server/api/__init__.py
+++ b/rogue/server/api/__init__.py
@@ -2,6 +2,11 @@
 API endpoints for the Rogue Agent Evaluator Server.
 """
 
-from . import evaluation, health, interview, llm
+from . import (
+    evaluation,
+    health,
+    interview,
+    llm,
+)
 
 __all__ = ["evaluation", "health", "interview", "llm"]
diff --git a/rogue/server/api/evaluation.py b/rogue/server/api/evaluation.py
index 00e29c2f..636dbf0c 100644
--- a/rogue/server/api/evaluation.py
+++ b/rogue/server/api/evaluation.py
@@ -59,6 +59,8 @@ async def create_evaluation(
         status=EvaluationStatus.PENDING,
         created_at=datetime.now(timezone.utc),
         request=request,
+        deep_test=request.agent_config.deep_test_mode,
+        judge_model=request.agent_config.judge_llm,
     )
 
     await evaluation_service.add_job(job)
diff --git a/rogue/server/api/llm.py b/rogue/server/api/llm.py
index 73a312f4..acc0166b 100644
--- a/rogue/server/api/llm.py
+++ b/rogue/server/api/llm.py
@@ -4,17 +4,24 @@
 This module provides REST API endpoints for LLM operations.
 """
 
-from fastapi import APIRouter, HTTPException
+from datetime import datetime, timezone
+from fastapi import APIRouter, Depends, HTTPException
 from rogue_sdk.types import (
     ScenarioGenerationRequest,
     ScenarioGenerationResponse,
     SummaryGenerationRequest,
+    ReportSummaryResponse,
+    ReportSummaryRequest,
 )
 
+from rogue.server.api.evaluation import get_evaluation_service
+from rogue.server.services.evaluation_service import EvaluationService
+
 from ..models.api_format import ServerSummaryGenerationResponse
 
 from ...common.logging import get_logger
 from ..services.llm_service import LLMService
+from ..services.qualifire_service import QualifireService
 
 router = APIRouter(prefix="/llm", tags=["llm"])
 logger = get_logger(__name__)
@@ -58,9 +65,13 @@ async def generate_scenarios(request: ScenarioGenerationRequest):
         )
 
 
-@router.post("/summary", response_model=ServerSummaryGenerationResponse)
+@router.post(
+    "/summary",
+    response_model=ServerSummaryGenerationResponse,
+)
 async def generate_summary(
     request: SummaryGenerationRequest,
+    evaluation_service: EvaluationService = Depends(get_evaluation_service),
 ) -> ServerSummaryGenerationResponse:
     """
     Generate evaluation summary from results.
@@ -84,6 +95,56 @@ async def generate_summary(
 
         logger.info("Successfully generated evaluation summary")
 
+        logger.info(
+            "Qualifire API key",
+            extra={"qualifire_api_key": request.qualifire_api_key},
+        )
+        logger.info(
+            "Job ID",
+            extra={"job_id": request.job_id},
+        )
+        logger.info(
+            "Qualifire URL",
+            extra={"qualifire_url": request.qualifire_url},
+        )
+
+        if request.qualifire_api_key and request.job_id:
+
+            logger.info(
+                "Reporting summary to Qualifire",
+                extra={"job_id": request.job_id},
+            )
+
+            job = await evaluation_service.get_job(request.job_id)
+
+            if not job and not request.judge_model and not request.deep_test:
+                raise HTTPException(
+                    status_code=400,
+                    detail="Job not found and judge model and deep test are not provided",  # noqa: E501
+                )
+
+            logger.info(
+                "Summary",
+                extra={"summary": summary, "results": request.results},
+            )
+
+            QualifireService.report_summary(
+                ReportSummaryRequest(
+                    job_id=request.job_id,
+                    structured_summary=summary,
+                    deep_test=job.deep_test if job else request.deep_test,
+                    start_time=(
+                        job.created_at
+                        if job is not None
+                        else datetime.now(timezone.utc)
+                    ),
+                    judge_model=job.judge_model if job else request.judge_model,
+                    qualifire_url=request.qualifire_url,
+                    qualifire_api_key=request.qualifire_api_key,
+                ),
+                evaluation_result=request.results,
+            )
+
         return ServerSummaryGenerationResponse(
             summary=summary,
             message="Successfully generated evaluation summary",
@@ -95,3 +156,44 @@ async def generate_summary(
             status_code=500,
             detail=f"Failed to generate summary: {str(e)}",
         )
+
+
+@router.post("/report_summary", response_model=ReportSummaryResponse)
+async def report_summary_handler(
+    request: ReportSummaryRequest,
+    evaluation_service: EvaluationService = Depends(get_evaluation_service),
+):
+    """
+    Report summary to Qualifire.
+    """
+    try:
+        job = await evaluation_service.get_job(request.job_id)
+
+        if not job:
+            raise HTTPException(
+                status_code=404,
+                detail="Evaluation job not found",
+            )
+
+        results = job.results
+
+        if not results or len(results) == 0:
+            raise HTTPException(
+                status_code=404,
+                detail="Evaluation results not found or empty",
+            )
+
+        QualifireService.report_summary(
+            request,
+            evaluation_result=results[0],
+        )
+
+        return ReportSummaryResponse(
+            success=True,
+        )
+    except Exception as e:
+        logger.exception("Failed to report summary")
+        raise HTTPException(
+            status_code=500,
+            detail=f"Failed to report summary: {str(e)}",
+        )
diff --git a/rogue/server/models/api_format.py b/rogue/server/models/api_format.py
index 1142b729..f3895b8e 100644
--- a/rogue/server/models/api_format.py
+++ b/rogue/server/models/api_format.py
@@ -8,15 +8,7 @@
 from typing import List, Optional
 
 from pydantic import BaseModel
-
-
-class StructuredSummary(BaseModel):
-    """Structured summary response from LLM."""
-
-    overall_summary: str
-    key_findings: List[str]
-    recommendations: List[str]
-    detailed_breakdown: List[dict]  # Table rows for scenario breakdown
+from rogue_sdk.types import StructuredSummary
 
 
 class ApiChatMessage(BaseModel):
@@ -39,6 +31,7 @@ class ApiScenarioResult(BaseModel):
     """Result of evaluating a single scenario in new API format."""
 
     description: Optional[str] = None
+    expectedOutcome: Optional[str] = None
     totalConversations: Optional[int] = None
     flaggedConversations: Optional[int] = None
     conversations: List[ApiConversationEvaluation]
diff --git a/rogue/server/services/__init__.py b/rogue/server/services/__init__.py
index 95047763..6b0b3aaf 100644
--- a/rogue/server/services/__init__.py
+++ b/rogue/server/services/__init__.py
@@ -5,4 +5,5 @@
     interviewer_service,
     llm_service,
     scenario_evaluation_service,
+    qualifire_service,
 )
diff --git a/rogue/server/services/api_format_service.py b/rogue/server/services/api_format_service.py
index 44ccc2cf..d63336e2 100644
--- a/rogue/server/services/api_format_service.py
+++ b/rogue/server/services/api_format_service.py
@@ -7,14 +7,13 @@
 from datetime import datetime, timezone
 from typing import Optional
 
-from rogue_sdk.types import EvaluationResults
+from rogue_sdk.types import EvaluationResults, StructuredSummary
 
 from ..models.api_format import (
     ApiChatMessage,
     ApiConversationEvaluation,
     ApiEvaluationResult,
     ApiScenarioResult,
-    StructuredSummary,
 )
 
 
@@ -80,6 +79,7 @@ def convert_to_api_format(
         api_scenarios.append(
             ApiScenarioResult(
                 description=result.scenario.scenario,
+                expectedOutcome=result.scenario.expected_outcome,
                 totalConversations=len(api_conversations),
                 flaggedConversations=len(
                     [c for c in api_conversations if not c.passed],
diff --git a/rogue/server/services/llm_service.py b/rogue/server/services/llm_service.py
index aa229c04..a775edf9 100644
--- a/rogue/server/services/llm_service.py
+++ b/rogue/server/services/llm_service.py
@@ -4,8 +4,8 @@
 from litellm import completion
 from loguru import logger
 from rogue_sdk.types import EvaluationResults, Scenario, Scenarios, ScenarioType
+from rogue_sdk.types import StructuredSummary
 
-from ..models.api_format import StructuredSummary
 
 SCENARIO_GENERATION_SYSTEM_PROMPT = """
 # Test Scenario Designer
diff --git a/rogue/server/services/qualifire_service.py b/rogue/server/services/qualifire_service.py
new file mode 100644
index 00000000..05aabf60
--- /dev/null
+++ b/rogue/server/services/qualifire_service.py
@@ -0,0 +1,40 @@
+import requests
+from loguru import logger
+
+from .api_format_service import convert_with_structured_summary
+from rogue_sdk.types import EvaluationResult, ReportSummaryRequest
+
+
+class QualifireService:
+    @staticmethod
+    def report_summary(
+        request: ReportSummaryRequest,
+        evaluation_result: EvaluationResult,
+    ):
+        logger.info(
+            "Reporting summary to Qualifire",
+        )
+
+        api_evaluation_result = convert_with_structured_summary(
+            evaluation_results=evaluation_result,
+            structured_summary=request.structured_summary,
+            deep_test=request.deep_test,
+            start_time=request.start_time,
+            judge_model=request.judge_model,
+        )
+
+        response = requests.post(
+            f"{request.qualifire_url}/api/rogue/v1/report",
+            headers={"X-qualifire-key": request.qualifire_api_key},
+            json=api_evaluation_result.model_dump(mode="json"),
+            timeout=300,
+        )
+
+        if not response.ok:
+            logger.error(
+                "Failed to report summary to Qualifire",
+                extra={"response": response.json()},
+            )
+            raise Exception(f"Failed to report summary to Qualifire: {response.json()}")
+
+        return response.json()
diff --git a/rogue/ui/components/scenario_runner.py b/rogue/ui/components/scenario_runner.py
index b1fa6d7e..e46c5540 100644
--- a/rogue/ui/components/scenario_runner.py
+++ b/rogue/ui/components/scenario_runner.py
@@ -493,6 +493,9 @@ def on_status_update(status_data):
                 results=all_results,
                 model=config.get("service_llm"),
                 api_key=config.get("judge_llm_api_key"),
+                qualifire_api_key=config.get("qualifire_api_key"),
+                deep_test=config.get("deep_test_mode", False),
+                judge_model=config.get("judge_llm"),
             )
 
             state["structured_summary"] = structured_summary
diff --git a/sdks/python/rogue_sdk/client.py b/sdks/python/rogue_sdk/client.py
index f88b7fd2..09546da8 100644
--- a/sdks/python/rogue_sdk/client.py
+++ b/sdks/python/rogue_sdk/client.py
@@ -149,11 +149,19 @@ async def generate_summary(
         results: EvaluationResults,
         model: str,
         api_key: Optional[str] = None,
+        qualifire_api_key: Optional[str] = None,
+        job_id: Optional[str] = None,
+        deep_test: bool = False,
+        judge_model: Optional[str] = None,
     ) -> SummaryGenerationResponse:
         """Generate summary via API."""
         data = SummaryGenerationRequest(
             results=results,
             model=model,
+            qualifire_api_key=qualifire_api_key,
+            job_id=job_id,
+            deep_test=deep_test,
+            judge_model=judge_model,
         )
         if api_key:
             data.api_key = api_key
diff --git a/sdks/python/rogue_sdk/sdk.py b/sdks/python/rogue_sdk/sdk.py
index 98bdc55a..d1ec55cf 100644
--- a/sdks/python/rogue_sdk/sdk.py
+++ b/sdks/python/rogue_sdk/sdk.py
@@ -289,12 +289,20 @@ async def generate_summary(
         results: EvaluationResults,
         model: str = "openai/gpt-4o-mini",
         api_key: Optional[str] = None,
+        qualifire_api_key: Optional[str] = None,
+        job_id: Optional[str] = None,
+        deep_test: bool = False,
+        judge_model: Optional[str] = None,
     ) -> Tuple[str, StructuredSummary]:
         """Generate evaluation summary from results."""
         response_data = await self.http_client.generate_summary(
             results=results,
             model=model,
             api_key=api_key,
+            qualifire_api_key=qualifire_api_key,
+            job_id=job_id,
+            deep_test=deep_test,
+            judge_model=judge_model,
         )
 
         # Convert structured summary back to string format for backward compatibility
diff --git a/sdks/python/rogue_sdk/types.py b/sdks/python/rogue_sdk/types.py
index b1b8706c..6842a61f 100644
--- a/sdks/python/rogue_sdk/types.py
+++ b/sdks/python/rogue_sdk/types.py
@@ -77,6 +77,7 @@ class AgentConfig(BaseModel):
     parallel_runs: int = 1
     judge_llm_api_key: Optional[str] = None
     business_context: str = ""
+    qualifire_api_key: Optional[str] = None
 
     @model_validator(mode="after")
     def check_auth_credentials(self) -> "AgentConfig":
@@ -385,6 +386,8 @@ class EvaluationJob(BaseModel):
     results: Optional[List[EvaluationResult]] = None
     error_message: Optional[str] = None
     progress: float = 0.0
+    deep_test: bool = False
+    judge_model: Optional[str] = None
 
 
 class EvaluationResponse(BaseModel):
@@ -431,6 +434,11 @@ class SummaryGenerationRequest(BaseModel):
     results: EvaluationResults
     model: str = "openai/gpt-4.1"
     api_key: Optional[str] = None
+    qualifire_api_key: Optional[str] = None
+    job_id: str = ""
+    deep_test: bool = False
+    judge_model: Optional[str] = None
+    qualifire_url: Optional[str] = "https://app.qualifire.ai"
 
 
 class StructuredSummary(BaseModel):
@@ -488,3 +496,21 @@ def validate_base_url(cls, v: str | HttpUrl) -> HttpUrl:
         if isinstance(v, str):
             return HttpUrl(v)
         return v
+
+
+class ReportSummaryRequest(BaseModel):
+    """Request to report a summary."""
+
+    job_id: str
+    structured_summary: Optional[StructuredSummary] = None
+    deep_test: bool = False
+    start_time: Optional[datetime] = None
+    judge_model: Optional[str] = None
+    qualifire_api_key: Optional[str] = None
+    qualifire_url: Optional[str] = "https://app.qualifire.ai"
+
+
+class ReportSummaryResponse(BaseModel):
+    """Response to report a summary."""
+
+    success: bool

From 7036bb0e8337386bfac13738379be7f006596f54 Mon Sep 17 00:00:00 2001
From: Dror Ivry <drorivry@gmail.com>
Date: Tue, 9 Sep 2025 14:38:13 +0300
Subject: [PATCH 10/22] fixed paste

---
 .../internal/components/llm_config_dialog.go  |  6 +++---
 packages/tui/internal/tui/app.go              | 19 +++++++++++++++++++
 2 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/packages/tui/internal/components/llm_config_dialog.go b/packages/tui/internal/components/llm_config_dialog.go
index b07471a3..54fcd104 100644
--- a/packages/tui/internal/components/llm_config_dialog.go
+++ b/packages/tui/internal/components/llm_config_dialog.go
@@ -549,7 +549,7 @@ func (d LLMConfigDialog) handleEnter() (LLMConfigDialog, tea.Cmd) {
 // handlePaste handles clipboard paste operation for API key input
 func (d LLMConfigDialog) handlePaste() (LLMConfigDialog, tea.Cmd) {
 	// Get clipboard content based on the operating system
-	clipboardText, err := getClipboardContent()
+	clipboardText, err := GetClipboardContent()
 	if err != nil {
 		// If clipboard reading fails, just return without error
 		return d, nil
@@ -569,8 +569,8 @@ func (d LLMConfigDialog) handlePaste() (LLMConfigDialog, tea.Cmd) {
 	return d, nil
 }
 
-// getClipboardContent reads content from the system clipboard
-func getClipboardContent() (string, error) {
+// GetClipboardContent reads content from the system clipboard
+func GetClipboardContent() (string, error) {
 	var cmd *exec.Cmd
 
 	switch runtime.GOOS {
diff --git a/packages/tui/internal/tui/app.go b/packages/tui/internal/tui/app.go
index 3e350e60..f1f9a30c 100644
--- a/packages/tui/internal/tui/app.go
+++ b/packages/tui/internal/tui/app.go
@@ -312,6 +312,25 @@ func (m Model) Update(msg tea.Msg) (tea.Model, tea.Cmd) {
 			}
 			return m, tea.Batch(cmds...)
 		}
+
+		if m.dialog != nil {
+			clipboardText, err := components.GetClipboardContent()
+			if err != nil {
+				// If clipboard reading fails, just return without error
+				return m, nil
+			}
+
+			// Clean the clipboard text (remove newlines and trim whitespace)
+			cleanText := strings.TrimSpace(strings.ReplaceAll(clipboardText, "\n", ""))
+
+			if cleanText == "" {
+				return m, nil
+			}
+
+			m.dialog.Input += cleanText
+			m.dialog.InputCursor = len(m.dialog.Input)
+			return m, nil
+		}
 	case components.SpinnerTickMsg:
 		// Update spinners
 		m.healthSpinner, cmd = m.healthSpinner.Update(msg)

From 64ff4641bcb892617241ece94aa4f6e7f58f421b Mon Sep 17 00:00:00 2001
From: Dror Ivry <drorivry@gmail.com>
Date: Tue, 9 Sep 2025 15:52:11 +0300
Subject: [PATCH 11/22] wip

---
 sdks/python/rogue_sdk/types.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sdks/python/rogue_sdk/types.py b/sdks/python/rogue_sdk/types.py
index 6842a61f..d7a929d6 100644
--- a/sdks/python/rogue_sdk/types.py
+++ b/sdks/python/rogue_sdk/types.py
@@ -434,10 +434,10 @@ class SummaryGenerationRequest(BaseModel):
     results: EvaluationResults
     model: str = "openai/gpt-4.1"
     api_key: Optional[str] = None
-    qualifire_api_key: Optional[str] = None
     job_id: str = ""
     deep_test: bool = False
     judge_model: Optional[str] = None
+    qualifire_api_key: Optional[str] = None
     qualifire_url: Optional[str] = "https://app.qualifire.ai"
 
 

From 2f23574d1f24ce2cfe558efcc82eafe41f200ac8 Mon Sep 17 00:00:00 2001
From: Dror Ivry <drorivry@gmail.com>
Date: Wed, 10 Sep 2025 13:17:48 +0300
Subject: [PATCH 12/22] fixed report_summary

---
 rogue/evaluator_agent/evaluator_agent.py | 4 ++++
 rogue/server/api/llm.py                  | 5 +++--
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/rogue/evaluator_agent/evaluator_agent.py b/rogue/evaluator_agent/evaluator_agent.py
index d6d25ed8..001ae760 100644
--- a/rogue/evaluator_agent/evaluator_agent.py
+++ b/rogue/evaluator_agent/evaluator_agent.py
@@ -395,6 +395,10 @@ def _log_evaluation(
                 "evaluation_passed (from agent)": evaluation_passed,
                 "reason (from agent)": reason,
                 "scenario_type": scenario_type,
+                "expected_outcome": scenario.get(
+                    "expected_outcome",
+                    "None",
+                ),
             },
         )
 
diff --git a/rogue/server/api/llm.py b/rogue/server/api/llm.py
index acc0166b..367358f0 100644
--- a/rogue/server/api/llm.py
+++ b/rogue/server/api/llm.py
@@ -7,6 +7,7 @@
 from datetime import datetime, timezone
 from fastapi import APIRouter, Depends, HTTPException
 from rogue_sdk.types import (
+    EvaluationResults,
     ScenarioGenerationRequest,
     ScenarioGenerationResponse,
     SummaryGenerationRequest,
@@ -185,7 +186,7 @@ async def report_summary_handler(
 
         QualifireService.report_summary(
             request,
-            evaluation_result=results[0],
+            evaluation_result=EvaluationResults(results=results),
         )
 
         return ReportSummaryResponse(
@@ -194,6 +195,6 @@ async def report_summary_handler(
     except Exception as e:
         logger.exception("Failed to report summary")
         raise HTTPException(
-            status_code=500,
+            status_code=e.status_code if hasattr(e, "status_code") else 500,
             detail=f"Failed to report summary: {str(e)}",
         )

From 5cb2a0cdf3bc3b51d9f8f2a6a77bb737ce47805f Mon Sep 17 00:00:00 2001
From: Dror Ivry <drorivry@gmail.com>
Date: Wed, 10 Sep 2025 14:04:27 +0300
Subject: [PATCH 13/22] tui report save

---
 packages/tui/internal/tui/app.go        | 82 ++++++++++++++++++++++-
 packages/tui/internal/tui/eval_ui.go    |  1 +
 packages/tui/internal/tui/evaluation.go | 88 +++++++++++++++++++++----
 rogue/server/api/llm.py                 | 10 ++-
 sdks/python/rogue_sdk/types.py          |  5 +-
 5 files changed, 167 insertions(+), 19 deletions(-)

diff --git a/packages/tui/internal/tui/app.go b/packages/tui/internal/tui/app.go
index f1f9a30c..0fe05b7b 100644
--- a/packages/tui/internal/tui/app.go
+++ b/packages/tui/internal/tui/app.go
@@ -87,7 +87,25 @@ func (m *Model) summaryGenerationCmd() tea.Cmd {
 		ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
 		defer cancel()
 
-		structuredSummary, err := sdk.GenerateSummary(ctx, m.evalState.JobID, judgeModel, apiKey, &m.config.QualifireAPIKey)
+		structuredSummary, err := sdk.GenerateSummary(
+			ctx,
+			m.evalState.JobID,
+			judgeModel,
+			apiKey,
+			&m.config.QualifireAPIKey,
+			m.evalState.DeepTest,
+			judgeModel,
+			m.config.ServerURL,
+		)
+
+		if err != nil {
+			return SummaryGeneratedMsg{
+				Summary: "",
+				Err:     err,
+			}
+		}
+
+		m.evalState.StructuredSummary = structuredSummary.Summary
 
 		overallSummary := structuredSummary.Summary.OverallSummary
 		keyFindings := structuredSummary.Summary.KeyFindings
@@ -531,6 +549,66 @@ func (m Model) Update(msg tea.Msg) (tea.Model, tea.Cmd) {
 		// Handle dialog closure
 		if m.dialog != nil {
 			switch msg.Action {
+			case "save_qualifire_and_report":
+				// Handle Qualifire API key save and report persistence
+				if m.dialog != nil && m.dialog.Title == "Configure Qualifire API Key" {
+					// Save the API key to config (allow empty to clear the key)
+					m.config.QualifireAPIKey = msg.Input
+					// Only enable integration if there's an API key
+					if msg.Input != "" {
+						m.config.QualifireEnabled = true
+						if m.configState != nil {
+							m.configState.QualifireEnabled = true
+							m.configState.HasChanges = true
+						}
+					}
+
+					// immediately report the summary
+					if m.evalState != nil && m.evalState.Completed {
+						sdk := NewRogueSDK(m.config.ServerURL)
+						err := sdk.ReportSummary(
+							context.Background(),
+							m.evalState.JobID,
+							m.evalState.StructuredSummary,
+							m.evalState.DeepTest,
+							m.evalState.JudgeModel,
+							m.config.QualifireAPIKey,
+						)
+						if err != nil {
+							// Show error dialog
+							errorDialog := components.ShowErrorDialog(
+								"Report Summary Error",
+								fmt.Sprintf("Failed to report summary: %v", err),
+							)
+							m.dialog = &errorDialog
+						}
+
+						err = m.saveConfig()
+						if err != nil {
+							// Show error dialog
+							errorDialog := components.ShowErrorDialog(
+								"Configuration Error",
+								fmt.Sprintf("Failed to save Qualifire configuration: %v", err),
+							)
+							m.dialog = &errorDialog
+							return m, nil
+						} else {
+							// Show appropriate success dialog
+							var message string
+							if msg.Input != "" {
+								message = "Qualifire API key has been successfully saved and integration is now enabled. Your evaluation report will now be automatically persisted."
+							} else {
+								message = "Qualifire API key has been cleared and integration is now disabled."
+							}
+							successDialog := components.NewInfoDialog(
+								"Qualifire Configured",
+								message,
+							)
+							m.dialog = &successDialog
+							return m, nil
+						}
+					}
+				}
 			case "save_qualifire":
 				// Handle Qualifire API key save
 				if m.dialog != nil && m.dialog.Title == "Configure Qualifire API Key" {
@@ -589,7 +667,7 @@ func (m Model) Update(msg tea.Msg) (tea.Model, tea.Cmd) {
 					)
 					// Customize the buttons for this specific use case
 					dialog.Buttons = []components.DialogButton{
-						{Label: "Save", Action: "save_qualifire", Style: components.PrimaryButton},
+						{Label: "Save", Action: "save_qualifire_and_report", Style: components.PrimaryButton},
 					}
 					// Position cursor at end of existing key if there is one
 					dialog.InputCursor = len(m.config.QualifireAPIKey)
diff --git a/packages/tui/internal/tui/eval_ui.go b/packages/tui/internal/tui/eval_ui.go
index e5e738ad..8d2e9d92 100644
--- a/packages/tui/internal/tui/eval_ui.go
+++ b/packages/tui/internal/tui/eval_ui.go
@@ -28,6 +28,7 @@ type EvaluationViewState struct {
 	JobID            string // For tracking the evaluation job
 	Completed        bool   // Whether evaluation finished successfully
 	SummaryGenerated bool   // Whether summary generation was already attempted
+	StructuredSummary StructuredSummary
 
 	// Editing state for New Evaluation
 	currentField int // 0: AgentURL, 1: JudgeModel, 2: DeepTest, 3: StartButton
diff --git a/packages/tui/internal/tui/evaluation.go b/packages/tui/internal/tui/evaluation.go
index 8575556b..920cc11d 100644
--- a/packages/tui/internal/tui/evaluation.go
+++ b/packages/tui/internal/tui/evaluation.go
@@ -93,18 +93,19 @@ type RogueSDK struct {
 	ws         *websocket.Conn
 }
 
+type StructuredSummary struct {
+	OverallSummary    string   `json:"overall_summary"`
+	KeyFindings       []string `json:"key_findings"`
+	Recommendations   []string `json:"recommendations"`
+	DetailedBreakdown []struct {
+		Scenario string `json:"scenario"`
+		Status   string `json:"status"`
+		Outcome  string `json:"outcome"`
+	} `json:"detailed_breakdown"`
+}
 type SummaryResp struct {
-	Summary struct {
-		OverallSummary    string   `json:"overall_summary"`
-		KeyFindings       []string `json:"key_findings"`
-		Recommendations   []string `json:"recommendations"`
-		DetailedBreakdown []struct {
-			Scenario string `json:"scenario"`
-			Status   string `json:"status"`
-			Outcome  string `json:"outcome"`
-		} `json:"detailed_breakdown"`
-	} `json:"summary"`
-	Message string `json:"message"`
+	Summary StructuredSummary `json:"summary"`
+	Message string            `json:"message"`
 }
 
 // NewRogueSDK creates a new SDK instance
@@ -416,7 +417,15 @@ func (sdk *RogueSDK) CancelEvaluation(ctx context.Context, jobID string) error {
 }
 
 // StartEvaluation is the main entry point used by the TUI
-func (m *Model) StartEvaluation(ctx context.Context, serverURL, agentURL string, scenarios []EvalScenario, judgeModel string, parallelRuns int, deepTest bool) (<-chan EvaluationEvent, func() error, error) {
+func (m *Model) StartEvaluation(
+	ctx context.Context,
+	serverURL string,
+	agentURL string,
+	scenarios []EvalScenario,
+	judgeModel string,
+	parallelRuns int,
+	deepTest bool,
+) (<-chan EvaluationEvent, func() error, error) {
 	sdk := NewRogueSDK(serverURL)
 
 	// Validate URLs
@@ -454,7 +463,14 @@ func (m *Model) StartEvaluation(ctx context.Context, serverURL, agentURL string,
 }
 
 // GenerateSummary generates a markdown summary from evaluation results
-func (sdk *RogueSDK) GenerateSummary(ctx context.Context, jobID, model, apiKey string, qualifireAPIKey *string) (*SummaryResp, error) {
+func (sdk *RogueSDK) GenerateSummary(
+	ctx context.Context,
+	jobID, model, apiKey string,
+	qualifireAPIKey *string,
+	deepTest bool,
+	judgeModel string,
+	qualifireURL string,
+) (*SummaryResp, error) {
 	// First get the evaluation job to extract results
 	job, err := sdk.GetEvaluation(ctx, jobID)
 	if err != nil {
@@ -474,7 +490,9 @@ func (sdk *RogueSDK) GenerateSummary(ctx context.Context, jobID, model, apiKey s
 		},
 		"job_id":            jobID,
 		"qualifire_api_key": *qualifireAPIKey,
-		"qualifire_url":     "http://localhost:3000",
+		"qualifire_url":     qualifireURL,
+		"deep_test":         deepTest,
+		"judge_model":       judgeModel,
 	}
 
 	body, err := json.Marshal(summaryReq)
@@ -513,6 +531,48 @@ func (sdk *RogueSDK) GenerateSummary(ctx context.Context, jobID, model, apiKey s
 	return &summaryResp, nil
 }
 
+// ReportSummary reports a summary to Qualifire
+func (sdk *RogueSDK) ReportSummary(
+	ctx context.Context,
+	jobID string,
+	summary StructuredSummary,
+	deepTest bool,
+	judgeModel string,
+	qualifireAPIKey string,
+) error {
+	reportReq := map[string]interface{}{
+		"job_id":             jobID,
+		"structured_summary": summary,
+		"deep_test":          deepTest,
+		"judge_model":        judgeModel,
+		"qualifire_api_key":  qualifireAPIKey,
+	}
+
+	body, err := json.Marshal(reportReq)
+	if err != nil {
+		return err
+	}
+
+	req, err := http.NewRequestWithContext(ctx, "POST", sdk.baseURL+"/api/v1/llm/report_summary", bytes.NewReader(body))
+	if err != nil {
+		return err
+	}
+	req.Header.Set("Content-Type", "application/json")
+
+	resp, err := sdk.httpClient.Do(req)
+	if err != nil {
+		return err
+	}
+	defer resp.Body.Close()
+
+	if resp.StatusCode != http.StatusOK {
+		body, _ := io.ReadAll(resp.Body)
+		return fmt.Errorf("report summary failed: %d %s", resp.StatusCode, string(body))
+	}
+
+	return nil
+}
+
 // CheckServerHealth calls GET /health and returns the status string
 func (m *Model) CheckServerHealth(ctx context.Context, serverURL string) (string, error) {
 	sdk := NewRogueSDK(serverURL)
diff --git a/rogue/server/api/llm.py b/rogue/server/api/llm.py
index 367358f0..05b9459e 100644
--- a/rogue/server/api/llm.py
+++ b/rogue/server/api/llm.py
@@ -185,7 +185,15 @@ async def report_summary_handler(
             )
 
         QualifireService.report_summary(
-            request,
+            ReportSummaryRequest(
+                job_id=request.job_id,
+                structured_summary=request.structured_summary,
+                deep_test=request.deep_test,
+                start_time=job.created_at,
+                judge_model=job.judge_model,
+                qualifire_api_key=request.qualifire_api_key,
+                qualifire_url=request.qualifire_url,
+            ),
             evaluation_result=EvaluationResults(results=results),
         )
 
diff --git a/sdks/python/rogue_sdk/types.py b/sdks/python/rogue_sdk/types.py
index d7a929d6..dc91f321 100644
--- a/sdks/python/rogue_sdk/types.py
+++ b/sdks/python/rogue_sdk/types.py
@@ -504,10 +504,11 @@ class ReportSummaryRequest(BaseModel):
     job_id: str
     structured_summary: Optional[StructuredSummary] = None
     deep_test: bool = False
-    start_time: Optional[datetime] = None
     judge_model: Optional[str] = None
+    start_time: Optional[datetime] = None
     qualifire_api_key: Optional[str] = None
-    qualifire_url: Optional[str] = "https://app.qualifire.ai"
+    # qualifire_url: Optional[str] = "https://app.qualifire.ai"
+    qualifire_url: Optional[str] = "http://localhost:3000"
 
 
 class ReportSummaryResponse(BaseModel):

From d9ce9e243cd16a9dec3f9d2b2d2a2ac5a8d3cfad Mon Sep 17 00:00:00 2001
From: Dror Ivry <drorivry@gmail.com>
Date: Wed, 10 Sep 2025 14:04:56 +0300
Subject: [PATCH 14/22] tui report save

---
 sdks/python/rogue_sdk/types.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/sdks/python/rogue_sdk/types.py b/sdks/python/rogue_sdk/types.py
index dc91f321..96fedddc 100644
--- a/sdks/python/rogue_sdk/types.py
+++ b/sdks/python/rogue_sdk/types.py
@@ -507,8 +507,7 @@ class ReportSummaryRequest(BaseModel):
     judge_model: Optional[str] = None
     start_time: Optional[datetime] = None
     qualifire_api_key: Optional[str] = None
-    # qualifire_url: Optional[str] = "https://app.qualifire.ai"
-    qualifire_url: Optional[str] = "http://localhost:3000"
+    qualifire_url: Optional[str] = "https://app.qualifire.ai"
 
 
 class ReportSummaryResponse(BaseModel):

From c5134b3d292e5a4e77547b81241f26529400267b Mon Sep 17 00:00:00 2001
From: Dror Ivry <drorivry@gmail.com>
Date: Wed, 10 Sep 2025 15:19:10 +0300
Subject: [PATCH 15/22] tui report save

---
 packages/tui/internal/tui/app.go        | 15 +++++++++++----
 packages/tui/internal/tui/evaluation.go |  2 --
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/packages/tui/internal/tui/app.go b/packages/tui/internal/tui/app.go
index 0fe05b7b..1c75b0e2 100644
--- a/packages/tui/internal/tui/app.go
+++ b/packages/tui/internal/tui/app.go
@@ -86,16 +86,18 @@ func (m *Model) summaryGenerationCmd() tea.Cmd {
 		// Create a context with longer timeout for summary generation
 		ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
 		defer cancel()
-
+		parsedAPIKey := &m.config.QualifireAPIKey
+		if m.config.QualifireEnabled == false {
+			parsedAPIKey = nil
+		}
 		structuredSummary, err := sdk.GenerateSummary(
 			ctx,
 			m.evalState.JobID,
 			judgeModel,
 			apiKey,
-			&m.config.QualifireAPIKey,
+			parsedAPIKey,
 			m.evalState.DeepTest,
 			judgeModel,
-			m.config.ServerURL,
 		)
 
 		if err != nil {
@@ -565,6 +567,11 @@ func (m Model) Update(msg tea.Msg) (tea.Model, tea.Cmd) {
 
 					// immediately report the summary
 					if m.evalState != nil && m.evalState.Completed {
+						parsedAPIKey := m.config.QualifireAPIKey
+						if m.config.QualifireEnabled == false {
+							parsedAPIKey = ""
+						}
+
 						sdk := NewRogueSDK(m.config.ServerURL)
 						err := sdk.ReportSummary(
 							context.Background(),
@@ -572,7 +579,7 @@ func (m Model) Update(msg tea.Msg) (tea.Model, tea.Cmd) {
 							m.evalState.StructuredSummary,
 							m.evalState.DeepTest,
 							m.evalState.JudgeModel,
-							m.config.QualifireAPIKey,
+							parsedAPIKey,
 						)
 						if err != nil {
 							// Show error dialog
diff --git a/packages/tui/internal/tui/evaluation.go b/packages/tui/internal/tui/evaluation.go
index 920cc11d..a3ab4410 100644
--- a/packages/tui/internal/tui/evaluation.go
+++ b/packages/tui/internal/tui/evaluation.go
@@ -469,7 +469,6 @@ func (sdk *RogueSDK) GenerateSummary(
 	qualifireAPIKey *string,
 	deepTest bool,
 	judgeModel string,
-	qualifireURL string,
 ) (*SummaryResp, error) {
 	// First get the evaluation job to extract results
 	job, err := sdk.GetEvaluation(ctx, jobID)
@@ -490,7 +489,6 @@ func (sdk *RogueSDK) GenerateSummary(
 		},
 		"job_id":            jobID,
 		"qualifire_api_key": *qualifireAPIKey,
-		"qualifire_url":     qualifireURL,
 		"deep_test":         deepTest,
 		"judge_model":       judgeModel,
 	}

From 38ccedd41637e58119b8bcb2efc4340731d4e2b9 Mon Sep 17 00:00:00 2001
From: drorIvry <drorIvry@gmail.com>
Date: Thu, 11 Sep 2025 13:49:33 +0300
Subject: [PATCH 16/22] Update rogue/run_cli.py

Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>
---
 rogue/run_cli.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/rogue/run_cli.py b/rogue/run_cli.py
index 82eb60a6..62492b5b 100644
--- a/rogue/run_cli.py
+++ b/rogue/run_cli.py
@@ -206,11 +206,17 @@ async def create_report(
     sdk = RogueSDK(sdk_config)
 
     try:
+    try:
+        qualifire_api_key = (
+            qualifire_api_key_secret.get_secret_value()
+            if qualifire_api_key_secret
+            else None
+        )
         summary, _ = await sdk.generate_summary(
             results=results,
             model=judge_llm,
             api_key=judge_llm_api_key,
-            qualifire_api_key=qualifire_api_key_secret,
+            qualifire_api_key=qualifire_api_key,
             deep_test=deep_test_mode,
             judge_model=judge_model,
         )

From 564f4a913ea4887c651504a598e5d301c8e8f9c4 Mon Sep 17 00:00:00 2001
From: Dror Ivry <drorivry@gmail.com>
Date: Thu, 11 Sep 2025 13:53:23 +0300
Subject: [PATCH 17/22] ci

---
 rogue/run_cli.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/rogue/run_cli.py b/rogue/run_cli.py
index 82eb60a6..53414cf0 100644
--- a/rogue/run_cli.py
+++ b/rogue/run_cli.py
@@ -214,6 +214,9 @@ async def create_report(
             deep_test=deep_test_mode,
             judge_model=judge_model,
         )
+    except Exception as e:
+        logger.exception("Failed to generate summary")
+        raise e
     finally:
         await sdk.close()
 

From 4d18d18a414f2dbeedf9d73c063f67e8db95ffd8 Mon Sep 17 00:00:00 2001
From: Dror Ivry <drorivry@gmail.com>
Date: Thu, 11 Sep 2025 13:54:36 +0300
Subject: [PATCH 18/22] ci

---
 rogue/run_cli.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/rogue/run_cli.py b/rogue/run_cli.py
index 9cf71e1d..614af6b4 100644
--- a/rogue/run_cli.py
+++ b/rogue/run_cli.py
@@ -205,7 +205,6 @@ async def create_report(
     )
     sdk = RogueSDK(sdk_config)
 
-    try:
     try:
         qualifire_api_key = (
             qualifire_api_key_secret.get_secret_value()

From de3af0b2f58538717ddd4993600096b834527db1 Mon Sep 17 00:00:00 2001
From: Dror Ivry <drorivry@gmail.com>
Date: Thu, 11 Sep 2025 14:22:21 +0300
Subject: [PATCH 19/22] ci

---
 rogue/server/api/llm.py                    | 4 ++--
 rogue/server/services/qualifire_service.py | 6 +++---
 sdks/python/rogue_sdk/types.py             | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/rogue/server/api/llm.py b/rogue/server/api/llm.py
index 05b9459e..28b271a0 100644
--- a/rogue/server/api/llm.py
+++ b/rogue/server/api/llm.py
@@ -143,7 +143,7 @@ async def generate_summary(
                     qualifire_url=request.qualifire_url,
                     qualifire_api_key=request.qualifire_api_key,
                 ),
-                evaluation_result=request.results,
+                evaluation_results=request.results,
             )
 
         return ServerSummaryGenerationResponse(
@@ -194,7 +194,7 @@ async def report_summary_handler(
                 qualifire_api_key=request.qualifire_api_key,
                 qualifire_url=request.qualifire_url,
             ),
-            evaluation_result=EvaluationResults(results=results),
+            evaluation_results=EvaluationResults(results=results),
         )
 
         return ReportSummaryResponse(
diff --git a/rogue/server/services/qualifire_service.py b/rogue/server/services/qualifire_service.py
index 05aabf60..bfdf1356 100644
--- a/rogue/server/services/qualifire_service.py
+++ b/rogue/server/services/qualifire_service.py
@@ -2,21 +2,21 @@
 from loguru import logger
 
 from .api_format_service import convert_with_structured_summary
-from rogue_sdk.types import EvaluationResult, ReportSummaryRequest
+from rogue_sdk.types import EvaluationResults, ReportSummaryRequest
 
 
 class QualifireService:
     @staticmethod
     def report_summary(
         request: ReportSummaryRequest,
-        evaluation_result: EvaluationResult,
+        evaluation_results: EvaluationResults,
     ):
         logger.info(
             "Reporting summary to Qualifire",
         )
 
         api_evaluation_result = convert_with_structured_summary(
-            evaluation_results=evaluation_result,
+            evaluation_results=evaluation_results,
             structured_summary=request.structured_summary,
             deep_test=request.deep_test,
             start_time=request.start_time,
diff --git a/sdks/python/rogue_sdk/types.py b/sdks/python/rogue_sdk/types.py
index 96fedddc..70cd15e8 100644
--- a/sdks/python/rogue_sdk/types.py
+++ b/sdks/python/rogue_sdk/types.py
@@ -434,7 +434,7 @@ class SummaryGenerationRequest(BaseModel):
     results: EvaluationResults
     model: str = "openai/gpt-4.1"
     api_key: Optional[str] = None
-    job_id: str = ""
+    job_id: Optional[str] = None
     deep_test: bool = False
     judge_model: Optional[str] = None
     qualifire_api_key: Optional[str] = None

From 1e5c5a05b374785fa462c724a500ac846cb50000 Mon Sep 17 00:00:00 2001
From: Dror Ivry <drorivry@gmail.com>
Date: Thu, 11 Sep 2025 15:45:36 +0300
Subject: [PATCH 20/22] ci

---
 packages/tui/internal/tui/eval_ui.go | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/packages/tui/internal/tui/eval_ui.go b/packages/tui/internal/tui/eval_ui.go
index 8d2e9d92..76a553be 100644
--- a/packages/tui/internal/tui/eval_ui.go
+++ b/packages/tui/internal/tui/eval_ui.go
@@ -24,10 +24,10 @@ type EvaluationViewState struct {
 	cancelFn func() error
 
 	// Report generation
-	Summary          string // Generated markdown summary
-	JobID            string // For tracking the evaluation job
-	Completed        bool   // Whether evaluation finished successfully
-	SummaryGenerated bool   // Whether summary generation was already attempted
+	Summary           string // Generated markdown summary
+	JobID             string // For tracking the evaluation job
+	Completed         bool   // Whether evaluation finished successfully
+	SummaryGenerated  bool   // Whether summary generation was already attempted
 	StructuredSummary StructuredSummary
 
 	// Editing state for New Evaluation

From 53755b7ce6870dee86e23e786c625853e7508498 Mon Sep 17 00:00:00 2001
From: Dror Ivry <drorivry@gmail.com>
Date: Thu, 11 Sep 2025 15:50:38 +0300
Subject: [PATCH 21/22] ci

---
 packages/tui/go.mod | 1 -
 1 file changed, 1 deletion(-)

diff --git a/packages/tui/go.mod b/packages/tui/go.mod
index 413f9fe3..a06d4950 100644
--- a/packages/tui/go.mod
+++ b/packages/tui/go.mod
@@ -14,7 +14,6 @@ require (
 )
 
 require (
-	github.com/charmbracelet/x/exp/golden v0.0.0-20250207160936-21c02780d27a // indirect
 	github.com/charmbracelet/x/input v0.3.7 // indirect
 	github.com/charmbracelet/x/windows v0.2.1 // indirect
 	github.com/dlclark/regexp2 v1.11.5 // indirect

From e679d82c3cedf063900fc9827d42cc920d1148b3 Mon Sep 17 00:00:00 2001
From: Dror Ivry <drorivry@gmail.com>
Date: Thu, 11 Sep 2025 15:57:33 +0300
Subject: [PATCH 22/22] ci

---
 .github/workflows/rogue.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/rogue.yml b/.github/workflows/rogue.yml
index d70c5806..dc20301e 100644
--- a/.github/workflows/rogue.yml
+++ b/.github/workflows/rogue.yml
@@ -56,5 +56,5 @@ jobs:
           OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
         with:
           evaluated_agent_url: "http://localhost:10001"
-          judge_llm: "openai/gpt-4.1-mini"
+          judge_llm: "openai/gpt-4.1"
           workdir: "./examples/tshirt_store_agent/.rogue"