diff --git a/.github/workflows/rogue.yml b/.github/workflows/rogue.yml index d70c5806..dc20301e 100644 --- a/.github/workflows/rogue.yml +++ b/.github/workflows/rogue.yml @@ -56,5 +56,5 @@ jobs: OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} with: evaluated_agent_url: "http://localhost:10001" - judge_llm: "openai/gpt-4.1-mini" + judge_llm: "openai/gpt-4.1" workdir: "./examples/tshirt_store_agent/.rogue" diff --git a/examples/js/cli/package.json b/examples/js/cli/package.json index 0afd3083..758f40a0 100644 --- a/examples/js/cli/package.json +++ b/examples/js/cli/package.json @@ -20,4 +20,4 @@ "ts-node": "^10.9.2", "typescript": "^5.8.3" } -} \ No newline at end of file +} diff --git a/examples/js/langgraph-js-example/package.json b/examples/js/langgraph-js-example/package.json index 46860e2a..86b52c0b 100644 --- a/examples/js/langgraph-js-example/package.json +++ b/examples/js/langgraph-js-example/package.json @@ -28,4 +28,4 @@ "ts-node": "^10.9.2", "typescript": "^5.8.3" } -} \ No newline at end of file +} diff --git a/examples/js/vercel-ai-example/package.json b/examples/js/vercel-ai-example/package.json index a5c1cee5..4c39d140 100644 --- a/examples/js/vercel-ai-example/package.json +++ b/examples/js/vercel-ai-example/package.json @@ -25,4 +25,4 @@ "uuid": "^11.1.0", "zod": "^3.24.1" } -} \ No newline at end of file +} diff --git a/examples/tshirt_store_agent/tshirt_store_agent_executor.py b/examples/tshirt_store_agent/tshirt_store_agent_executor.py index 68286326..32cb739a 100644 --- a/examples/tshirt_store_agent/tshirt_store_agent_executor.py +++ b/examples/tshirt_store_agent/tshirt_store_agent_executor.py @@ -129,7 +129,7 @@ async def _upsert_session(self, session_id: str): if session is None: logger.error( f"Critical error: Session is None even after " - f"create_session for session_id: {session_id}" + f"create_session for session_id: {session_id}", ) raise RuntimeError( f"Failed to get or create session: {session_id}", @@ -151,15 +151,16 @@ def convert_a2a_part_to_genai(part: Part) -> types.Part: if isinstance(part.file, FileWithUri): return types.Part( file_data=types.FileData( - file_uri=part.file.uri, mime_type=part.file.mimeType - ) + file_uri=part.file.uri, + mime_type=part.file.mimeType, + ), ) if isinstance(part.file, FileWithBytes): return types.Part( inline_data=types.Blob( data=base64.b64decode(part.file.bytes), mime_type=part.file.mimeType, - ) + ), ) raise ValueError(f"Unsupported file type: {type(part.file)}") raise ValueError(f"Unsupported part type: {type(part)}") @@ -185,8 +186,8 @@ def convert_genai_part_to_a2a(part: types.Part) -> Part: file=FileWithUri( uri=part.file_data.file_uri or "", mimeType=part.file_data.mime_type, - ) - ) + ), + ), ) if part.inline_data: return Part( @@ -196,7 +197,7 @@ def convert_genai_part_to_a2a(part: types.Part) -> Part: part.inline_data.data, # type: ignore ).decode(), mimeType=part.inline_data.mime_type, - ) - ) + ), + ), ) raise ValueError(f"Unsupported part type: {part}") diff --git a/packages/tui/go.mod b/packages/tui/go.mod index 2e986de1..a06d4950 100644 --- a/packages/tui/go.mod +++ b/packages/tui/go.mod @@ -14,7 +14,6 @@ require ( ) require ( - github.com/charmbracelet/x/exp/golden v0.0.0-20250207160936-21c02780d27a // indirect github.com/charmbracelet/x/input v0.3.7 // indirect github.com/charmbracelet/x/windows v0.2.1 // indirect github.com/dlclark/regexp2 v1.11.5 // indirect @@ -23,6 +22,7 @@ require ( require ( github.com/alecthomas/chroma/v2 v2.20.0 + github.com/charmbracelet/bubbles/v2 v2.0.0-beta.1 github.com/charmbracelet/colorprofile v0.3.1 // indirect github.com/charmbracelet/x/cellbuf v0.0.14-0.20250505150409-97991a1f17d1 // indirect github.com/charmbracelet/x/term v0.2.1 // indirect diff --git a/packages/tui/go.sum b/packages/tui/go.sum index a69ee239..d4e9da06 100644 --- a/packages/tui/go.sum +++ b/packages/tui/go.sum @@ -6,6 +6,8 @@ github.com/alecthomas/repr v0.5.1 h1:E3G4t2QbHTSNpPKBgMTln5KLkZHLOcU7r37J4pXBuIg github.com/alecthomas/repr v0.5.1/go.mod h1:Fr0507jx4eOXV7AlPV6AVZLYrLIuIeSOWtW57eE/O/4= github.com/aymanbagabas/go-udiff v0.2.0 h1:TK0fH4MteXUDspT88n8CKzvK0X9O2xu9yQjWpi6yML8= github.com/aymanbagabas/go-udiff v0.2.0/go.mod h1:RE4Ex0qsGkTAJoQdQQCA0uG+nAzJO/pI/QwceO5fgrA= +github.com/charmbracelet/bubbles/v2 v2.0.0-beta.1 h1:swACzss0FjnyPz1enfX56GKkLiuKg5FlyVmOLIlU2kE= +github.com/charmbracelet/bubbles/v2 v2.0.0-beta.1/go.mod h1:6HamsBKWqEC/FVHuQMHgQL+knPyvHH55HwJDHl/adMw= github.com/charmbracelet/bubbletea/v2 v2.0.0-beta.4 h1:UgUuKKvBwgqm2ZEL+sKv/OLeavrUb4gfHgdxe6oIOno= github.com/charmbracelet/bubbletea/v2 v2.0.0-beta.4/go.mod h1:0wWFRpsgF7vHsCukVZ5LAhZkiR4j875H6KEM2/tFQmA= github.com/charmbracelet/colorprofile v0.3.1 h1:k8dTHMd7fgw4bnFd7jXTLZrSU/CQrKnL3m+AxCzDz40= diff --git a/packages/tui/internal/components/llm_config_dialog.go b/packages/tui/internal/components/llm_config_dialog.go index b07471a3..54fcd104 100644 --- a/packages/tui/internal/components/llm_config_dialog.go +++ b/packages/tui/internal/components/llm_config_dialog.go @@ -549,7 +549,7 @@ func (d LLMConfigDialog) handleEnter() (LLMConfigDialog, tea.Cmd) { // handlePaste handles clipboard paste operation for API key input func (d LLMConfigDialog) handlePaste() (LLMConfigDialog, tea.Cmd) { // Get clipboard content based on the operating system - clipboardText, err := getClipboardContent() + clipboardText, err := GetClipboardContent() if err != nil { // If clipboard reading fails, just return without error return d, nil @@ -569,8 +569,8 @@ func (d LLMConfigDialog) handlePaste() (LLMConfigDialog, tea.Cmd) { return d, nil } -// getClipboardContent reads content from the system clipboard -func getClipboardContent() (string, error) { +// GetClipboardContent reads content from the system clipboard +func GetClipboardContent() (string, error) { var cmd *exec.Cmd switch runtime.GOOS { diff --git a/packages/tui/internal/theme/themes/vesper.json b/packages/tui/internal/theme/themes/vesper.json index b8406f93..08eade58 100644 --- a/packages/tui/internal/theme/themes/vesper.json +++ b/packages/tui/internal/theme/themes/vesper.json @@ -216,4 +216,4 @@ } } } - \ No newline at end of file + diff --git a/packages/tui/internal/tui/app.go b/packages/tui/internal/tui/app.go index 1d9746aa..1c75b0e2 100644 --- a/packages/tui/internal/tui/app.go +++ b/packages/tui/internal/tui/app.go @@ -8,7 +8,9 @@ import ( "strings" "time" + "github.com/charmbracelet/bubbles/v2/table" tea "github.com/charmbracelet/bubbletea/v2" + "github.com/pelletier/go-toml/v2" "github.com/rogue/tui/internal/components" "github.com/rogue/tui/internal/theme" @@ -84,8 +86,52 @@ func (m *Model) summaryGenerationCmd() tea.Cmd { // Create a context with longer timeout for summary generation ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute) defer cancel() + parsedAPIKey := &m.config.QualifireAPIKey + if m.config.QualifireEnabled == false { + parsedAPIKey = nil + } + structuredSummary, err := sdk.GenerateSummary( + ctx, + m.evalState.JobID, + judgeModel, + apiKey, + parsedAPIKey, + m.evalState.DeepTest, + judgeModel, + ) + + if err != nil { + return SummaryGeneratedMsg{ + Summary: "", + Err: err, + } + } + + m.evalState.StructuredSummary = structuredSummary.Summary + + overallSummary := structuredSummary.Summary.OverallSummary + keyFindings := structuredSummary.Summary.KeyFindings + parsedKeyFindings := "" + for _, finding := range keyFindings { + parsedKeyFindings += "- " + finding + "\n" + } + recommendations := structuredSummary.Summary.Recommendations + parsedRecommendations := "" + for _, recommendation := range recommendations { + parsedRecommendations += "- " + recommendation + "\n" + } + + detailedBreakdown := structuredSummary.Summary.DetailedBreakdown + parsedDetailedBreakdown := "" + for _, breakdown := range detailedBreakdown { + parsedDetailedBreakdown += "- " + breakdown.Scenario + " - " + breakdown.Status + " - " + breakdown.Outcome + "\n" + } + + summary := "## Overall Summary\n\n" + overallSummary + + "\n\n" + "## Key Findings\n\n" + parsedKeyFindings + + "\n\n" + "## Recommendations\n\n" + parsedRecommendations + + "\n\n" + "## Detailed Breakdown\n\n" + parsedDetailedBreakdown - summary, err := sdk.GenerateSummary(ctx, m.evalState.JobID, judgeModel, apiKey) return SummaryGeneratedMsg{ Summary: summary, Err: err, @@ -131,20 +177,21 @@ type App struct { // Model represents the main application state type Model struct { - currentScreen Screen - width int - height int - input string - cursor int - evaluations []Evaluation - scenarios []Scenario - config Config - version string - commandInput components.CommandInput - dialog *components.Dialog - dialogStack []components.Dialog - llmDialog *components.LLMConfigDialog - scenarioEditor components.ScenarioEditor + currentScreen Screen + width int + height int + input string + cursor int + evaluations []Evaluation + scenarios []Scenario + config Config + version string + commandInput components.CommandInput + dialog *components.Dialog + dialogStack []components.Dialog + llmDialog *components.LLMConfigDialog + scenarioEditor components.ScenarioEditor + detailedBreakdown []table.Row // Spinners for loading states healthSpinner components.Spinner @@ -238,7 +285,7 @@ func (a *App) Run() error { // Initialize viewports eventsViewport: components.NewViewport(1, 80, 20), summaryViewport: components.NewViewport(2, 80, 20), - reportViewport: components.NewViewport(3, 80, 20), + reportViewport: components.NewViewport(3, 80, 15), focusedViewport: 0, // Start with events viewport focused eventsAutoScroll: true, // Start with auto-scroll enabled } @@ -285,6 +332,25 @@ func (m Model) Update(msg tea.Msg) (tea.Model, tea.Cmd) { } return m, tea.Batch(cmds...) } + + if m.dialog != nil { + clipboardText, err := components.GetClipboardContent() + if err != nil { + // If clipboard reading fails, just return without error + return m, nil + } + + // Clean the clipboard text (remove newlines and trim whitespace) + cleanText := strings.TrimSpace(strings.ReplaceAll(clipboardText, "\n", "")) + + if cleanText == "" { + return m, nil + } + + m.dialog.Input += cleanText + m.dialog.InputCursor = len(m.dialog.Input) + return m, nil + } case components.SpinnerTickMsg: // Update spinners m.healthSpinner, cmd = m.healthSpinner.Update(msg) @@ -485,6 +551,71 @@ func (m Model) Update(msg tea.Msg) (tea.Model, tea.Cmd) { // Handle dialog closure if m.dialog != nil { switch msg.Action { + case "save_qualifire_and_report": + // Handle Qualifire API key save and report persistence + if m.dialog != nil && m.dialog.Title == "Configure Qualifire API Key" { + // Save the API key to config (allow empty to clear the key) + m.config.QualifireAPIKey = msg.Input + // Only enable integration if there's an API key + if msg.Input != "" { + m.config.QualifireEnabled = true + if m.configState != nil { + m.configState.QualifireEnabled = true + m.configState.HasChanges = true + } + } + + // immediately report the summary + if m.evalState != nil && m.evalState.Completed { + parsedAPIKey := m.config.QualifireAPIKey + if m.config.QualifireEnabled == false { + parsedAPIKey = "" + } + + sdk := NewRogueSDK(m.config.ServerURL) + err := sdk.ReportSummary( + context.Background(), + m.evalState.JobID, + m.evalState.StructuredSummary, + m.evalState.DeepTest, + m.evalState.JudgeModel, + parsedAPIKey, + ) + if err != nil { + // Show error dialog + errorDialog := components.ShowErrorDialog( + "Report Summary Error", + fmt.Sprintf("Failed to report summary: %v", err), + ) + m.dialog = &errorDialog + } + + err = m.saveConfig() + if err != nil { + // Show error dialog + errorDialog := components.ShowErrorDialog( + "Configuration Error", + fmt.Sprintf("Failed to save Qualifire configuration: %v", err), + ) + m.dialog = &errorDialog + return m, nil + } else { + // Show appropriate success dialog + var message string + if msg.Input != "" { + message = "Qualifire API key has been successfully saved and integration is now enabled. Your evaluation report will now be automatically persisted." + } else { + message = "Qualifire API key has been cleared and integration is now disabled." + } + successDialog := components.NewInfoDialog( + "Qualifire Configured", + message, + ) + m.dialog = &successDialog + return m, nil + } + } + } case "save_qualifire": // Handle Qualifire API key save if m.dialog != nil && m.dialog.Title == "Configure Qualifire API Key" { @@ -543,7 +674,7 @@ func (m Model) Update(msg tea.Msg) (tea.Model, tea.Cmd) { ) // Customize the buttons for this specific use case dialog.Buttons = []components.DialogButton{ - {Label: "Save", Action: "save_qualifire", Style: components.PrimaryButton}, + {Label: "Save", Action: "save_qualifire_and_report", Style: components.PrimaryButton}, } // Position cursor at end of existing key if there is one dialog.InputCursor = len(m.config.QualifireAPIKey) diff --git a/packages/tui/internal/tui/eval_ui.go b/packages/tui/internal/tui/eval_ui.go index 2c8eba6d..76a553be 100644 --- a/packages/tui/internal/tui/eval_ui.go +++ b/packages/tui/internal/tui/eval_ui.go @@ -14,7 +14,7 @@ type EvaluationViewState struct { JudgeModel string ParallelRuns int DeepTest bool - Scenarios []string + Scenarios []EvalScenario // Runtime Running bool @@ -24,10 +24,11 @@ type EvaluationViewState struct { cancelFn func() error // Report generation - Summary string // Generated markdown summary - JobID string // For tracking the evaluation job - Completed bool // Whether evaluation finished successfully - SummaryGenerated bool // Whether summary generation was already attempted + Summary string // Generated markdown summary + JobID string // For tracking the evaluation job + Completed bool // Whether evaluation finished successfully + SummaryGenerated bool // Whether summary generation was already attempted + StructuredSummary StructuredSummary // Editing state for New Evaluation currentField int // 0: AgentURL, 1: JudgeModel, 2: DeepTest, 3: StartButton @@ -35,7 +36,7 @@ type EvaluationViewState struct { } // loadScenariosFromWorkdir reads .rogue/scenarios.json upward from CWD -func loadScenariosFromWorkdir() []string { +func loadScenariosFromWorkdir() []EvalScenario { wd, _ := os.Getwd() dir := wd for { @@ -43,14 +44,20 @@ func loadScenariosFromWorkdir() []string { if b, err := os.ReadFile(p); err == nil { var v struct { Scenarios []struct { - Scenario string `json:"scenario"` + Scenario string `json:"scenario"` + ScenarioType string `json:"scenario_type"` + ExpectedOutcome string `json:"expected_outcome"` } `json:"scenarios"` } if json.Unmarshal(b, &v) == nil { - out := make([]string, 0, len(v.Scenarios)) + out := make([]EvalScenario, 0, len(v.Scenarios)) for _, s := range v.Scenarios { if s.Scenario != "" { - out = append(out, s.Scenario) + out = append(out, EvalScenario{ + Scenario: s.Scenario, + ScenarioType: ScenarioType(s.ScenarioType), + ExpectedOutcome: s.ExpectedOutcome, + }) } } return out diff --git a/packages/tui/internal/tui/evaluation.go b/packages/tui/internal/tui/evaluation.go index ecbd6a2c..a3ab4410 100644 --- a/packages/tui/internal/tui/evaluation.go +++ b/packages/tui/internal/tui/evaluation.go @@ -42,8 +42,9 @@ type AgentConfig struct { } type EvalScenario struct { - Scenario string `json:"scenario"` - ScenarioType ScenarioType `json:"scenario_type"` + Scenario string `json:"scenario"` + ScenarioType ScenarioType `json:"scenario_type"` + ExpectedOutcome string `json:"expected_outcome,omitempty"` } type EvaluationRequest struct { @@ -92,6 +93,21 @@ type RogueSDK struct { ws *websocket.Conn } +type StructuredSummary struct { + OverallSummary string `json:"overall_summary"` + KeyFindings []string `json:"key_findings"` + Recommendations []string `json:"recommendations"` + DetailedBreakdown []struct { + Scenario string `json:"scenario"` + Status string `json:"status"` + Outcome string `json:"outcome"` + } `json:"detailed_breakdown"` +} +type SummaryResp struct { + Summary StructuredSummary `json:"summary"` + Message string `json:"message"` +} + // NewRogueSDK creates a new SDK instance func NewRogueSDK(baseURL string) *RogueSDK { return &RogueSDK{ @@ -401,7 +417,15 @@ func (sdk *RogueSDK) CancelEvaluation(ctx context.Context, jobID string) error { } // StartEvaluation is the main entry point used by the TUI -func (m *Model) StartEvaluation(ctx context.Context, serverURL, agentURL string, scenarios []string, judgeModel string, parallelRuns int, deepTest bool) (<-chan EvaluationEvent, func() error, error) { +func (m *Model) StartEvaluation( + ctx context.Context, + serverURL string, + agentURL string, + scenarios []EvalScenario, + judgeModel string, + parallelRuns int, + deepTest bool, +) (<-chan EvaluationEvent, func() error, error) { sdk := NewRogueSDK(serverURL) // Validate URLs @@ -429,8 +453,9 @@ func (m *Model) StartEvaluation(ctx context.Context, serverURL, agentURL string, // Convert scenarios for _, s := range scenarios { request.Scenarios = append(request.Scenarios, EvalScenario{ - Scenario: s, - ScenarioType: ScenarioTypePolicy, + Scenario: s.Scenario, + ScenarioType: s.ScenarioType, + ExpectedOutcome: s.ExpectedOutcome, }) } @@ -438,15 +463,21 @@ func (m *Model) StartEvaluation(ctx context.Context, serverURL, agentURL string, } // GenerateSummary generates a markdown summary from evaluation results -func (sdk *RogueSDK) GenerateSummary(ctx context.Context, jobID, model, apiKey string) (string, error) { +func (sdk *RogueSDK) GenerateSummary( + ctx context.Context, + jobID, model, apiKey string, + qualifireAPIKey *string, + deepTest bool, + judgeModel string, +) (*SummaryResp, error) { // First get the evaluation job to extract results job, err := sdk.GetEvaluation(ctx, jobID) if err != nil { - return "", fmt.Errorf("failed to get evaluation results: %w", err) + return nil, fmt.Errorf("failed to get evaluation results: %w", err) } if job.Results == nil { - return "", fmt.Errorf("no results available for job %s", jobID) + return nil, fmt.Errorf("no results available for job %s", jobID) } // Prepare summary request - match server's SummaryGenerationRequest format @@ -456,16 +487,20 @@ func (sdk *RogueSDK) GenerateSummary(ctx context.Context, jobID, model, apiKey s "results": map[string]interface{}{ "results": job.Results, }, + "job_id": jobID, + "qualifire_api_key": *qualifireAPIKey, + "deep_test": deepTest, + "judge_model": judgeModel, } body, err := json.Marshal(summaryReq) if err != nil { - return "", err + return nil, err } req, err := http.NewRequestWithContext(ctx, "POST", sdk.baseURL+"/api/v1/llm/summary", bytes.NewReader(body)) if err != nil { - return "", err + return nil, err } req.Header.Set("Content-Type", "application/json") @@ -476,24 +511,64 @@ func (sdk *RogueSDK) GenerateSummary(ctx context.Context, jobID, model, apiKey s resp, err := longTimeoutClient.Do(req) if err != nil { - return "", err + return nil, err } defer resp.Body.Close() if resp.StatusCode != http.StatusOK { body, _ := io.ReadAll(resp.Body) - return "", fmt.Errorf("summary generation failed: %d %s", resp.StatusCode, string(body)) + return nil, fmt.Errorf("summary generation failed: %d %s", resp.StatusCode, string(body)) } - var summaryResp struct { - Summary string `json:"summary"` - Message string `json:"message"` - } + var summaryResp SummaryResp + if err := json.NewDecoder(resp.Body).Decode(&summaryResp); err != nil { - return "", err + return nil, err } - return summaryResp.Summary, nil + return &summaryResp, nil +} + +// ReportSummary reports a summary to Qualifire +func (sdk *RogueSDK) ReportSummary( + ctx context.Context, + jobID string, + summary StructuredSummary, + deepTest bool, + judgeModel string, + qualifireAPIKey string, +) error { + reportReq := map[string]interface{}{ + "job_id": jobID, + "structured_summary": summary, + "deep_test": deepTest, + "judge_model": judgeModel, + "qualifire_api_key": qualifireAPIKey, + } + + body, err := json.Marshal(reportReq) + if err != nil { + return err + } + + req, err := http.NewRequestWithContext(ctx, "POST", sdk.baseURL+"/api/v1/llm/report_summary", bytes.NewReader(body)) + if err != nil { + return err + } + req.Header.Set("Content-Type", "application/json") + + resp, err := sdk.httpClient.Do(req) + if err != nil { + return err + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + body, _ := io.ReadAll(resp.Body) + return fmt.Errorf("report summary failed: %d %s", resp.StatusCode, string(body)) + } + + return nil } // CheckServerHealth calls GET /health and returns the status string diff --git a/packages/tui/internal/tui/report_view.go b/packages/tui/internal/tui/report_view.go index 08c71e45..84bf31ec 100644 --- a/packages/tui/internal/tui/report_view.go +++ b/packages/tui/internal/tui/report_view.go @@ -22,7 +22,7 @@ func (m Model) renderReport() string { // Main container style with full width and height background mainStyle := lipgloss.NewStyle(). Width(m.width). - Height(m.height - 1). // -1 for footer + Height(m.height - 12). Background(t.Background()) // Title style @@ -58,27 +58,28 @@ func (m Model) renderReport() string { } // Calculate viewport dimensions - viewportWidth := m.width - 4 // Leave margins + viewportWidth := m.width - 8 // Leave margins viewportHeight := m.height - 8 // title(3) + help(1) + margins(4) // Create a temporary copy of the viewport to avoid modifying the original viewport := m.reportViewport - viewport.SetSize(viewportWidth, viewportHeight) + viewport.SetSize(viewportWidth, viewportHeight-2) viewport.SetContent(reportContent) // Style the viewport with border viewportStyle := lipgloss.NewStyle(). + Height(viewportHeight - 8). Border(lipgloss.RoundedBorder()). BorderForeground(t.Border()). BorderBackground(t.BackgroundPanel()). - Background(t.BackgroundPanel()). - Width(viewportWidth + 2). // +2 for border - Height(viewportHeight + 2) // +2 for border + Background(t.BackgroundPanel()) // Apply viewport styling viewport.Style = lipgloss.NewStyle(). Foreground(t.Text()). Background(t.BackgroundPanel()). + Width(viewportWidth). + Height(viewportHeight-8). Padding(1, 2) // Help text style @@ -102,13 +103,13 @@ func (m Model) renderReport() string { // Center the viewport in the available space contentArea := lipgloss.NewStyle(). Width(m.width). - Height(viewportHeight + 2). + Height(viewportHeight - 8). Background(t.Background()) centeredViewport := contentArea.Render( lipgloss.Place( m.width, - viewportHeight+2, + viewportHeight-8, lipgloss.Center, lipgloss.Top, viewportContent, diff --git a/rogue/common/generic_agent_executor.py b/rogue/common/generic_agent_executor.py index 0e1de2f6..059dc77a 100644 --- a/rogue/common/generic_agent_executor.py +++ b/rogue/common/generic_agent_executor.py @@ -128,7 +128,7 @@ async def _upsert_session(self, session_id: str): if session is None: logger.error( f"Critical error: Session is None even after " - f"create_session for session_id: {session_id}" + f"create_session for session_id: {session_id}", ) raise RuntimeError( f"Failed to get or create session: {session_id}", @@ -150,15 +150,16 @@ def convert_a2a_part_to_genai(part: Part) -> types.Part: if isinstance(part.file, FileWithUri): return types.Part( file_data=types.FileData( - file_uri=part.file.uri, mime_type=part.file.mimeType - ) + file_uri=part.file.uri, + mime_type=part.file.mimeType, + ), ) if isinstance(part.file, FileWithBytes): return types.Part( inline_data=types.Blob( data=base64.b64decode(part.file.bytes), mime_type=part.file.mimeType, - ) + ), ) raise ValueError(f"Unsupported file type: {type(part.file)}") raise ValueError(f"Unsupported part type: {type(part)}") @@ -184,8 +185,8 @@ def convert_genai_part_to_a2a(part: types.Part) -> Part: file=FileWithUri( uri=part.file_data.file_uri or "", mimeType=part.file_data.mime_type, - ) - ) + ), + ), ) if part.inline_data: return Part( @@ -195,7 +196,7 @@ def convert_genai_part_to_a2a(part: types.Part) -> Part: part.inline_data.data, # type: ignore ).decode(), mimeType=part.inline_data.mime_type, - ) - ) + ), + ), ) raise ValueError(f"Unsupported part type: {part}") diff --git a/rogue/common/remote_agent_connection.py b/rogue/common/remote_agent_connection.py index e08caf83..4fdf080b 100644 --- a/rogue/common/remote_agent_connection.py +++ b/rogue/common/remote_agent_connection.py @@ -81,7 +81,7 @@ async def send_message( SendStreamingMessageRequest( id=uuid4().hex, params=request, - ) + ), ): logger.debug( "received stream response from remote agent", @@ -110,7 +110,7 @@ async def send_message( SendMessageRequest( id=uuid4().hex, params=request, - ) + ), ) logger.debug( diff --git a/rogue/evaluator_agent/evaluator_agent.py b/rogue/evaluator_agent/evaluator_agent.py index 79e4f608..001ae760 100644 --- a/rogue/evaluator_agent/evaluator_agent.py +++ b/rogue/evaluator_agent/evaluator_agent.py @@ -122,6 +122,7 @@ - `scenario`: The entire scenario json object being tested. The json-object contains: - "scenario": The scenario text. - "scenario_type": The scenario type. + - "expected_outcome": The expected outcome of the scenario. - `context_id`: The conversation's context ID - `evaluation_passed`: Boolean indicating whether the agent complied with the policy. You should determine this based on the conversation. - `reason`: A brief explanation of your decision @@ -363,6 +364,7 @@ def _log_evaluation( context_id: str, evaluation_passed: bool, reason: str, + scenario_type: Optional[str], ) -> None: """ Logs the evaluation of the given scenario and test case. @@ -370,6 +372,7 @@ def _log_evaluation( This is the scenario dictionary containing both the scenario text and type: - scenario: The scenario text. - scenario_type: The scenario type. + - expected_outcome: The expected outcome of the scenario. :param context_id: The conversation's context_id. This allows us to distinguish which conversation is being evaluated. :param evaluation_passed: A boolean value with the evaluation result. This is @@ -391,6 +394,11 @@ def _log_evaluation( ), "evaluation_passed (from agent)": evaluation_passed, "reason (from agent)": reason, + "scenario_type": scenario_type, + "expected_outcome": scenario.get( + "expected_outcome", + "None", + ), }, ) diff --git a/rogue/run_cli.py b/rogue/run_cli.py index acf8979e..614af6b4 100644 --- a/rogue/run_cli.py +++ b/rogue/run_cli.py @@ -188,6 +188,9 @@ async def create_report( results: EvaluationResults, output_report_file: Path, judge_llm_api_key_secret: SecretStr | None = None, + qualifire_api_key_secret: SecretStr | None = None, + deep_test_mode: bool = False, + judge_model: str | None = None, ) -> str: judge_llm_api_key = ( judge_llm_api_key_secret.get_secret_value() @@ -203,11 +206,22 @@ async def create_report( sdk = RogueSDK(sdk_config) try: - summary = await sdk.generate_summary( + qualifire_api_key = ( + qualifire_api_key_secret.get_secret_value() + if qualifire_api_key_secret + else None + ) + summary, _ = await sdk.generate_summary( results=results, model=judge_llm, api_key=judge_llm_api_key, + qualifire_api_key=qualifire_api_key, + deep_test=deep_test_mode, + judge_model=judge_model, ) + except Exception as e: + logger.exception("Failed to generate summary") + raise e finally: await sdk.close() @@ -352,6 +366,8 @@ async def run_cli(args: Namespace) -> int: results=results, output_report_file=cli_input.output_report_file, judge_llm_api_key_secret=cli_input.judge_llm_api_key, + deep_test_mode=cli_input.deep_test_mode, + judge_model=cli_input.judge_llm, ) logger.info("Report saved", extra={"report_file": cli_input.output_report_file}) diff --git a/rogue/server/api/__init__.py b/rogue/server/api/__init__.py index e1dec064..0b1b8c2e 100644 --- a/rogue/server/api/__init__.py +++ b/rogue/server/api/__init__.py @@ -2,6 +2,11 @@ API endpoints for the Rogue Agent Evaluator Server. """ -from . import evaluation, health, interview, llm +from . import ( + evaluation, + health, + interview, + llm, +) __all__ = ["evaluation", "health", "interview", "llm"] diff --git a/rogue/server/api/evaluation.py b/rogue/server/api/evaluation.py index 00e29c2f..636dbf0c 100644 --- a/rogue/server/api/evaluation.py +++ b/rogue/server/api/evaluation.py @@ -59,6 +59,8 @@ async def create_evaluation( status=EvaluationStatus.PENDING, created_at=datetime.now(timezone.utc), request=request, + deep_test=request.agent_config.deep_test_mode, + judge_model=request.agent_config.judge_llm, ) await evaluation_service.add_job(job) diff --git a/rogue/server/api/llm.py b/rogue/server/api/llm.py index 4fa434f8..28b271a0 100644 --- a/rogue/server/api/llm.py +++ b/rogue/server/api/llm.py @@ -4,16 +4,25 @@ This module provides REST API endpoints for LLM operations. """ -from fastapi import APIRouter, HTTPException +from datetime import datetime, timezone +from fastapi import APIRouter, Depends, HTTPException from rogue_sdk.types import ( + EvaluationResults, ScenarioGenerationRequest, ScenarioGenerationResponse, SummaryGenerationRequest, - SummaryGenerationResponse, + ReportSummaryResponse, + ReportSummaryRequest, ) +from rogue.server.api.evaluation import get_evaluation_service +from rogue.server.services.evaluation_service import EvaluationService + +from ..models.api_format import ServerSummaryGenerationResponse + from ...common.logging import get_logger from ..services.llm_service import LLMService +from ..services.qualifire_service import QualifireService router = APIRouter(prefix="/llm", tags=["llm"]) logger = get_logger(__name__) @@ -57,8 +66,14 @@ async def generate_scenarios(request: ScenarioGenerationRequest): ) -@router.post("/summary", response_model=SummaryGenerationResponse) -async def generate_summary(request: SummaryGenerationRequest): +@router.post( + "/summary", + response_model=ServerSummaryGenerationResponse, +) +async def generate_summary( + request: SummaryGenerationRequest, + evaluation_service: EvaluationService = Depends(get_evaluation_service), +) -> ServerSummaryGenerationResponse: """ Generate evaluation summary from results. @@ -81,7 +96,57 @@ async def generate_summary(request: SummaryGenerationRequest): logger.info("Successfully generated evaluation summary") - return SummaryGenerationResponse( + logger.info( + "Qualifire API key", + extra={"qualifire_api_key": request.qualifire_api_key}, + ) + logger.info( + "Job ID", + extra={"job_id": request.job_id}, + ) + logger.info( + "Qualifire URL", + extra={"qualifire_url": request.qualifire_url}, + ) + + if request.qualifire_api_key and request.job_id: + + logger.info( + "Reporting summary to Qualifire", + extra={"job_id": request.job_id}, + ) + + job = await evaluation_service.get_job(request.job_id) + + if not job and not request.judge_model and not request.deep_test: + raise HTTPException( + status_code=400, + detail="Job not found and judge model and deep test are not provided", # noqa: E501 + ) + + logger.info( + "Summary", + extra={"summary": summary, "results": request.results}, + ) + + QualifireService.report_summary( + ReportSummaryRequest( + job_id=request.job_id, + structured_summary=summary, + deep_test=job.deep_test if job else request.deep_test, + start_time=( + job.created_at + if job is not None + else datetime.now(timezone.utc) + ), + judge_model=job.judge_model if job else request.judge_model, + qualifire_url=request.qualifire_url, + qualifire_api_key=request.qualifire_api_key, + ), + evaluation_results=request.results, + ) + + return ServerSummaryGenerationResponse( summary=summary, message="Successfully generated evaluation summary", ) @@ -92,3 +157,52 @@ async def generate_summary(request: SummaryGenerationRequest): status_code=500, detail=f"Failed to generate summary: {str(e)}", ) + + +@router.post("/report_summary", response_model=ReportSummaryResponse) +async def report_summary_handler( + request: ReportSummaryRequest, + evaluation_service: EvaluationService = Depends(get_evaluation_service), +): + """ + Report summary to Qualifire. + """ + try: + job = await evaluation_service.get_job(request.job_id) + + if not job: + raise HTTPException( + status_code=404, + detail="Evaluation job not found", + ) + + results = job.results + + if not results or len(results) == 0: + raise HTTPException( + status_code=404, + detail="Evaluation results not found or empty", + ) + + QualifireService.report_summary( + ReportSummaryRequest( + job_id=request.job_id, + structured_summary=request.structured_summary, + deep_test=request.deep_test, + start_time=job.created_at, + judge_model=job.judge_model, + qualifire_api_key=request.qualifire_api_key, + qualifire_url=request.qualifire_url, + ), + evaluation_results=EvaluationResults(results=results), + ) + + return ReportSummaryResponse( + success=True, + ) + except Exception as e: + logger.exception("Failed to report summary") + raise HTTPException( + status_code=e.status_code if hasattr(e, "status_code") else 500, + detail=f"Failed to report summary: {str(e)}", + ) diff --git a/rogue/server/models/__init__.py b/rogue/server/models/__init__.py new file mode 100644 index 00000000..9a5b64b5 --- /dev/null +++ b/rogue/server/models/__init__.py @@ -0,0 +1,15 @@ +"""Server models for the Rogue Agent Evaluator.""" + +from .api_format import ( + ApiChatMessage, + ApiConversationEvaluation, + ApiEvaluationResult, + ApiScenarioResult, +) + +__all__ = [ + "ApiChatMessage", + "ApiConversationEvaluation", + "ApiEvaluationResult", + "ApiScenarioResult", +] diff --git a/rogue/server/models/api_format.py b/rogue/server/models/api_format.py new file mode 100644 index 00000000..f3895b8e --- /dev/null +++ b/rogue/server/models/api_format.py @@ -0,0 +1,56 @@ +"""API format models for evaluation results. + +These models define the enhanced API format for evaluation results +that includes summary, key findings, recommendations, and metadata. +""" + +from datetime import datetime +from typing import List, Optional + +from pydantic import BaseModel +from rogue_sdk.types import StructuredSummary + + +class ApiChatMessage(BaseModel): + """Chat message for new API format with datetime timestamp.""" + + role: str + content: str + timestamp: datetime + + +class ApiConversationEvaluation(BaseModel): + """Conversation evaluation for new API format.""" + + passed: bool + messages: List[ApiChatMessage] + reason: Optional[str] = None + + +class ApiScenarioResult(BaseModel): + """Result of evaluating a single scenario in new API format.""" + + description: Optional[str] = None + expectedOutcome: Optional[str] = None + totalConversations: Optional[int] = None + flaggedConversations: Optional[int] = None + conversations: List[ApiConversationEvaluation] + + +class ApiEvaluationResult(BaseModel): + """New API format for evaluation results.""" + + scenarios: List[ApiScenarioResult] + summary: Optional[str] = None + keyFindings: Optional[str] = None + recommendation: Optional[str] = None + deepTest: bool = False + startTime: datetime + judgeModel: Optional[str] = None + + +class ServerSummaryGenerationResponse(BaseModel): + """Server response for summary generation with structured summary.""" + + summary: StructuredSummary + message: str diff --git a/rogue/server/services/__init__.py b/rogue/server/services/__init__.py index 8e3466ae..6b0b3aaf 100644 --- a/rogue/server/services/__init__.py +++ b/rogue/server/services/__init__.py @@ -1,7 +1,9 @@ from . import ( + api_format_service, evaluation_library, evaluation_service, interviewer_service, llm_service, scenario_evaluation_service, + qualifire_service, ) diff --git a/rogue/server/services/api_format_service.py b/rogue/server/services/api_format_service.py new file mode 100644 index 00000000..d63336e2 --- /dev/null +++ b/rogue/server/services/api_format_service.py @@ -0,0 +1,141 @@ +"""Service for converting evaluation results to API format. + +This service handles the conversion from legacy EvaluationResults +to the new enhanced API format with structured summary data. +""" + +from datetime import datetime, timezone +from typing import Optional + +from rogue_sdk.types import EvaluationResults, StructuredSummary + +from ..models.api_format import ( + ApiChatMessage, + ApiConversationEvaluation, + ApiEvaluationResult, + ApiScenarioResult, +) + + +def convert_to_api_format( + evaluation_results: EvaluationResults, + structured_summary: Optional[StructuredSummary] = None, + deep_test: bool = False, + start_time: Optional[datetime] = None, + judge_model: Optional[str] = None, +) -> ApiEvaluationResult: + """Convert legacy EvaluationResults to new API format. + + Args: + evaluation_results: Legacy evaluation results to convert + structured_summary: Structured summary from LLM with separate sections + deep_test: Whether deep test mode was enabled + start_time: When the evaluation started (defaults to current time) + judge_model: The LLM judge model used + + Returns: + ApiEvaluationResult: New format evaluation result with additional metadata + """ + if start_time is None: + start_time = datetime.now(timezone.utc) + + api_scenarios = [] + + for result in evaluation_results.results: + # Convert conversations to new format + api_conversations = [] + for conv_eval in result.conversations: + # Convert ChatHistory messages to ApiChatMessage + api_messages = [] + for msg in conv_eval.messages.messages: + timestamp = datetime.now(timezone.utc) + if msg.timestamp: + try: + if isinstance(msg.timestamp, str): + timestamp = datetime.fromisoformat( + msg.timestamp.replace("Z", "+00:00"), + ) + else: + timestamp = msg.timestamp + except (ValueError, AttributeError): + timestamp = datetime.now(timezone.utc) + + api_messages.append( + ApiChatMessage( + role=msg.role, + content=msg.content, + timestamp=timestamp, + ), + ) + + api_conversations.append( + ApiConversationEvaluation( + passed=conv_eval.passed, + messages=api_messages, + reason=conv_eval.reason if conv_eval.reason else None, + ), + ) + + api_scenarios.append( + ApiScenarioResult( + description=result.scenario.scenario, + expectedOutcome=result.scenario.expected_outcome, + totalConversations=len(api_conversations), + flaggedConversations=len( + [c for c in api_conversations if not c.passed], + ), + conversations=api_conversations, + ), + ) + + # Extract structured summary components + summary = None + key_findings = None + recommendation = None + + if structured_summary: + summary = structured_summary.overall_summary + key_findings = "\n".join( + f"• {finding}" for finding in structured_summary.key_findings + ) + recommendation = "\n".join( + f"• {rec}" for rec in structured_summary.recommendations + ) + + return ApiEvaluationResult( + scenarios=api_scenarios, + summary=summary, + keyFindings=key_findings, + recommendation=recommendation, + deepTest=deep_test, + startTime=start_time, + judgeModel=judge_model, + ) + + +def convert_with_structured_summary( + evaluation_results: EvaluationResults, + structured_summary: Optional[StructuredSummary] = None, + deep_test: bool = False, + start_time: Optional[datetime] = None, + judge_model: Optional[str] = None, +) -> ApiEvaluationResult: + """Convert to API format with structured summary. + + Args: + evaluation_results: Legacy evaluation results to convert + structured_summary: Structured summary from LLM + deep_test: Whether deep test mode was enabled + start_time: When the evaluation started + judge_model: The LLM judge model used + + Returns: + ApiEvaluationResult: New format with structured summary data + """ + return convert_to_api_format( + evaluation_results=evaluation_results, + structured_summary=structured_summary, + deep_test=deep_test, + start_time=start_time, + judge_model=judge_model, + ) diff --git a/rogue/server/services/interviewer_service.py b/rogue/server/services/interviewer_service.py index 1caf69b7..d9d9d333 100644 --- a/rogue/server/services/interviewer_service.py +++ b/rogue/server/services/interviewer_service.py @@ -71,7 +71,7 @@ def send_message(self, user_input: str): { "role": "user", "content": user_input, - } + }, ) # Copying the messages to avoid modifying the original list @@ -87,7 +87,7 @@ def send_message(self, user_input: str): "You have asked 3 questions. Now, provide a concise summary of " "the agent's business context based on the conversation." ), - } + }, ) try: @@ -101,7 +101,7 @@ def send_message(self, user_input: str): { "role": "assistant", "content": response.choices[0].message.content, - } + }, ) return response.choices[0].message.content diff --git a/rogue/server/services/llm_service.py b/rogue/server/services/llm_service.py index de46177f..a775edf9 100644 --- a/rogue/server/services/llm_service.py +++ b/rogue/server/services/llm_service.py @@ -1,8 +1,11 @@ +import json from typing import Optional from litellm import completion from loguru import logger from rogue_sdk.types import EvaluationResults, Scenario, Scenarios, ScenarioType +from rogue_sdk.types import StructuredSummary + SCENARIO_GENERATION_SYSTEM_PROMPT = """ # Test Scenario Designer @@ -98,7 +101,7 @@ # Evaluation Results Summarizer You are a test results summarizer. Your task is to analyze the provided evaluation results -and generate a concise, insightful, and human-readable summary in Markdown format. +and generate a structured JSON response with the summary components. ## Evaluation Results (JSON) @@ -106,22 +109,47 @@ ## Your Task -Based on the JSON data above, create a summary that includes: +Based on the JSON data above, create a structured summary that includes: -1. **Overall Summary**: A brief, high-level overview of the agent's performance, - highlighting the pass/fail ratio and any critical issues discovered. -2. **Key Findings**: Bullet points detailing the most significant discoveries, both - positive and negative. Focus on patterns of failure or notable successes. -3. **Recommendations**: Suggest concrete next steps for improving the agent. These +1. **overall_summary**: A brief, high-level overview of the agent's performance, + highlighting the pass/fail ratio and any critical issues discovered. Return as a single string. +2. **key_findings**: List of the most significant discoveries, both positive and negative. + Focus on patterns of failure or notable successes. Return as an array of strings. +3. **recommendations**: List of concrete next steps for improving the agent. These could include fixing specific bugs, improving training data, or clarifying policies. -4. **Detailed Breakdown**: A table that provides a granular look at each - scenario that was tested, including the pass/fail with the appropriate emoji ✅/❌ status and a brief note on the outcome. + Return as an array of strings. +4. **detailed_breakdown**: Array of objects representing a table that provides a granular + look at each scenario tested. Each object should have: scenario, status (✅/❌), outcome. + +## Output Format +You MUST respond with valid JSON in exactly this format: + +```json +{ + "overall_summary": "Brief overview text here...", + "key_findings": [ + "First key finding", + "Second key finding" + ], + "recommendations": [ + "First recommendation", + "Second recommendation" + ], + "detailed_breakdown": [ + { + "scenario": "Scenario name", + "status": "✅", + "outcome": "Brief outcome description" + } + ] +} +``` ## Guidelines - Use clear and professional language. -- Format the output using Markdown for readability (headings, bold text, lists, etc.). - Be objective and base your summary strictly on the provided data. -- Ensure the summary is well-organized and easy to navigate. +- Return ONLY valid JSON - no markdown, no explanations, no additional text. +- Ensure all strings are properly escaped for JSON. """ # noqa: E501 @@ -142,13 +170,18 @@ def generate_scenarios( context: str, llm_provider_api_key: Optional[str] = None, ) -> Scenarios: - """ - Generates scenarios for the given business context using the given model. - :param model: LLM model to use for scenario generation. - :param context: Business context to use for scenario generation. - :param llm_provider_api_key: api key for the LLM provider - (if applicable, env can also be used instead). - :return: The generated scenarios + """Generate test scenarios from business context using LLM. + + Args: + model: LLM model to use for generation + context: Business context description for scenario generation + llm_provider_api_key: API key for the LLM provider + + Returns: + Scenarios: Generated test scenarios + + Raises: + Exception: If scenario generation fails """ system_prompt = SCENARIO_GENERATION_SYSTEM_PROMPT.replace( r"{$BUSINESS_CONTEXT}", @@ -188,7 +221,7 @@ def generate_summary_from_results( model: str, results: EvaluationResults, llm_provider_api_key: Optional[str] = None, - ) -> str: + ) -> StructuredSummary: system_prompt = SUMMARY_GENERATION_SYSTEM_PROMPT.replace( r"{$EVALUATION_RESULTS}", results.model_dump_json(indent=2), @@ -198,7 +231,10 @@ def generate_summary_from_results( {"role": "system", "content": system_prompt}, { "role": "user", - "content": "Please generate the summary based on the provided results.", + "content": ( + "Please generate the structured summary based on the " + "provided results." + ), }, ] @@ -210,7 +246,38 @@ def generate_summary_from_results( messages=messages, api_key=api_key, ) - return response.choices[0].message.content + + # Parse the JSON response from the LLM + content = response.choices[0].message.content.strip() + + # Remove markdown code blocks if present + if content.startswith("```json"): + content = content[7:] + if content.endswith("```"): + content = content[:-3] + content = content.strip() + + # Parse JSON and create StructuredSummary + summary_data = json.loads(content) + return StructuredSummary(**summary_data) + + except json.JSONDecodeError as e: + logger.exception(f"Failed to parse JSON response from LLM: {e}") + # Return a fallback structured summary + return StructuredSummary( + overall_summary="Error: Could not parse summary response from LLM.", + key_findings=["Unable to generate key findings due to parsing error."], + recommendations=["Please review the evaluation results manually."], + detailed_breakdown=[], + ) except Exception: logger.exception("Failed to generate summary from results") - return "Error: Could not generate a summary for the evaluation results." + # Return a fallback structured summary + return StructuredSummary( + overall_summary=( + "Error: Could not generate a summary for the evaluation results." + ), + key_findings=["Unable to generate key findings due to system error."], + recommendations=["Please review the evaluation results manually."], + detailed_breakdown=[], + ) diff --git a/rogue/server/services/qualifire_service.py b/rogue/server/services/qualifire_service.py new file mode 100644 index 00000000..bfdf1356 --- /dev/null +++ b/rogue/server/services/qualifire_service.py @@ -0,0 +1,40 @@ +import requests +from loguru import logger + +from .api_format_service import convert_with_structured_summary +from rogue_sdk.types import EvaluationResults, ReportSummaryRequest + + +class QualifireService: + @staticmethod + def report_summary( + request: ReportSummaryRequest, + evaluation_results: EvaluationResults, + ): + logger.info( + "Reporting summary to Qualifire", + ) + + api_evaluation_result = convert_with_structured_summary( + evaluation_results=evaluation_results, + structured_summary=request.structured_summary, + deep_test=request.deep_test, + start_time=request.start_time, + judge_model=request.judge_model, + ) + + response = requests.post( + f"{request.qualifire_url}/api/rogue/v1/report", + headers={"X-qualifire-key": request.qualifire_api_key}, + json=api_evaluation_result.model_dump(mode="json"), + timeout=300, + ) + + if not response.ok: + logger.error( + "Failed to report summary to Qualifire", + extra={"response": response.json()}, + ) + raise Exception(f"Failed to report summary to Qualifire: {response.json()}") + + return response.json() diff --git a/rogue/server/services/scenario_evaluation_service.py b/rogue/server/services/scenario_evaluation_service.py index 4ed69ba0..8de94c76 100644 --- a/rogue/server/services/scenario_evaluation_service.py +++ b/rogue/server/services/scenario_evaluation_service.py @@ -78,7 +78,7 @@ async def evaluate_scenarios(self) -> AsyncGenerator[tuple[str, Any], None]: results = data if results and results.results: logger.info( - f"📊 Processing {len(results.results)} evaluation results" + f"📊 Processing {len(results.results)} evaluation results", ) for res in results.results: self._results.add_result(res) @@ -86,12 +86,12 @@ async def evaluate_scenarios(self) -> AsyncGenerator[tuple[str, Any], None]: logger.warning("⚠️ Received results update but no results data") else: # it's a 'chat' or 'status' update logger.debug( - f"🔄 Forwarding {update_type} update: {str(data)[:50]}..." + f"🔄 Forwarding {update_type} update: {str(data)[:50]}...", ) yield update_type, data logger.info( - f"🏁 arun_evaluator_agent completed. Total updates: {update_count}" + f"🏁 arun_evaluator_agent completed. Total updates: {update_count}", ) except Exception as e: @@ -132,6 +132,6 @@ async def evaluate_scenarios(self) -> AsyncGenerator[tuple[str, Any], None]: ( "✅ ScenarioEvaluationService completed with " f"{len(self._results.results)} total results" - ) + ), ) yield "done", self._results diff --git a/rogue/tests/models/test_cli_input.py b/rogue/tests/models/test_cli_input.py index 627bc921..e2571b47 100644 --- a/rogue/tests/models/test_cli_input.py +++ b/rogue/tests/models/test_cli_input.py @@ -32,7 +32,8 @@ def test_check_auth_credentials(self, auth_type, credentials, should_raise): if should_raise: with pytest.raises( - ValidationError, match="Authentication Credentials cannot be empty" + ValidationError, + match="Authentication Credentials cannot be empty", ): CLIInput(**input_data) else: diff --git a/rogue/tests/models/test_evaluation_result.py b/rogue/tests/models/test_evaluation_result.py index b1458423..4c6277e8 100644 --- a/rogue/tests/models/test_evaluation_result.py +++ b/rogue/tests/models/test_evaluation_result.py @@ -1,4 +1,5 @@ import pytest +from datetime import datetime from rogue_sdk.types import ( ChatHistory, ChatMessage, @@ -7,6 +8,8 @@ EvaluationResults, Scenario, ) +from rogue.server.services.api_format_service import convert_to_api_format +from rogue.server.models.api_format import ApiEvaluationResult, StructuredSummary class TestEvaluationResults: @@ -54,26 +57,26 @@ def get_evaluation_result( EvaluationResults(), get_evaluation_result(scenario_1, conversation_1_passed), EvaluationResults( - results=[get_evaluation_result(scenario_1, conversation_1_passed)] + results=[get_evaluation_result(scenario_1, conversation_1_passed)], ), ), # no overlap from non-empty results ( EvaluationResults( - results=[get_evaluation_result(scenario_1, conversation_1_passed)] + results=[get_evaluation_result(scenario_1, conversation_1_passed)], ), get_evaluation_result(scenario_2, conversation_1_failed), EvaluationResults( results=[ get_evaluation_result(scenario_1, conversation_1_passed), get_evaluation_result(scenario_2, conversation_1_failed), - ] + ], ), ), # scenario overlap with passed unchanged True -> True ( EvaluationResults( - results=[get_evaluation_result(scenario_1, conversation_1_passed)] + results=[get_evaluation_result(scenario_1, conversation_1_passed)], ), get_evaluation_result(scenario_1, conversation_2_passed), EvaluationResults( @@ -86,13 +89,13 @@ def get_evaluation_result( ], passed=True, ), - ] + ], ), ), # scenario overlap with passed changed True -> False ( EvaluationResults( - results=[get_evaluation_result(scenario_1, conversation_1_passed)] + results=[get_evaluation_result(scenario_1, conversation_1_passed)], ), get_evaluation_result(scenario_1, conversation_2_failed), EvaluationResults( @@ -105,13 +108,13 @@ def get_evaluation_result( ], passed=False, ), - ] + ], ), ), # scenario overlap with passed unchanged False -> False (#1) ( EvaluationResults( - results=[get_evaluation_result(scenario_1, conversation_1_failed)] + results=[get_evaluation_result(scenario_1, conversation_1_failed)], ), get_evaluation_result(scenario_1, conversation_2_failed), EvaluationResults( @@ -124,13 +127,13 @@ def get_evaluation_result( ], passed=False, ), - ] + ], ), ), # scenario overlap with passed unchanged False -> False (#2) ( EvaluationResults( - results=[get_evaluation_result(scenario_1, conversation_1_failed)] + results=[get_evaluation_result(scenario_1, conversation_1_failed)], ), get_evaluation_result( scenario_1, @@ -146,7 +149,7 @@ def get_evaluation_result( ], passed=False, ), - ] + ], ), ), ], @@ -159,3 +162,57 @@ def test_add_result( ): existing_results.add_result(new_result) assert existing_results == expected_results + + def test_convert_to_api_format(self): + """Test conversion to new API format.""" + results = EvaluationResults() + result = self.get_evaluation_result(self.scenario_1, self.conversation_1_passed) + results.add_result(result) + + # Create structured summary for testing + structured_summary = StructuredSummary( + overall_summary="Test summary for overall evaluation", + key_findings=["Key finding 1", "Key finding 2"], + recommendations=["Recommendation 1", "Recommendation 2"], + detailed_breakdown=[ + {"scenario": "Test", "status": "✅", "outcome": "Passed"}, + ], + ) + + api_format = convert_to_api_format( + evaluation_results=results, + structured_summary=structured_summary, + deep_test=True, + judge_model="openai/gpt-4o-mini", + ) + + assert isinstance(api_format, ApiEvaluationResult) + assert len(api_format.scenarios) == 1 + assert api_format.scenarios[0].description == "Scenario 1" + assert api_format.scenarios[0].totalConversations == 1 + assert api_format.scenarios[0].flaggedConversations == 0 + assert len(api_format.scenarios[0].conversations) == 1 + + # Test structured summary fields + assert api_format.summary == "Test summary for overall evaluation" + assert api_format.keyFindings == "• Key finding 1\n• Key finding 2" + assert api_format.recommendation == "• Recommendation 1\n• Recommendation 2" + assert api_format.deepTest is True + assert api_format.judgeModel == "openai/gpt-4o-mini" + assert api_format.scenarios[0].conversations[0].passed is True + assert api_format.scenarios[0].conversations[0].reason == "reason" + assert len(api_format.scenarios[0].conversations[0].messages) == 1 + + # Test message conversion + message = api_format.scenarios[0].conversations[0].messages[0] + assert message.role == "user" + assert message.content == "message 1" + assert isinstance(message.timestamp, datetime) + + # Test new fields + assert api_format.summary == "Test summary for overall evaluation" + assert api_format.keyFindings == "• Key finding 1\n• Key finding 2" + assert api_format.recommendation == "• Recommendation 1\n• Recommendation 2" + assert api_format.deepTest is True + assert api_format.judgeModel == "openai/gpt-4o-mini" + assert isinstance(api_format.startTime, datetime) diff --git a/rogue/ui/components/config_screen.py b/rogue/ui/components/config_screen.py index bd8e5793..e32e1cb5 100644 --- a/rogue/ui/components/config_screen.py +++ b/rogue/ui/components/config_screen.py @@ -36,7 +36,7 @@ def create_config_screen( ) gr.Markdown( "When enabled, you'll be guided through an AI-powered interview to " - "extract your agent's business context. Turn off to skip this step." + "extract your agent's business context. Turn off to skip this step.", ) gr.Markdown("**Deep Test Mode**") @@ -46,7 +46,7 @@ def create_config_screen( ) gr.Markdown( "When enabled, the evaluator will " - "approach each scenario from different angles" + "approach each scenario from different angles", ) gr.Markdown("### Parallel Runs") @@ -76,7 +76,8 @@ def create_config_screen( ), ) auth_credentials_error = gr.Markdown( - visible=False, elem_classes=["error-label"] + visible=False, + elem_classes=["error-label"], ) gr.Markdown("## Evaluator Configuration") @@ -84,12 +85,12 @@ def create_config_screen( "Specify the models for the evaluation process. " "The **Service LLM** will be used to interview, " "generate scenarios and summaries. The **Judge LLM** is used by the " - "evaluator agent to score the agent's performance against those scenarios." + "evaluator agent to score the agent's performance against those scenarios.", ) gr.Markdown( "ℹ️ Under the hood we're using `litellm`. See the " "[list of supported models](https://docs.litellm.ai/docs/providers). " - "You can use environment variables for API keys." + "You can use environment variables for API keys.", ) service_llm = gr.Textbox( @@ -226,7 +227,8 @@ def save_config( msg = error["msg"] if loc in error_labels: error_updates[error_labels[loc]] = gr.update( - value=f"**Error:** {msg}", visible=True + value=f"**Error:** {msg}", + visible=True, ) else: logger.exception("Unhandled validation error") diff --git a/rogue/ui/components/report_generator.py b/rogue/ui/components/report_generator.py index 538491b1..db554368 100644 --- a/rogue/ui/components/report_generator.py +++ b/rogue/ui/components/report_generator.py @@ -5,6 +5,8 @@ from loguru import logger from rogue_sdk.types import EvaluationResults +from ...server.services.api_format_service import convert_with_structured_summary + def _load_report_data_from_files( evaluation_results_output_path: Path | None, @@ -60,13 +62,36 @@ def on_report_tab_select(state): ) results = EvaluationResults() + # Convert to new API format for display using server service + try: + # Extract configuration and additional metadata from state + config = state.get("config", {}) + + # For now, pass None for structured_summary since UI still uses + # string summaries. This will be updated when the UI summary generation + # is converted to structured format + api_format_results = convert_with_structured_summary( + evaluation_results=results, + structured_summary=state.get("structured_summary"), + deep_test=config.get("deep_test_mode", False), + start_time=state.get("start_time"), + judge_model=config.get("judge_llm"), + ) + results_json = api_format_results.model_dump_json( + indent=2, + exclude_none=True, + ) + except Exception as e: + logger.warning( + f"Failed to convert results to API format: {e}", + extra={ + "results": results, + }, + ) + results_json = str(results) + return { - evaluation_results_display: gr.update( - value=results.model_dump_json( - indent=2, - exclude_none=True, - ), - ), + evaluation_results_display: gr.update(value=results_json), summary_display: gr.update(value=summary), } diff --git a/rogue/ui/components/scenario_runner.py b/rogue/ui/components/scenario_runner.py index a1ea60ba..e46c5540 100644 --- a/rogue/ui/components/scenario_runner.py +++ b/rogue/ui/components/scenario_runner.py @@ -481,6 +481,7 @@ def on_status_update(status_data): # final_output_path.write_text(all_results.model_dump_json(indent=2)) # Generate summary using SDK (server-based) + summary = "Summary generation failed." try: sdk_config = RogueClientConfig( base_url=state.get("rogue_server_url", "http://localhost:8000"), @@ -488,12 +489,17 @@ def on_status_update(status_data): ) sdk = RogueSDK(sdk_config) - summary = await sdk.generate_summary( + summary, structured_summary = await sdk.generate_summary( results=all_results, model=config.get("service_llm"), api_key=config.get("judge_llm_api_key"), + qualifire_api_key=config.get("qualifire_api_key"), + deep_test=config.get("deep_test_mode", False), + judge_model=config.get("judge_llm"), ) + state["structured_summary"] = structured_summary + await sdk.close() except Exception: logger.exception("Summary generation failed") diff --git a/sdks/python/rogue_sdk/client.py b/sdks/python/rogue_sdk/client.py index 966e5558..09546da8 100644 --- a/sdks/python/rogue_sdk/client.py +++ b/sdks/python/rogue_sdk/client.py @@ -24,6 +24,7 @@ SendMessageResponse, StartInterviewRequest, StartInterviewResponse, + StructuredSummary, SummaryGenerationRequest, SummaryGenerationResponse, ) @@ -148,11 +149,19 @@ async def generate_summary( results: EvaluationResults, model: str, api_key: Optional[str] = None, + qualifire_api_key: Optional[str] = None, + job_id: Optional[str] = None, + deep_test: bool = False, + judge_model: Optional[str] = None, ) -> SummaryGenerationResponse: """Generate summary via API.""" data = SummaryGenerationRequest( results=results, model=model, + qualifire_api_key=qualifire_api_key, + job_id=job_id, + deep_test=deep_test, + judge_model=judge_model, ) if api_key: data.api_key = api_key @@ -162,7 +171,27 @@ async def generate_summary( "/api/v1/llm/summary", json=data.model_dump(mode="json"), ) - return SummaryGenerationResponse(**response) + + # Handle server's structured summary response + summary_data = response.get("summary", {}) + if isinstance(summary_data, dict) and "overall_summary" in summary_data: + # Server returned StructuredSummary - convert to our expected format + structured_summary = StructuredSummary(**summary_data) + return SummaryGenerationResponse( + summary=structured_summary, + message=response.get("message", "Successfully generated summary"), + ) + else: + # Fallback for legacy string response + return SummaryGenerationResponse( + summary=StructuredSummary( + overall_summary=str(summary_data), + key_findings=[], + recommendations=[], + detailed_breakdown=[], + ), + message=response.get("message", "Successfully generated summary"), + ) async def start_interview( self, @@ -237,7 +266,7 @@ async def wait_for_evaluation( elapsed = asyncio.get_running_loop().time() - start_time if elapsed >= max_wait_time: raise TimeoutError( - f"Evaluation {job_id} did not complete within {max_wait_time}s" + f"Evaluation {job_id} did not complete within {max_wait_time}s", ) await asyncio.sleep(poll_interval) diff --git a/sdks/python/rogue_sdk/sdk.py b/sdks/python/rogue_sdk/sdk.py index efca694c..d1ec55cf 100644 --- a/sdks/python/rogue_sdk/sdk.py +++ b/sdks/python/rogue_sdk/sdk.py @@ -5,7 +5,7 @@ """ import asyncio -from typing import Any, Callable, Optional +from typing import Any, Callable, Optional, Tuple from loguru import logger from pydantic import HttpUrl @@ -25,6 +25,7 @@ RogueClientConfig, Scenarios, SendMessageResponse, + StructuredSummary, WebSocketEventType, ) from .websocket import RogueWebSocketClient @@ -114,7 +115,7 @@ def on_websocket_event( """Add WebSocket event handler.""" if not self.ws_client: raise RuntimeError( - "WebSocket not connected. Call connect_websocket() first." + "WebSocket not connected. Call connect_websocket() first.", ) self.ws_client.on(event, handler) @@ -192,7 +193,7 @@ def handle_final_job_result(task): result_future.set_result(result) else: result_future.set_exception( - Exception("Failed to retrieve final job result") + Exception("Failed to retrieve final job result"), ) except Exception as e: result_future.set_exception(e) @@ -210,7 +211,7 @@ def handle_chat_update(event, data): def handle_error(event, data): if not result_future.done(): result_future.set_exception( - Exception(f"WebSocket error: {data.get('error')}") + Exception(f"WebSocket error: {data.get('error')}"), ) # Connect WebSocket for updates @@ -228,7 +229,7 @@ def handle_error(event, data): return result except asyncio.TimeoutError: raise TimeoutError( - f"Evaluation {job_id} did not complete within {timeout}s" + f"Evaluation {job_id} did not complete within {timeout}s", ) finally: await self.disconnect_websocket() @@ -288,15 +289,57 @@ async def generate_summary( results: EvaluationResults, model: str = "openai/gpt-4o-mini", api_key: Optional[str] = None, - ) -> str: + qualifire_api_key: Optional[str] = None, + job_id: Optional[str] = None, + deep_test: bool = False, + judge_model: Optional[str] = None, + ) -> Tuple[str, StructuredSummary]: """Generate evaluation summary from results.""" response_data = await self.http_client.generate_summary( results=results, model=model, api_key=api_key, + qualifire_api_key=qualifire_api_key, + job_id=job_id, + deep_test=deep_test, + judge_model=judge_model, ) - return response_data.summary + # Convert structured summary back to string format for backward compatibility + structured_summary = response_data.summary + if hasattr(structured_summary, "overall_summary"): + # Format as markdown string for UI display + summary_parts = [ + f"# Evaluation Results Summary\n\n## Overall Summary\n" + f"{structured_summary.overall_summary}", + ] + + if structured_summary.key_findings: + findings = "\n".join( + f"- {finding}" for finding in structured_summary.key_findings + ) + summary_parts.append(f"\n---\n\n## Key Findings\n{findings}") + + if structured_summary.recommendations: + recommendations = "\n".join( + f"{i + 1}. {rec}" + for i, rec in enumerate(structured_summary.recommendations) + ) + summary_parts.append(f"\n---\n\n## Recommendations\n{recommendations}") + + if structured_summary.detailed_breakdown: + breakdown = "\n".join( + f"{i + 1}. {row}" + for i, row in enumerate(structured_summary.detailed_breakdown) + ) + summary_parts.append(f"\n---\n\n## Detailed Breakdown\n{breakdown}") + + summary_parts.append("\n---\n") + + return "\n".join(summary_parts), structured_summary + else: + # Fallback for string response + return str(structured_summary), structured_summary async def start_interview( self, diff --git a/sdks/python/rogue_sdk/tests/test_types.py b/sdks/python/rogue_sdk/tests/test_types.py index db652f7c..aa906bde 100644 --- a/sdks/python/rogue_sdk/tests/test_types.py +++ b/sdks/python/rogue_sdk/tests/test_types.py @@ -42,7 +42,10 @@ def test_validate_dataset_for_type(self, scenario_type, dataset, should_raise): ], ) def test_validate_dataset_sample_size( - self, dataset, dataset_sample_size, should_raise + self, + dataset, + dataset_sample_size, + should_raise, ): input_data = { "scenario": "Test Scenario", @@ -55,7 +58,8 @@ def test_validate_dataset_sample_size( if should_raise: with pytest.raises( - ValidationError, match="`dataset_sample_size` must be set" + ValidationError, + match="`dataset_sample_size` must be set", ): Scenario(**input_data) else: diff --git a/sdks/python/rogue_sdk/types.py b/sdks/python/rogue_sdk/types.py index b5359716..70cd15e8 100644 --- a/sdks/python/rogue_sdk/types.py +++ b/sdks/python/rogue_sdk/types.py @@ -77,6 +77,7 @@ class AgentConfig(BaseModel): parallel_runs: int = 1 judge_llm_api_key: Optional[str] = None business_context: str = "" + qualifire_api_key: Optional[str] = None @model_validator(mode="after") def check_auth_credentials(self) -> "AgentConfig": @@ -85,7 +86,7 @@ def check_auth_credentials(self) -> "AgentConfig": if auth_type and auth_type != AuthType.NO_AUTH and not auth_credentials: raise ValueError( - "Authentication Credentials cannot be empty for the selected auth type." + "Authentication Credentials cannot be empty for the selected auth type.", # noqa: E501 ) return self @@ -110,7 +111,7 @@ def validate_dataset_for_type(self) -> "Scenario": if dataset_required and self.dataset is None: raise ValueError( f"`dataset` must be provided when scenario_type is " - f"'{self.scenario_type.value}'" + f"'{self.scenario_type.value}'", ) elif not dataset_required and self.dataset is not None: logger.info( @@ -143,7 +144,7 @@ def get_scenarios_by_type(self, scenario_type: ScenarioType) -> "Scenarios": scenario for scenario in self.scenarios if scenario.scenario_type == scenario_type - ] + ], ) def get_policy_scenarios(self) -> "Scenarios": @@ -207,6 +208,101 @@ def combine(self, other: "EvaluationResults"): self.add_result(result) +# New API Format Types + + +class ApiChatMessage(BaseModel): + """Chat message for new API format with datetime timestamp.""" + + role: str + content: str + timestamp: datetime + + +class ApiConversationEvaluation(BaseModel): + """Conversation evaluation for new API format.""" + + passed: bool + messages: List[ApiChatMessage] + reason: Optional[str] = None + + +class ApiScenarioResult(BaseModel): + """Result of evaluating a single scenario in new API format.""" + + description: Optional[str] = None + totalConversations: Optional[int] = None + flaggedConversations: Optional[int] = None + conversations: List[ApiConversationEvaluation] + + +class ApiEvaluationResult(BaseModel): + """New API format for evaluation results.""" + + scenarios: List[ApiScenarioResult] + + +# Conversion functions for new API format +def convert_to_api_format(evaluation_results: EvaluationResults) -> ApiEvaluationResult: + """Convert legacy EvaluationResults to new API format. + + Args: + evaluation_results: Legacy evaluation results to convert + + Returns: + ApiEvaluationResult: New format evaluation result + """ + api_scenarios = [] + + for result in evaluation_results.results: + # Convert conversations to new format + api_conversations = [] + for conv_eval in result.conversations: + # Convert ChatHistory messages to ApiChatMessage + api_messages = [] + for msg in conv_eval.messages.messages: + timestamp = datetime.now(timezone.utc) + if msg.timestamp: + try: + if isinstance(msg.timestamp, str): + timestamp = datetime.fromisoformat( + msg.timestamp.replace("Z", "+00:00"), + ) + else: + timestamp = msg.timestamp + except (ValueError, AttributeError): + timestamp = datetime.now(timezone.utc) + + api_messages.append( + ApiChatMessage( + role=msg.role, + content=msg.content, + timestamp=timestamp, + ), + ) + + api_conversations.append( + ApiConversationEvaluation( + passed=conv_eval.passed, + messages=api_messages, + reason=conv_eval.reason if conv_eval.reason else None, + ), + ) + + api_scenarios.append( + ApiScenarioResult( + description=result.scenario.scenario, + totalConversations=len(api_conversations), + flaggedConversations=len( + [c for c in api_conversations if not c.passed], + ), + conversations=api_conversations, + ), + ) + + return ApiEvaluationResult(scenarios=api_scenarios) + + # Interview Types @@ -290,6 +386,8 @@ class EvaluationJob(BaseModel): results: Optional[List[EvaluationResult]] = None error_message: Optional[str] = None progress: float = 0.0 + deep_test: bool = False + judge_model: Optional[str] = None class EvaluationResponse(BaseModel): @@ -336,12 +434,26 @@ class SummaryGenerationRequest(BaseModel): results: EvaluationResults model: str = "openai/gpt-4.1" api_key: Optional[str] = None + job_id: Optional[str] = None + deep_test: bool = False + judge_model: Optional[str] = None + qualifire_api_key: Optional[str] = None + qualifire_url: Optional[str] = "https://app.qualifire.ai" + + +class StructuredSummary(BaseModel): + """Structured summary response from LLM.""" + + overall_summary: str + key_findings: List[str] + recommendations: List[str] + detailed_breakdown: List[dict] # Table rows for scenario breakdown class SummaryGenerationResponse(BaseModel): """Response containing generated summary.""" - summary: str + summary: StructuredSummary message: str @@ -384,3 +496,21 @@ def validate_base_url(cls, v: str | HttpUrl) -> HttpUrl: if isinstance(v, str): return HttpUrl(v) return v + + +class ReportSummaryRequest(BaseModel): + """Request to report a summary.""" + + job_id: str + structured_summary: Optional[StructuredSummary] = None + deep_test: bool = False + judge_model: Optional[str] = None + start_time: Optional[datetime] = None + qualifire_api_key: Optional[str] = None + qualifire_url: Optional[str] = "https://app.qualifire.ai" + + +class ReportSummaryResponse(BaseModel): + """Response to report a summary.""" + + success: bool diff --git a/sdks/python/rogue_sdk/websocket.py b/sdks/python/rogue_sdk/websocket.py index f0a79797..7b9d1b76 100644 --- a/sdks/python/rogue_sdk/websocket.py +++ b/sdks/python/rogue_sdk/websocket.py @@ -164,7 +164,7 @@ def _emit(self, event: WebSocketEventType, data: Any) -> None: ) if t.exception() else None - ) + ), ) else: handler(event, data) @@ -180,7 +180,7 @@ async def _schedule_reconnect(self) -> None: delay = self.reconnect_delay * (2 ** (self.reconnect_attempts - 1)) logger.info( - f"Scheduling reconnect attempt {self.reconnect_attempts} in {delay}s" + f"Scheduling reconnect attempt {self.reconnect_attempts} in {delay}s", ) await asyncio.sleep(delay)