diff --git a/rogue/evaluator_agent/evaluator_agent.py b/rogue/evaluator_agent/evaluator_agent.py index 001ae760..4effa04d 100644 --- a/rogue/evaluator_agent/evaluator_agent.py +++ b/rogue/evaluator_agent/evaluator_agent.py @@ -117,7 +117,7 @@ - Returns: A dictionary containing the other agent's response: - "response": A string containing the other agent's response. If there is no response from the other agent, the string is empty. -3. `_log_evaluation(scenario: dict, context_id: str, evaluation_passed: bool, reason: str)` +3. `_log_evaluation(scenario: dict, context_id: str, evaluation_passed: bool, reason: str)` NOTE: THE SCENARIO IS A DICTIONARY NOT A STRING - Parameters: - `scenario`: The entire scenario json object being tested. The json-object contains: - "scenario": The scenario text. diff --git a/rogue/models/cli_input.py b/rogue/models/cli_input.py index 063ed902..fdf5f6cb 100644 --- a/rogue/models/cli_input.py +++ b/rogue/models/cli_input.py @@ -15,6 +15,7 @@ class CLIInput(BaseModel): evaluated_agent_credentials: SecretStr | None = None judge_llm: str judge_llm_api_key: SecretStr | None = None + qualifire_api_key: SecretStr | None = None input_scenarios_file: Path = workdir / "scenarios.json" output_report_file: Path = workdir / "report.md" business_context: str diff --git a/rogue/run_cli.py b/rogue/run_cli.py index 614af6b4..c5fc4ed8 100644 --- a/rogue/run_cli.py +++ b/rogue/run_cli.py @@ -96,7 +96,7 @@ async def run_scenarios( evaluation_results_output_path: Path, business_context: str, deep_test_mode: bool, -) -> EvaluationResults | None: +) -> tuple[EvaluationResults | None, str | None]: evaluated_agent_auth_credentials = ( evaluated_agent_auth_credentials_secret.get_secret_value() if evaluated_agent_auth_credentials_secret @@ -134,7 +134,7 @@ async def _run_scenarios_with_sdk( evaluation_results_output_path: Path, business_context: str, deep_test_mode: bool, -) -> EvaluationResults | None: +) -> tuple[EvaluationResults | None, str | None]: """Run scenarios using the new SDK.""" # Initialize SDK @@ -173,10 +173,10 @@ async def _run_scenarios_with_sdk( results.model_dump_json(indent=2, exclude_none=True), encoding="utf-8", ) - return results + return results, final_job.job_id else: logger.error("Scenario evaluation completed but no results found.") - return None + return None, None finally: await sdk.close() @@ -187,6 +187,7 @@ async def create_report( judge_llm: str, results: EvaluationResults, output_report_file: Path, + job_id: str | None = None, judge_llm_api_key_secret: SecretStr | None = None, qualifire_api_key_secret: SecretStr | None = None, deep_test_mode: bool = False, @@ -216,6 +217,7 @@ async def create_report( model=judge_llm, api_key=judge_llm_api_key, qualifire_api_key=qualifire_api_key, + job_id=job_id, deep_test=deep_test_mode, judge_model=judge_model, ) @@ -342,7 +344,7 @@ async def run_cli(args: Namespace) -> int: "scenarios_length": len(scenarios.scenarios), }, ) - results = await run_scenarios( + results, job_id = await run_scenarios( rogue_server_url=args.rogue_server_url, evaluated_agent_url=cli_input.evaluated_agent_url.encoded_string(), evaluated_agent_auth_type=cli_input.evaluated_agent_auth_type, @@ -364,6 +366,7 @@ async def run_cli(args: Namespace) -> int: rogue_server_url=args.rogue_server_url, judge_llm=cli_input.judge_llm, results=results, + job_id=job_id, output_report_file=cli_input.output_report_file, judge_llm_api_key_secret=cli_input.judge_llm_api_key, deep_test_mode=cli_input.deep_test_mode, diff --git a/rogue/server/api/llm.py b/rogue/server/api/llm.py index 28b271a0..8106d791 100644 --- a/rogue/server/api/llm.py +++ b/rogue/server/api/llm.py @@ -5,6 +5,7 @@ """ from datetime import datetime, timezone +import os from fastapi import APIRouter, Depends, HTTPException from rogue_sdk.types import ( EvaluationResults, @@ -96,18 +97,10 @@ async def generate_summary( logger.info("Successfully generated evaluation summary") - logger.info( - "Qualifire API key", - extra={"qualifire_api_key": request.qualifire_api_key}, - ) - logger.info( - "Job ID", - extra={"job_id": request.job_id}, - ) - logger.info( - "Qualifire URL", - extra={"qualifire_url": request.qualifire_url}, - ) + if not request.qualifire_api_key: + env_api_key = os.getenv("QUALIFIRE_API_KEY") + if env_api_key: + request.qualifire_api_key = env_api_key if request.qualifire_api_key and request.job_id: @@ -184,6 +177,11 @@ async def report_summary_handler( detail="Evaluation results not found or empty", ) + if not request.qualifire_api_key: + env_api_key = os.getenv("QUALIFIRE_API_KEY") + if env_api_key: + request.qualifire_api_key = env_api_key + QualifireService.report_summary( ReportSummaryRequest( job_id=request.job_id,