Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion rogue/evaluator_agent/evaluator_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@
- Returns: A dictionary containing the other agent's response:
- "response": A string containing the other agent's response. If there is no response from the other agent, the string is empty.

3. `_log_evaluation(scenario: dict, context_id: str, evaluation_passed: bool, reason: str)`
3. `_log_evaluation(scenario: dict, context_id: str, evaluation_passed: bool, reason: str)` NOTE: THE SCENARIO IS A DICTIONARY NOT A STRING
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Fix tool doc/signature mismatch for _log_evaluation (scenario_type).

The tool doc omits scenario_type, but the function requires it. This will cause tool-call validation failures. Either document the param as optional or make it optional in code.

Apply this doc tweak:

-3. `_log_evaluation(scenario: dict, context_id: str, evaluation_passed: bool, reason: str)` NOTE: THE SCENARIO IS A DICTIONARY NOT A STRING
+3. `_log_evaluation(scenario: dict, context_id: str, evaluation_passed: bool, reason: str, scenario_type: Optional[str] = None)` NOTE: THE SCENARIO IS A DICTIONARY NOT A STRING

And make the code accept it as optional (outside this hunk):

# Change signature to:
def _log_evaluation(
    self,
    scenario: dict[str, str],
    context_id: str,
    evaluation_passed: bool,
    reason: str,
    scenario_type: Optional[str] = None,
) -> None:
    ...
🤖 Prompt for AI Agents
In rogue/evaluator_agent/evaluator_agent.py around line 120, the tool doc and
function signature for _log_evaluation are out of sync: the doc omits
scenario_type while the function requires it, and the doc incorrectly states
scenario is a string (it’s a dict). Update the docstring to document scenario as
dict[str, str] and add an optional scenario_type parameter, and change the
function signature to accept scenario_type: Optional[str] = None so tool-call
validation passes; ensure all internal uses handle scenario_type possibly being
None.

- Parameters:
- `scenario`: The entire scenario json object being tested. The json-object contains:
- "scenario": The scenario text.
Expand Down
1 change: 1 addition & 0 deletions rogue/models/cli_input.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ class CLIInput(BaseModel):
evaluated_agent_credentials: SecretStr | None = None
judge_llm: str
judge_llm_api_key: SecretStr | None = None
qualifire_api_key: SecretStr | None = None
input_scenarios_file: Path = workdir / "scenarios.json"
output_report_file: Path = workdir / "report.md"
business_context: str
Expand Down
13 changes: 8 additions & 5 deletions rogue/run_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ async def run_scenarios(
evaluation_results_output_path: Path,
business_context: str,
deep_test_mode: bool,
) -> EvaluationResults | None:
) -> tuple[EvaluationResults | None, str | None]:
evaluated_agent_auth_credentials = (
evaluated_agent_auth_credentials_secret.get_secret_value()
if evaluated_agent_auth_credentials_secret
Expand Down Expand Up @@ -134,7 +134,7 @@ async def _run_scenarios_with_sdk(
evaluation_results_output_path: Path,
business_context: str,
deep_test_mode: bool,
) -> EvaluationResults | None:
) -> tuple[EvaluationResults | None, str | None]:
"""Run scenarios using the new SDK."""

# Initialize SDK
Expand Down Expand Up @@ -173,10 +173,10 @@ async def _run_scenarios_with_sdk(
results.model_dump_json(indent=2, exclude_none=True),
encoding="utf-8",
)
return results
return results, final_job.job_id
else:
logger.error("Scenario evaluation completed but no results found.")
return None
return None, None

finally:
await sdk.close()
Expand All @@ -187,6 +187,7 @@ async def create_report(
judge_llm: str,
results: EvaluationResults,
output_report_file: Path,
job_id: str | None = None,
judge_llm_api_key_secret: SecretStr | None = None,
qualifire_api_key_secret: SecretStr | None = None,
deep_test_mode: bool = False,
Expand Down Expand Up @@ -216,6 +217,7 @@ async def create_report(
model=judge_llm,
api_key=judge_llm_api_key,
qualifire_api_key=qualifire_api_key,
job_id=job_id,
deep_test=deep_test_mode,
judge_model=judge_model,
)
Expand Down Expand Up @@ -342,7 +344,7 @@ async def run_cli(args: Namespace) -> int:
"scenarios_length": len(scenarios.scenarios),
},
)
results = await run_scenarios(
results, job_id = await run_scenarios(
rogue_server_url=args.rogue_server_url,
evaluated_agent_url=cli_input.evaluated_agent_url.encoded_string(),
evaluated_agent_auth_type=cli_input.evaluated_agent_auth_type,
Expand All @@ -364,6 +366,7 @@ async def run_cli(args: Namespace) -> int:
rogue_server_url=args.rogue_server_url,
judge_llm=cli_input.judge_llm,
results=results,
job_id=job_id,
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bug: API Key Ignored in Report Creation

The qualifire_api_key from CLIInput isn't passed to the create_report function. This causes any Qualifire API key provided via the CLI to be ignored, preventing its use by the server.

Fix in Cursor Fix in Web

output_report_file=cli_input.output_report_file,
judge_llm_api_key_secret=cli_input.judge_llm_api_key,
deep_test_mode=cli_input.deep_test_mode,
Expand Down
22 changes: 10 additions & 12 deletions rogue/server/api/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
"""

from datetime import datetime, timezone
import os
from fastapi import APIRouter, Depends, HTTPException
from rogue_sdk.types import (
EvaluationResults,
Expand Down Expand Up @@ -96,18 +97,10 @@ async def generate_summary(

logger.info("Successfully generated evaluation summary")

logger.info(
"Qualifire API key",
extra={"qualifire_api_key": request.qualifire_api_key},
)
logger.info(
"Job ID",
extra={"job_id": request.job_id},
)
logger.info(
"Qualifire URL",
extra={"qualifire_url": request.qualifire_url},
)
if not request.qualifire_api_key:
env_api_key = os.getenv("QUALIFIRE_API_KEY")
if env_api_key:
request.qualifire_api_key = env_api_key

if request.qualifire_api_key and request.job_id:

Expand Down Expand Up @@ -184,6 +177,11 @@ async def report_summary_handler(
detail="Evaluation results not found or empty",
)

if not request.qualifire_api_key:
env_api_key = os.getenv("QUALIFIRE_API_KEY")
if env_api_key:
request.qualifire_api_key = env_api_key

QualifireService.report_summary(
ReportSummaryRequest(
job_id=request.job_id,
Expand Down
Loading