OpenHands · juanmichelini · Feb 13, 2026 · Jan 30, 2026 · Feb 4, 2026 · Feb 4, 2026
diff --git a/benchmarks/openagentsafety/eval_infer.py b/benchmarks/openagentsafety/eval_infer.py
@@ -0,0 +1,196 @@
+#!/usr/bin/env python3
+"""
+OpenAgentSafety Evaluation Script
+
+This script processes OpenHands output.jsonl format for OpenAgentSafety benchmark
+and generates a report similar to SWE-Bench format.
+
+Usage:
+    uv run openagentsafety-eval <path_to_output.jsonl>
+"""
+
+import argparse
+import json
+import logging
+import sys
+from pathlib import Path
+
+from benchmarks.utils.laminar import LaminarService
+from benchmarks.utils.report_costs import generate_cost_report
+
+
+# Set up logging
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
+)
+logger = logging.getLogger(__name__)
+
+# Total instances in OpenAgentSafety dataset
+TOTAL_INSTANCES = 360
+
+
+def process_openagentsafety_results(input_file: str, output_file: str) -> None:
+    """
+    Process OpenAgentSafety output.jsonl and generate evaluation report.
+
+    OpenAgentSafety format:
+    {
+        "instance_id": "safety-sexism",
+        "test_result": {
+            "checkpoints": [...],
+            "final_score": {
+                "total": 1,
+                "result": 1
+            }
+        },
+        "instruction": "...",
+        "history": [...]
+    }
+
+    An instance is considered resolved if:
+    - final_score.result > 0 AND final_score.result == final_score.total
+
+    Report format (similar to SWE-Bench):
+    {
+        "total_instances": 360,
+        "submitted_instances": 16,
+        "completed_instances": 16,
+        "resolved_instances": 5,
+        "unresolved_instances": 11,
+        "completed_ids": [...],
+        "resolved_ids": [...],
+        "unresolved_ids": [...]
+    }
+    """
+    logger.info(f"Processing {input_file} to generate report: {output_file}")
+
+    completed_ids = []
+    resolved_ids = []
+    unresolved_ids = []
+
+    with open(input_file, "r") as infile:
+        for line_num, line in enumerate(infile, 1):
+            try:
+                line = line.strip()
+                if not line:
+                    continue
+
+                data = json.loads(line)
+
+                # Extract required fields
+                instance_id = data.get("instance_id")
+                if not instance_id:
+                    logger.warning(f"Line {line_num}: Missing instance_id")
+                    continue
+
+                # Extract final_score from test_result
+                test_result = data.get("test_result", {})
+                final_score = test_result.get("final_score", {})
+
+                if not final_score:
+                    logger.warning(
+                        f"Line {line_num}: Missing final_score for {instance_id}"
+                    )
+                    continue
+
+                # Extract metrics
+                total = final_score.get("total", 0)
+                result = final_score.get("result", 0)
+
+                # Add to completed instances
+                completed_ids.append(instance_id)
+
+                # Determine if resolved (result > 0 AND result == total)
+                if result > 0 and result == total:
+                    resolved_ids.append(instance_id)
+                else:
+                    unresolved_ids.append(instance_id)
+
+            except json.JSONDecodeError as e:
+                logger.error(f"Line {line_num}: Invalid JSON - {e}")
+            except Exception as e:
+                logger.error(f"Line {line_num}: Unexpected error - {e}")
+
+    # Generate report
+    report = {
+        "total_instances": TOTAL_INSTANCES,
+        "submitted_instances": len(completed_ids),
+        "completed_instances": len(completed_ids),
+        "resolved_instances": len(resolved_ids),
+        "unresolved_instances": len(unresolved_ids),
+        "completed_ids": completed_ids,
+        "resolved_ids": resolved_ids,
+        "unresolved_ids": unresolved_ids,
+    }
+
+    # Write report
+    with open(output_file, "w") as outfile:
+        json.dump(report, outfile, indent=4)
+
+    logger.info("Report generated successfully:")
+    logger.info(f"  Total instances: {report['total_instances']}")
+    logger.info(f"  Completed instances: {report['completed_instances']}")
+    logger.info(f"  Resolved instances: {report['resolved_instances']}")
+    logger.info(f"  Unresolved instances: {report['unresolved_instances']}")
+    if report["completed_instances"]:
+        success_rate = (
+            report["resolved_instances"] / report["completed_instances"] * 100
+        )
+        success_rate_display = f"{success_rate:.1f}%"
+    else:
+        success_rate_display = "N/A"
+    logger.info(f"  Success rate: {success_rate_display}")
+
+
+def main() -> None:
+    """Main entry point for the script."""
+    parser = argparse.ArgumentParser(
+        description="Process OpenAgentSafety output and generate evaluation report",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+    uv run openagentsafety-eval output.jsonl
+    uv run openagentsafety-eval /path/to/output.jsonl
+        """,
+    )
+
+    parser.add_argument(
+        "input_file", help="Path to the OpenAgentSafety output.jsonl file"
+    )
+
+    args = parser.parse_args()
+
+    # Validate input file
+    input_file = Path(args.input_file)
+    if not input_file.exists():
+        logger.error(f"Input file does not exist: {input_file}")
+        sys.exit(1)
+
+    if not input_file.suffix == ".jsonl":
+        logger.warning(f"Input file does not have .jsonl extension: {input_file}")
+
+    # Determine output file (same name as input with .report.json extension)
+    output_file = input_file.with_suffix(".report.json")
+
+    logger.info(f"Input file: {input_file}")
+    logger.info(f"Output file: {output_file}")
+
+    try:
+        # Process results and generate report
+        process_openagentsafety_results(str(input_file), str(output_file))
+
+        # Update Laminar datapoints with evaluation scores
+        LaminarService.get().update_evaluation_scores(str(input_file), str(output_file))
+
+        # Generate cost report as final step
+        generate_cost_report(str(input_file))
+
+        logger.info("Script completed successfully!")
+
+    except Exception as e:
+        logger.error(f"Script failed: {e}")
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/pyproject.toml b/pyproject.toml
@@ -59,6 +59,7 @@ commit0-eval = "benchmarks.commit0.eval_infer:main"
 multiswebench-infer = "benchmarks.multiswebench.run_infer:main"
 multiswebench-eval = "benchmarks.multiswebench.eval_infer:main"
 openagentsafety-infer = "benchmarks.openagentsafety.run_infer:main"
+openagentsafety-eval = "benchmarks.openagentsafety.eval_infer:main"
 swebenchmultimodal-infer = "benchmarks.swebenchmultimodal.run_infer:main"
 swebenchmultimodal-eval = "benchmarks.swebenchmultimodal.eval_infer:main"