Merge branch 'main' into main

zhudotexe · Mar 4, 2025 · 7c62f0f · 7c62f0f
2 parents ce2fcbc + db950db
commit 7c62f0f
Show file tree

Hide file tree

Showing 4 changed files with 28 additions and 14 deletions.
diff --git a/.github/workflows/leaderboard-submission.yml b/.github/workflows/leaderboard-submission.yml
@@ -16,6 +16,8 @@ permissions:
 jobs:
   evaluate:
     runs-on: ubuntu-latest
+    environment:
+      name: leaderboard-eval-run
     env:
       GH_TOKEN: ${{ github.token }}
 
@@ -44,6 +46,12 @@ jobs:
       - name: Download test set answers
         run: wget -q ${{ secrets.TEST_ANSWERS_URL }} -O fanoutqa-test-answers.json
 
+      - name: Cache FOQA Cache
+        uses: actions/cache@v4
+        with:
+          path: "~/.cache/fanoutqa"
+          key: foqa-cache
+
       # set up in local workdir and hydrate results
       - name: Set up Python 3.10
         uses: actions/setup-python@v5
@@ -66,6 +74,12 @@ jobs:
           FANOUTQA_OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
         run: python leaderboard-submissions/hydrate.py
 
+      - name: Save FOQA Cache
+        uses: actions/cache/save@v4
+        with:
+          path: "~/.cache/fanoutqa"
+          key: foqa-cache
+
       # ensure the PR comment exists so we can edit it in future steps
       - name: Ensure PR comment
         if: steps.eval.outputs.changed > 0 || failure()

diff --git a/leaderboard-submissions/hydrate.py b/leaderboard-submissions/hydrate.py
@@ -162,17 +162,17 @@ async def eval_submission(metadata_fp: Path, check_result: CheckResult):
 
     print("Evaluating closed book answers...")
     closedbook_answers = read_jsonl_answers(CB_PATH / check_result.metadata.closedbook_generations)
-    closedbook_scorer = Scorer(questions, closedbook_answers)
+    closedbook_scorer = Scorer(questions, closedbook_answers, llm_cache_key="eval")
     closedbook_results = (await closedbook_scorer.score()).to_dict()
 
     print("Evaluating open book answers...")
     openbook_answers = read_jsonl_answers(OB_PATH / check_result.metadata.openbook_generations)
-    openbook_scorer = Scorer(questions, openbook_answers)
+    openbook_scorer = Scorer(questions, openbook_answers, llm_cache_key="eval")
     openbook_results = (await openbook_scorer.score()).to_dict()
 
     print("Evaluating evidence provided answers...")
     evidenceprovided_answers = read_jsonl_answers(EP_PATH / check_result.metadata.evidenceprovided_generations)
-    evidenceprovided_scorer = Scorer(questions, evidenceprovided_answers)
+    evidenceprovided_scorer = Scorer(questions, evidenceprovided_answers, llm_cache_key="eval")
     evidenceprovided_results = (await evidenceprovided_scorer.score()).to_dict()
 
     # hash the results to prevent score manipulation

diff --git a/leaderboard-submissions/metadata/llama33_70b_distill_ds_32k.json b/leaderboard-submissions/metadata/llama33_70b_distill_ds_32k.json
@@ -2,11 +2,11 @@
     "name": "Llama3.3 70b distilled DS - 32k ss",
     "authors": "SambaNova Systems",
     "url": "https://cloud.sambanova.ai/",
-    "citation": "SambaNova Systems",
+    "citation": "SambaNova Systems, 2025",
     "type": "PROMPT",
-    "context": "32,000",
+    "context": 32000,
     "is_trained_for_function_calling": false,
-    "details": "Additional model details (e.g. API model revision or Hugging Face model ID) - optional",
+    "details": null,
     "closedbook_generations": "llama33_70b_distill_ds_32k.jsonl",
     "openbook_generations": "llama33_70b_distill_ds_32k.jsonl",
     "evidenceprovided_generations": "llama33_70b_distill_ds_32k.jsonl"

diff --git a/leaderboard-submissions/results/llama33_70b_distill_ds_32k.json b/leaderboard-submissions/results/llama33_70b_distill_ds_32k.json
@@ -1,15 +1,15 @@
 {
-  "_submission_hash": "af5811b3e9bf9e0f8e5967898ce8e61b48772c2c267d10f5f6a47aa3f6221a67",
-  "_results_hash": "9fec365a2e7272ddaeaa0c93d9a792b07bb371ab61ce58f8a183f701d2118905",
+  "_submission_hash": "0115054d62b42cb80e3967c115a071a5af5e6879d8b2b1fb3b65a3bd85bf93a8",
+  "_results_hash": "adc70848c8a7f98337f5a56a08e51af318f9dcb41c876da80bbbb2fe892f70bd",
   "metadata": {
     "name": "Llama3.3 70b distilled DS - 32k ss",
     "authors": "SambaNova Systems",
     "url": null,
-    "citation": "",
+    "citation": "SambaNova Systems, 2025",
     "type": "PROMPT",
-    "context": "32,000",
+    "context": 32000,
     "is_trained_for_function_calling": false,
-    "details": "Additional model details (e.g. API model revision or Hugging Face model ID) - optional"
+    "details": null
   },
   "closedbook": {
     "acc": {
@@ -34,7 +34,7 @@
       }
     },
     "bleurt": 0.4949774522107938,
-    "gpt": 0.19337016574585636
+    "gpt": 0.19475138121546962
   },
   "openbook": {
     "acc": {
@@ -59,7 +59,7 @@
       }
     },
     "bleurt": 0.5296848898660906,
-    "gpt": 0.27624309392265195
+    "gpt": 0.26933701657458564
   },
   "evidenceprovided": {
     "acc": {
@@ -84,6 +84,6 @@
       }
     },
     "bleurt": 0.5824599785431808,
-    "gpt": 0.4129834254143646
+    "gpt": 0.4116022099447514
   }
 }