Skip to content

Commit

Permalink
Merge branch 'main' into main
Browse files Browse the repository at this point in the history
  • Loading branch information
snova-nidhih authored Mar 4, 2025
2 parents ce2fcbc + db950db commit 7c62f0f
Show file tree
Hide file tree
Showing 4 changed files with 28 additions and 14 deletions.
14 changes: 14 additions & 0 deletions .github/workflows/leaderboard-submission.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ permissions:
jobs:
evaluate:
runs-on: ubuntu-latest
environment:
name: leaderboard-eval-run
env:
GH_TOKEN: ${{ github.token }}

Expand Down Expand Up @@ -44,6 +46,12 @@ jobs:
- name: Download test set answers
run: wget -q ${{ secrets.TEST_ANSWERS_URL }} -O fanoutqa-test-answers.json

- name: Cache FOQA Cache
uses: actions/cache@v4
with:
path: "~/.cache/fanoutqa"
key: foqa-cache

# set up in local workdir and hydrate results
- name: Set up Python 3.10
uses: actions/setup-python@v5
Expand All @@ -66,6 +74,12 @@ jobs:
FANOUTQA_OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
run: python leaderboard-submissions/hydrate.py

- name: Save FOQA Cache
uses: actions/cache/save@v4
with:
path: "~/.cache/fanoutqa"
key: foqa-cache

# ensure the PR comment exists so we can edit it in future steps
- name: Ensure PR comment
if: steps.eval.outputs.changed > 0 || failure()
Expand Down
6 changes: 3 additions & 3 deletions leaderboard-submissions/hydrate.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,17 +162,17 @@ async def eval_submission(metadata_fp: Path, check_result: CheckResult):

print("Evaluating closed book answers...")
closedbook_answers = read_jsonl_answers(CB_PATH / check_result.metadata.closedbook_generations)
closedbook_scorer = Scorer(questions, closedbook_answers)
closedbook_scorer = Scorer(questions, closedbook_answers, llm_cache_key="eval")
closedbook_results = (await closedbook_scorer.score()).to_dict()

print("Evaluating open book answers...")
openbook_answers = read_jsonl_answers(OB_PATH / check_result.metadata.openbook_generations)
openbook_scorer = Scorer(questions, openbook_answers)
openbook_scorer = Scorer(questions, openbook_answers, llm_cache_key="eval")
openbook_results = (await openbook_scorer.score()).to_dict()

print("Evaluating evidence provided answers...")
evidenceprovided_answers = read_jsonl_answers(EP_PATH / check_result.metadata.evidenceprovided_generations)
evidenceprovided_scorer = Scorer(questions, evidenceprovided_answers)
evidenceprovided_scorer = Scorer(questions, evidenceprovided_answers, llm_cache_key="eval")
evidenceprovided_results = (await evidenceprovided_scorer.score()).to_dict()

# hash the results to prevent score manipulation
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@
"name": "Llama3.3 70b distilled DS - 32k ss",
"authors": "SambaNova Systems",
"url": "https://cloud.sambanova.ai/",
"citation": "SambaNova Systems",
"citation": "SambaNova Systems, 2025",
"type": "PROMPT",
"context": "32,000",
"context": 32000,
"is_trained_for_function_calling": false,
"details": "Additional model details (e.g. API model revision or Hugging Face model ID) - optional",
"details": null,
"closedbook_generations": "llama33_70b_distill_ds_32k.jsonl",
"openbook_generations": "llama33_70b_distill_ds_32k.jsonl",
"evidenceprovided_generations": "llama33_70b_distill_ds_32k.jsonl"
Expand Down
16 changes: 8 additions & 8 deletions leaderboard-submissions/results/llama33_70b_distill_ds_32k.json
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
{
"_submission_hash": "af5811b3e9bf9e0f8e5967898ce8e61b48772c2c267d10f5f6a47aa3f6221a67",
"_results_hash": "9fec365a2e7272ddaeaa0c93d9a792b07bb371ab61ce58f8a183f701d2118905",
"_submission_hash": "0115054d62b42cb80e3967c115a071a5af5e6879d8b2b1fb3b65a3bd85bf93a8",
"_results_hash": "adc70848c8a7f98337f5a56a08e51af318f9dcb41c876da80bbbb2fe892f70bd",
"metadata": {
"name": "Llama3.3 70b distilled DS - 32k ss",
"authors": "SambaNova Systems",
"url": null,
"citation": "",
"citation": "SambaNova Systems, 2025",
"type": "PROMPT",
"context": "32,000",
"context": 32000,
"is_trained_for_function_calling": false,
"details": "Additional model details (e.g. API model revision or Hugging Face model ID) - optional"
"details": null
},
"closedbook": {
"acc": {
Expand All @@ -34,7 +34,7 @@
}
},
"bleurt": 0.4949774522107938,
"gpt": 0.19337016574585636
"gpt": 0.19475138121546962
},
"openbook": {
"acc": {
Expand All @@ -59,7 +59,7 @@
}
},
"bleurt": 0.5296848898660906,
"gpt": 0.27624309392265195
"gpt": 0.26933701657458564
},
"evidenceprovided": {
"acc": {
Expand All @@ -84,6 +84,6 @@
}
},
"bleurt": 0.5824599785431808,
"gpt": 0.4129834254143646
"gpt": 0.4116022099447514
}
}

0 comments on commit 7c62f0f

Please sign in to comment.