diff --git a/evals/registry/data/probability_questions/probability_questions.jsonl b/evals/registry/data/probability_questions/probability_questions.jsonl new file mode 100644 index 0000000000..67591e97ed --- /dev/null +++ b/evals/registry/data/probability_questions/probability_questions.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:586447e2dd41421446f8dc163babe17d47af4344b1f607d117b62827453acee3 +size 54726 diff --git a/evals/registry/evals/probability_questions.yaml b/evals/registry/evals/probability_questions.yaml new file mode 100644 index 0000000000..32bcd574e1 --- /dev/null +++ b/evals/registry/evals/probability_questions.yaml @@ -0,0 +1,8 @@ +probability-questions: + id: probability-questions.dev.v0 + description: A collection of probability questions that ChatGPT fails. Let's see if GPT-4 can do better. + metrics: [accuracy] +probability-questions.dev.v0: + class: evals.elsuite.basic.match:Match + args: + samples_jsonl: probability_questions/probability_questions.jsonl \ No newline at end of file