diff --git a/evals/registry/data/ordering_randomised_versionlist/samples.jsonl b/evals/registry/data/ordering_randomised_versionlist/samples.jsonl new file mode 100644 index 0000000000..6bfdc2da8c --- /dev/null +++ b/evals/registry/data/ordering_randomised_versionlist/samples.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:123164695072b66e3128fcd05d0232c4c4103fb6a2a8732200415a62577eba69 +size 8948 diff --git a/evals/registry/evals/ordering_randomised_versionlist.yaml b/evals/registry/evals/ordering_randomised_versionlist.yaml new file mode 100644 index 0000000000..a7a9bcacf1 --- /dev/null +++ b/evals/registry/evals/ordering_randomised_versionlist.yaml @@ -0,0 +1,10 @@ +ordering_randomised_versionlist: + id: ordering_randomised_versionlist.dev.v0 + description: This evaluation aims to test prompt engineered failure cases to order a randomised version history list, but causes chronological ordering failures such as 7.5.2 -> 7.4.2 -> 7.5.1 -> 7.4.1 (incorrectly inserted 7.4.2 in between 7.5.2 and 7.5.1 in the Explainable AI chain of thoughts) and 7.5.2 -> 7.5.1 -> 7.5.0 -> 7.4.1 (incorrectly skipped over 7.4.2 in the Explainable AI chain of thoughts). + metrics: [accuracy] + +ordering_randomised_versionlist.dev.v0: + class: evals.elsuite.basic.includes:Includes + args: + samples_jsonl: ordering_randomised_versionlist/samples.jsonl +