EleutherAI · haileyschoelkopf · Jun 7, 2024 · Jun 3, 2024 · Jun 3, 2024 · Jun 3, 2024
@@ -223,7 +223,7 @@ def prepare_print_tasks(
 
 def consolidate_results(
     eval_tasks: List[TaskOutput],
-) -> Tuple[dict, dict, dict, dict, dict]:
+) -> Tuple[dict, dict, dict, dict, dict, dict]:
     """
     @param eval_tasks: list(TaskOutput).
     @return: A tuple containing the consolidated results, samples, configs, versions, and num_fewshot.
@@ -240,6 +240,8 @@ def consolidate_results(
     - configs: A defaultdict with task names as keys and task configurations as values.
     - versions: A defaultdict with task names as keys and task versions as values.
     - num_fewshot: A defaultdict with task names as keys and number of few-shot samples as values.
+    - higher_is_better: A defaultdict with task names as keys and indicators of whether higher values are better
+    for each metric as values.
 
     The method then returns the consolidated results, samples, configs, versions, and num_fewshot as a tuple.
     """

@@ -300,7 +300,11 @@ def make_table(result_dict, column: str = "results", sort_results: bool = True):
         if "alias" in dic:
             k = dic.pop("alias")
 
-        for (mf), v in dic.items():
+        metric_items = dic.items()
+        if sort_results:
+            metric_items = sorted(metric_items)
+
+        for (mf), v in metric_items:
             m, _, f = mf.partition(",")
             if m.endswith("_stderr"):
                 continue

@@ -1,11 +1,13 @@
 import os
+import re
 from typing import List
 
 import pytest
 
 import lm_eval.api as api
 import lm_eval.evaluator as evaluator
 from lm_eval import tasks
+from lm_eval.utils import make_table
 
 
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
@@ -75,3 +77,73 @@ def r(x):
         x == y
         for x, y in zip([y for _, y in r(e1).items()], [y for _, y in r(e2).items()])
     )
+
+
+@pytest.mark.parametrize(
+    "task_name,limit,model,model_args",
+    [
+        (
+            ["ai2_arc"],
+            10,
+            "hf",
+            "pretrained=EleutherAI/pythia-14m,dtype=float32,device=cpu",
+        ),
+        (
+            ["mmlu_abstract_algebra", "mmlu_global_facts", "mmlu_public_relations"],
+            10,
+            "hf",
+            "pretrained=EleutherAI/pythia-14m,dtype=float32,device=cpu",
+        ),
+        (
+            ["lambada_openai"],
+            10,
+            "hf",
+            "pretrained=EleutherAI/pythia-14m,dtype=float32,device=cpu",
+        ),
+        (
+            ["wikitext"],
+            10,
+            "hf",
+            "pretrained=EleutherAI/pythia-14m,dtype=float32,device=cpu",
+        ),
+    ],
+)
+def test_printed_results(task_name: List[str], limit: int, model: str, model_args: str):
+    results = evaluator.simple_evaluate(
+        model=model,
+        tasks=task_name,
+        limit=limit,
+        model_args=model_args,
+        bootstrap_iters=0,
+        random_seed=0,
+        numpy_random_seed=0,
+        torch_random_seed=0,
+        fewshot_random_seed=0,
+    )
+
+    filename = "_".join(
+        (
+            "-".join(task_name),
+            str(limit),
+            str(model),
+            re.sub(r"[^a-zA-Z0-9_\-\.]", "-", model_args),
+        )
+    )
+    filepath = f"./tests/testdata/{filename}.txt"
+    with open(filepath, "r") as f:
+        t1 = f.read().strip()
+
+    t2 = make_table(results).strip()
+
+    t1_lines, t2_lines = t1.splitlines(), t2.splitlines()
+    assert len(t1_lines) == len(t2_lines)
+    for t1_line, t2_line in zip(t1_lines, t2_lines):
+        t1_items, t2_items = t1_line.split("|"), t2_line.split("|")
+        assert len(t1_items) == len(t2_items)
+        for t1_item, t2_item in zip(t1_items, t2_items):
+            try:
+                t1_item = float(t1_item)
+                t2_item = float(t2_item)
+                assert abs(t1_item - t2_item) < 0.1
+            except ValueError:
+                assert t1_item == t2_item
@@ -0,0 +1,8 @@
+|     Tasks      |Version|Filter|n-shot| Metric |   |Value|   |Stderr|
+|----------------|-------|------|-----:|--------|---|----:|---|------|
+|ai2_arc         |N/A    |none  |     0|acc     |↑  | 0.15|±  |N/A   |
+|                |       |none  |     0|acc_norm|↑  | 0.05|±  |N/A   |
+| - arc_challenge|      1|none  |     0|acc     |↑  | 0.00|±  |N/A   |
+|                |       |none  |     0|acc_norm|↑  | 0.00|±  |N/A   |
+| - arc_easy     |      1|none  |     0|acc     |↑  | 0.30|±  |N/A   |
+|                |       |none  |     0|acc_norm|↑  | 0.10|±  |N/A   |
@@ -0,0 +1,4 @@
+|    Tasks     |Version|Filter|n-shot|  Metric  |   | Value  |   |Stderr|
+|--------------|------:|------|-----:|----------|---|-------:|---|------|
+|lambada_openai|      1|none  |     0|acc       |↑  |  0.1000|±  |N/A   |
+|              |       |none  |     0|perplexity|↓  |605.4879|±  |N/A   |
@@ -0,0 +1,5 @@
+|     Tasks      |Version|Filter|n-shot|Metric|   |Value|   |Stderr|
+|----------------|------:|------|-----:|------|---|----:|---|------|
+|abstract_algebra|      0|none  |     0|acc   |↑  |  0.2|±  |N/A   |
+|global_facts    |      0|none  |     0|acc   |↑  |  0.2|±  |N/A   |
+|public_relations|      0|none  |     0|acc   |↑  |  0.2|±  |N/A   |
@@ -0,0 +1,5 @@
+| Tasks  |Version|Filter|n-shot|    Metric     |   | Value  |   |Stderr|
+|--------|------:|------|-----:|---------------|---|-------:|---|------|
+|wikitext|      2|none  |     0|bits_per_byte  |↓  |  1.3394|±  |N/A   |
+|        |       |none  |     0|byte_perplexity|↓  |  2.5304|±  |N/A   |
+|        |       |none  |     0|word_perplexity|↓  |130.4812|±  |N/A   |