adding Benchmark and Eval classes (#25)

* adding Benchmark and Eval classes * simplifying benchmarking * updating snapshot
dtch1997 · Dec 4, 2023 · e8dd92b · e8dd92b
1 parent bd01efb
commit e8dd92b
Show file tree

Hide file tree

Showing 9 changed files with 168 additions and 12 deletions.
diff --git a/repepo/core/benchmark.py b/repepo/core/benchmark.py
@@ -0,0 +1,48 @@
+# pyright: strict, reportMissingTypeStubs=false
+
+from dataclasses import dataclass
+from typing import Optional, Sequence
+
+from transformers import GenerationConfig
+
+from repepo.algorithms.base import BaseAlgorithm
+from repepo.core.evaluate import EvalPrediction, EvalResult, Evaluator
+from repepo.core.format import AbstractFormatter, InputOutputFormatter
+from repepo.core.pipeline import Pipeline
+
+from repepo.core.types import Dataset, Model, Tokenizer
+
+
+@dataclass
+class Benchmark:
+    name: str
+    train_dataset: Dataset
+    test_dataset: Dataset
+    evaluators: list[Evaluator]
+
+
+def train_and_evaluate_benchmark(
+    model: Model,
+    tokenizer: Tokenizer,
+    algorithms: Sequence[BaseAlgorithm],
+    benchmark: Benchmark,
+    formatter: Optional[AbstractFormatter] = None,
+    generation_config: Optional[GenerationConfig] = None,
+) -> EvalResult:
+    # set up pipeline
+    pipeline = Pipeline(model, tokenizer, formatter=formatter or InputOutputFormatter())
+
+    # train pipeline
+    for algorithm in algorithms:
+        pipeline = algorithm.run(pipeline, benchmark.train_dataset)
+
+    # evaluate
+    predictions: list[EvalPrediction] = []
+    # TODO: support batching
+    for example in benchmark.test_dataset:
+        output = pipeline.generate(example, generation_config=generation_config)
+        predictions.append(EvalPrediction(example, output))
+    metrics: dict[str, float] = {}
+    for evaluator in benchmark.evaluators:
+        metrics.update(evaluator(predictions))
+    return EvalResult(predictions, metrics)
diff --git a/repepo/core/evaluate.py b/repepo/core/evaluate.py
@@ -0,0 +1,36 @@
+# pyright: strict
+
+
+from dataclasses import dataclass
+from statistics import mean
+from typing import Callable, Sequence
+from repepo.core.types import Example
+
+
+@dataclass
+class EvalPrediction:
+    example: Example
+    output: str
+
+
+@dataclass
+class EvalResult:
+    predictions: list[EvalPrediction]
+    metrics: dict[str, float]
+
+
+Evaluator = Callable[[Sequence[EvalPrediction]], dict[str, float]]
+
+
+class AccuracyEvaluator:
+    def score_prediction(self, prediction: EvalPrediction) -> float:
+        """Score a single prediction, 1 if correct, 0 otherwise."""
+        expected = prediction.example.output
+        # the output might be longer than the expected depending on how many tokens we generate
+        # so just verify that the expected output is a prefix of the generated output
+        is_correct = prediction.output.strip().startswith(expected.strip())
+        return 1.0 if is_correct else 0.0
+
+    def __call__(self, predictions: Sequence[EvalPrediction]) -> dict[str, float]:
+        pred_results = [self.score_prediction(pred) for pred in predictions]
+        return {"accuracy": mean(pred_results)}
diff --git a/repepo/core/pipeline.py b/repepo/core/pipeline.py
@@ -26,7 +26,7 @@ def build_generation_prompt(self, example: Example) -> str:
     def generate(
         self,
         example: Example,
-        config: Optional[GenerationConfig] = None,
+        generation_config: Optional[GenerationConfig] = None,
         remove_base_prompt: bool = True,
     ) -> str:
         """Generate a completion for a given example"""
@@ -35,7 +35,7 @@ def generate(
         inputs = inputs.to(self.model.device)
         outputs = self.model.generate(
             **inputs,
-            config=config,
+            generation_config=generation_config,
         )[0]
         outputs_str = self.tokenizer.decode(outputs, skip_special_tokens=True)
         if remove_base_prompt:

diff --git a/repepo/core/prompt.py b/repepo/core/prompt.py
@@ -6,7 +6,7 @@
 
 
 def completion_to_str(completion: Completion) -> str:
-    return completion.prompt + "\n" + completion.response
+    return completion.prompt.rstrip() + " " + completion.response.lstrip() + "\n"
 
 
 class AbstractPrompter(abc.ABC):

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -8,6 +8,7 @@
 
 
 _model: GPTNeoXForCausalLM | None = None
+_larger_model: GPTNeoXForCausalLM | None = None
 
 
 def _load_model() -> GPTNeoXForCausalLM:
@@ -16,7 +17,6 @@ def _load_model() -> GPTNeoXForCausalLM:
     if _model is None:
         _pretrained_model = GPTNeoXForCausalLM.from_pretrained(
             "EleutherAI/pythia-70m",
-            # torch_dtype=torch.bfloat16,
             torch_dtype=torch.float64,
             token=True,
         )
@@ -26,7 +26,22 @@ def _load_model() -> GPTNeoXForCausalLM:
     return _model
 
 
+def _load_larger_model() -> GPTNeoXForCausalLM:
+    global _larger_model
+
+    if _larger_model is None:
+        _pretrained_model = GPTNeoXForCausalLM.from_pretrained(
+            "EleutherAI/pythia-160m",
+            token=True,
+        )
+        assert type(_pretrained_model) == GPTNeoXForCausalLM
+        device: Any = "cuda" if torch.cuda.is_available() else "cpu"
+        _larger_model = _pretrained_model.to(device).eval()
+    return _larger_model
+
+
 _tokenizer = AutoTokenizer.from_pretrained("EleutherAI/pythia-70m")
+_larger_tokenizer = AutoTokenizer.from_pretrained("EleutherAI/pythia-160m")
 # _model = GPTNeoXForCausalLM.from_pretrained("EleutherAI/pythia-6.9b", torch_dtype=torch.bfloat16, device_map="auto", token=True, cache_dir = "/ext_usb").eval()
 # _tokenizer = AutoTokenizer.from_pretrained("EleutherAI/pythia-6.9b", cache_dir = "/ext_usb")
 
@@ -39,3 +54,13 @@ def model() -> GPTNeoXForCausalLM:
 @pytest.fixture
 def tokenizer() -> Tokenizer:
     return _tokenizer
+
+
+@pytest.fixture
+def larger_model() -> GPTNeoXForCausalLM:
+    return _load_larger_model()
+
+
+@pytest.fixture
+def larger_tokenizer() -> Tokenizer:
+    return _larger_tokenizer
diff --git a/tests/core/__snapshots__/test_pipeline.ambr b/tests/core/__snapshots__/test_pipeline.ambr
@@ -2,14 +2,14 @@
 # name: test_icl_Pipeline_build_generation_prompt
   '''
   Input:  Paris is in 
-  Output: 
-  France
+  Output: France
+
   Input:  London is in 
-  Output: 
-  England
+  Output: England
+
   Input:  Berlin is in 
-  Output: 
-  Germany
+  Output: Germany
+
   Input:  Beijing is in 
   Output: 
   '''

diff --git a/tests/core/test_benchmark.py b/tests/core/test_benchmark.py
@@ -0,0 +1,44 @@
+import pytest
+from transformers import GenerationConfig
+
+from repepo.algorithms.icl import InContextLearning
+from repepo.core.benchmark import Benchmark, train_and_evaluate_benchmark
+from repepo.core.evaluate import AccuracyEvaluator
+from repepo.core.types import Dataset, Example, Tokenizer, Model
+
+
+def test_evaluate_benchmark(larger_model: Model, larger_tokenizer: Tokenizer) -> None:
+    train_dataset: Dataset = [
+        Example("", "Paris is located in the country of", "France"),
+        Example("", "Shanghai is located in the country of", "China"),
+        Example("", "Tokyo is located in the country of", "Japan"),
+        Example("", "London is located in the country of", "England"),
+    ]
+    test_dataset = [
+        Example("", "Kyoto is located in the country of", "Japan"),
+        Example("", "Beijing is located in the country of", "China"),
+        Example("", "FakePlace is located in the country of", "WrongAnswer"),
+    ]
+    benchmark = Benchmark(
+        name="test benchmark",
+        train_dataset=train_dataset,
+        test_dataset=test_dataset,
+        evaluators=[AccuracyEvaluator()],
+    )
+    algorithms = [InContextLearning()]
+
+    results = train_and_evaluate_benchmark(
+        model=larger_model,
+        tokenizer=larger_tokenizer,
+        algorithms=algorithms,
+        benchmark=benchmark,
+        generation_config=GenerationConfig(
+            max_length=100, pad_token_id=larger_tokenizer.eos_token_id
+        ),
+    )
+
+    assert len(results.predictions) == 3
+    assert results.predictions[0].example == test_dataset[0]
+    assert results.predictions[1].example == test_dataset[1]
+    assert results.predictions[2].example == test_dataset[2]
+    assert results.metrics["accuracy"] == pytest.approx(2 / 3)
diff --git a/tests/core/test_pipeline.py b/tests/core/test_pipeline.py
@@ -13,7 +13,8 @@ def test_basic_Pipeline_generate(
 ) -> None:
     pipeline = Pipeline(model, tokenizer)
     res = pipeline.generate(
-        Example(instruction="Respond", input="A B C D", output="E"), config=None
+        Example(instruction="Respond", input="A B C D", output="E"),
+        generation_config=None,
     )
     # pythia-70m generates nonsense, so just verify we get something
     assert isinstance(res, str)

diff --git a/tests/core/test_prompter.py b/tests/core/test_prompter.py
@@ -15,4 +15,6 @@ def test_few_shot_prompter():
     examples = [Completion(prompt="Example 1", response="Response 1")]
     prompter = FewShotPrompter(examples)
     output = prompter.apply(completion)
-    assert output.prompt == examples[0].prompt + "\n" + examples[0].response + "\nHello"
+    assert (
+        output.prompt == examples[0].prompt + " " + examples[0].response + "\n\nHello"
+    )