Skip to content

Commit

Permalink
adding Benchmark and Eval classes (#25)
Browse files Browse the repository at this point in the history
* adding Benchmark and Eval classes

* simplifying benchmarking

* updating snapshot
  • Loading branch information
chanind authored Dec 4, 2023
1 parent bd01efb commit e8dd92b
Show file tree
Hide file tree
Showing 9 changed files with 168 additions and 12 deletions.
48 changes: 48 additions & 0 deletions repepo/core/benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# pyright: strict, reportMissingTypeStubs=false

from dataclasses import dataclass
from typing import Optional, Sequence

from transformers import GenerationConfig

from repepo.algorithms.base import BaseAlgorithm
from repepo.core.evaluate import EvalPrediction, EvalResult, Evaluator
from repepo.core.format import AbstractFormatter, InputOutputFormatter
from repepo.core.pipeline import Pipeline

from repepo.core.types import Dataset, Model, Tokenizer


@dataclass
class Benchmark:
name: str
train_dataset: Dataset
test_dataset: Dataset
evaluators: list[Evaluator]


def train_and_evaluate_benchmark(
model: Model,
tokenizer: Tokenizer,
algorithms: Sequence[BaseAlgorithm],
benchmark: Benchmark,
formatter: Optional[AbstractFormatter] = None,
generation_config: Optional[GenerationConfig] = None,
) -> EvalResult:
# set up pipeline
pipeline = Pipeline(model, tokenizer, formatter=formatter or InputOutputFormatter())

# train pipeline
for algorithm in algorithms:
pipeline = algorithm.run(pipeline, benchmark.train_dataset)

# evaluate
predictions: list[EvalPrediction] = []
# TODO: support batching
for example in benchmark.test_dataset:
output = pipeline.generate(example, generation_config=generation_config)
predictions.append(EvalPrediction(example, output))
metrics: dict[str, float] = {}
for evaluator in benchmark.evaluators:
metrics.update(evaluator(predictions))
return EvalResult(predictions, metrics)
36 changes: 36 additions & 0 deletions repepo/core/evaluate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# pyright: strict


from dataclasses import dataclass
from statistics import mean
from typing import Callable, Sequence
from repepo.core.types import Example


@dataclass
class EvalPrediction:
example: Example
output: str


@dataclass
class EvalResult:
predictions: list[EvalPrediction]
metrics: dict[str, float]


Evaluator = Callable[[Sequence[EvalPrediction]], dict[str, float]]


class AccuracyEvaluator:
def score_prediction(self, prediction: EvalPrediction) -> float:
"""Score a single prediction, 1 if correct, 0 otherwise."""
expected = prediction.example.output
# the output might be longer than the expected depending on how many tokens we generate
# so just verify that the expected output is a prefix of the generated output
is_correct = prediction.output.strip().startswith(expected.strip())
return 1.0 if is_correct else 0.0

def __call__(self, predictions: Sequence[EvalPrediction]) -> dict[str, float]:
pred_results = [self.score_prediction(pred) for pred in predictions]
return {"accuracy": mean(pred_results)}
4 changes: 2 additions & 2 deletions repepo/core/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def build_generation_prompt(self, example: Example) -> str:
def generate(
self,
example: Example,
config: Optional[GenerationConfig] = None,
generation_config: Optional[GenerationConfig] = None,
remove_base_prompt: bool = True,
) -> str:
"""Generate a completion for a given example"""
Expand All @@ -35,7 +35,7 @@ def generate(
inputs = inputs.to(self.model.device)
outputs = self.model.generate(
**inputs,
config=config,
generation_config=generation_config,
)[0]
outputs_str = self.tokenizer.decode(outputs, skip_special_tokens=True)
if remove_base_prompt:
Expand Down
2 changes: 1 addition & 1 deletion repepo/core/prompt.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@


def completion_to_str(completion: Completion) -> str:
return completion.prompt + "\n" + completion.response
return completion.prompt.rstrip() + " " + completion.response.lstrip() + "\n"


class AbstractPrompter(abc.ABC):
Expand Down
27 changes: 26 additions & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@


_model: GPTNeoXForCausalLM | None = None
_larger_model: GPTNeoXForCausalLM | None = None


def _load_model() -> GPTNeoXForCausalLM:
Expand All @@ -16,7 +17,6 @@ def _load_model() -> GPTNeoXForCausalLM:
if _model is None:
_pretrained_model = GPTNeoXForCausalLM.from_pretrained(
"EleutherAI/pythia-70m",
# torch_dtype=torch.bfloat16,
torch_dtype=torch.float64,
token=True,
)
Expand All @@ -26,7 +26,22 @@ def _load_model() -> GPTNeoXForCausalLM:
return _model


def _load_larger_model() -> GPTNeoXForCausalLM:
global _larger_model

if _larger_model is None:
_pretrained_model = GPTNeoXForCausalLM.from_pretrained(
"EleutherAI/pythia-160m",
token=True,
)
assert type(_pretrained_model) == GPTNeoXForCausalLM
device: Any = "cuda" if torch.cuda.is_available() else "cpu"
_larger_model = _pretrained_model.to(device).eval()
return _larger_model


_tokenizer = AutoTokenizer.from_pretrained("EleutherAI/pythia-70m")
_larger_tokenizer = AutoTokenizer.from_pretrained("EleutherAI/pythia-160m")
# _model = GPTNeoXForCausalLM.from_pretrained("EleutherAI/pythia-6.9b", torch_dtype=torch.bfloat16, device_map="auto", token=True, cache_dir = "/ext_usb").eval()
# _tokenizer = AutoTokenizer.from_pretrained("EleutherAI/pythia-6.9b", cache_dir = "/ext_usb")

Expand All @@ -39,3 +54,13 @@ def model() -> GPTNeoXForCausalLM:
@pytest.fixture
def tokenizer() -> Tokenizer:
return _tokenizer


@pytest.fixture
def larger_model() -> GPTNeoXForCausalLM:
return _load_larger_model()


@pytest.fixture
def larger_tokenizer() -> Tokenizer:
return _larger_tokenizer
12 changes: 6 additions & 6 deletions tests/core/__snapshots__/test_pipeline.ambr
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,14 @@
# name: test_icl_Pipeline_build_generation_prompt
'''
Input: Paris is in
Output:
France
Output: France

Input: London is in
Output:
England
Output: England

Input: Berlin is in
Output:
Germany
Output: Germany

Input: Beijing is in
Output:
'''
Expand Down
44 changes: 44 additions & 0 deletions tests/core/test_benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import pytest
from transformers import GenerationConfig

from repepo.algorithms.icl import InContextLearning
from repepo.core.benchmark import Benchmark, train_and_evaluate_benchmark
from repepo.core.evaluate import AccuracyEvaluator
from repepo.core.types import Dataset, Example, Tokenizer, Model


def test_evaluate_benchmark(larger_model: Model, larger_tokenizer: Tokenizer) -> None:
train_dataset: Dataset = [
Example("", "Paris is located in the country of", "France"),
Example("", "Shanghai is located in the country of", "China"),
Example("", "Tokyo is located in the country of", "Japan"),
Example("", "London is located in the country of", "England"),
]
test_dataset = [
Example("", "Kyoto is located in the country of", "Japan"),
Example("", "Beijing is located in the country of", "China"),
Example("", "FakePlace is located in the country of", "WrongAnswer"),
]
benchmark = Benchmark(
name="test benchmark",
train_dataset=train_dataset,
test_dataset=test_dataset,
evaluators=[AccuracyEvaluator()],
)
algorithms = [InContextLearning()]

results = train_and_evaluate_benchmark(
model=larger_model,
tokenizer=larger_tokenizer,
algorithms=algorithms,
benchmark=benchmark,
generation_config=GenerationConfig(
max_length=100, pad_token_id=larger_tokenizer.eos_token_id
),
)

assert len(results.predictions) == 3
assert results.predictions[0].example == test_dataset[0]
assert results.predictions[1].example == test_dataset[1]
assert results.predictions[2].example == test_dataset[2]
assert results.metrics["accuracy"] == pytest.approx(2 / 3)
3 changes: 2 additions & 1 deletion tests/core/test_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@ def test_basic_Pipeline_generate(
) -> None:
pipeline = Pipeline(model, tokenizer)
res = pipeline.generate(
Example(instruction="Respond", input="A B C D", output="E"), config=None
Example(instruction="Respond", input="A B C D", output="E"),
generation_config=None,
)
# pythia-70m generates nonsense, so just verify we get something
assert isinstance(res, str)
Expand Down
4 changes: 3 additions & 1 deletion tests/core/test_prompter.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,6 @@ def test_few_shot_prompter():
examples = [Completion(prompt="Example 1", response="Response 1")]
prompter = FewShotPrompter(examples)
output = prompter.apply(completion)
assert output.prompt == examples[0].prompt + "\n" + examples[0].response + "\nHello"
assert (
output.prompt == examples[0].prompt + " " + examples[0].response + "\n\nHello"
)

0 comments on commit e8dd92b

Please sign in to comment.