diff --git a/.pylintrc b/.pylintrc index 01a605d..14b3eb1 100644 --- a/.pylintrc +++ b/.pylintrc @@ -448,7 +448,8 @@ disable=raw-checker-failed, pointless-statement, wrong-import-order, line-too-long, - dangerous-default-value + dangerous-default-value, + too-many-instance-attributes # Enable the message, report, category or checker with the given id(s). You can # either give multiple identifier separated by comma (,) or put this option diff --git a/.spellcheck-en-custom.txt b/.spellcheck-en-custom.txt index 5967218..3251d44 100644 --- a/.spellcheck-en-custom.txt +++ b/.spellcheck-en-custom.txt @@ -26,3 +26,5 @@ TODO tox venv vllm +barebones +LM diff --git a/CHANGELOG.md b/CHANGELOG.md index a897297..1ba0513 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,8 @@ +## 0.4.2 + +* Adds the ability to provide a custom system prompt to the MMLU-based evaluators. When a system prompt is provided, LM-eval applies the chat template under the hood, else it will pass the model a barebones prompt. +* Adds an `extra_args` parameter to the `.run` method of all MMLU-based evaluators. This way, consumers are able to directly pass any additional arguments they want through to the `lm_eval.evaluators.simple_evaluate` function. + ## 0.4 * Added ability to specify a custom http client to MT-Bench diff --git a/scripts/test_mmlu.py b/scripts/test_mmlu.py index 2db46c0..a6035df 100755 --- a/scripts/test_mmlu.py +++ b/scripts/test_mmlu.py @@ -1,16 +1,73 @@ +# Standard +from typing import Dict, List, Tuple, TypedDict + # First Party from instructlab.eval.mmlu import MMLUEvaluator +SYSTEM_PROMPT = """I am, Red Hat® Instruct Model based on Granite 7B, an AI language model developed by Red Hat and IBM Research, based on the Granite-7b-base language model. My primary function is to be a chat assistant.""" + + +class MMLUSample(TypedDict): + """ + Example of a single sample returned from lm_eval when running MMLU. + This is not a comprehensive type, just the subset of fields we care about for this test. + """ + + # Arguments is the list of (prompt, answer) pairs passed to MMLU as few-shot samples. + # They will not be present with few_shot=0 + arguments: List[Tuple[str, str]] + + +def all_samples_contain_system_prompt( + samples: Dict[str, List[MMLUSample]], prompt: str +) -> bool: + """ + Given a mapping of evaluation --> list of results, validates that all few-shot examples + included the system prompt + """ + for topic, samples_set in samples.items(): + for sample in samples_set: + for mmlu_prompt, _ in sample["arguments"]: + if prompt not in mmlu_prompt: + # we are looking for the exact system prompt, so no need to convert to normalize to lowercase + print(f"found a sample in the '{topic}' MMLU topic set") + return False + + return True + def test_minimal_mmlu(): print("===> Executing 'test_minimal_mmlu'...") try: model_path = "instructlab/granite-7b-lab" tasks = ["mmlu_anatomy", "mmlu_astronomy"] - mmlu = MMLUEvaluator(model_path=model_path, tasks=tasks) - overall_score, individual_scores = mmlu.run() + mmlu = MMLUEvaluator( + model_path=model_path, + tasks=tasks, + system_prompt=SYSTEM_PROMPT, + ) + overall_score, individual_scores = mmlu.run( + extra_args={"log_samples": True, "write_out": True} + ) + samples = mmlu.results["samples"] + print(overall_score) print(individual_scores) + + # we need n-shots > 1 to be able to validate the inclusion of the system prompt + eligible_samples = { + topic: samples[topic] + for topic, shot in mmlu.results["n-shot"].items() + if shot > 1 + } + if eligible_samples: + if not all_samples_contain_system_prompt(eligible_samples, SYSTEM_PROMPT): + return False + else: + print( + "MMLU was run in zero-shot mode, cannot confirm that system prompt was included, skipping check..." + ) + except Exception as exc: print(f"'test_minimal_mmlu' failed: {exc}") return False diff --git a/src/instructlab/eval/mmlu.py b/src/instructlab/eval/mmlu.py index f893b66..1aaf462 100644 --- a/src/instructlab/eval/mmlu.py +++ b/src/instructlab/eval/mmlu.py @@ -7,12 +7,12 @@ """ # Standard -from typing import Optional, Union +from typing import Any, Dict, Optional, Union import os # Third Party -from lm_eval.evaluator import simple_evaluate # type: ignore -from lm_eval.tasks import TaskManager # type: ignore +from lm_eval.evaluator import simple_evaluate +from lm_eval.tasks import TaskManager import torch # First Party @@ -102,6 +102,8 @@ class AbstractMMLUEvaluator(Evaluator): few_shots number of examples batch_size batch size for evaluation. Valid values are a positive integer or 'auto' to select the largest batch size that will fit in memory, or 'auto:N' to reselect the largest batch size N times'. device PyTorch device (e.g. "cpu" or "cuda:0") for running models + system_prompt system prompt to be used when applying the chat template + results full output from the `lm_eval.evaluator.simple_evaluate` function after MMLU has run. """ def __init__( @@ -113,26 +115,43 @@ def __init__( few_shots: int = 5, batch_size: Optional[Union[int, str]] = "auto", device: str = ("cuda" if torch.cuda.is_available() else "cpu"), + system_prompt: Optional[str] = None, ) -> None: self.model_path = model_path + self.system_prompt = system_prompt self.tasks_dir = tasks_dir self.tasks = tasks self.model_dtype = model_dtype self.few_shots = few_shots self.batch_size = batch_size self.device = device + self._results = None - def run(self, server_url: str | None = None) -> tuple: + @property + def results(self) -> Dict[str, Any] | None: + """ + Returns the results of the last MMLU evaluation, if one has taken place. + + Returns: + Dict[str, Any] | None: The output from `lm_eval.evaluator.simple_evaluate` + """ + return self._results + + def run( + self, server_url: str | None = None, extra_args: Dict[str, Any] | None = None + ) -> tuple: """ Runs evaluation Attributes server_url Model server endpoint (Ex: http://localhost:8000/v1) for the model being evaluated + extra_args Dictionary containing any extra arguments to be passed into the lm_eval `lm_eval.evaluator.simple_evaluate` function. Returns: overall_score Average score for the task group individual_scores Individual scores for each task in the task group """ + extra_args = {} if not extra_args else extra_args logger.debug(locals()) # TODO: make this a parameter for class? @@ -153,7 +172,10 @@ def run(self, server_url: str | None = None) -> tuple: return overall_score, individual_scores - def _run_mmlu(self, server_url: str | None = None) -> dict: + def _run_mmlu( + self, server_url: str | None = None, extra_args: Dict[str, Any] | None = None + ) -> dict: + extra_args = {} if not extra_args else extra_args if server_url is not None: # Requires lm_eval >= 0.4.4 model_args = f"base_url={server_url}/completions,model={self.model_path},tokenizer_backend=huggingface" @@ -168,17 +190,25 @@ def _run_mmlu(self, server_url: str | None = None) -> dict: if not os.access(self.tasks_dir, os.R_OK): raise InvalidTasksDirError(self.tasks_dir) tm = TaskManager(verbosity="DEBUG", include_path=self.tasks_dir) - mmlu_output = self._simple_evaluate_with_error_handling( - model=model, - model_args=model_args, - tasks=self.tasks, - num_fewshot=self.few_shots, - batch_size=self.batch_size, - device=self.device, - task_manager=tm, - ) - results = mmlu_output["results"] - return results + should_apply_chat_template = self.system_prompt is not None + + # configure the args here so users can override them as necessary + simple_evaluate_kwargs = { + "model": model, + "model_args": model_args, + "tasks": self.tasks, + "num_fewshot": self.few_shots, + "batch_size": self.batch_size, + "device": self.device, + "task_manager": tm, + "system_instruction": self.system_prompt, + "apply_chat_template": should_apply_chat_template, + } + simple_evaluate_kwargs.update(extra_args) + + results = self._simple_evaluate_with_error_handling(**simple_evaluate_kwargs) + self._results = results + return results["results"] # This method converts general errors from simple_evaluate # into a more user-understandable error @@ -213,12 +243,13 @@ class MMLUEvaluator(AbstractMMLUEvaluator): Evaluator for Massive Multitask Language Understanding (MMLU) Attributes: - model_path absolute path to or name of a huggingface model - tasks list of tasks for MMLU to test the model with - model_dtype dtype of model when served - few_shots number of examples - batch_size batch size for evaluation. Valid values are a positive integer or 'auto' to select the largest batch size that will fit in memory, or 'auto:N' to reselect the largest batch size N times'. - device PyTorch device (e.g. "cpu" or "cuda:0") for running models + model_path absolute path to or name of a huggingface model + tasks list of tasks for MMLU to test the model with + model_dtype dtype of model when served + few_shots number of examples + batch_size batch size for evaluation. Valid values are a positive integer or 'auto' to select the largest batch size that will fit in memory, or 'auto:N' to reselect the largest batch size N times'. + device PyTorch device (e.g. "cpu" or "cuda:0") for running models + system_prompt system prompt to be used when applying the chat template """ name = "mmlu" @@ -231,9 +262,17 @@ def __init__( few_shots: int = 5, batch_size: Optional[Union[int, str]] = "auto", device: str = ("cuda" if torch.cuda.is_available() else "cpu"), + system_prompt: Optional[str] = None, ) -> None: super().__init__( - model_path, None, tasks, model_dtype, few_shots, batch_size, device + model_path, + None, + tasks, + model_dtype, + few_shots, + batch_size, + device, + system_prompt=system_prompt, ) @@ -243,6 +282,7 @@ class MMLUBranchEvaluator(AbstractMMLUEvaluator): Attributes: model_path absolute path to or name of a huggingface model + system_prompt system prompt to be used when applying the chat template tasks_dir path where the .jsonl and _task.yaml files for the branches being evaluated are stored tasks group name that is shared by all the MMLUBranch tasks model_dtype dtype of model when served diff --git a/tests/test_mmlu.py b/tests/test_mmlu.py index bdf4f90..2cc0c79 100644 --- a/tests/test_mmlu.py +++ b/tests/test_mmlu.py @@ -48,7 +48,10 @@ def test_mmlu_branch(eval_mock): tasks_dir = f"{os.path.dirname(os.path.realpath(__file__))}/testdata/sdg" tasks = ["mmlu_pr"] mmlu = MMLUBranchEvaluator( - model_path=MODEL_EXAMPLE, tasks_dir=tasks_dir, tasks=tasks + model_path=MODEL_EXAMPLE, + tasks_dir=tasks_dir, + tasks=tasks, + system_prompt="You are an intelligent AI language model.", ) overall_score, individual_scores = mmlu.run() @@ -62,7 +65,11 @@ def test_mmlu_branch(eval_mock): ) def test_mmlu(eval_mock): tasks = ["mmlu_anatomy", "mmlu_astronomy", "mmlu_algebra"] - mmlu = MMLUEvaluator(model_path=MODEL_EXAMPLE, tasks=tasks) + mmlu = MMLUEvaluator( + model_path=MODEL_EXAMPLE, + tasks=tasks, + system_prompt="You are an intelligent AI language model.", + ) overall_score, individual_scores = mmlu.run() eval_mock.assert_called()