instructlab · mergify · Dec 13, 2024 · Dec 12, 2024 · Dec 13, 2024 · Dec 13, 2024
diff --git a/.pylintrc b/.pylintrc
@@ -448,7 +448,8 @@ disable=raw-checker-failed,
         pointless-statement,
         wrong-import-order,
         line-too-long,
-        dangerous-default-value
+        dangerous-default-value,
+        too-many-instance-attributes
 
 # Enable the message, report, category or checker with the given id(s). You can
 # either give multiple identifier separated by comma (,) or put this option

diff --git a/.spellcheck-en-custom.txt b/.spellcheck-en-custom.txt
@@ -26,3 +26,5 @@ TODO
 tox
 venv
 vllm
+barebones
+LM
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,7 @@
+## 0.4.2
+
+* Adds the ability to provide a custom system prompt to the MMLU-based evaluators. When a system prompt is provided, LM-eval applies the chat template under the hood, else it will pass the model a barebones prompt.
+
 ## 0.4
 
 * Added ability to specify a custom http client to MT-Bench

diff --git a/scripts/test_mmlu.py b/scripts/test_mmlu.py
@@ -1,13 +1,19 @@
 # First Party
 from instructlab.eval.mmlu import MMLUEvaluator
 
+SYSTEM_PROMPT = """I am, Red Hat® Instruct Model based on Granite 7B, an AI language model developed by Red Hat and IBM Research, based on the Granite-7b-base language model. My primary function is to be a chat assistant."""
+
 
 def test_minimal_mmlu():
     print("===> Executing 'test_minimal_mmlu'...")
     try:
         model_path = "instructlab/granite-7b-lab"
         tasks = ["mmlu_anatomy", "mmlu_astronomy"]
-        mmlu = MMLUEvaluator(model_path=model_path, tasks=tasks)
+        mmlu = MMLUEvaluator(
+            model_path=model_path,
+            tasks=tasks,
+            system_prompt=SYSTEM_PROMPT,
+        )
         overall_score, individual_scores = mmlu.run()
         print(overall_score)
         print(individual_scores)

diff --git a/src/instructlab/eval/mmlu.py b/src/instructlab/eval/mmlu.py
@@ -102,6 +102,7 @@ class AbstractMMLUEvaluator(Evaluator):
         few_shots       number of examples
         batch_size      batch size for evaluation. Valid values are a positive integer or 'auto' to select the largest batch size that will fit in memory, or 'auto:N' to reselect the largest batch size N times'.
         device          PyTorch device (e.g. "cpu" or "cuda:0") for running models
+        system_prompt   system prompt to be used when applying the chat template
     """
 
     def __init__(
@@ -113,8 +114,10 @@ def __init__(
         few_shots: int = 5,
         batch_size: Optional[Union[int, str]] = "auto",
         device: str = ("cuda" if torch.cuda.is_available() else "cpu"),
+        system_prompt: Optional[str] = None,
     ) -> None:
         self.model_path = model_path
+        self.system_prompt = system_prompt
         self.tasks_dir = tasks_dir
         self.tasks = tasks
         self.model_dtype = model_dtype
@@ -168,6 +171,7 @@ def _run_mmlu(self, server_url: str | None = None) -> dict:
             if not os.access(self.tasks_dir, os.R_OK):
                 raise InvalidTasksDirError(self.tasks_dir)
             tm = TaskManager(verbosity="DEBUG", include_path=self.tasks_dir)
+        should_apply_chat_template = self.system_prompt is not None
         mmlu_output = self._simple_evaluate_with_error_handling(
             model=model,
             model_args=model_args,
@@ -176,6 +180,8 @@ def _run_mmlu(self, server_url: str | None = None) -> dict:
             batch_size=self.batch_size,
             device=self.device,
             task_manager=tm,
+            system_instruction=self.system_prompt,
+            apply_chat_template=should_apply_chat_template,
         )
         results = mmlu_output["results"]
         return results
@@ -213,12 +219,13 @@ class MMLUEvaluator(AbstractMMLUEvaluator):
     Evaluator for Massive Multitask Language Understanding (MMLU)
 
     Attributes:
-        model_path   absolute path to or name of a huggingface model
-        tasks        list of tasks for MMLU to test the model with
-        model_dtype  dtype of model when served
-        few_shots    number of examples
-        batch_size   batch size for evaluation. Valid values are a positive integer or 'auto' to select the largest batch size that will fit in memory, or 'auto:N' to reselect the largest batch size N times'.
-        device       PyTorch device (e.g. "cpu" or "cuda:0") for running models
+        model_path      absolute path to or name of a huggingface model
+        tasks           list of tasks for MMLU to test the model with
+        model_dtype     dtype of model when served
+        few_shots       number of examples
+        batch_size      batch size for evaluation. Valid values are a positive integer or 'auto' to select the largest batch size that will fit in memory, or 'auto:N' to reselect the largest batch size N times'.
+        device          PyTorch device (e.g. "cpu" or "cuda:0") for running models
+        system_prompt   system prompt to be used when applying the chat template
     """
 
     name = "mmlu"
@@ -231,9 +238,17 @@ def __init__(
         few_shots: int = 5,
         batch_size: Optional[Union[int, str]] = "auto",
         device: str = ("cuda" if torch.cuda.is_available() else "cpu"),
+        system_prompt: Optional[str] = None,
     ) -> None:
         super().__init__(
-            model_path, None, tasks, model_dtype, few_shots, batch_size, device
+            model_path,
+            None,
+            tasks,
+            model_dtype,
+            few_shots,
+            batch_size,
+            device,
+            system_prompt=system_prompt,
         )
 
 
@@ -243,6 +258,7 @@ class MMLUBranchEvaluator(AbstractMMLUEvaluator):
 
     Attributes:
         model_path      absolute path to or name of a huggingface model
+        system_prompt   system prompt to be used when applying the chat template
         tasks_dir       path where the <TASK_NAME>.jsonl and <TASK_NAME>_task.yaml files for the branches being evaluated are stored
         tasks           group name that is shared by all the MMLUBranch tasks
         model_dtype     dtype of model when served

diff --git a/tests/test_mmlu.py b/tests/test_mmlu.py
@@ -48,7 +48,10 @@ def test_mmlu_branch(eval_mock):
     tasks_dir = f"{os.path.dirname(os.path.realpath(__file__))}/testdata/sdg"
     tasks = ["mmlu_pr"]
     mmlu = MMLUBranchEvaluator(
-        model_path=MODEL_EXAMPLE, tasks_dir=tasks_dir, tasks=tasks
+        model_path=MODEL_EXAMPLE,
+        tasks_dir=tasks_dir,
+        tasks=tasks,
+        system_prompt="You are an intelligent AI language model.",
     )
     overall_score, individual_scores = mmlu.run()
 
@@ -62,7 +65,11 @@ def test_mmlu_branch(eval_mock):
 )
 def test_mmlu(eval_mock):
     tasks = ["mmlu_anatomy", "mmlu_astronomy", "mmlu_algebra"]
-    mmlu = MMLUEvaluator(model_path=MODEL_EXAMPLE, tasks=tasks)
+    mmlu = MMLUEvaluator(
+        model_path=MODEL_EXAMPLE,
+        tasks=tasks,
+        system_prompt="You are an intelligent AI language model.",
+    )
     overall_score, individual_scores = mmlu.run()
 
     eval_mock.assert_called()
-Original file line number
+Diff line change
@@ Expand Up / @@ -26,3 +26,5 @@ TODO @@
     tox
     venv
     vllm
+    barebones
+    LM