Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allows MMLU to have the system_prompt provided to it #197

Merged
merged 3 commits into from
Dec 13, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .pylintrc
Original file line number Diff line number Diff line change
Expand Up @@ -448,7 +448,8 @@ disable=raw-checker-failed,
pointless-statement,
wrong-import-order,
line-too-long,
dangerous-default-value
dangerous-default-value,
too-many-instance-attributes

# Enable the message, report, category or checker with the given id(s). You can
# either give multiple identifier separated by comma (,) or put this option
Expand Down
2 changes: 2 additions & 0 deletions .spellcheck-en-custom.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,5 @@ TODO
tox
venv
vllm
barebones
LM
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
## 0.4.2

* Adds the ability to provide a custom system prompt to the MMLU-based evaluators. When a system prompt is provided, LM-eval applies the chat template under the hood, else it will pass the model a barebones prompt.

## 0.4

* Added ability to specify a custom http client to MT-Bench
Expand Down
8 changes: 7 additions & 1 deletion scripts/test_mmlu.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,19 @@
# First Party
from instructlab.eval.mmlu import MMLUEvaluator

SYSTEM_PROMPT = """I am, Red Hat® Instruct Model based on Granite 7B, an AI language model developed by Red Hat and IBM Research, based on the Granite-7b-base language model. My primary function is to be a chat assistant."""


def test_minimal_mmlu():
print("===> Executing 'test_minimal_mmlu'...")
try:
model_path = "instructlab/granite-7b-lab"
tasks = ["mmlu_anatomy", "mmlu_astronomy"]
abhi1092 marked this conversation as resolved.
Show resolved Hide resolved
mmlu = MMLUEvaluator(model_path=model_path, tasks=tasks)
mmlu = MMLUEvaluator(
model_path=model_path,
tasks=tasks,
system_prompt=SYSTEM_PROMPT,
)
overall_score, individual_scores = mmlu.run()
print(overall_score)
print(individual_scores)
Expand Down
30 changes: 23 additions & 7 deletions src/instructlab/eval/mmlu.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ class AbstractMMLUEvaluator(Evaluator):
few_shots number of examples
batch_size batch size for evaluation. Valid values are a positive integer or 'auto' to select the largest batch size that will fit in memory, or 'auto:N' to reselect the largest batch size N times'.
device PyTorch device (e.g. "cpu" or "cuda:0") for running models
system_prompt system prompt to be used when applying the chat template
"""

def __init__(
Expand All @@ -113,8 +114,10 @@ def __init__(
few_shots: int = 5,
batch_size: Optional[Union[int, str]] = "auto",
device: str = ("cuda" if torch.cuda.is_available() else "cpu"),
system_prompt: Optional[str] = None,
) -> None:
self.model_path = model_path
self.system_prompt = system_prompt
self.tasks_dir = tasks_dir
self.tasks = tasks
self.model_dtype = model_dtype
Expand Down Expand Up @@ -168,6 +171,7 @@ def _run_mmlu(self, server_url: str | None = None) -> dict:
if not os.access(self.tasks_dir, os.R_OK):
raise InvalidTasksDirError(self.tasks_dir)
tm = TaskManager(verbosity="DEBUG", include_path=self.tasks_dir)
should_apply_chat_template = self.system_prompt is not None
mmlu_output = self._simple_evaluate_with_error_handling(
model=model,
model_args=model_args,
Expand All @@ -176,6 +180,8 @@ def _run_mmlu(self, server_url: str | None = None) -> dict:
batch_size=self.batch_size,
device=self.device,
task_manager=tm,
system_instruction=self.system_prompt,
apply_chat_template=should_apply_chat_template,
)
results = mmlu_output["results"]
return results
Expand Down Expand Up @@ -213,12 +219,13 @@ class MMLUEvaluator(AbstractMMLUEvaluator):
Evaluator for Massive Multitask Language Understanding (MMLU)

Attributes:
model_path absolute path to or name of a huggingface model
tasks list of tasks for MMLU to test the model with
model_dtype dtype of model when served
few_shots number of examples
batch_size batch size for evaluation. Valid values are a positive integer or 'auto' to select the largest batch size that will fit in memory, or 'auto:N' to reselect the largest batch size N times'.
device PyTorch device (e.g. "cpu" or "cuda:0") for running models
model_path absolute path to or name of a huggingface model
tasks list of tasks for MMLU to test the model with
model_dtype dtype of model when served
few_shots number of examples
batch_size batch size for evaluation. Valid values are a positive integer or 'auto' to select the largest batch size that will fit in memory, or 'auto:N' to reselect the largest batch size N times'.
device PyTorch device (e.g. "cpu" or "cuda:0") for running models
system_prompt system prompt to be used when applying the chat template
"""

name = "mmlu"
Expand All @@ -231,9 +238,17 @@ def __init__(
few_shots: int = 5,
batch_size: Optional[Union[int, str]] = "auto",
device: str = ("cuda" if torch.cuda.is_available() else "cpu"),
system_prompt: Optional[str] = None,
) -> None:
super().__init__(
model_path, None, tasks, model_dtype, few_shots, batch_size, device
model_path,
None,
tasks,
model_dtype,
few_shots,
batch_size,
device,
system_prompt=system_prompt,
alimaredia marked this conversation as resolved.
Show resolved Hide resolved
)


Expand All @@ -243,6 +258,7 @@ class MMLUBranchEvaluator(AbstractMMLUEvaluator):

Attributes:
model_path absolute path to or name of a huggingface model
system_prompt system prompt to be used when applying the chat template
tasks_dir path where the <TASK_NAME>.jsonl and <TASK_NAME>_task.yaml files for the branches being evaluated are stored
tasks group name that is shared by all the MMLUBranch tasks
model_dtype dtype of model when served
Expand Down
11 changes: 9 additions & 2 deletions tests/test_mmlu.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,10 @@ def test_mmlu_branch(eval_mock):
tasks_dir = f"{os.path.dirname(os.path.realpath(__file__))}/testdata/sdg"
tasks = ["mmlu_pr"]
mmlu = MMLUBranchEvaluator(
model_path=MODEL_EXAMPLE, tasks_dir=tasks_dir, tasks=tasks
model_path=MODEL_EXAMPLE,
tasks_dir=tasks_dir,
tasks=tasks,
system_prompt="You are an intelligent AI language model.",
)
overall_score, individual_scores = mmlu.run()

Expand All @@ -62,7 +65,11 @@ def test_mmlu_branch(eval_mock):
)
def test_mmlu(eval_mock):
tasks = ["mmlu_anatomy", "mmlu_astronomy", "mmlu_algebra"]
mmlu = MMLUEvaluator(model_path=MODEL_EXAMPLE, tasks=tasks)
mmlu = MMLUEvaluator(
model_path=MODEL_EXAMPLE,
tasks=tasks,
system_prompt="You are an intelligent AI language model.",
)
overall_score, individual_scores = mmlu.run()

eval_mock.assert_called()
Expand Down
Loading