Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

eval API opt #794

Merged
merged 2 commits into from
Dec 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 3 additions & 6 deletions gptqmodel/models/auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,7 @@ def eval(
tasks: Union[List[LM_EVAL_TASK], List[EVALPLUS_TASK]],
batch: int = 1,
trust_remote_code: bool = False,
output_file: Optional[str] = None,
):
if framework is None:
raise ValueError("eval parameter: `framework` cannot be set to None")
Expand All @@ -238,17 +239,12 @@ def eval(
if task not in LM_EVAL_TASK.get_task_enums():
raise ValueError(f"lm_eval support tasks: {LM_EVAL_TASK.get_all_tasks_string()}")

from pathlib import Path

from gptqmodel.utils.eval import lm_eval
from lm_eval.utils import make_table
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_id_or_path, trust_remote_code=trust_remote_code)

result_path = Path("lm_eval_results")
result_path.mkdir(parents=True, exist_ok=True)

results = lm_eval(
model_id_or_path,
model_name="hf",
Expand All @@ -257,7 +253,7 @@ def eval(
trust_remote_code=trust_remote_code,
batch_size=batch,
apply_chat_template=True if tokenizer.chat_template is not None else False,
output_path=str(result_path)
output_path=output_file
)
print('--------lm_eval Eval Result---------')
print(make_table(results))
Expand All @@ -278,6 +274,7 @@ def eval(
dataset=task.value,
batch=batch,
trust_remote_code=trust_remote_code,
output_file=output_file
)
results[task.value] = {"base tests": base_formatted, "base + extra tests": plus_formatted, "results_path": result_path}
print('--------evalplus Eval Result---------')
Expand Down
2 changes: 1 addition & 1 deletion gptqmodel/nn_modules/qlinear/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ def _validate(cls, bits: int, group_size: int, desc_act: bool, sym: bool, dynami
if device is not None:
try:
cls.validate_device(device)
except NotImplementedError as e:
except NotImplementedError:
e = f"{cls} does not support device: {device}"
return False, NotImplementedError(e)

Expand Down
18 changes: 10 additions & 8 deletions gptqmodel/utils/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ def evalplus(
dataset: str,
batch: int = 1,
trust_remote_code: bool = False,
output_file: Optional[str] = None,
):
try:
from evalplus.evaluate import evaluate
Expand All @@ -73,20 +74,21 @@ def evalplus(

assert dataset in ["humaneval", "mbpp"], f"Invalid dataset {dataset}"

evaluate(dataset=dataset, model=model, backend="gptqmodel", bs=batch, trust_remote_code=trust_remote_code,
evaluate(dataset=dataset, model=model, backend="gptqmodel", bs=batch, trust_remote_code=trust_remote_code, output_file=output_file,
greedy=True)

result_path = model.strip("./").replace("/", "--") + "_gptqmodel_temp_0.0_eval_results.json"
result_path = os.path.join("evalplus_results", dataset, result_path)
if output_file is None:
output_file = model.strip("./").replace("/", "--") + "_gptqmodel_temp_0.0_eval_results.json"
output_file = os.path.join("evalplus_results", dataset, output_file)

if not os.path.exists(result_path):
raise FileNotFoundError(f"No such file: {result_path}")
if not os.path.exists(output_file):
raise FileNotFoundError(f"No such file: {output_file}")

try:
with open(result_path, 'r') as file:
with open(output_file, 'r') as file:
data = json.load(file)
except json.JSONDecodeError:
raise ValueError(f"Failed to decode JSON: {result_path}")
raise ValueError(f"Failed to decode JSON: {output_file}")

try:
pass_at_k = data["pass_at_k"]
Expand All @@ -100,7 +102,7 @@ def evalplus(
except ValueError as e:
raise ValueError(f"Data format error: {str(e)}")

return base_formatted, plus_formatted, result_path
return base_formatted, plus_formatted, output_file


def evalplus_make_table(results):
Expand Down
29 changes: 16 additions & 13 deletions tests/test_eval.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import os
import tempfile
import unittest
from typing import Union

Expand All @@ -8,7 +9,6 @@

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"


class TestEval(unittest.TestCase):
@classmethod
def setUpClass(self):
Expand All @@ -21,18 +21,21 @@ def setUpClass(self):
]
)
def test_eval(self, eval_backend: EVAL, task: Union[LM_EVAL_TASK, EVALPLUS_TASK]):
results = GPTQModel.eval(self.MODEL_ID, framework=eval_backend, tasks=[task], batch=32)
if eval_backend == EVAL.LM_EVAL:
acc_score = results['results'].get(task.value, {}).get('acc,none')
acc_norm_score = results['results'].get(task.value, {}).get('acc_norm,none')

self.assertGreaterEqual(acc_score, 0.31, "acc score does not match expected result")
self.assertGreaterEqual(acc_norm_score, 0.35, "acc_norm score does not match expected result")
elif eval_backend == EVAL.EVALPLUS:
result = results.get(task.value)
base_formatted, plus_formatted, _ = float(result.get("base tests")), float(result.get("base + extra tests")), result.get("results_path")
self.assertGreaterEqual(base_formatted, 0.31, "Base score does not match expected result")
self.assertGreaterEqual(plus_formatted, 0.29, "Plus score does not match expected result")
with tempfile.TemporaryDirectory() as tmp_dir:
output_file = f"{tmp_dir}/result.json"
results = GPTQModel.eval(self.MODEL_ID, framework=eval_backend, tasks=[task], batch=32, output_file=output_file)
if eval_backend == EVAL.LM_EVAL:
acc_score = results['results'].get(task.value, {}).get('acc,none')
acc_norm_score = results['results'].get(task.value, {}).get('acc_norm,none')

self.assertGreaterEqual(acc_score, 0.31, "acc score does not match expected result")
self.assertGreaterEqual(acc_norm_score, 0.35, "acc_norm score does not match expected result")
elif eval_backend == EVAL.EVALPLUS:
result = results.get(task.value)
base_formatted, plus_formatted, _ = float(result.get("base tests")), float(
result.get("base + extra tests")), result.get("results_path")
self.assertGreaterEqual(base_formatted, 0.29, "Base score does not match expected result")
self.assertGreaterEqual(plus_formatted, 0.26, "Plus score does not match expected result")



Expand Down
9 changes: 6 additions & 3 deletions tests/test_evalplus.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import os
import tempfile
import unittest

from gptqmodel.utils.eval import evalplus
Expand All @@ -12,8 +13,10 @@ def setUpClass(self):
self.MODEL_ID = "/monster/data/model/Llama-3.2-1B-Instruct"

def test_evalplus(self):
base_formatted, plus_formatted, _ = evalplus(model=self.MODEL_ID, dataset='humaneval')
self.assertGreaterEqual(float(base_formatted), 0.31, "Base score does not match expected result")
self.assertGreaterEqual(float(plus_formatted), 0.29, "Plus score does not match expected result")
with tempfile.TemporaryDirectory() as tmp_dir:
output_file = f"{tmp_dir}/result.json"
base_formatted, plus_formatted, _ = evalplus(model=self.MODEL_ID, dataset='humaneval', output_file=output_file)
self.assertGreaterEqual(float(base_formatted), 0.29, "Base score does not match expected result")
self.assertGreaterEqual(float(plus_formatted), 0.26, "Plus score does not match expected result")