Skip to content

Commit

Permalink
refactor: limit usage of scipy and skilearn dependencies (#2097)
Browse files Browse the repository at this point in the history
* refactor: move scipy and sklearn module imports to func imports

Signed-off-by: Nathan Weinberg <nweinber@redhat.com>

* refactor: consolidate weighted_f1_score func into lm_eval utils

Signed-off-by: Nathan Weinberg <nweinber@redhat.com>

* lint: allow for utils file to have unused imports

this allows for shared functions to be defined only
once while allowing for the YAML function importing
to continue working

Signed-off-by: Nathan Weinberg <nweinber@redhat.com>

---------

Signed-off-by: Nathan Weinberg <nweinber@redhat.com>
  • Loading branch information
nathan-weinberg authored Aug 1, 2024
1 parent 63e76e8 commit 7f15cce
Show file tree
Hide file tree
Showing 17 changed files with 42 additions and 89 deletions.
10 changes: 6 additions & 4 deletions lm_eval/api/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@

import numpy as np
import sacrebleu
import sklearn.metrics

from lm_eval.api.registry import register_aggregation, register_metric

Expand Down Expand Up @@ -51,21 +50,24 @@ def bits_per_byte(items):

@register_aggregation("f1")
def f1_score(items):
from sklearn.metrics import f1_score

unzipped_list = list(zip(*items))
golds = unzipped_list[0]
preds = unzipped_list[1]
fscore = sklearn.metrics.f1_score(golds, preds)
fscore = f1_score(golds, preds)

return np.max(fscore)


@register_aggregation("matthews_corrcoef")
def matthews_corrcoef(items):
from sklearn.metrics import matthews_corrcoef

unzipped_list = list(zip(*items))
golds = unzipped_list[0]
preds = unzipped_list[1]
# print(preds)
return sklearn.metrics.matthews_corrcoef(golds, preds)
return matthews_corrcoef(golds, preds)


@register_aggregation("bleu")
Expand Down
10 changes: 1 addition & 9 deletions lm_eval/tasks/afrimmlu/direct/utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from sklearn.metrics import f1_score
from lm_eval.utils import weighted_f1_score


def doc_to_choice(doc):
Expand Down Expand Up @@ -30,11 +30,3 @@ def doc_to_text(doc):
choice4=choices[3],
)
return text


def weighted_f1_score(items):
unzipped_list = list(zip(*items))
golds = unzipped_list[0]
preds = unzipped_list[1]
fscore = f1_score(golds, preds, average="weighted")
return fscore
10 changes: 1 addition & 9 deletions lm_eval/tasks/afrimmlu/translate/utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from sklearn.metrics import f1_score
from lm_eval.utils import weighted_f1_score


def doc_to_choice(doc):
Expand Down Expand Up @@ -30,11 +30,3 @@ def doc_to_text(doc):
choice4=choices[3],
)
return text


def weighted_f1_score(items):
unzipped_list = list(zip(*items))
golds = unzipped_list[0]
preds = unzipped_list[1]
fscore = f1_score(golds, preds, average="weighted")
return fscore
10 changes: 1 addition & 9 deletions lm_eval/tasks/afrimmlu/utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from sklearn.metrics import f1_score
from lm_eval.utils import weighted_f1_score


def doc_to_choice(doc):
Expand Down Expand Up @@ -30,11 +30,3 @@ def doc_to_text(doc):
choice4=choices[3],
)
return text


def weighted_f1_score(items):
unzipped_list = list(zip(*items))
golds = unzipped_list[0]
preds = unzipped_list[1]
fscore = f1_score(golds, preds, average="weighted")
return fscore
10 changes: 1 addition & 9 deletions lm_eval/tasks/afrixnli/anli prompt/en-direct/utils.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,6 @@
from sklearn.metrics import f1_score
from lm_eval.utils import weighted_f1_score


def doc_to_target(doc):
replacements = {0: "True", 1: "Neither", 2: "False"}
return replacements[doc["label"]]


def weighted_f1_score(items):
unzipped_list = list(zip(*items))
golds = unzipped_list[0]
preds = unzipped_list[1]
fscore = f1_score(golds, preds, average="weighted")
return fscore
10 changes: 1 addition & 9 deletions lm_eval/tasks/afrixnli/anli prompt/native-direct/utils.py
Original file line number Diff line number Diff line change
@@ -1,9 +1 @@
from sklearn.metrics import f1_score


def weighted_f1_score(items):
unzipped_list = list(zip(*items))
golds = unzipped_list[0]
preds = unzipped_list[1]
fscore = f1_score(golds, preds, average="weighted")
return fscore
from lm_eval.utils import weighted_f1_score
10 changes: 1 addition & 9 deletions lm_eval/tasks/afrixnli/anli prompt/translate/utils.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,6 @@
from sklearn.metrics import f1_score
from lm_eval.utils import weighted_f1_score


def doc_to_target(doc):
replacements = {0: "True", 1: "Neither", 2: "False"}
return replacements[doc["label"]]


def weighted_f1_score(items):
unzipped_list = list(zip(*items))
golds = unzipped_list[0]
preds = unzipped_list[1]
fscore = f1_score(golds, preds, average="weighted")
return fscore
10 changes: 1 addition & 9 deletions lm_eval/tasks/afrixnli/lai prompt/direct/utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from sklearn.metrics import f1_score
from lm_eval.utils import weighted_f1_score


def doc_to_text(doc):
Expand All @@ -17,11 +17,3 @@ def doc_to_text(doc):
def doc_to_target(doc):
replacements = {0: "entailment", 1: "neutral", 2: "contradiction"}
return replacements[doc["label"]]


def weighted_f1_score(items):
unzipped_list = list(zip(*items))
golds = unzipped_list[0]
preds = unzipped_list[1]
fscore = f1_score(golds, preds, average="weighted")
return fscore
10 changes: 1 addition & 9 deletions lm_eval/tasks/afrixnli/lai prompt/translate/utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from sklearn.metrics import f1_score
from lm_eval.utils import weighted_f1_score


def doc_to_text(doc):
Expand All @@ -17,11 +17,3 @@ def doc_to_text(doc):
def doc_to_target(doc):
replacements = {0: "entailment", 1: "neutral", 2: "contradiction"}
return replacements[doc["label"]]


def weighted_f1_score(items):
unzipped_list = list(zip(*items))
golds = unzipped_list[0]
preds = unzipped_list[1]
fscore = f1_score(golds, preds, average="weighted")
return fscore
3 changes: 2 additions & 1 deletion lm_eval/tasks/drop/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
import string

import numpy as np
from scipy.optimize import linear_sum_assignment


_ARTICLES = re.compile(r"\b(a|an|the)\b", re.UNICODE)
Expand Down Expand Up @@ -117,6 +116,8 @@ def _align_bags(predicted, gold):
Takes gold and predicted answer sets and first finds the optimal 1-1 alignment
between them and gets maximum metric values over all the answers.
"""
from scipy.optimize import linear_sum_assignment

scores = np.zeros([len(gold), len(predicted)])
for gold_index, gold_item in enumerate(gold):
for pred_index, pred_item in enumerate(predicted):
Expand Down
3 changes: 2 additions & 1 deletion lm_eval/tasks/kobest/utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
from datasets import Dataset
from sklearn.metrics import f1_score


def copa_doc_to_text(doc: dict) -> str:
Expand Down Expand Up @@ -41,6 +40,8 @@ def preprocessor(dataset):


def macro_f1_score(items):
from sklearn.metrics import f1_score

unzipped_list = list(zip(*items))
golds = unzipped_list[0]
preds = unzipped_list[1]
Expand Down
9 changes: 5 additions & 4 deletions lm_eval/tasks/super_glue/cb/aggregate.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
import numpy as np
import sklearn


def cb_multi_fi(items):
from sklearn.metrics import f1_score

preds, golds = zip(*items)
preds = np.array(preds)
golds = np.array(golds)
f11 = sklearn.metrics.f1_score(y_true=golds == 0, y_pred=preds == 0)
f12 = sklearn.metrics.f1_score(y_true=golds == 1, y_pred=preds == 1)
f13 = sklearn.metrics.f1_score(y_true=golds == 2, y_pred=preds == 2)
f11 = f1_score(y_true=golds == 0, y_pred=preds == 0)
f12 = f1_score(y_true=golds == 1, y_pred=preds == 1)
f13 = f1_score(y_true=golds == 2, y_pred=preds == 2)
avg_f1 = np.mean([f11, f12, f13])
return avg_f1
5 changes: 2 additions & 3 deletions lm_eval/tasks/super_glue/cb/t5_utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,3 @@
import sklearn.metrics


def mean_3class_f1(predictions, references): # This is a passthrough function
string_label = ["entailment", "contradiction", "neutral"]
predictions = (
Expand All @@ -23,6 +20,8 @@ def agg_mean_3class_f1(items):
}

def _fn(predictions, references):
import sklearn.metrics

metric_fn = getattr(sklearn.metrics, metric_str)
metric_val = metric_fn(references, predictions, **metric_fn_kwargs)
return metric_val
Expand Down
5 changes: 3 additions & 2 deletions lm_eval/tasks/super_glue/multirc/t5_utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import collections

import numpy as np
import sklearn.metrics


def f1(predictions, references): # This is a passthrough function
Expand All @@ -19,10 +18,12 @@ def f1(predictions, references): # This is a passthrough function


def agg_f1(items):
from sklearn.metrics import f1_score

predictions, references = zip(*items)
references, predictions = np.asarray(references), np.asarray(predictions)

return sklearn.metrics.f1_score(references, predictions)
return f1_score(references, predictions)


def em(predictions, references): # This is a passthrough function
Expand Down
10 changes: 10 additions & 0 deletions lm_eval/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -487,3 +487,13 @@ def create_iterator(raw_iterator, *, rank=0, world_size=1, limit=None):
among ranks in multigpu setting or only pulling a sample of documents
"""
return islice(raw_iterator, rank, limit, world_size)


def weighted_f1_score(items):
from sklearn.metrics import f1_score

unzipped_list = list(zip(*items))
golds = unzipped_list[0]
preds = unzipped_list[1]
fscore = f1_score(golds, preds, average="weighted")
return fscore
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -104,3 +104,4 @@ known-first-party = ["lm_eval"]

[tool.ruff.lint.extend-per-file-ignores]
"__init__.py" = ["F401","F402","F403"]
"utils.py" = ["F401"]
5 changes: 3 additions & 2 deletions scripts/model_comparator.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

import numpy as np
import pandas as pd
import scipy.stats
import torch

import lm_eval.evaluator
Expand All @@ -23,11 +22,13 @@ def memory_stats():


def calculate_z_value(res1: Dict, res2: Dict) -> Tuple[float, float]:
from scipy.stats.norm import sf

acc1, acc2 = res1["acc,none"], res2["acc,none"]
st_err1, st_err2 = res1["acc_stderr,none"], res2["acc_stderr,none"]
Z = (acc1 - acc2) / np.sqrt((st_err1**2) + (st_err2**2))
# Determining the p-value
p_value = 2 * scipy.stats.norm.sf(abs(Z)) # two-tailed test
p_value = 2 * sf(abs(Z)) # two-tailed test
return Z, p_value


Expand Down

0 comments on commit 7f15cce

Please sign in to comment.