refactor evaluation

SeanLee97 · Jul 26, 2024 · c51e05d · c51e05d
1 parent ccce515
commit c51e05d
Show file tree

Hide file tree

Showing 5 changed files with 138 additions and 76 deletions.
diff --git a/angle_emb/__init__.py b/angle_emb/__init__.py
@@ -1,6 +1,7 @@
 # -*- coding: utf-8 -*-
 
-from .angle import *
+from .angle import * # NOQA
+from .evaluation import * # NOQA
 
 
 __version__ = '0.4.8'
diff --git a/angle_emb/angle.py b/angle_emb/angle.py
@@ -10,15 +10,10 @@
 from typing import Any, Dict, Optional, List, Union, Tuple, Callable
 from dataclasses import dataclass
 
-import scipy
-import scipy.stats
-import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import bitsandbytes as bnb
-from tqdm import tqdm
-from boltons.iterutils import chunked_iter
 from datasets import Dataset
 from transformers import (
  AutoModelForCausalLM, AutoModel, AutoTokenizer,
@@ -35,6 +30,7 @@
 from peft.tuners.lora import LoraLayer
 
 from .utils import logger
+from .evaluation import CorrelationEvaluator
 
 
 DEFAULT_LLM_PATTERNS = [r'.*llama.*', r'.*qwen.*', r'.*baichuan.*', r'.*mistral.*']
@@ -237,44 +233,6 @@ def contrastive_with_negative_loss(
  return nn.CrossEntropyLoss()(scores, labels)
 
 
-def compute_corrcoef(x: np.ndarray, y: np.ndarray) -> float:
- """
- Compute correlation coefficients
-
- :param x: np.ndarry, x array
- :param y: np.ndarry, y array
-
- :return: float
- """
- return scipy.stats.spearmanr(x, y).correlation
-
-
-def l2_normalize(arr: np.ndarray) -> np.ndarray:
- """
- Normalize array using L2
-
- :param arr: np.ndarray, input array
-
- :return: np.ndarray
- """
- norms = (arr**2).sum(axis=1, keepdims=True)**0.5
- return arr / np.clip(norms, 1e-8, np.inf)
-
-
-def optimal_threshold(y_true: np.ndarray, y_pred: np.ndarray) -> Tuple[float, float]:
- """
- Compute optimal threshold
-
- :param y_true: np.ndarray, y_true
- :param y_pred: np.ndarray, y_true
-
- :return: Tuple[float, float]
- """
- loss = lambda t: -np.mean((y_true > 0.5) == (y_pred > np.tanh(t))) # NOQA
- result = scipy.optimize.minimize(loss, 1, method='Powell')
- return np.tanh(result.x), -result.fun
-
-
 def check_llm(model_name_or_path: str, llm_regex_patterns: List[str] = None) -> bool:
  if llm_regex_patterns is not None:
  llm_regex_patterns += DEFAULT_LLM_PATTERNS
@@ -1499,35 +1457,21 @@ def fit(self,
  trainer.push_to_hub()
  self.backbone.save_pretrained(output_dir)
 
- def evaluate(self, data: Dataset, batch_size: int = 32, threshold: Optional[float] = None, device: Any = None):
- self.backbone.eval()
- data_collator = AngleDataCollator(
- self.tokenizer,
- return_tensors="pt",
- max_length=self.max_length,
- filter_duplicate=False,
- )
- y_trues, y_preds = [], []
- # for X, y in data.make_iter(random=False):
- for features in tqdm(chunked_iter(data, batch_size), desc='Evaluate'):
- X = data_collator(features)
- y = X.pop('labels', None)
- y_trues.extend(y[::2, 0].detach().cpu().numpy())
- with torch.no_grad():
- X.to(device or self.device)
- x_vecs = self.pooler(X,
- pooling_strategy=self.pooling_strategy).detach().float().cpu().numpy()
- x_vecs = l2_normalize(x_vecs)
- pred = (x_vecs[::2] * x_vecs[1::2]).sum(1)
- y_preds.extend(pred)
-
- y_trues, y_preds = np.array(y_trues), np.array(y_preds)
- corrcoef = compute_corrcoef(y_trues, y_preds)
- if threshold is None:
- _, accuracy = optimal_threshold(y_trues, y_preds)
- else:
- accuracy = np.mean((y_trues > 0.5) == (y_preds > threshold))
- return corrcoef, accuracy
+ def evaluate(self, data: Dataset, batch_size: int = 32, metric: str = 'spearman_cosine') -> float:
+ """ evaluate
+
+ :param data: Dataset, DatasetFormats.A is required
+ :param batch_size: int. Default 32.
+ :param metric: str. Default 'spearman_cosine'.
+
+ :return: float.
+ """
+ return CorrelationEvaluator(
+ text1=data['text1'],
+ text2=data['text2'],
+ labels=data['label'],
+ batch_size=batch_size,
+ )(self)[metric]
 
  def encode(self,
  inputs: Union[List[str], Tuple[str], List[Dict], str],
@@ -1656,7 +1600,7 @@ def __init__(self,
  self.hub_private_repo = hub_private_repo
 
  def on_epoch_end(self, args, state, control, **kwargs):
- corrcoef, accuracy = self.evaluate_fn(self.valid_ds)
+ corrcoef = self.evaluate_fn(self.valid_ds)
  if corrcoef > self.best_corrcoef:
  self.best_corrcoef = corrcoef
  print('new best corrcoef!')
@@ -1669,4 +1613,4 @@ def on_epoch_end(self, args, state, control, **kwargs):
  private=self.hub_private_repo,
  exist_ok=True,
  commit_message='new best checkpoint')
- print(f'corrcoef: {corrcoef}, accuracy: {accuracy}, best corrcoef: {self.best_corrcoef}')
+ logger.info(f'corrcoef: {corrcoef}, best corrcoef: {self.best_corrcoef}')
diff --git a/angle_emb/evaluation.py b/angle_emb/evaluation.py
@@ -0,0 +1,97 @@
+# -*- coding: utf-8 -*-
+
+from typing import List
+
+import numpy as np
+from boltons.iterutils import chunked_iter
+from tqdm import tqdm
+from sklearn.metrics.pairwise import (
+ paired_cosine_distances,
+ paired_euclidean_distances,
+ paired_manhattan_distances
+)
+from scipy.stats import pearsonr, spearmanr
+
+from .angle import AnglE
+
+
+class CorrelationEvaluator(object):
+ def __init__(
+ self,
+ text1: List[str],
+ text2: List[str],
+ labels: List[float],
+ batch_size: int = 32
+ ):
+ assert len(text1) == len(text2) == len(labels), "text1, text2, and labels must have the same length"
+ self.text1 = text1
+ self.text2 = text2
+ self.labels = labels
+ self.batch_size = batch_size
+
+ def __call__(self, model: AnglE, **kwargs) -> dict:
+ """ Evaluate the model on the given dataset.
+
+ :param model: AnglE, the model to evaluate.
+ :param kwargs: Additional keyword arguments to pass to the `encode` method of the model.
+
+ :return: dict, The evaluation results.
+ """
+ embeddings1 = []
+ embeddings2 = []
+ for chunk in tqdm(chunked_iter(range(len(self.text1)), self.batch_size)):
+ batch_text1 = [self.text1[i] for i in chunk]
+ batch_text2 = [self.text2[i] for i in chunk]
+
+ batch_embeddings1 = model.encode(batch_text1, **kwargs)
+ batch_embeddings2 = model.encode(batch_text2, **kwargs)
+ embeddings1.append(batch_embeddings1)
+ embeddings2.append(batch_embeddings2)
+
+ embeddings1 = np.concatenate(embeddings1, axis=0)
+ embeddings2 = np.concatenate(embeddings2, axis=0)
+
+ cosine_labels = 1 - (paired_cosine_distances(embeddings1, embeddings2))
+ manhattan_distances = -paired_manhattan_distances(embeddings1, embeddings2)
+ euclidean_distances = -paired_euclidean_distances(embeddings1, embeddings2)
+ dot_products = [np.dot(emb1, emb2) for emb1, emb2 in zip(embeddings1, embeddings2)]
+
+ pearson_cosine, _ = pearsonr(self.labels, cosine_labels)
+ spearman_cosine, _ = spearmanr(self.labels, cosine_labels)
+
+ pearson_manhattan, _ = pearsonr(self.labels, manhattan_distances)
+ spearman_manhattan, _ = spearmanr(self.labels, manhattan_distances)
+
+ pearson_euclidean, _ = pearsonr(self.labels, euclidean_distances)
+ spearman_euclidean, _ = spearmanr(self.labels, euclidean_distances)
+
+ pearson_dot, _ = pearsonr(self.labels, dot_products)
+ spearman_dot, _ = spearmanr(self.labels, dot_products)
+
+ metrics = {
+ "pearson_cosine": pearson_cosine,
+ "spearman_cosine": spearman_cosine,
+ "pearson_manhattan": pearson_manhattan,
+ "spearman_manhattan": spearman_manhattan,
+ "pearson_euclidean": pearson_euclidean,
+ "spearman_euclidean": spearman_euclidean,
+ "pearson_dot": pearson_dot,
+ "spearman_dot": spearman_dot,
+ }
+ return metrics
+
+ def list_all_metrics(self) -> List[str]:
+ """ Get a list of all the metrics that can be computed by this evaluator.
+
+ :return: List[str], A list of all the metrics that can be computed by this evaluator.
+ """
+ return [
+ "pearson_cosine",
+ "spearman_cosine",
+ "pearson_manhattan",
+ "spearman_manhattan",
+ "pearson_euclidean",
+ "spearman_euclidean",
+ "pearson_dot",
+ "spearman_dot",
+ ]
diff --git a/requirements.txt b/requirements.txt
@@ -6,4 +6,5 @@ prettytable
 transformers>=4.32.1
 scipy
 einops
-wandb
+wandb
+scikit-learn
diff --git a/tests/test_eval.py b/tests/test_eval.py
@@ -0,0 +1,19 @@
+# -*- coding: utf-8 -*-
+
+
+def test_eval():
+ from datasets import load_dataset
+ from angle_emb import AnglE, CorrelationEvaluator
+
+ angle = AnglE.from_pretrained('WhereIsAI/UAE-Large-V1', pooling_strategy='cls')
+ eval_dataset = load_dataset('sentence-transformers/stsb', split="test")
+
+ spearman = CorrelationEvaluator(
+ text1=eval_dataset["sentence1"],
+ text2=eval_dataset["sentence2"],
+ labels=eval_dataset["score"],
+ )(angle)['spearman_cosine']
+ assert spearman > 0.9
+
+ spearman = angle.evaluate(eval_dataset)
+ assert spearman > 0.9