From fdc7cdf8d781f7b48be2702ec42652542b2ddfe8 Mon Sep 17 00:00:00 2001 From: "shengzhe.li" Date: Thu, 25 Jul 2024 03:47:26 +0900 Subject: [PATCH 01/13] Add prediction logging for classification and retrieval --- src/jmteb/evaluators/classification/data.py | 7 ++ .../evaluators/classification/evaluator.py | 17 ++++- src/jmteb/evaluators/retrieval/data.py | 26 ++++++++ src/jmteb/evaluators/retrieval/evaluator.py | 64 +++++++++++++++---- .../test_classification_evaluator.py | 19 ++++++ tests/evaluator/test_retrieval_evaluator.py | 19 ++++++ 6 files changed, 138 insertions(+), 14 deletions(-) diff --git a/src/jmteb/evaluators/classification/data.py b/src/jmteb/evaluators/classification/data.py index 5885471..ba5eb8d 100644 --- a/src/jmteb/evaluators/classification/data.py +++ b/src/jmteb/evaluators/classification/data.py @@ -13,6 +13,13 @@ class ClassificationInstance: label: int +@dataclass +class ClassificationPrediction: + text: str + label: int + prediction: int + + class ClassificationDataset(ABC): @abstractmethod def __len__(self): diff --git a/src/jmteb/evaluators/classification/evaluator.py b/src/jmteb/evaluators/classification/evaluator.py index dbe2d8e..457d949 100644 --- a/src/jmteb/evaluators/classification/evaluator.py +++ b/src/jmteb/evaluators/classification/evaluator.py @@ -11,7 +11,7 @@ from jmteb.evaluators.base import EmbeddingEvaluator, EvaluationResults from .classifiers import Classifier, KnnClassifier, LogRegClassifier -from .data import ClassificationDataset +from .data import ClassificationDataset, ClassificationPrediction class ClassificationEvaluator(EmbeddingEvaluator): @@ -28,6 +28,7 @@ class ClassificationEvaluator(EmbeddingEvaluator): The first one is specified as the main index. classifiers (dict[str, Classifier]): classifiers to be evaluated. prefix (str | None): prefix for sentences. Defaults to None. + log_predictions (bool): whether to log predictions of each datapoint. """ def __init__( @@ -38,6 +39,7 @@ def __init__( average: str = "macro", classifiers: dict[str, Classifier] | None = None, prefix: str | None = None, + log_predictions: bool = False, ) -> None: self.train_dataset = train_dataset self.val_dataset = val_dataset @@ -52,6 +54,7 @@ def __init__( if average_name.strip().lower() in ("micro", "macro", "samples", "weighted", "binary") ] or ["macro"] self.prefix = prefix + self.log_predictions = log_predictions self.main_metric = f"{self.average[0]}_f1" def __call__( @@ -119,6 +122,7 @@ def __call__( "val_scores": val_results, "test_scores": test_results, }, + predictions=self._format_predictions(self.test_dataset, y_pred) if self.log_predictions else None, ) @staticmethod @@ -128,3 +132,14 @@ def _compute_metrics(y_pred: np.ndarray, y_true: list[int], average: list[float] for average_method in average: classifier_results[f"{average_method}_f1"] = f1_score(y_true, y_pred, average=average_method) return classifier_results + + @staticmethod + def _format_predictions(dataset: ClassificationDataset, y_pred: np.ndarray) -> list[ClassificationPrediction]: + texts = [item.text for item in dataset] + y_true = [item.label for item in dataset] + y_pred = y_pred.tolist() + assert len(texts) == len(y_true) == len(y_pred) + return [ + ClassificationPrediction(text=text, label=label, prediction=pred) + for text, label, pred in zip(texts, y_true, y_pred) + ] diff --git a/src/jmteb/evaluators/retrieval/data.py b/src/jmteb/evaluators/retrieval/data.py index 70c69a4..4c8c30b 100644 --- a/src/jmteb/evaluators/retrieval/data.py +++ b/src/jmteb/evaluators/retrieval/data.py @@ -21,6 +21,13 @@ class RetrievalDoc: text: str +@dataclass +class RetrievalPrediction: + query: str + relevant_docs: list[RetrievalDoc] + predicted_relevant_docs: list[RetrievalDoc] + + class RetrievalQueryDataset(ABC): @abstractmethod def __len__(self): @@ -46,6 +53,23 @@ def __getitem__(self, idx) -> RetrievalDoc: def __eq__(self, __value: object) -> bool: return False + def _build_idx_docid_mapping(self, dataset_attr_name: str = "dataset") -> None: + self.idx_to_docid: dict = {} + self.docid_to_idx: dict = {} + id_key: str = getattr(self, "id_key", None) + dataset = getattr(self, dataset_attr_name) + if id_key: + for idx, doc_dict in enumerate(dataset): + self.idx_to_docid[idx] = doc_dict[id_key] + self.docid_to_idx[doc_dict[id_key]] = idx + elif isinstance(dataset[0], RetrievalDoc): + for idx, doc in enumerate(dataset): + doc: RetrievalDoc + self.idx_to_docid[idx] = doc.id + self.docid_to_idx[doc.id] = idx + else: + raise ValueError(f"Invalid dataset type: list[{type(dataset[0])}]") + class HfRetrievalQueryDataset(RetrievalQueryDataset): def __init__( @@ -124,6 +148,7 @@ def __init__(self, path: str, split: str, name: str | None = None, id_key: str = self.dataset = datasets.load_dataset(path, split=split, name=name, trust_remote_code=True) self.id_key = id_key self.text_key = text_key + self._build_idx_docid_mapping() def __len__(self): return len(self.dataset) @@ -150,6 +175,7 @@ def __init__(self, filename: str, id_key: str = "docid", text_key: str = "text") self.dataset = corpus self.id_key = id_key self.text_key = text_key + self._build_idx_docid_mapping() def __len__(self): return len(self.dataset) diff --git a/src/jmteb/evaluators/retrieval/evaluator.py b/src/jmteb/evaluators/retrieval/evaluator.py index c7edc59..90549a8 100644 --- a/src/jmteb/evaluators/retrieval/evaluator.py +++ b/src/jmteb/evaluators/retrieval/evaluator.py @@ -15,7 +15,13 @@ from jmteb.embedders.base import TextEmbedder from jmteb.evaluators.base import EmbeddingEvaluator, EvaluationResults -from .data import RetrievalDocDataset, RetrievalQueryDataset +from .data import ( + RetrievalDoc, + RetrievalDocDataset, + RetrievalPrediction, + RetrievalQuery, + RetrievalQueryDataset, +) T = TypeVar("T") @@ -33,6 +39,7 @@ class RetrievalEvaluator(EmbeddingEvaluator): accuracy_at_k (list[int] | None): accuracy in top k hits. query_prefix (str | None): prefix for queries. Defaults to None. doc_prefix (str | None): prefix for documents. Defaults to None. + log_predictions (bool): whether to log predictions of each datapoint. Defaults to False. """ def __init__( @@ -45,6 +52,7 @@ def __init__( ndcg_at_k: list[int] | None = None, query_prefix: str | None = None, doc_prefix: str | None = None, + log_predictions: bool = False, ) -> None: self.val_query_dataset = val_query_dataset self.test_query_dataset = test_query_dataset @@ -59,6 +67,7 @@ def __init__( self.query_prefix = query_prefix self.doc_prefix = doc_prefix + self.log_predictions = log_predictions def __call__( self, @@ -103,7 +112,7 @@ def __call__( val_results = {} for dist_name, dist_func in dist_functions.items(): - val_results[dist_name] = self._compute_metrics( + val_results[dist_name], _ = self._compute_metrics( query_dataset=self.val_query_dataset, query_embeddings=val_query_embeddings, doc_embeddings=doc_embeddings, @@ -112,14 +121,13 @@ def __call__( sorted_val_results = sorted(val_results.items(), key=lambda res: res[1][self.main_metric], reverse=True) optimal_dist_name = sorted_val_results[0][0] - test_results = { - optimal_dist_name: self._compute_metrics( - query_dataset=self.test_query_dataset, - query_embeddings=test_query_embeddings, - doc_embeddings=doc_embeddings, - dist_func=dist_functions[optimal_dist_name], - ) - } + test_scores, test_predictions = self._compute_metrics( + query_dataset=self.test_query_dataset, + query_embeddings=test_query_embeddings, + doc_embeddings=doc_embeddings, + dist_func=dist_functions[optimal_dist_name], + ) + test_results = {optimal_dist_name: test_scores} return EvaluationResults( metric_name=self.main_metric, @@ -129,6 +137,7 @@ def __call__( "val_scores": val_results, "test_scores": test_results, }, + predictions=test_predictions, ) def _compute_metrics( @@ -137,9 +146,9 @@ def _compute_metrics( query_embeddings: np.ndarray | Tensor, doc_embeddings: np.ndarray | Tensor, dist_func: Callable[[Tensor, Tensor], Tensor], - ) -> dict[str, dict[str, float]]: + ) -> tuple[dict[str, dict[str, float]], list[RetrievalPrediction]]: results: dict[str, float] = {} - + predictions: list[RetrievalPrediction] = [] if self.log_predictions else None with tqdm.tqdm(total=len(doc_embeddings), desc="Retrieval doc chunks") as pbar: top_k_indices_chunks: list[np.ndarray] = [] top_k_scores_chunks: list[np.ndarray] = [] @@ -173,13 +182,42 @@ def _compute_metrics( golden_doc_ids = [item.relevant_docs for item in query_dataset] retrieved_doc_ids = [[self.doc_dataset[i].id for i in indices] for indices in sorted_top_k_indices] + predictions = ( + self._format_predictions(query_dataset, self.doc_dataset, retrieved_doc_ids) + if self.log_predictions + else None + ) + for k in self.accuracy_at_k: results[f"accuracy@{k}"] = accuracy_at_k(golden_doc_ids, retrieved_doc_ids, k) for k in self.ndcg_at_k: results[f"ndcg@{k}"] = ndcg_at_k(golden_doc_ids, retrieved_doc_ids, k) results[f"mrr@{self.max_top_k}"] = mrr_at_k(golden_doc_ids, retrieved_doc_ids, self.max_top_k) - return results + return results, predictions + + @staticmethod + def _format_predictions( + query_dataset: RetrievalQueryDataset, + doc_dataset: RetrievalDocDataset, + retrieved_doc_ids: list[list], + ) -> list[RetrievalPrediction]: + predictions = [] + for q, pred_docids in zip(query_dataset, retrieved_doc_ids): + q: RetrievalQuery + golden_docs: list[RetrievalDoc] = [ + doc_dataset[doc_dataset.docid_to_idx[docid]] for docid in q.relevant_docs + ] + pred_docs: list[RetrievalDoc] = [ + doc_dataset[doc_dataset.docid_to_idx[pred_docid]] for pred_docid in pred_docids + ] + prediction = RetrievalPrediction( + query=q.query, + relevant_docs=golden_docs, + predicted_relevant_docs=pred_docs, + ) + predictions.append(prediction) + return predictions def accuracy_at_k(relevant_docs: list[list[T]], top_hits: list[list[T]], k: int) -> float: diff --git a/tests/evaluator/test_classification_evaluator.py b/tests/evaluator/test_classification_evaluator.py index bce9964..761198e 100644 --- a/tests/evaluator/test_classification_evaluator.py +++ b/tests/evaluator/test_classification_evaluator.py @@ -90,3 +90,22 @@ def test_classification_jsonl_dataset_equal(): assert dummy_jsonl_dataset_1 == dummy_jsonl_dataset_2 dummy_jsonl_dataset_2.label_key = "LABEL" assert dummy_jsonl_dataset_1 != dummy_jsonl_dataset_2 + + +def test_classification_prediction_logging(embedder): + dataset = DummyClassificationDataset() + evaluator = ClassificationEvaluator( + train_dataset=dataset, + val_dataset=dataset, + test_dataset=dataset, + classifiers={ + "logreg": LogRegClassifier(), + "knn": KnnClassifier(k=2, distance_metric="cosine"), + }, + log_predictions=True, + ) + results = evaluator(model=embedder) + assert isinstance(results.predictions, list) + assert [p.text for p in results.predictions] == [d.text for d in dataset] + assert [p.label for p in results.predictions] == [d.label for d in dataset] + assert all([isinstance(p.prediction, int) for p in results.predictions]) diff --git a/tests/evaluator/test_retrieval_evaluator.py b/tests/evaluator/test_retrieval_evaluator.py index fa52c52..d76d65d 100644 --- a/tests/evaluator/test_retrieval_evaluator.py +++ b/tests/evaluator/test_retrieval_evaluator.py @@ -8,6 +8,7 @@ from jmteb.evaluators.retrieval.data import ( JsonlRetrievalDocDataset, JsonlRetrievalQueryDataset, + RetrievalPrediction, ) EXPECTED_OUTPUT_DICT_KEYS = {"val_scores", "test_scores", "optimal_distance_metric"} @@ -19,6 +20,7 @@ class DummyDocDataset(RetrievalDocDataset): def __init__(self, prefix: str = ""): self._items = [RetrievalDoc(id=str(i), text=f"{prefix}dummy document {i}") for i in range(30)] + self._build_idx_docid_mapping("_items") def __len__(self): return len(self._items) @@ -60,6 +62,23 @@ def test_retrieval_evaluator(embedder): assert any(score.startswith(metric) for metric in ["accuracy", "mrr", "ndcg"]) +def test_retrieval_evaluator_with_predictions(embedder): + dummy_query_dataset = DummyQueryDataset() + dummy_doc_dataset = DummyDocDataset() + evaluator = RetrievalEvaluator( + val_query_dataset=dummy_query_dataset, + test_query_dataset=dummy_query_dataset, + doc_dataset=dummy_doc_dataset, + accuracy_at_k=[1, 3, 5, 10], + ndcg_at_k=[1, 3, 5], + doc_chunk_size=3, + log_predictions=True, + ) + results = evaluator(model=embedder) + assert [p.query for p in results.predictions] == [q.query for q in dummy_query_dataset] + assert all([isinstance(p, RetrievalPrediction) for p in results.predictions]) + + def test_retrieval_evaluator_with_prefix(embedder): evaluator_with_prefix = RetrievalEvaluator( val_query_dataset=DummyQueryDataset(), From fea50d5b4f56ec729d0894b6d3a55b6801b6af2b Mon Sep 17 00:00:00 2001 From: "shengzhe.li" Date: Thu, 25 Jul 2024 04:04:34 +0900 Subject: [PATCH 02/13] fix EvaluatonResults --- src/jmteb/evaluators/base.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/jmteb/evaluators/base.py b/src/jmteb/evaluators/base.py index 7b47379..a94c96c 100644 --- a/src/jmteb/evaluators/base.py +++ b/src/jmteb/evaluators/base.py @@ -19,11 +19,13 @@ class EvaluationResults: metric_value (float): Value of the main metric. details (dict[str, Any]): Details of the evaluation. This included some additional metrics or values that are used to derive the main metric. + predictions (list[Any]): Predictions (such as, (text, y_true, y_pred)) """ metric_name: str metric_value: float details: dict[str, Any] + predictions: list[Any] | None = None def as_dict(self) -> dict[str, Any]: return { From 86eef3da54f266959682afd7d5ac87c1be9c013f Mon Sep 17 00:00:00 2001 From: "shengzhe.li" Date: Sat, 27 Jul 2024 02:08:10 +0900 Subject: [PATCH 03/13] Add an argument to control how many predicted docs are logged --- src/jmteb/evaluators/retrieval/evaluator.py | 7 ++++++- tests/evaluator/test_retrieval_evaluator.py | 6 ++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/src/jmteb/evaluators/retrieval/evaluator.py b/src/jmteb/evaluators/retrieval/evaluator.py index 90549a8..d5339f9 100644 --- a/src/jmteb/evaluators/retrieval/evaluator.py +++ b/src/jmteb/evaluators/retrieval/evaluator.py @@ -40,6 +40,7 @@ class RetrievalEvaluator(EmbeddingEvaluator): query_prefix (str | None): prefix for queries. Defaults to None. doc_prefix (str | None): prefix for documents. Defaults to None. log_predictions (bool): whether to log predictions of each datapoint. Defaults to False. + top_n_docs_to_log (int): log only top n documents that are predicted as relevant. Defaults to 5. """ def __init__( @@ -53,6 +54,7 @@ def __init__( query_prefix: str | None = None, doc_prefix: str | None = None, log_predictions: bool = False, + top_n_docs_to_log: int = 5, ) -> None: self.val_query_dataset = val_query_dataset self.test_query_dataset = test_query_dataset @@ -68,6 +70,7 @@ def __init__( self.query_prefix = query_prefix self.doc_prefix = doc_prefix self.log_predictions = log_predictions + self.top_n_docs_to_log = top_n_docs_to_log def __call__( self, @@ -183,7 +186,7 @@ def _compute_metrics( retrieved_doc_ids = [[self.doc_dataset[i].id for i in indices] for indices in sorted_top_k_indices] predictions = ( - self._format_predictions(query_dataset, self.doc_dataset, retrieved_doc_ids) + self._format_predictions(query_dataset, self.doc_dataset, retrieved_doc_ids, self.top_n_docs_to_log) if self.log_predictions else None ) @@ -201,6 +204,7 @@ def _format_predictions( query_dataset: RetrievalQueryDataset, doc_dataset: RetrievalDocDataset, retrieved_doc_ids: list[list], + top_n_to_log: int, ) -> list[RetrievalPrediction]: predictions = [] for q, pred_docids in zip(query_dataset, retrieved_doc_ids): @@ -208,6 +212,7 @@ def _format_predictions( golden_docs: list[RetrievalDoc] = [ doc_dataset[doc_dataset.docid_to_idx[docid]] for docid in q.relevant_docs ] + pred_docids = pred_docids[:top_n_to_log] pred_docs: list[RetrievalDoc] = [ doc_dataset[doc_dataset.docid_to_idx[pred_docid]] for pred_docid in pred_docids ] diff --git a/tests/evaluator/test_retrieval_evaluator.py b/tests/evaluator/test_retrieval_evaluator.py index d76d65d..21fb6e9 100644 --- a/tests/evaluator/test_retrieval_evaluator.py +++ b/tests/evaluator/test_retrieval_evaluator.py @@ -15,6 +15,7 @@ EXPECTED_DIST_FUNC_NAMES = {"cosine_similarity", "euclidean_distance", "dot_score"} QUERY_PREFIX = "クエリ: " DOC_PREFIX = "ドキュメント: " +TOP_N_DOCS_TO_LOG = 4 class DummyDocDataset(RetrievalDocDataset): @@ -73,10 +74,15 @@ def test_retrieval_evaluator_with_predictions(embedder): ndcg_at_k=[1, 3, 5], doc_chunk_size=3, log_predictions=True, + top_n_docs_to_log=TOP_N_DOCS_TO_LOG, ) results = evaluator(model=embedder) assert [p.query for p in results.predictions] == [q.query for q in dummy_query_dataset] assert all([isinstance(p, RetrievalPrediction) for p in results.predictions]) + for p in results.predictions: + assert isinstance(p, RetrievalPrediction) + assert len(p.predicted_relevant_docs) == TOP_N_DOCS_TO_LOG + assert all([isinstance(doc, RetrievalDoc) for doc in p.predicted_relevant_docs]) def test_retrieval_evaluator_with_prefix(embedder): From f5fe5d31550c4cf8f29d0a53c16664f3d3569dae Mon Sep 17 00:00:00 2001 From: "shengzhe.li" Date: Sat, 27 Jul 2024 02:39:34 +0900 Subject: [PATCH 04/13] fix imports and tests --- src/jmteb/evaluators/classification/__init__.py | 6 +++++- src/jmteb/evaluators/retrieval/__init__.py | 1 + tests/evaluator/test_classification_evaluator.py | 16 ++++++++++++++++ tests/evaluator/test_retrieval_evaluator.py | 2 +- 4 files changed, 23 insertions(+), 2 deletions(-) diff --git a/src/jmteb/evaluators/classification/__init__.py b/src/jmteb/evaluators/classification/__init__.py index 6c85424..4c1bfbb 100644 --- a/src/jmteb/evaluators/classification/__init__.py +++ b/src/jmteb/evaluators/classification/__init__.py @@ -1,3 +1,7 @@ from .classifiers import Classifier, KnnClassifier, LogRegClassifier -from .data import ClassificationDataset, ClassificationInstance +from .data import ( + ClassificationDataset, + ClassificationInstance, + ClassificationPrediction, +) from .evaluator import ClassificationEvaluator diff --git a/src/jmteb/evaluators/retrieval/__init__.py b/src/jmteb/evaluators/retrieval/__init__.py index c63354c..73d4e33 100644 --- a/src/jmteb/evaluators/retrieval/__init__.py +++ b/src/jmteb/evaluators/retrieval/__init__.py @@ -1,6 +1,7 @@ from .data import ( RetrievalDoc, RetrievalDocDataset, + RetrievalPrediction, RetrievalQuery, RetrievalQueryDataset, ) diff --git a/tests/evaluator/test_classification_evaluator.py b/tests/evaluator/test_classification_evaluator.py index 761198e..77cc542 100644 --- a/tests/evaluator/test_classification_evaluator.py +++ b/tests/evaluator/test_classification_evaluator.py @@ -2,6 +2,7 @@ ClassificationDataset, ClassificationEvaluator, ClassificationInstance, + ClassificationPrediction, KnnClassifier, LogRegClassifier, ) @@ -44,6 +45,21 @@ def test_classification_evaluator(embedder): assert set(value.keys()) == expected_metrics +def test_classification_evaluator_with_predictions(embedder): + evaluator = ClassificationEvaluator( + train_dataset=DummyClassificationDataset(), + val_dataset=DummyClassificationDataset(), + test_dataset=DummyClassificationDataset(), + classifiers={ + "logreg": LogRegClassifier(), + "knn": KnnClassifier(k=2, distance_metric="cosine"), + }, + log_predictions=True, + ) + results = evaluator(model=embedder) + assert all([isinstance(result, ClassificationPrediction) for result in results.predictions]) + + def test_classification_evaluator_with_prefix(embedder): evaluator_with_prefix = ClassificationEvaluator( train_dataset=DummyClassificationDataset(), diff --git a/tests/evaluator/test_retrieval_evaluator.py b/tests/evaluator/test_retrieval_evaluator.py index 21fb6e9..2bc5e32 100644 --- a/tests/evaluator/test_retrieval_evaluator.py +++ b/tests/evaluator/test_retrieval_evaluator.py @@ -2,13 +2,13 @@ RetrievalDoc, RetrievalDocDataset, RetrievalEvaluator, + RetrievalPrediction, RetrievalQuery, RetrievalQueryDataset, ) from jmteb.evaluators.retrieval.data import ( JsonlRetrievalDocDataset, JsonlRetrievalQueryDataset, - RetrievalPrediction, ) EXPECTED_OUTPUT_DICT_KEYS = {"val_scores", "test_scores", "optimal_distance_metric"} From 957b8f62eb696380af0d4b857cd9fb93f5ad7921 Mon Sep 17 00:00:00 2001 From: "shengzhe.li" Date: Sat, 27 Jul 2024 02:40:04 +0900 Subject: [PATCH 05/13] Implement prediction logging in STS --- src/jmteb/evaluators/sts/__init__.py | 2 +- src/jmteb/evaluators/sts/data.py | 9 +++++ src/jmteb/evaluators/sts/evaluator.py | 57 +++++++++++++++++++-------- tests/evaluator/test_sts_evaluator.py | 8 +++- 4 files changed, 57 insertions(+), 19 deletions(-) diff --git a/src/jmteb/evaluators/sts/__init__.py b/src/jmteb/evaluators/sts/__init__.py index 502fdc8..665402c 100644 --- a/src/jmteb/evaluators/sts/__init__.py +++ b/src/jmteb/evaluators/sts/__init__.py @@ -1,2 +1,2 @@ -from .data import STSDataset, STSInstance +from .data import STSDataset, STSInstance, STSPrediction from .evaluator import STSEvaluator diff --git a/src/jmteb/evaluators/sts/data.py b/src/jmteb/evaluators/sts/data.py index 02504e7..a2166a5 100644 --- a/src/jmteb/evaluators/sts/data.py +++ b/src/jmteb/evaluators/sts/data.py @@ -14,6 +14,15 @@ class STSInstance: score: float +@dataclass +class STSPrediction: + sentence1: str + sentence2: str + true_score: float + predicted_score: float + similarity_function_name: str + + class STSDataset(ABC): @abstractmethod def __len__(self): diff --git a/src/jmteb/evaluators/sts/evaluator.py b/src/jmteb/evaluators/sts/evaluator.py index 4999fa5..cbea7e2 100644 --- a/src/jmteb/evaluators/sts/evaluator.py +++ b/src/jmteb/evaluators/sts/evaluator.py @@ -14,7 +14,7 @@ from jmteb.embedders.base import TextEmbedder from jmteb.evaluators.base import EmbeddingEvaluator, EvaluationResults -from .data import STSDataset +from .data import STSDataset, STSInstance, STSPrediction class STSEvaluator(EmbeddingEvaluator): @@ -34,12 +34,14 @@ def __init__( test_dataset: STSDataset, sentence1_prefix: str | None = None, sentence2_prefix: str | None = None, + log_predictions: bool = False, ) -> None: self.val_dataset = val_dataset self.test_dataset = test_dataset self.sentence1_prefix = sentence1_prefix self.sentence2_prefix = sentence2_prefix self.main_metric = "spearman" + self.log_predictions = log_predictions def __call__( self, model: TextEmbedder, cache_dir: str | PathLike[str] | None = None, overwrite_cache: bool = False @@ -69,7 +71,7 @@ def __call__( val_results = {} for sim_name, sim_func in similarity_functions.items(): - val_results[sim_name] = self._compute_similarity( + val_results[sim_name], _ = self._compute_similarity( val_embeddings1, val_embeddings2, val_golden_scores, sim_func ) @@ -80,36 +82,57 @@ def __call__( )[ 0 ][0] - test_results = { - optimal_similarity_name: self._compute_similarity( - test_embeddings1, - test_embeddings2, - test_golden_scores, - similarity_functions[optimal_similarity_name], - ) - } + test_eval_scores, test_sim_scores = self._compute_similarity( + test_embeddings1, + test_embeddings2, + test_golden_scores, + similarity_functions[optimal_similarity_name], + ) return EvaluationResults( metric_name=self.main_metric, - metric_value=test_results[optimal_similarity_name][self.main_metric], + metric_value=test_eval_scores[self.main_metric], details={ "optimal_similarity_metric": optimal_similarity_name, "val_scores": val_results, - "test_scores": test_results, + "test_scores": {optimal_similarity_name: test_eval_scores}, }, + predictions=( + self._format_predictions(self.test_dataset, test_sim_scores, optimal_similarity_name) + if self.log_predictions + else None + ), ) @staticmethod def _compute_similarity( embeddings1: Tensor, embeddings2: Tensor, golden_scores: list, similarity_func: Callable - ) -> dict[str, float]: - test_sim_score = similarity_func(embeddings1, embeddings2).cpu() - pearson = pearsonr(golden_scores, test_sim_score)[0] - spearman = spearmanr(golden_scores, test_sim_score)[0] + ) -> tuple[dict[str, float], list[float]]: + sim_scores = similarity_func(embeddings1, embeddings2).cpu() + pearson = pearsonr(golden_scores, sim_scores)[0] + spearman = spearmanr(golden_scores, sim_scores)[0] return { "pearson": pearson if not math.isnan(pearson) else 0.0, "spearman": spearman if not math.isnan(spearman) else 0.0, - } + }, sim_scores.tolist() + + @staticmethod + def _format_predictions( + dataset: STSDataset, sim_scores: list[float], similarity_function_name: str + ) -> list[STSPrediction]: + predictions = [] + for item, sim_score in zip(dataset, sim_scores): + item: STSInstance + predictions.append( + STSPrediction( + sentence1=item.sentence1, + sentence2=item.sentence2, + true_score=item.score, + predicted_score=sim_score, + similarity_function_name=similarity_function_name, + ) + ) + return predictions def _convert_to_embeddings( self, diff --git a/tests/evaluator/test_sts_evaluator.py b/tests/evaluator/test_sts_evaluator.py index 69469cc..d7a6d1c 100644 --- a/tests/evaluator/test_sts_evaluator.py +++ b/tests/evaluator/test_sts_evaluator.py @@ -1,4 +1,4 @@ -from jmteb.evaluators.sts import STSDataset, STSEvaluator, STSInstance +from jmteb.evaluators.sts import STSDataset, STSEvaluator, STSInstance, STSPrediction from jmteb.evaluators.sts.data import JsonlSTSDataset EXPECTED_OUTPUT_DICT_KEYS = {"val_scores", "test_scores", "optimal_similarity_metric"} @@ -37,6 +37,12 @@ def test_sts(embedder): assert set(results.details[score_splitname][dist].keys()) == EXPECTED_METRIC_NAMES +def test_sts_with_predictions(embedder): + evaluator = STSEvaluator(val_dataset=DummySTSDataset(), test_dataset=DummySTSDataset(), log_predictions=True) + results = evaluator(model=embedder) + assert all([isinstance(result, STSPrediction) for result in results.predictions]) + + def test_sts_with_prefix(embedder): evaluator_with_prefix = STSEvaluator( val_dataset=DummySTSDataset(), From eea32ddac6a4516fddb8f3f46e9d326393bbc5f5 Mon Sep 17 00:00:00 2001 From: "shengzhe.li" Date: Sat, 27 Jul 2024 02:49:05 +0900 Subject: [PATCH 06/13] Implement prediction logging in clustering --- src/jmteb/evaluators/clustering/__init__.py | 2 +- src/jmteb/evaluators/clustering/data.py | 7 ++++ src/jmteb/evaluators/clustering/evaluator.py | 34 +++++++++++++------- tests/evaluator/test_clustering_evaluator.py | 9 ++++++ 4 files changed, 40 insertions(+), 12 deletions(-) diff --git a/src/jmteb/evaluators/clustering/__init__.py b/src/jmteb/evaluators/clustering/__init__.py index 5164b12..e22bd0d 100644 --- a/src/jmteb/evaluators/clustering/__init__.py +++ b/src/jmteb/evaluators/clustering/__init__.py @@ -1,2 +1,2 @@ -from .data import ClusteringDataset, ClusteringInstance +from .data import ClusteringDataset, ClusteringInstance, ClusteringPrediction from .evaluator import ClusteringEvaluator diff --git a/src/jmteb/evaluators/clustering/data.py b/src/jmteb/evaluators/clustering/data.py index ee2ec4f..64d9608 100644 --- a/src/jmteb/evaluators/clustering/data.py +++ b/src/jmteb/evaluators/clustering/data.py @@ -13,6 +13,13 @@ class ClusteringInstance: label: int +@dataclass +class ClusteringPrediction: + text: str + label: int + prediction: int + + class ClusteringDataset(ABC): @abstractmethod def __len__(self): diff --git a/src/jmteb/evaluators/clustering/evaluator.py b/src/jmteb/evaluators/clustering/evaluator.py index d8ef443..4f3cd3c 100644 --- a/src/jmteb/evaluators/clustering/evaluator.py +++ b/src/jmteb/evaluators/clustering/evaluator.py @@ -18,7 +18,7 @@ from jmteb.embedders.base import TextEmbedder from jmteb.evaluators.base import EmbeddingEvaluator, EvaluationResults -from .data import ClusteringDataset +from .data import ClusteringDataset, ClusteringPrediction class ClusteringEvaluator(EmbeddingEvaluator): @@ -32,11 +32,13 @@ def __init__( test_dataset: ClusteringDataset, prefix: str | None = None, random_seed: int | None = None, + log_predictions: bool = False, ) -> None: self.val_dataset = val_dataset self.test_dataset = test_dataset self.prefix = prefix self.random_seed = random_seed + self.log_predictions = log_predictions self.main_metric = "v_measure_score" def __call__( @@ -80,20 +82,21 @@ def __call__( logger.info("Fitting clustering model...") val_results = {} for model_name, model_constructor in model_constructors.items(): - val_results[model_name] = self._evaluate_clustering_model(val_embeddings, val_labels, model_constructor()) + val_results[model_name], _ = self._evaluate_clustering_model( + val_embeddings, val_labels, model_constructor() + ) optimal_clustering_model_name = sorted( val_results.items(), key=lambda res: res[1][self.main_metric], reverse=True, )[0][0] - test_results = { - optimal_clustering_model_name: self._evaluate_clustering_model( - test_embeddings, - test_labels, - model_constructors[optimal_clustering_model_name](), - ) - } + test_scores, test_predictions = self._evaluate_clustering_model( + test_embeddings, + test_labels, + model_constructors[optimal_clustering_model_name](), + ) + test_results = {optimal_clustering_model_name: test_scores} return EvaluationResults( metric_name=self.main_metric, @@ -103,12 +106,15 @@ def __call__( "val_scores": val_results, "test_scores": test_results, }, + predictions=( + self._format_predictions(self.test_dataset, test_predictions) if self.log_predictions else None + ), ) @staticmethod def _evaluate_clustering_model( embeddings: np.ndarray, y_true: list[int], clustering_model: ClusterMixin - ) -> dict[str, float]: + ) -> tuple[dict[str, float], list[int]]: y_pred = clustering_model.fit_predict(embeddings) h_score, c_score, v_score = homogeneity_completeness_v_measure( labels_pred=y_pred, labels_true=np.array(y_true) @@ -118,4 +124,10 @@ def _evaluate_clustering_model( "v_measure_score": v_score, "homogeneity_score": h_score, "completeness_score": c_score, - } + }, y_pred.tolist() + + @staticmethod + def _format_predictions(dataset: ClusteringDataset, predictions: list[int]) -> list[ClusteringPrediction]: + return [ + ClusteringPrediction(item.text, item.label, prediction) for item, prediction in zip(dataset, predictions) + ] diff --git a/tests/evaluator/test_clustering_evaluator.py b/tests/evaluator/test_clustering_evaluator.py index 217d850..50880bd 100644 --- a/tests/evaluator/test_clustering_evaluator.py +++ b/tests/evaluator/test_clustering_evaluator.py @@ -2,6 +2,7 @@ ClusteringDataset, ClusteringEvaluator, ClusteringInstance, + ClusteringPrediction, ) from jmteb.evaluators.clustering.data import JsonlClusteringDataset @@ -39,6 +40,14 @@ def test_kmeans_clustering(embedder): assert set(results.details[score_splitname][clustering_model].keys()) == expected_metrics +def test_clustering_with_predictions(embedder): + evaluator = ClusteringEvaluator( + val_dataset=DummyClusteringDataset(), test_dataset=DummyClusteringDataset(), log_predictions=True + ) + results = evaluator(model=embedder) + assert all([isinstance(p, ClusteringPrediction) for p in results.predictions]) + + def test_clustering_with_prefix(embedder): evaluator_with_prefix = ClusteringEvaluator( val_dataset=DummyClusteringDataset(), From 1d53001c08e49a5a2174a61d1c496d1e8c03df3a Mon Sep 17 00:00:00 2001 From: "shengzhe.li" Date: Sat, 27 Jul 2024 03:03:08 +0900 Subject: [PATCH 07/13] give up implementing prediction logging in pair classification --- src/jmteb/evaluators/pair_classification/evaluator.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/jmteb/evaluators/pair_classification/evaluator.py b/src/jmteb/evaluators/pair_classification/evaluator.py index 6ec30d0..280bbfb 100644 --- a/src/jmteb/evaluators/pair_classification/evaluator.py +++ b/src/jmteb/evaluators/pair_classification/evaluator.py @@ -22,6 +22,8 @@ class PairClassificationEvaluator(EmbeddingEvaluator): test_dataset (PairClassificationDataset): test dataset sentence1_prefix (str | None): prefix for sentence1. Defaults to None. sentence2_prefix (str | None): prefix for sentence2. Defaults to None. + + # NOTE: Don't log predictions, as predictions by different metrics could be different. """ def __init__( From 2da792a09bd01384d20d051b9684cf0e47ab6991 Mon Sep 17 00:00:00 2001 From: "shengzhe.li" Date: Tue, 30 Jul 2024 00:57:31 +0900 Subject: [PATCH 08/13] Add prediction logging for reranking --- src/jmteb/evaluators/reranking/__init__.py | 1 + src/jmteb/evaluators/reranking/data.py | 26 ++++++++ src/jmteb/evaluators/reranking/evaluator.py | 69 +++++++++++++++++---- tests/evaluator/test_reranking_evaluator.py | 21 +++++++ 4 files changed, 105 insertions(+), 12 deletions(-) diff --git a/src/jmteb/evaluators/reranking/__init__.py b/src/jmteb/evaluators/reranking/__init__.py index 9931fcb..023120b 100644 --- a/src/jmteb/evaluators/reranking/__init__.py +++ b/src/jmteb/evaluators/reranking/__init__.py @@ -1,6 +1,7 @@ from .data import ( RerankingDoc, RerankingDocDataset, + RerankingPrediction, RerankingQuery, RerankingQueryDataset, ) diff --git a/src/jmteb/evaluators/reranking/data.py b/src/jmteb/evaluators/reranking/data.py index 4875729..e4a5228 100644 --- a/src/jmteb/evaluators/reranking/data.py +++ b/src/jmteb/evaluators/reranking/data.py @@ -22,6 +22,13 @@ class RerankingDoc: text: str +@dataclass +class RerankingPrediction: + query: str + relevant_docs: list[RerankingDoc] + reranked_relevant_docs: list[RerankingDoc] + + class RerankingQueryDataset(ABC): @abstractmethod def __len__(self): @@ -47,6 +54,23 @@ def __getitem__(self, idx) -> RerankingDoc: def __eq__(self, __value: object) -> bool: return False + def _build_idx_docid_mapping(self, dataset_attr_name: str = "dataset") -> None: + self.idx_to_docid: dict = {} + self.docid_to_idx: dict = {} + id_key: str = getattr(self, "id_key", None) + dataset = getattr(self, dataset_attr_name) + if id_key: + for idx, doc_dict in enumerate(dataset): + self.idx_to_docid[idx] = doc_dict[id_key] + self.docid_to_idx[doc_dict[id_key]] = idx + elif isinstance(dataset[0], RerankingDoc): + for idx, doc in enumerate(dataset): + doc: RerankingDoc + self.idx_to_docid[idx] = doc.id + self.docid_to_idx[doc.id] = idx + else: + raise ValueError(f"Invalid dataset type: list[{type(dataset[0])}]") + class HfRerankingQueryDataset(RerankingQueryDataset): def __init__( @@ -131,6 +155,7 @@ def __init__(self, path: str, split: str, name: str | None = None, id_key: str = self.dataset = datasets.load_dataset(path, split=split, name=name, trust_remote_code=True) self.id_key = id_key self.text_key = text_key + self._build_idx_docid_mapping() def __len__(self): return len(self.dataset) @@ -157,6 +182,7 @@ def __init__(self, filename: str, id_key: str = "docid", text_key: str = "text") self.dataset = corpus self.id_key = id_key self.text_key = text_key + self._build_idx_docid_mapping() def __len__(self): return len(self.dataset) diff --git a/src/jmteb/evaluators/reranking/evaluator.py b/src/jmteb/evaluators/reranking/evaluator.py index 41baf6c..15ade24 100644 --- a/src/jmteb/evaluators/reranking/evaluator.py +++ b/src/jmteb/evaluators/reranking/evaluator.py @@ -14,7 +14,13 @@ from jmteb.embedders.base import TextEmbedder from jmteb.evaluators.base import EmbeddingEvaluator, EvaluationResults -from .data import RerankingDocDataset, RerankingQueryDataset +from .data import ( + RerankingDoc, + RerankingDocDataset, + RerankingPrediction, + RerankingQuery, + RerankingQueryDataset, +) T = TypeVar("T") @@ -30,6 +36,8 @@ class RerankingEvaluator(EmbeddingEvaluator): ndcg_at_k (list[int] | None): top k documents to consider in NDCG (Normalized Documented Cumulative Gain). query_prefix (str | None): prefix for queries. Defaults to None. doc_prefix (str | None): prefix for documents. Defaults to None. + log_predictions (bool): whether to log predictions of each datapoint. Defaults to False. + top_n_docs_to_log (int): log only top n documents. Defaults to 5. """ def __init__( @@ -40,6 +48,8 @@ def __init__( ndcg_at_k: list[int] | None = None, query_prefix: str | None = None, doc_prefix: str | None = None, + log_predictions: bool = False, + top_n_docs_to_log: int = 5, ) -> None: self.test_query_dataset = test_query_dataset self.val_query_dataset = val_query_dataset @@ -48,6 +58,8 @@ def __init__( self.main_metric = f"ndcg@{self.ndcg_at_k[0]}" self.query_prefix = query_prefix self.doc_prefix = doc_prefix + self.log_predictions = log_predictions + self.top_n_docs_to_log = top_n_docs_to_log def __call__( self, @@ -91,7 +103,7 @@ def __call__( val_results = {} for dist_name, dist_func in dist_functions.items(): - val_results[dist_name] = self._compute_metrics( + val_results[dist_name], _ = self._compute_metrics( query_dataset=self.val_query_dataset, query_embeddings=val_query_embeddings, doc_embeddings=doc_embeddings, @@ -100,14 +112,13 @@ def __call__( sorted_val_results = sorted(val_results.items(), key=lambda res: res[1][self.main_metric], reverse=True) optimal_dist_name = sorted_val_results[0][0] - test_results = { - optimal_dist_name: self._compute_metrics( - query_dataset=self.test_query_dataset, - query_embeddings=test_query_embeddings, - doc_embeddings=doc_embeddings, - dist_func=dist_functions[optimal_dist_name], - ) - } + scores, reranked_docs_list = self._compute_metrics( + query_dataset=self.test_query_dataset, + query_embeddings=test_query_embeddings, + doc_embeddings=doc_embeddings, + dist_func=dist_functions[optimal_dist_name], + ) + test_results = {optimal_dist_name: scores} return EvaluationResults( metric_name=self.main_metric, @@ -117,6 +128,13 @@ def __call__( "val_scores": val_results, "test_scores": test_results, }, + predictions=( + self._format_predictions( + self.test_query_dataset, self.doc_dataset, reranked_docs_list, self.top_n_docs_to_log + ) + if self.log_predictions + else None + ), ) def _compute_metrics( @@ -125,7 +143,7 @@ def _compute_metrics( query_embeddings: np.ndarray | Tensor, doc_embeddings: np.ndarray | Tensor, dist_func: Callable[[Tensor, Tensor], Tensor], - ) -> dict[str, float]: + ) -> tuple[dict[str, float], list[list[str | int]]]: doc_indices = {item.id: i for i, item in enumerate(self.doc_dataset)} results: dict[str, float] = {} @@ -156,7 +174,34 @@ def _compute_metrics( for k in self.ndcg_at_k: results[f"ndcg@{k}"] = ndcg_at_k(retrieved_docs_list, relevance_scores_list, reranked_docs_list, k) - return results + return results, reranked_docs_list + + @staticmethod + def _format_predictions( + query_dataset: RerankingQueryDataset, + doc_dataset: RerankingDocDataset, + reranked_docs_list: list[list], + top_n_to_log: int, + ) -> list[RerankingPrediction]: + predictions = [] + for q, pred_docids in zip(query_dataset, reranked_docs_list): + q: RerankingQuery + golden_docs: list[RerankingDoc] = [ + doc_dataset[doc_dataset.docid_to_idx[docid]] for docid in q.retrieved_docs + ] + pred_docids = pred_docids[:top_n_to_log] + pred_docs: list[RerankingDoc] = [ + doc_dataset[doc_dataset.docid_to_idx[pred_docid]] for pred_docid in pred_docids + ] + logger.info(f"{golden_docs=}") + logger.info(f"{pred_docs=}") + prediction = RerankingPrediction( + query=q.query, + relevant_docs=golden_docs, + reranked_relevant_docs=pred_docs, + ) + predictions.append(prediction) + return predictions def ndcg_at_k( diff --git a/tests/evaluator/test_reranking_evaluator.py b/tests/evaluator/test_reranking_evaluator.py index ef847a9..e9ea894 100644 --- a/tests/evaluator/test_reranking_evaluator.py +++ b/tests/evaluator/test_reranking_evaluator.py @@ -1,3 +1,5 @@ +from loguru import logger + from jmteb.evaluators.reranking import ( RerankingDoc, RerankingDocDataset, @@ -8,17 +10,20 @@ from jmteb.evaluators.reranking.data import ( JsonlRerankingDocDataset, JsonlRerankingQueryDataset, + RerankingPrediction, ) EXPECTED_OUTPUT_DICT_KEYS = {"val_scores", "test_scores", "optimal_distance_metric"} EXPECTED_DIST_FUNC_NAMES = {"cosine_similarity", "euclidean_distance", "dot_score"} QUERY_PREFIX = "クエリ: " DOC_PREFIX = "ドキュメント: " +TOP_N_DOCS_TO_LOG = 4 class DummyDocDataset(RerankingDocDataset): def __init__(self, prefix: str = ""): self._items = [RerankingDoc(id=str(i), text=f"{prefix}dummy document {i}") for i in range(30)] + self._build_idx_docid_mapping("_items") def __len__(self): return len(self._items) @@ -60,6 +65,22 @@ def test_reranking_evaluator(embedder): assert any(score.startswith(metric) for metric in ["ndcg"]) +def test_reranking_evaluator_with_predictions(embedder): + evaluator = RerankingEvaluator( + val_query_dataset=DummyQueryDataset(), + test_query_dataset=DummyQueryDataset(), + doc_dataset=DummyDocDataset(), + log_predictions=True, + top_n_docs_to_log=TOP_N_DOCS_TO_LOG, + ) + results = evaluator(model=embedder) + logger.info(f"{results.predictions=}") + for p in results.predictions: + assert isinstance(p, RerankingPrediction) + assert len(p.reranked_relevant_docs) <= TOP_N_DOCS_TO_LOG + assert all([isinstance(doc, RerankingDoc) for doc in p.reranked_relevant_docs]) + + def test_reranking_evaluator_with_prefix(embedder): evaluator_with_prefix = RerankingEvaluator( val_query_dataset=DummyQueryDataset(), From 8890f3b3e9d2a6c7b10837aba106c905e3236724 Mon Sep 17 00:00:00 2001 From: "shengzhe.li" Date: Tue, 30 Jul 2024 00:58:15 +0900 Subject: [PATCH 09/13] Add an option to output predictions for all datasets --- src/jmteb/__main__.py | 8 ++++++++ tests/test_main.py | 1 + 2 files changed, 9 insertions(+) diff --git a/src/jmteb/__main__.py b/src/jmteb/__main__.py index bb9af7f..03c7c63 100644 --- a/src/jmteb/__main__.py +++ b/src/jmteb/__main__.py @@ -60,6 +60,9 @@ def main( parser.add_argument("--overwrite_cache", type=bool, default=False, help="Overwrite the save_dir if it exists") parser.add_argument("--eval_include", type=list[str], default=None, help="Evaluators to include.") parser.add_argument("--eval_exclude", type=list[str], default=None, help="Evaluators to exclude.") + parser.add_argument( + "--log_predictions", type=bool, default=False, help="Whether to log predictions for all evaulators." + ) args = parser.parse_args() @@ -99,6 +102,11 @@ def main( f"Please check {args.evaluators}" ) + if args.log_predictions: + for k, v in args.evaluators.items(): + if hasattr(v, "log_predictions"): + args.evaluators[k].log_predictions = True + main( text_embedder=args.embedder, evaluators=args.evaluators, diff --git a/tests/test_main.py b/tests/test_main.py index 05ce84a..ee81fb5 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -26,6 +26,7 @@ def test_main_cli(): "--embedder.model_kwargs", '{"torch_dtype": "torch.float16"}', "--save_dir", f, "--eval_include", '["jsts"]', + "--log_predictions", "true", ] # fmt: on result = subprocess.run(command) From 2eac1d70f52bc9a2b87d93e7b24245b52996c79d Mon Sep 17 00:00:00 2001 From: "shengzhe.li" Date: Tue, 30 Jul 2024 00:58:30 +0900 Subject: [PATCH 10/13] Update README --- README.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 03f824d..5a35967 100644 --- a/README.md +++ b/README.md @@ -40,4 +40,7 @@ poetry run python -m jmteb \ ``` > [!NOTE] -> Some tasks (e.g., AmazonReviewClassification in classification, JAQKET and Mr.TyDi-ja in retrieval, esci in reranking) are time-consuming and memory-consuming. Heavy retrieval tasks take hours to encode the large corpus, and use much memory for the storage of such vectors. If you want to exclude them, add `--eval_exclude "['amazon_review_classification', 'mrtydi', 'jaqket', 'esci']"`. +> Some tasks (e.g., AmazonReviewClassification in classification, JAQKET and Mr.TyDi-ja in retrieval, esci in reranking) are time-consuming and memory-consuming. Heavy retrieval tasks take hours to encode the large corpus, and use much memory for the storage of such vectors. If you want to exclude them, add `--eval_exclude "['amazon_review_classification', 'mrtydi', 'jaqket', 'esci']"`. Similarly, you can also use `--eval_include` to include only evaluation datasets you want. + +> [!NOTE] +> If you want to log model predictions to further analyze the performance of your model, you may want to use `--log_predictions true` to enable all evaluators to log predictions. It is also available to set whether to log in the config of evaluators. From ba87d8459e4e8e1280492a2dc3920c1607fc1f6a Mon Sep 17 00:00:00 2001 From: "shengzhe.li" Date: Tue, 30 Jul 2024 01:05:01 +0900 Subject: [PATCH 11/13] Update README format --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index 5a35967..a5c0f75 100644 --- a/README.md +++ b/README.md @@ -41,6 +41,5 @@ poetry run python -m jmteb \ > [!NOTE] > Some tasks (e.g., AmazonReviewClassification in classification, JAQKET and Mr.TyDi-ja in retrieval, esci in reranking) are time-consuming and memory-consuming. Heavy retrieval tasks take hours to encode the large corpus, and use much memory for the storage of such vectors. If you want to exclude them, add `--eval_exclude "['amazon_review_classification', 'mrtydi', 'jaqket', 'esci']"`. Similarly, you can also use `--eval_include` to include only evaluation datasets you want. - > [!NOTE] > If you want to log model predictions to further analyze the performance of your model, you may want to use `--log_predictions true` to enable all evaluators to log predictions. It is also available to set whether to log in the config of evaluators. From 19d16d1647eaf4a22ba3dfdb8e807d3a46cc2397 Mon Sep 17 00:00:00 2001 From: "shengzhe.li" Date: Tue, 30 Jul 2024 16:20:15 +0900 Subject: [PATCH 12/13] Fix README --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index a5c0f75..5a35967 100644 --- a/README.md +++ b/README.md @@ -41,5 +41,6 @@ poetry run python -m jmteb \ > [!NOTE] > Some tasks (e.g., AmazonReviewClassification in classification, JAQKET and Mr.TyDi-ja in retrieval, esci in reranking) are time-consuming and memory-consuming. Heavy retrieval tasks take hours to encode the large corpus, and use much memory for the storage of such vectors. If you want to exclude them, add `--eval_exclude "['amazon_review_classification', 'mrtydi', 'jaqket', 'esci']"`. Similarly, you can also use `--eval_include` to include only evaluation datasets you want. + > [!NOTE] > If you want to log model predictions to further analyze the performance of your model, you may want to use `--log_predictions true` to enable all evaluators to log predictions. It is also available to set whether to log in the config of evaluators. From 5ad6f694ef6257ddc2ba46e48ab1a69723071bc1 Mon Sep 17 00:00:00 2001 From: "shengzhe.li" Date: Tue, 30 Jul 2024 16:32:32 +0900 Subject: [PATCH 13/13] ignore MD028 --- .markdownlint.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.markdownlint.yaml b/.markdownlint.yaml index d6aa660..f52f5b5 100644 --- a/.markdownlint.yaml +++ b/.markdownlint.yaml @@ -1,3 +1,4 @@ MD013: false MD040: false -MD025: false \ No newline at end of file +MD025: false +MD028: false \ No newline at end of file