From 53390da85b1b9462504568f9dd0e769a6c485407 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rton=20Kardos?= Date: Wed, 24 Jan 2024 13:18:51 +0100 Subject: [PATCH 1/8] Added encode_queries() and encode_corpus() to E5Wrapper --- src/seb/registered_models/e5_models.py | 42 +++++++++++++++++--------- 1 file changed, 28 insertions(+), 14 deletions(-) diff --git a/src/seb/registered_models/e5_models.py b/src/seb/registered_models/e5_models.py index 97c45144..278a1e36 100644 --- a/src/seb/registered_models/e5_models.py +++ b/src/seb/registered_models/e5_models.py @@ -1,27 +1,20 @@ from functools import partial -from typing import Any +from typing import Any, Optional from numpy.typing import ArrayLike +from sentence_transformers import SentenceTransformer from seb import models from ..interfaces.model import EmbeddingModel, Encoder, ModelMeta from ..interfaces.task import Task -from .hf_models import get_sentence_transformer class E5Wrapper(Encoder): - def __init__(self, model_name: str): + def __init__(self, model_name: str, sep: str = " "): self.model_name = model_name - self.mdl = get_sentence_transformer(model_name) - - @staticmethod - def preprocess(sentences: list[str]) -> list[str]: - # following the documentation it is better to generally do this: - return ["query: " + sentence for sentence in sentences] - - # but it does not work slightly better than this: - # return sentences # noqa + self.mdl = SentenceTransformer(model_name) + self.sep = sep def encode( self, @@ -31,8 +24,29 @@ def encode( batch_size: int = 32, **kwargs: Any, ) -> ArrayLike: - sentences = self.preprocess(sentences) - return self.mdl.encode(sentences, batch_size=batch_size, task=task, **kwargs) # type: ignore + return self.encode_queries(sentences, batch_size=batch_size, **kwargs) + + def encode_queries(self, queries: list[str], batch_size: int, **kwargs): + sentences = ["query: " + sentence for sentence in queries] + return self.mdl.encode(sentences, batch_size=batch_size, **kwargs) + + def encode_corpus(self, corpus: list[dict[str, str]], batch_size: int, **kwargs): + if type(corpus) is dict: + sentences = [ + (corpus["title"][i] + self.sep + corpus["text"][i]).strip() + if "title" in corpus + else corpus["text"][i].strip() + for i in range(len(corpus["text"])) + ] + else: + sentences = [ + (doc["title"] + self.sep + doc["text"]).strip() + if "title" in doc + else doc["text"].strip() + for doc in corpus + ] + sentences = ["passage: " + sentence for sentence in sentences] + return self.mdl.encode(sentences, batch_size=batch_size, **kwargs) # English From 6742fdd4b710fd5b1e7ca3145454f60b4330574f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rton=20Kardos?= Date: Wed, 24 Jan 2024 13:43:44 +0100 Subject: [PATCH 2/8] Added task-dependent and asymmetrical embeddings for Cohere --- src/seb/registered_models/cohere_models.py | 66 ++++++++++++++-------- 1 file changed, 43 insertions(+), 23 deletions(-) diff --git a/src/seb/registered_models/cohere_models.py b/src/seb/registered_models/cohere_models.py index a4f6150c..b8384fa2 100644 --- a/src/seb/registered_models/cohere_models.py +++ b/src/seb/registered_models/cohere_models.py @@ -4,54 +4,74 @@ import logging -from collections.abc import Sequence from functools import partial from typing import Any import torch import seb +from seb.interfaces.task import Task from seb.registries import models logger = logging.getLogger(__name__) class CohereTextEmbeddingModel(seb.Encoder): - def __init__(self, model_name: str) -> None: + def __init__(self, model_name: str, sep: str = " ") -> None: self.model_name = model_name - - @staticmethod - def create_sentence_blocks( - sentences: Sequence[str], - block_size: int, - ) -> list[Sequence[str]]: - sent_blocks: list[Sequence[str]] = [] - for i in range(0, len(sentences), block_size): - sent_blocks.append(sentences[i : i + block_size]) - return sent_blocks + self.sep = sep def get_embedding_dim(self) -> int: - v = self.encode(["get emb dim"]) + v = self._embed(["get emb dim"], input_type="classification") return v.shape[1] - def encode( - self, - sentences: Sequence[str], - batch_size: int = 32, # noqa: ARG002 - embed_type: str = "classification", - **kwargs: Any, # noqa: ARG002 - ) -> torch.Tensor: - import cohere # type: ignore + def _embed(self, sentences: list[str], input_type: str) -> torch.Tensor: + import cohere client = cohere.Client() response = client.embed( texts=list(sentences), model=self.model_name, - input_type=embed_type, + input_type=input_type, ) - return torch.tensor(response.embeddings) + def encode( + self, + sentences: list[str], + batch_size: int = 32, # noqa: ARG002 + *, + task: Task, + **kwargs: Any, # noqa: ARG002 + ) -> torch.Tensor: + if task.task_type == "Classification": + input_type = "classification" + elif task.task_type == "Clustering": + input_type = "clustering" + else: + input_type = "search_document" + return self._embed(sentences, input_type=input_type) + + def encode_queries(self, queries: list[str], batch_size: int, **kwargs): + return self._embed(queries, input_type="search_query") + + def encode_corpus(self, corpus: list[dict[str, str]], batch_size: int, **kwargs): + if type(corpus) is dict: + sentences = [ + (corpus["title"][i] + self.sep + corpus["text"][i]).strip() + if "title" in corpus + else corpus["text"][i].strip() + for i in range(len(corpus["text"])) + ] + else: + sentences = [ + (doc["title"] + self.sep + doc["text"]).strip() + if "title" in doc + else doc["text"].strip() + for doc in corpus + ] + return self._embed(sentences, input_type="search_document") + @models.register("embed-multilingual-v3.0") def create_embed_multilingual_v3() -> seb.EmbeddingModel: From ecee037d73448de50320d58063d08917fcd9d77b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rton=20Kardos?= Date: Fri, 26 Jan 2024 11:20:13 +0100 Subject: [PATCH 3/8] Added encode_queries and encode_documents to EmbeddingModel, made task optional --- src/seb/interfaces/model.py | 34 +++++++++++++++++-- src/seb/registered_models/cohere_models.py | 4 +-- src/seb/registered_models/e5_mistral.py | 5 ++- src/seb/registered_models/e5_models.py | 2 ++ src/seb/registered_models/fairseq_models.py | 19 ++++++++--- src/seb/registered_models/hf_models.py | 2 +- .../registered_models/translate_e5_models.py | 4 +-- 7 files changed, 56 insertions(+), 14 deletions(-) diff --git a/src/seb/interfaces/model.py b/src/seb/interfaces/model.py index 5ad07a28..18f624cb 100644 --- a/src/seb/interfaces/model.py +++ b/src/seb/interfaces/model.py @@ -1,6 +1,7 @@ import json from pathlib import Path -from typing import TYPE_CHECKING, Any, Callable, Optional, Protocol, runtime_checkable +from typing import (TYPE_CHECKING, Any, Callable, Optional, Protocol, + runtime_checkable) from numpy.typing import ArrayLike from pydantic import BaseModel @@ -21,7 +22,7 @@ def encode( self, sentences: list[str], *, - task: "Task", + task: Optional["Task"] = None, batch_size: int = 32, **kwargs: Any, ) -> ArrayLike: @@ -110,7 +111,7 @@ def encode( self, sentences: list[str], *, - task: "Task", + task: Optional["Task"] = None, batch_size: int = 32, **kwargs: Any, ) -> ArrayLike: @@ -127,3 +128,30 @@ def encode( Embeddings for the given documents """ return self.model.encode(sentences, batch_size=batch_size, task=task, **kwargs) + + def encode_queries(self, queries: list[str], batch_size: int, **kwargs): + try: + return self.model.encode_queries(queries, batch_size=batch_size, **kwargs) + except AttributeError: + return self.encode(queries, task=None, batch_size=batch_size, **kwargs) + + def encode_corpus(self, corpus: list[dict[str, str]], batch_size: int, **kwargs): + try: + return self.model.encode_corpus(corpus, batch_size=batch_size, **kwargs) + except AttributeError: + sep = " " + if type(corpus) is dict: + sentences = [ + (corpus["title"][i] + sep + corpus["text"][i]).strip() + if "title" in corpus + else corpus["text"][i].strip() + for i in range(len(corpus["text"])) + ] + else: + sentences = [ + (doc["title"] + sep + doc["text"]).strip() + if "title" in doc + else doc["text"].strip() + for doc in corpus + ] + return self.encode(sentences, task=None, batch_size=batch_size, **kwargs) diff --git a/src/seb/registered_models/cohere_models.py b/src/seb/registered_models/cohere_models.py index b8384fa2..d11830db 100644 --- a/src/seb/registered_models/cohere_models.py +++ b/src/seb/registered_models/cohere_models.py @@ -5,7 +5,7 @@ import logging from functools import partial -from typing import Any +from typing import Any, Optional import torch @@ -41,7 +41,7 @@ def encode( sentences: list[str], batch_size: int = 32, # noqa: ARG002 *, - task: Task, + task: Optional[Task] = None, **kwargs: Any, # noqa: ARG002 ) -> torch.Tensor: if task.task_type == "Classification": diff --git a/src/seb/registered_models/e5_mistral.py b/src/seb/registered_models/e5_mistral.py index 4dafe5b7..6a2e5916 100644 --- a/src/seb/registered_models/e5_mistral.py +++ b/src/seb/registered_models/e5_mistral.py @@ -1,6 +1,6 @@ from collections.abc import Iterable, Sequence from itertools import islice -from typing import Any, TypeVar +from typing import Any, Optional, TypeVar import torch import torch.nn.functional as F @@ -10,6 +10,7 @@ from seb import models from seb.interfaces.model import EmbeddingModel, Encoder, ModelMeta +from seb.interfaces.task import Task T = TypeVar("T") @@ -77,6 +78,8 @@ def last_token_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tenso def encode( self, sentences: list[str], + *, + task: Optional[Task] = None, batch_size: int = 32, **kwargs: Any, # noqa ) -> ArrayLike: diff --git a/src/seb/registered_models/e5_models.py b/src/seb/registered_models/e5_models.py index 278a1e36..cf9205e5 100644 --- a/src/seb/registered_models/e5_models.py +++ b/src/seb/registered_models/e5_models.py @@ -27,10 +27,12 @@ def encode( return self.encode_queries(sentences, batch_size=batch_size, **kwargs) def encode_queries(self, queries: list[str], batch_size: int, **kwargs): + print("ENCODING QUERYYYYY!!!!") sentences = ["query: " + sentence for sentence in queries] return self.mdl.encode(sentences, batch_size=batch_size, **kwargs) def encode_corpus(self, corpus: list[dict[str, str]], batch_size: int, **kwargs): + print("ENCODING CORPUS!!!!") if type(corpus) is dict: sentences = [ (corpus["title"][i] + self.sep + corpus["text"][i]).strip() diff --git a/src/seb/registered_models/fairseq_models.py b/src/seb/registered_models/fairseq_models.py index 8d76f82f..55b7137e 100644 --- a/src/seb/registered_models/fairseq_models.py +++ b/src/seb/registered_models/fairseq_models.py @@ -6,6 +6,7 @@ import torch from seb.interfaces.model import EmbeddingModel, Encoder, ModelMeta +from seb.interfaces.task import Task from seb.registries import models @@ -35,9 +36,7 @@ def __init__( Norwegian Nynorsk, and Norwegian Bokmål, respectively. """ from sonar.models.sonar_text import ( # type: ignore - load_sonar_text_encoder_model, - load_sonar_tokenizer, - ) + load_sonar_text_encoder_model, load_sonar_tokenizer) super().__init__() @@ -60,6 +59,8 @@ def __init__( def encode( self, input: Union[Path, Sequence[str]], # noqa: A002 + *, + task: Optional[Task] = None, batch_size: int, **kwargs: dict, # noqa: ARG002 ) -> torch.Tensor: @@ -72,7 +73,11 @@ def encode( tokenizer_encoder = self.tokenizer.create_encoder(lang=self.source_lang) # type: ignore pipeline = ( - (read_text(input) if isinstance(input, (str, Path)) else read_sequence(input)) + ( + read_text(input) + if isinstance(input, (str, Path)) + else read_sequence(input) + ) .map(tokenizer_encoder) .bucket(batch_size) .map(Collater(self.tokenizer.vocab_info.pad_idx)) # type: ignore @@ -96,7 +101,11 @@ def get_sonar_model(source_lang: str) -> SonarTextToEmbeddingModelPipeline: source_lang=source_lang, ) except ImportError: - msg = "Could not fetch Sonar Models. Make sure you have" + "fairseq2 installed. This is currently only supported for " + "Linux." + msg = ( + "Could not fetch Sonar Models. Make sure you have" + + "fairseq2 installed. This is currently only supported for " + + "Linux." + ) raise ImportError(msg) # noqa B904 diff --git a/src/seb/registered_models/hf_models.py b/src/seb/registered_models/hf_models.py index 2e91d7c0..57a2a12f 100644 --- a/src/seb/registered_models/hf_models.py +++ b/src/seb/registered_models/hf_models.py @@ -29,7 +29,7 @@ def encode( sentences: list[str], *, batch_size: int, - task: Task, # noqa: ARG002 + task: Optional[Task] = None, # noqa: ARG002 **kwargs: Any, ) -> ArrayLike: return super().encode(sentences, batch_size=batch_size, **kwargs) # type: ignore diff --git a/src/seb/registered_models/translate_e5_models.py b/src/seb/registered_models/translate_e5_models.py index ea687588..165e712a 100644 --- a/src/seb/registered_models/translate_e5_models.py +++ b/src/seb/registered_models/translate_e5_models.py @@ -1,6 +1,6 @@ from collections.abc import Sequence from functools import partial -from typing import Any +from typing import Any, Optional import torch from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer @@ -34,7 +34,7 @@ def encode( self, sentences: list[str], *, - task: seb.Task, # noqa: ARG002 + task: Optional[seb.Task] = None, # noqa: ARG002 batch_size: int = 32, **kwargs: Any, ) -> torch.Tensor: From 79708bfe5824a465ae018c4d0b4921acc9b47ff7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rton=20Kardos?= Date: Fri, 26 Jan 2024 11:22:53 +0100 Subject: [PATCH 4/8] Replaced MTEBTaskModel with partial() --- src/seb/interfaces/mteb_task.py | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/src/seb/interfaces/mteb_task.py b/src/seb/interfaces/mteb_task.py index 9d306bb9..b9a4df82 100644 --- a/src/seb/interfaces/mteb_task.py +++ b/src/seb/interfaces/mteb_task.py @@ -1,4 +1,5 @@ from datetime import datetime +from functools import partial from typing import Any, Union import numpy as np @@ -12,15 +13,6 @@ from .task import DescriptiveDatasetStats, Task -class MTEBTaskModel(Encoder): - def __init__(self, mteb_model: Encoder, task: Task) -> None: - self.mteb_model = mteb_model - self.task = task - - def encode(self, texts: list[str], **kwargs: Any) -> ArrayLike: - return self.mteb_model.encode(texts, task=self.task, **kwargs) - - class MTEBTask(Task): def __init__(self, mteb_task: AbsTask) -> None: self.mteb_task = mteb_task @@ -76,8 +68,12 @@ def get_descriptive_stats(self) -> DescriptiveDatasetStats: def evaluate(self, model: Encoder) -> TaskResult: split = self.mteb_task.description["eval_splits"][0] - task_model = MTEBTaskModel(model, self) - scores = self.mteb_task.evaluate(task_model, split=split) + # Infusing task into encode() + original_encode = model.encode + model.encode = partial(model.encode, task=self) + scores = self.mteb_task.evaluate(model, split=split) + # Resetting encode to original + model.encode = original_encode if scores is None: raise ValueError("MTEBTask evaluation failed.") From 674a0054e281d5536ac2df13b39e8f3ecd017c7b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rton=20Kardos?= Date: Fri, 26 Jan 2024 11:25:44 +0100 Subject: [PATCH 5/8] Removed reference to MTEBTask from ScaLA --- src/seb/registered_tasks/multilingual.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/src/seb/registered_tasks/multilingual.py b/src/seb/registered_tasks/multilingual.py index 53de7154..5f6767b5 100644 --- a/src/seb/registered_tasks/multilingual.py +++ b/src/seb/registered_tasks/multilingual.py @@ -1,11 +1,12 @@ from datetime import datetime +from functools import partial from typing import Any import numpy as np from datasets import DatasetDict, concatenate_datasets from seb.interfaces.model import Encoder -from seb.interfaces.mteb_task import MTEBTask, MTEBTaskModel +from seb.interfaces.mteb_task import MTEBTask from seb.interfaces.task import Task from seb.registries import tasks from seb.result_dataclasses import TaskResult @@ -37,13 +38,9 @@ def create_massive_scenario() -> Task: @tasks.register("ScaLA") def create_scala() -> Task: - from mteb import ( - ScalaDaClassification, - ScalaNbClassification, - ScalaNnClassification, - ScalaSvClassification, - __version__, - ) + from mteb import (ScalaDaClassification, ScalaNbClassification, + ScalaNnClassification, ScalaSvClassification, + __version__) class ScalaTask(Task): def __init__(self) -> None: @@ -96,11 +93,13 @@ def get_descriptive_stats(self) -> dict[str, Any]: def evaluate(self, model: Encoder) -> TaskResult: scores = {} - _model = MTEBTaskModel(model, self) + original_encode = model.encode + model.encode = partial(model.encode, task=self) for lang, mteb_task in self.mteb_tasks.items(): mteb_task.load_data() - score = mteb_task.evaluate(_model) + score = mteb_task.evaluate(model) scores[lang] = score + model.encode = original_encode return TaskResult( task_name=self.name, From 75debecb9e4108b30080845ff81bcd4935e81286 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rton=20Kardos?= Date: Fri, 26 Jan 2024 12:24:20 +0100 Subject: [PATCH 6/8] Made EmbeddingModel into a dataclass instead of BaseModel --- src/seb/interfaces/model.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/seb/interfaces/model.py b/src/seb/interfaces/model.py index 18f624cb..40005149 100644 --- a/src/seb/interfaces/model.py +++ b/src/seb/interfaces/model.py @@ -1,4 +1,5 @@ import json +from dataclasses import dataclass from pathlib import Path from typing import (TYPE_CHECKING, Any, Callable, Optional, Protocol, runtime_checkable) @@ -79,7 +80,8 @@ def from_disk(cls, path: Path) -> "ModelMeta": return cls(**model_meta) -class EmbeddingModel(BaseModel): +@dataclass +class EmbeddingModel: """ An embedding model as implemented in SEB. It notably dynamically loads models (such that models are not loaded when a cache is hit) and includes metadata pertaining to the specific model. From cc12070004fb4ea943a5e7f76a086cba234ed8cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rton=20Kardos?= Date: Fri, 26 Jan 2024 12:25:01 +0100 Subject: [PATCH 7/8] Removed debugging print statements from E5 --- src/seb/registered_models/e5_models.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/seb/registered_models/e5_models.py b/src/seb/registered_models/e5_models.py index cf9205e5..278a1e36 100644 --- a/src/seb/registered_models/e5_models.py +++ b/src/seb/registered_models/e5_models.py @@ -27,12 +27,10 @@ def encode( return self.encode_queries(sentences, batch_size=batch_size, **kwargs) def encode_queries(self, queries: list[str], batch_size: int, **kwargs): - print("ENCODING QUERYYYYY!!!!") sentences = ["query: " + sentence for sentence in queries] return self.mdl.encode(sentences, batch_size=batch_size, **kwargs) def encode_corpus(self, corpus: list[dict[str, str]], batch_size: int, **kwargs): - print("ENCODING CORPUS!!!!") if type(corpus) is dict: sentences = [ (corpus["title"][i] + self.sep + corpus["text"][i]).strip() From 34a2612eb578c909d14266092a539537e208eea9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rton=20Kardos?= Date: Fri, 26 Jan 2024 12:51:50 +0100 Subject: [PATCH 8/8] Put resetting the model's encode() method to a finally clause --- src/seb/interfaces/mteb_task.py | 12 ++++++++---- src/seb/registered_tasks/multilingual.py | 19 +++++++++++++------ 2 files changed, 21 insertions(+), 10 deletions(-) diff --git a/src/seb/interfaces/mteb_task.py b/src/seb/interfaces/mteb_task.py index b9a4df82..0afca106 100644 --- a/src/seb/interfaces/mteb_task.py +++ b/src/seb/interfaces/mteb_task.py @@ -70,10 +70,14 @@ def evaluate(self, model: Encoder) -> TaskResult: split = self.mteb_task.description["eval_splits"][0] # Infusing task into encode() original_encode = model.encode - model.encode = partial(model.encode, task=self) - scores = self.mteb_task.evaluate(model, split=split) - # Resetting encode to original - model.encode = original_encode + try: + model.encode = partial(model.encode, task=self) + scores = self.mteb_task.evaluate(model, split=split) + except Exception as e: + raise e + finally: + # Resetting encode to original + model.encode = original_encode if scores is None: raise ValueError("MTEBTask evaluation failed.") diff --git a/src/seb/registered_tasks/multilingual.py b/src/seb/registered_tasks/multilingual.py index 5f6767b5..f63bbaef 100644 --- a/src/seb/registered_tasks/multilingual.py +++ b/src/seb/registered_tasks/multilingual.py @@ -93,13 +93,20 @@ def get_descriptive_stats(self) -> dict[str, Any]: def evaluate(self, model: Encoder) -> TaskResult: scores = {} + # Infusing task into encode() original_encode = model.encode - model.encode = partial(model.encode, task=self) - for lang, mteb_task in self.mteb_tasks.items(): - mteb_task.load_data() - score = mteb_task.evaluate(model) - scores[lang] = score - model.encode = original_encode + try: + model.encode = partial(model.encode, task=self) + for lang, mteb_task in self.mteb_tasks.items(): + mteb_task.load_data() + score = mteb_task.evaluate(model) + scores[lang] = score + model.encode = original_encode + except Exception as e: + raise e + finally: + # Resetting encode to original + model.encode = original_encode return TaskResult( task_name=self.name,