fix: restructure repo

KennethEnevoldsen · Jan 18, 2024 · 64bace6 · 64bace6
1 parent af6f926
commit 64bace6
Show file tree

Hide file tree

Showing 28 changed files with 407 additions and 157 deletions.
diff --git a/src/seb/__init__.py b/src/seb/__init__.py
@@ -1,6 +1,5 @@
 from .benchmark import Benchmark
 from .full_benchmark import run_benchmark
-from .model_interface import ModelInterface, ModelMeta, EmbeddingModel
 from .registries import (
     get_all_models,
     get_all_tasks,
@@ -9,7 +8,9 @@
     models,
     tasks,
 )
+
+from .interfaces.task import Task
+from .interfaces.model import EmbeddingModel, ModelMeta, Encoder
 from .result_dataclasses import BenchmarkResults, TaskError, TaskResult
-from .seb_models import *  # import all SEB models
-from .seb_tasks import *  # import all SEB tasks
-from .tasks_interface import Task
+from .registered_models import *  # import all SEB models
+from .registered_tasks import *  # import all SEB tasks
diff --git a/src/seb/benchmark.py b/src/seb/benchmark.py
@@ -7,10 +7,10 @@
 
 from tqdm import tqdm
 
-from .model_interface import EmbeddingModel
+from .interfaces.model import EmbeddingModel
+from .interfaces.task import Task
 from .registries import get_all_tasks, get_task
 from .result_dataclasses import BenchmarkResults, TaskError, TaskResult
-from .tasks_interface import Task
 from .warning_ignore_manager import WarningIgnoreContextManager
 
 logger = logging.getLogger(__name__)

diff --git a/src/seb/full_benchmark.py b/src/seb/full_benchmark.py
@@ -6,9 +6,8 @@
 from pathlib import Path
 from typing import Optional
 
-from seb.model_interface import EmbeddingModel
-
 from .benchmark import Benchmark
+from .interfaces.model import EmbeddingModel
 from .registries import get_all_models
 from .result_dataclasses import BenchmarkResults
 

diff --git a/src/seb/model_interface.py → src/seb/interfaces/model.py b/src/seb/model_interface.py → src/seb/interfaces/model.py
@@ -1,27 +1,31 @@
-from typing import Any, Callable, Optional, Protocol, Union, runtime_checkable
+from typing import TYPE_CHECKING, Any, Callable, Optional, Protocol, runtime_checkable
 
-from numpy import ndarray
 from pydantic import BaseModel
-from torch import Tensor
 
-ArrayLike = Union[ndarray, Tensor]
+from ..types import ArrayLike
+
+if TYPE_CHECKING:
+    from .task import Task
 
 
 @runtime_checkable
-class ModelInterface(Protocol):
+class Encoder(Protocol):
     """
     Interface which all models must implement.
     """
 
     def encode(
         self,
         sentences: list[str],
+        task: "Task",
         batch_size: int = 32,
         **kwargs: dict,
     ) -> ArrayLike:
         """Returns a list of embeddings for the given sentences.
         Args:
             sentences: List of sentences to encode
+            task: The task to encode for. This allows the model to encode differently for different tasks. Will always be given but does not need
+                to be used.
             batch_size: Batch size for the encoding
             kwargs: arguments to pass to the models encode method
 
@@ -62,11 +66,11 @@ class EmbeddingModel(BaseModel):
     """
 
     meta: ModelMeta
-    loader: Callable[[], ModelInterface]
-    _model: Optional[ModelInterface] = None
+    loader: Callable[[], Encoder]
+    _model: Optional[Encoder] = None
 
     @property
-    def model(self) -> ModelInterface:
+    def model(self) -> Encoder:
         """
         Dynimically load the model.
         """

diff --git a/src/seb/tasks_interface.py → src/seb/interfaces/mteb_task.py b/src/seb/tasks_interface.py → src/seb/interfaces/mteb_task.py
@@ -1,60 +1,24 @@
 from datetime import datetime
-from typing import Any, Protocol, runtime_checkable
+from typing import Any
 
 import numpy as np
 from datasets import DatasetDict, concatenate_datasets
 from mteb import AbsTask
 from mteb import __version__ as mteb_version
 
-from .model_interface import ModelInterface
-from .result_dataclasses import TaskResult
+from ..result_dataclasses import TaskResult
+from ..types import ArrayLike
+from .model import Encoder
+from .task import DescriptiveDatasetStats, Task
 
 
-@runtime_checkable
-class Task(Protocol):
-    """
-    A task is a specific evaluation task for a sentence embedding model.
+class MTEBTaskModel(Encoder):
+    def __init__(self, mteb_model: Encoder, task: Task) -> None:
+        self.mteb_model = mteb_model
+        self.task = task
 
-    Attributes:
-        name: The name of the task.
-        main_score: The main score of the task.
-        description: A description of the task.
-        reference: A reference to the task.
-        version: The version of the task.
-        languages: The languages of the task.
-        domain: The domains of the task. Should be one of the categories listed on https://universaldependencies.org
-
-    """
-
-    name: str
-    main_score: str
-    description: str
-    reference: str
-    version: str
-    languages: list[str]
-    domain: list[str]
-
-    def evaluate(self, model: ModelInterface) -> TaskResult:
-        """
-        Evaluates a Sentence Embedding Model on the task.
-
-        Args:
-            model: A sentence embedding model.
-
-        Returns:
-            A TaskResult object.
-        """
-        ...
-
-    def get_descriptive_stats(self) -> dict[str, Any]:
-        ...
-
-    def name_to_path(self) -> str:
-        """
-        Convert a name to a path.
-        """
-        name = self.name.replace("/", "__").replace(" ", "_")
-        return name
+    def encode(self, texts: list[str], **kwargs: Any) -> ArrayLike:
+        return self.mteb_model.encode(texts, task=self.task, **kwargs)
 
 
 class MTEBTask(Task):
@@ -67,7 +31,7 @@ def __init__(self, mteb_task: AbsTask) -> None:
         self.version = f"{mteb_version}"
         self.reference = mteb_desc["reference"]
         self.languages = mteb_desc["eval_langs"]
-        self.type = mteb_desc["type"]
+        self.task_type = mteb_desc["type"]
         self.domain = []
         self._text_columns = ["text"]
 
@@ -93,26 +57,27 @@ def load_data(self) -> DatasetDict:
 
         return DatasetDict(ds)
 
-    def get_descriptive_stats(self) -> dict[str, Any]:
-        ds = self.load_data()
+    def get_descriptive_stats(self) -> DescriptiveDatasetStats:
+        ds: DatasetDict = self.load_data()
         texts = []
         for split in ds:
             for text_column in self._text_columns:
                 texts += ds[split][text_column]
 
-        document_lengths = [len(text) for text in texts]
+        document_lengths = np.array([len(text) for text in texts])
 
-        mean = np.mean(document_lengths)
-        std = np.std(document_lengths)
-        return {
-            "mean_document_length": mean,
-            "std_document_length": std,
-            "num_documents": len(document_lengths),
-        }
+        mean = float(np.mean(document_lengths))
+        std = float(np.std(document_lengths))
+        return DescriptiveDatasetStats(
+            mean_document_length=mean,
+            std_document_length=std,
+            num_documents=len(document_lengths),
+        )
 
-    def evaluate(self, model: ModelInterface) -> TaskResult:
+    def evaluate(self, model: Encoder) -> TaskResult:
         split = self.mteb_task.description["eval_splits"][0]
-        scores = self.mteb_task.evaluate(model, split=split)
+        task_model = MTEBTaskModel(model, self)
+        scores = self.mteb_task.evaluate(task_model, split=split)
         if scores is None:
             raise ValueError("MTEBTask evaluation failed.")
 

diff --git a/src/seb/interfaces/task.py b/src/seb/interfaces/task.py
@@ -0,0 +1,151 @@
+from typing import Any, Callable, Optional, Protocol, runtime_checkable
+
+from pydantic import BaseModel
+
+from ..result_dataclasses import TaskResult
+from ..types import ArrayLike, DescriptiveDatasetStats, Domain, Language, TaskType
+
+
+@runtime_checkable
+class Task(Protocol):
+    """
+    A task is a specific evaluation task for a sentence embedding model.
+
+    Attributes:
+        name: The name of the task.
+        main_score: The main score of the task.
+        description: A description of the task.
+        reference: A reference to the task.
+        version: The version of the task.
+        languages: The languages of the task.
+        domain: The domains of the task. Should be one of the categories listed on https://universaldependencies.org
+    """
+
+    name: str
+    main_score: str
+    description: str
+    reference: str
+    version: str
+    languages: list[Language]
+    domain: list[Domain]
+    task_type: TaskType
+
+    def evaluate(self, model: "Encoder") -> TaskResult:
+        """
+        Evaluates a Sentence Embedding Model on the task.
+
+        Args:
+            model: A sentence embedding model.
+
+        Returns:
+            A TaskResult object.
+        """
+        ...
+
+    def get_descriptive_stats(self) -> DescriptiveDatasetStats:
+        ...
+
+    def name_to_path(self) -> str:
+        """
+        Convert a name to a path.
+        """
+        name = self.name.replace("/", "__").replace(" ", "_")
+        return name
+
+
+@runtime_checkable
+class Encoder(Protocol):
+    """
+    Interface which all models must implement.
+    """
+
+    def encode(
+        self,
+        sentences: list[str],
+        task: Task,
+        batch_size: int = 32,
+        **kwargs: dict,
+    ) -> ArrayLike:
+        """Returns a list of embeddings for the given sentences.
+        Args:
+            sentences: List of sentences to encode
+            task: The task to encode for. This allows the model to encode differently for different tasks. Will always be given but does not need
+                to be used.
+            batch_size: Batch size for the encoding
+            kwargs: arguments to pass to the models encode method
+
+        Returns:
+            Embeddings for the given documents
+        """
+        ...
+
+
+class ModelMeta(BaseModel):
+    name: str
+    description: Optional[str] = None
+    huggingface_name: Optional[str] = None
+    reference: Optional[str] = None
+    languages: list[str] = []
+    open_source: bool = False
+    embedding_size: Optional[int] = None
+
+    def get_path_name(self) -> str:
+        if self.huggingface_name is None:
+            return self._name_to_path(self.name)
+        return self._name_to_path(self.huggingface_name)
+
+    @staticmethod
+    def _name_to_path(name: str) -> str:
+        return name.replace("/", "__").replace(" ", "_")
+
+    def get_huggingface_url(self) -> str:
+        if self.huggingface_name is None:
+            raise ValueError("This model does not have an associated huggingface name.")
+        return f"https://huggingface.co/{self.huggingface_name}"
+
+
+class EmbeddingModel(BaseModel):
+    """
+    An embedding model as implemented in SEB. It notably dynamically loads models (such that models are not loaded when a cache is hit)
+    and includes metadata pertaining to the specific model.
+    """
+
+    meta: ModelMeta
+    loader: Callable[[], Encoder]
+    _model: Optional[Encoder] = None
+
+    @property
+    def model(self) -> Encoder:
+        """
+        Dynimically load the model.
+        """
+        if self._model is None:
+            self._model = self.loader()
+        return self._model
+
+    @property
+    def number_of_parameters(self) -> Optional[int]:
+        """
+        Returns the number of parameters in the model.
+        """
+        if hasattr(self.model, "num_parameters"):
+            return sum(p.numel() for p in self.model.parameters() if p.requires_grad)  # type: ignore
+        return None
+
+    def encode(
+        self,
+        sentences: list[str],
+        batch_size: int = 32,
+        **kwargs: Any,
+    ) -> ArrayLike:
+        """
+        Returns a list of embeddings for the given sentences.
+        Args:
+            sentences: List of sentences to encode
+            batch_size: Batch size for the encoding
+            kwargs: arguments to pass to the models encode method
+
+        Returns:
+            Embeddings for the given documents
+        """
+        return self.model.encode(sentences, batch_size=batch_size, **kwargs)
diff --git a/src/seb/seb_models/__init__.py → src/seb/registered_models/__init__.py b/src/seb/seb_models/__init__.py → src/seb/registered_models/__init__.py