Skip to content

Commit

Permalink
fix: restructure repo
Browse files Browse the repository at this point in the history
  • Loading branch information
KennethEnevoldsen committed Jan 18, 2024
1 parent af6f926 commit 64bace6
Show file tree
Hide file tree
Showing 28 changed files with 407 additions and 157 deletions.
9 changes: 5 additions & 4 deletions src/seb/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from .benchmark import Benchmark
from .full_benchmark import run_benchmark
from .model_interface import ModelInterface, ModelMeta, EmbeddingModel
from .registries import (
get_all_models,
get_all_tasks,
Expand All @@ -9,7 +8,9 @@
models,
tasks,
)

from .interfaces.task import Task
from .interfaces.model import EmbeddingModel, ModelMeta, Encoder
from .result_dataclasses import BenchmarkResults, TaskError, TaskResult
from .seb_models import * # import all SEB models
from .seb_tasks import * # import all SEB tasks
from .tasks_interface import Task
from .registered_models import * # import all SEB models
from .registered_tasks import * # import all SEB tasks
4 changes: 2 additions & 2 deletions src/seb/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,10 @@

from tqdm import tqdm

from .model_interface import EmbeddingModel
from .interfaces.model import EmbeddingModel
from .interfaces.task import Task
from .registries import get_all_tasks, get_task
from .result_dataclasses import BenchmarkResults, TaskError, TaskResult
from .tasks_interface import Task
from .warning_ignore_manager import WarningIgnoreContextManager

logger = logging.getLogger(__name__)
Expand Down
3 changes: 1 addition & 2 deletions src/seb/full_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,8 @@
from pathlib import Path
from typing import Optional

from seb.model_interface import EmbeddingModel

from .benchmark import Benchmark
from .interfaces.model import EmbeddingModel
from .registries import get_all_models
from .result_dataclasses import BenchmarkResults

Expand Down
20 changes: 12 additions & 8 deletions src/seb/model_interface.py → src/seb/interfaces/model.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,31 @@
from typing import Any, Callable, Optional, Protocol, Union, runtime_checkable
from typing import TYPE_CHECKING, Any, Callable, Optional, Protocol, runtime_checkable

from numpy import ndarray
from pydantic import BaseModel
from torch import Tensor

ArrayLike = Union[ndarray, Tensor]
from ..types import ArrayLike

if TYPE_CHECKING:
from .task import Task


@runtime_checkable
class ModelInterface(Protocol):
class Encoder(Protocol):
"""
Interface which all models must implement.
"""

def encode(
self,
sentences: list[str],
task: "Task",
batch_size: int = 32,
**kwargs: dict,
) -> ArrayLike:
"""Returns a list of embeddings for the given sentences.
Args:
sentences: List of sentences to encode
task: The task to encode for. This allows the model to encode differently for different tasks. Will always be given but does not need
to be used.
batch_size: Batch size for the encoding
kwargs: arguments to pass to the models encode method
Expand Down Expand Up @@ -62,11 +66,11 @@ class EmbeddingModel(BaseModel):
"""

meta: ModelMeta
loader: Callable[[], ModelInterface]
_model: Optional[ModelInterface] = None
loader: Callable[[], Encoder]
_model: Optional[Encoder] = None

@property
def model(self) -> ModelInterface:
def model(self) -> Encoder:
"""
Dynimically load the model.
"""
Expand Down
85 changes: 25 additions & 60 deletions src/seb/tasks_interface.py → src/seb/interfaces/mteb_task.py
Original file line number Diff line number Diff line change
@@ -1,60 +1,24 @@
from datetime import datetime
from typing import Any, Protocol, runtime_checkable
from typing import Any

import numpy as np
from datasets import DatasetDict, concatenate_datasets
from mteb import AbsTask
from mteb import __version__ as mteb_version

from .model_interface import ModelInterface
from .result_dataclasses import TaskResult
from ..result_dataclasses import TaskResult
from ..types import ArrayLike
from .model import Encoder
from .task import DescriptiveDatasetStats, Task


@runtime_checkable
class Task(Protocol):
"""
A task is a specific evaluation task for a sentence embedding model.
class MTEBTaskModel(Encoder):
def __init__(self, mteb_model: Encoder, task: Task) -> None:
self.mteb_model = mteb_model
self.task = task

Attributes:
name: The name of the task.
main_score: The main score of the task.
description: A description of the task.
reference: A reference to the task.
version: The version of the task.
languages: The languages of the task.
domain: The domains of the task. Should be one of the categories listed on https://universaldependencies.org
"""

name: str
main_score: str
description: str
reference: str
version: str
languages: list[str]
domain: list[str]

def evaluate(self, model: ModelInterface) -> TaskResult:
"""
Evaluates a Sentence Embedding Model on the task.
Args:
model: A sentence embedding model.
Returns:
A TaskResult object.
"""
...

def get_descriptive_stats(self) -> dict[str, Any]:
...

def name_to_path(self) -> str:
"""
Convert a name to a path.
"""
name = self.name.replace("/", "__").replace(" ", "_")
return name
def encode(self, texts: list[str], **kwargs: Any) -> ArrayLike:
return self.mteb_model.encode(texts, task=self.task, **kwargs)


class MTEBTask(Task):
Expand All @@ -67,7 +31,7 @@ def __init__(self, mteb_task: AbsTask) -> None:
self.version = f"{mteb_version}"
self.reference = mteb_desc["reference"]
self.languages = mteb_desc["eval_langs"]
self.type = mteb_desc["type"]
self.task_type = mteb_desc["type"]
self.domain = []
self._text_columns = ["text"]

Expand All @@ -93,26 +57,27 @@ def load_data(self) -> DatasetDict:

return DatasetDict(ds)

def get_descriptive_stats(self) -> dict[str, Any]:
ds = self.load_data()
def get_descriptive_stats(self) -> DescriptiveDatasetStats:
ds: DatasetDict = self.load_data()
texts = []
for split in ds:
for text_column in self._text_columns:
texts += ds[split][text_column]

document_lengths = [len(text) for text in texts]
document_lengths = np.array([len(text) for text in texts])

mean = np.mean(document_lengths)
std = np.std(document_lengths)
return {
"mean_document_length": mean,
"std_document_length": std,
"num_documents": len(document_lengths),
}
mean = float(np.mean(document_lengths))
std = float(np.std(document_lengths))
return DescriptiveDatasetStats(
mean_document_length=mean,
std_document_length=std,
num_documents=len(document_lengths),
)

def evaluate(self, model: ModelInterface) -> TaskResult:
def evaluate(self, model: Encoder) -> TaskResult:
split = self.mteb_task.description["eval_splits"][0]
scores = self.mteb_task.evaluate(model, split=split)
task_model = MTEBTaskModel(model, self)
scores = self.mteb_task.evaluate(task_model, split=split)
if scores is None:
raise ValueError("MTEBTask evaluation failed.")

Expand Down
151 changes: 151 additions & 0 deletions src/seb/interfaces/task.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
from typing import Any, Callable, Optional, Protocol, runtime_checkable

from pydantic import BaseModel

from ..result_dataclasses import TaskResult
from ..types import ArrayLike, DescriptiveDatasetStats, Domain, Language, TaskType


@runtime_checkable
class Task(Protocol):
"""
A task is a specific evaluation task for a sentence embedding model.
Attributes:
name: The name of the task.
main_score: The main score of the task.
description: A description of the task.
reference: A reference to the task.
version: The version of the task.
languages: The languages of the task.
domain: The domains of the task. Should be one of the categories listed on https://universaldependencies.org
"""

name: str
main_score: str
description: str
reference: str
version: str
languages: list[Language]
domain: list[Domain]
task_type: TaskType

def evaluate(self, model: "Encoder") -> TaskResult:
"""
Evaluates a Sentence Embedding Model on the task.
Args:
model: A sentence embedding model.
Returns:
A TaskResult object.
"""
...

def get_descriptive_stats(self) -> DescriptiveDatasetStats:
...

def name_to_path(self) -> str:
"""
Convert a name to a path.
"""
name = self.name.replace("/", "__").replace(" ", "_")
return name


@runtime_checkable
class Encoder(Protocol):
"""
Interface which all models must implement.
"""

def encode(
self,
sentences: list[str],
task: Task,
batch_size: int = 32,
**kwargs: dict,
) -> ArrayLike:
"""Returns a list of embeddings for the given sentences.
Args:
sentences: List of sentences to encode
task: The task to encode for. This allows the model to encode differently for different tasks. Will always be given but does not need
to be used.
batch_size: Batch size for the encoding
kwargs: arguments to pass to the models encode method
Returns:
Embeddings for the given documents
"""
...


class ModelMeta(BaseModel):
name: str
description: Optional[str] = None
huggingface_name: Optional[str] = None
reference: Optional[str] = None
languages: list[str] = []
open_source: bool = False
embedding_size: Optional[int] = None

def get_path_name(self) -> str:
if self.huggingface_name is None:
return self._name_to_path(self.name)
return self._name_to_path(self.huggingface_name)

@staticmethod
def _name_to_path(name: str) -> str:
return name.replace("/", "__").replace(" ", "_")

def get_huggingface_url(self) -> str:
if self.huggingface_name is None:
raise ValueError("This model does not have an associated huggingface name.")
return f"https://huggingface.co/{self.huggingface_name}"


class EmbeddingModel(BaseModel):
"""
An embedding model as implemented in SEB. It notably dynamically loads models (such that models are not loaded when a cache is hit)
and includes metadata pertaining to the specific model.
"""

meta: ModelMeta
loader: Callable[[], Encoder]
_model: Optional[Encoder] = None

@property
def model(self) -> Encoder:
"""
Dynimically load the model.
"""
if self._model is None:
self._model = self.loader()
return self._model

@property
def number_of_parameters(self) -> Optional[int]:
"""
Returns the number of parameters in the model.
"""
if hasattr(self.model, "num_parameters"):
return sum(p.numel() for p in self.model.parameters() if p.requires_grad) # type: ignore
return None

def encode(
self,
sentences: list[str],
batch_size: int = 32,
**kwargs: Any,
) -> ArrayLike:
"""
Returns a list of embeddings for the given sentences.
Args:
sentences: List of sentences to encode
batch_size: Batch size for the encoding
kwargs: arguments to pass to the models encode method
Returns:
Embeddings for the given documents
"""
return self.model.encode(sentences, batch_size=batch_size, **kwargs)
File renamed without changes.
Loading

0 comments on commit 64bace6

Please sign in to comment.