Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added speed task #70

Merged
merged 3 commits into from
Jan 22, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions src/seb/cli/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,11 @@
from rich.table import Table

import seb
from seb.types import Language


def get_main_score(task: seb.TaskResult, langs: Optional[list[str]]) -> float:
if langs is None:
def get_main_score(task: seb.TaskResult, langs: Optional[list[Language]]) -> float:
if langs is None: # noqa
_langs = task.languages
else:
_langs = set(langs) & set(task.languages)
Expand Down
37 changes: 37 additions & 0 deletions src/seb/full_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,18 @@
"""


import logging
from pathlib import Path
from typing import Optional

from .benchmark import Benchmark
from .interfaces.model import EmbeddingModel
from .registered_tasks.speed import CPUSpeedTask, GPUSpeedTask
from .registries import get_all_models
from .result_dataclasses import BenchmarkResults

logger = logging.getLogger(__name__)

BENCHMARKS = {
"Mainland Scandinavian": ["da", "sv", "nn", "nb"],
"Danish": ["da"],
Expand Down Expand Up @@ -44,3 +48,36 @@ def run_benchmark(
results[subset] = bm_results

return results


def run_speed_benchmark(
use_cache: bool = True,
run_models: bool = True,
raise_errors: bool = True,
cache_dir: Optional[Path] = None,
) -> dict[str, list[BenchmarkResults]]:
"""
Run the speed benchmark.
"""
models: list[EmbeddingModel] = get_all_models()
tasks = [CPUSpeedTask(), GPUSpeedTask()] # type: ignore

if use_cache:
logger.warn(
"Running the speed benchmark with use_cache=True will load speed results from the cache, this might lead to incomparable results."
)

results = {}
for subset, langs in BENCHMARKS.items():
benchmark = Benchmark(languages=langs)
bm_results = benchmark.evaluate_models(
models=models,
use_cache=use_cache,
run_model=run_models,
raise_errors=raise_errors,
cache_dir=cache_dir,
)

results[subset] = bm_results

return results
1 change: 1 addition & 0 deletions src/seb/interfaces/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ def encode(
**kwargs: Any,
) -> ArrayLike:
"""Returns a list of embeddings for the given sentences.

Args:
sentences: List of sentences to encode
task: The task to encode for. This allows the model to encode differently for different tasks. Will always be given but does not need
Expand Down
6 changes: 3 additions & 3 deletions src/seb/interfaces/mteb_task.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
from datetime import datetime
from typing import Any
from typing import Any, Union

import numpy as np
from datasets import DatasetDict, concatenate_datasets
from mteb import AbsTask
from mteb import __version__ as mteb_version

from ..result_dataclasses import TaskResult
from ..types import ArrayLike
from ..types import ArrayLike, Language
from .model import Encoder
from .task import DescriptiveDatasetStats, Task

Expand Down Expand Up @@ -88,7 +88,7 @@ def evaluate(self, model: Encoder) -> TaskResult:
scores = scores.get(split, scores)
score_is_nested = isinstance(scores[next(iter(scores.keys()))], dict)
if not score_is_nested:
_scores = {lang: scores for lang in self.languages}
_scores: dict[str, dict[str, Union[float, str]]] = {lang: scores for lang in self.languages}
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this to appease the type checker?

scores = _scores

task_result = TaskResult(
Expand Down
2 changes: 2 additions & 0 deletions src/seb/interfaces/task.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from typing import Protocol, runtime_checkable

from attr import dataclass

from ..result_dataclasses import TaskResult
from ..types import DescriptiveDatasetStats, Domain, Language, TaskType
from .model import Encoder
Expand Down
103 changes: 103 additions & 0 deletions src/seb/registered_tasks/speed.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
import logging
import platform
import time
from datetime import datetime
from pathlib import Path
from typing import Optional, Union

import numpy as np
import psutil
import torch

from seb.interfaces.model import EmbeddingModel
from seb.interfaces.task import Task
from seb.result_dataclasses import TaskResult
from seb.types import DescriptiveDatasetStats, languages_in_seb

logger = logging.getLogger(__name__)


class CPUSpeedTask(Task):
reference = "NA"
version = "0.0.1"
task_type = "Speed"
languages = languages_in_seb
main_score = "Inference speed (seconds)"
domain = ["fiction"] # noqa
name = "Speed (CPU)"
description = "Time taken to encode the text 'The Ugly Duckling' split by paragraphs on a CPU."
device = "cpu"
_dataset: Optional[list[str]] = None

def load_dataset(self) -> list[str]:
file_path = Path(__file__).parent / "the_ugly_duckling.txt"
with file_path.open("r") as f:
text = f.read()
return text.split("\n\n")

@property
def dataset(self) -> list[str]:
if self._dataset is None:
self._dataset = self.load_dataset()
return self._dataset

def get_descriptive_stats(self) -> DescriptiveDatasetStats:
dataset = self.load_dataset()
lengths = np.array([len(x) for x in dataset])
return DescriptiveDatasetStats(
mean_document_length=float(np.mean(lengths)), std_document_length=float(np.std(lengths)), num_documents=len(dataset)
)

def get_time_taken(self, model: EmbeddingModel) -> float:
dataset = self.load_dataset()
start = time.time()
with torch.no_grad():
model.encode(dataset, batch_size=1, device=self.device, task=self)
KennethEnevoldsen marked this conversation as resolved.
Show resolved Hide resolved
time_taken = time.time() - start
return time_taken

def evaluate(self, model: EmbeddingModel) -> TaskResult:
model.loader() # ensure model is loaded

has_to_method = callable(getattr(model._model, "to", None))
KennethEnevoldsen marked this conversation as resolved.
Show resolved Hide resolved
if has_to_method:
model = model.to(self.device) # type: ignore

run_inference = not (self.device == "cuda" and not has_to_method)
if run_inference:
time_taken = self.get_time_taken(model)
else:
logger.warn(f"Could not run inference on {model.meta.name} on {self.device} as it does not have a 'to' method. Skipping")
time_taken = np.nan

scores: dict[str, Union[str, float]] = {self.main_score: time_taken, **self.get_system_info()}

return TaskResult(
task_name=self.name,
task_description=self.description,
task_version=self.version,
scores={Language: scores for Language in self.languages},
time_of_run=datetime.now(),
main_score=self.main_score,
)

def get_system_info(self) -> dict[str, str]:
"""
Returns a dictionary with system information.
"""
info = {}
info["platform"] = platform.system()
info["platform-release"] = platform.release()
info["platform-version"] = platform.version()
info["architecture"] = platform.machine()
info["processor"] = platform.processor()
info["ram"] = str(round(psutil.virtual_memory().total / (1024.0**3))) + " GB"
info["Physical cores"] = psutil.cpu_count(logical=False)
info["Total cores"] = psutil.cpu_count(logical=True)
return info


class GPUSpeedTask(CPUSpeedTask):
name = "Speed (GPU)"
description = "Time taken to encode the text 'The Ugly Duckling' split by paragraphs on a GPU."
device: str = "cuda"
Loading