Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add more unit tests #464

Merged
merged 4 commits into from
Jul 20, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions requirements/requirements-test.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
#
# pip-compile requirements-test.in
#
accelerate==0.21.0
# via -r requirements.in
aiohttp==3.8.4
# via
# fsspec
Expand Down Expand Up @@ -132,6 +134,8 @@ fsspec[http]==2023.6.0
# huggingface-hub
# lightning
# pytorch-lightning
greenlet==2.0.2
# via sqlalchemy
grpcio==1.49.1
# via
# pymilvus
Expand Down Expand Up @@ -242,6 +246,7 @@ numexpr==2.8.4
numpy==1.24.4
# via
# -r requirements.in
# accelerate
# langchain
# lightning
# numexpr
Expand All @@ -263,6 +268,7 @@ ordered-set==4.1.0
# via deepdiff
packaging==23.1
# via
# accelerate
# dask
# distributed
# huggingface-hub
Expand Down Expand Up @@ -293,6 +299,7 @@ protobuf==4.23.4
# ray
psutil==5.9.5
# via
# accelerate
# distributed
# lightning
py==1.11.0
Expand Down Expand Up @@ -346,6 +353,7 @@ pytz==2023.3
# pandas
pyyaml==6.0.1
# via
# accelerate
# dask
# distributed
# huggingface-hub
Expand Down Expand Up @@ -449,6 +457,7 @@ toolz==0.12.0
torch==2.0.0
# via
# -r requirements.in
# accelerate
# lightning
# pytorch-lightning
# torchmetrics
Expand Down
1 change: 1 addition & 0 deletions requirements/requirements.in
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,4 @@ ray>=2.4.0
scikit-learn>=1.1.3
torch>=2.0.0,!=2.0.1
transformers>=4.29.1
accelerate>=0.20.1
9 changes: 9 additions & 0 deletions requirements/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
#
# pip-compile requirements.in
#
accelerate==0.21.0
# via -r requirements.in
aiohttp==3.8.4
# via
# fsspec
Expand Down Expand Up @@ -127,6 +129,8 @@ fsspec[http]==2023.6.0
# huggingface-hub
# lightning
# pytorch-lightning
greenlet==2.0.2
# via sqlalchemy
grpcio==1.49.1
# via
# pymilvus
Expand Down Expand Up @@ -226,6 +230,7 @@ numexpr==2.8.4
numpy==1.24.4
# via
# -r requirements.in
# accelerate
# langchain
# lightning
# numexpr
Expand All @@ -247,6 +252,7 @@ ordered-set==4.1.0
# via deepdiff
packaging==23.1
# via
# accelerate
# dask
# distributed
# huggingface-hub
Expand Down Expand Up @@ -274,6 +280,7 @@ protobuf==4.23.4
# ray
psutil==5.9.5
# via
# accelerate
# distributed
# lightning
py==1.11.0
Expand Down Expand Up @@ -321,6 +328,7 @@ pytz==2023.3
# pandas
pyyaml==6.0.1
# via
# accelerate
# dask
# distributed
# huggingface-hub
Expand Down Expand Up @@ -417,6 +425,7 @@ toolz==0.12.0
torch==2.0.0
# via
# -r requirements.in
# accelerate
# lightning
# pytorch-lightning
# torchmetrics
Expand Down
2 changes: 1 addition & 1 deletion superduperdb/core/documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ class Document:
that resource to a mix of jsonable content or `bytes`
"""

_DEFAULT_ID_KEY = '_id'
_DEFAULT_ID_KEY: str = '_id'

def __init__(self, content: t.Dict):
self.content = content
Expand Down
2 changes: 1 addition & 1 deletion superduperdb/core/model.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import inspect
from dask.distributed import Future
import inspect
import dataclasses as dc
import typing as t

Expand Down
1 change: 0 additions & 1 deletion superduperdb/core/vector_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,6 @@ def get_nearest(
models, keys = self.models_keys
if len(models) != len(keys):
raise ValueError(f'len(models={models}) != len(keys={keys})')

within_ids = ids or ()

if db.db.id_field in like.content: # type: ignore
Expand Down
13 changes: 11 additions & 2 deletions superduperdb/datalayer/mongodb/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -331,6 +331,9 @@ class FindOne(SelectOne):

type_id: t.Literal['mongodb.FindOne'] = 'mongodb.FindOne'

def add_fold(self, fold: str) -> 'Select':
raise NotImplementedError

def __call__(self, db):
if self.collection is not None:
return SuperDuperCursor.wrap_document(
Expand Down Expand Up @@ -361,6 +364,12 @@ class Aggregate(Select):

type_id: t.Literal['mongodb.Aggregate'] = 'mongodb.Aggregate'

def add_fold(self, fold: str) -> 'Select':
raise NotImplementedError

def is_trivial(self) -> bool:
raise NotImplementedError

@property
def select_ids(self) -> 'Select':
raise NotImplementedError
Expand Down Expand Up @@ -489,7 +498,6 @@ class InsertMany(Insert):
verbose: bool = True
args: t.List = dc.field(default_factory=list)
kwargs: t.Dict = dc.field(default_factory=dict)
valid_prob: float = 0.05
encoders: t.List = dc.field(default_factory=list)

type_id: t.Literal['mongodb.InsertMany'] = 'mongodb.InsertMany'
Expand All @@ -506,11 +514,12 @@ def select_using_ids(self, ids):
return Find(collection=self.collection, args=[{'_id': {'$in': ids}}])

def __call__(self, db):
valid_prob = self.kwargs.get('valid_prob', 0.5)
for e in self.encoders:
db.add(e)
documents = [r.encode() for r in self.documents]
for r in documents:
if random.random() < self.valid_prob:
if random.random() < valid_prob:
r['_fold'] = 'valid'
else:
r['_fold'] = 'train'
Expand Down
25 changes: 15 additions & 10 deletions superduperdb/models/transformers/wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,13 +31,13 @@ def TransformersTrainerConfiguration(identifier: str, *args, **kwargs):
class Pipeline(Model):
tokenizer: t.Optional[t.Callable] = None

def __post_init__(self, db):
def __post_init__(self):
if not self.device:
self.device = "cuda" if torch.cuda.is_available() else "cpu"
self.object.to(self.device)
if not isinstance(self.tokenizer, Artifact):
self.tokenizer = Artifact(_artifact=self.tokenizer)
super().__post_init__(db)
self.tokenizer = Artifact(artifact=self.tokenizer)
super().__post_init__()

@property
def pipeline(self):
Expand All @@ -57,7 +57,7 @@ def _get_data(
**tokenizer_kwargs,
):
tokenizing_function = TokenizingFunction(
self.tokenizer.a, key=X_key, **tokenizer_kwargs
self.tokenizer.artifact, key=X_key, **tokenizer_kwargs
)
train_data = query_dataset_factory(
select=self.training_select,
Expand Down Expand Up @@ -104,7 +104,7 @@ def _fit( # type: ignore[override]
prefetch_size: int = _DEFAULT_PREFETCH_SIZE,
tokenizer_kwargs: t.Dict[str, t.Any] = {},
**kwargs,
):
) -> t.Optional[t.Dict[str, t.Any]]:
if configuration is not None:
self.configuration = configuration
if select is not None:
Expand All @@ -114,6 +114,8 @@ def _fit( # type: ignore[override]
if metrics is not None:
self.metrics = metrics

evaluate = kwargs.pop('evaluate', True)

if isinstance(X, str):
train_data, valid_data = self._get_data(
db,
Expand All @@ -131,18 +133,21 @@ def _fit( # type: ignore[override]
eval_dataset=valid_data,
**kwargs,
)
evaluation = None

try:
trainer.train()
evaluation = trainer.evaluate()
if evaluate:
evaluation = trainer.evaluate()
except Exception as exc:
log.error(f"Training could not finish :: {exc}")

return evaluation
log.exception(f"Training could not finish :: {exc}")
raise
else:
return evaluation

def _predict_one(self, input: str, **kwargs):
tokenized_input = self.tokenizer.a(input, return_tensors='pt').to(self.device)
return self.object.a(**tokenized_input, **kwargs)
return self.object.artifact(**tokenized_input, **kwargs)

def _predict(self, input: str, **kwargs):
if not isinstance(input, list):
Expand Down
12 changes: 8 additions & 4 deletions superduperdb/models/vanilla/wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,22 @@


class Function(Model):
vanilla = True

def predict_one(self, x, **kwargs):
return self.object.a(x, **kwargs)
return self.object.artifact(x, **kwargs)

def _predict(self, docs, num_workers=0):
def _predict(self, docs, num_workers=0, **kwargs):
outputs = []
if not isinstance(docs, list):
return self.predict_one(docs)
if num_workers:
pool = multiprocessing.Pool(processes=num_workers)
for r in pool.map(self.object, docs):
for r in pool.map(self.object.artifact, docs):
outputs.append(r)
pool.close()
pool.join()
else:
for r in docs:
outputs.append(self.object(r))
outputs.append(self.object.artifact(r))
return outputs
4 changes: 2 additions & 2 deletions superduperdb/vector_search/faiss_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,13 @@ def __init__(self, h, index, measure='l2', faiss_index=None):
super().__init__(h, index, measure)
self.h = self.h.astype('float32')
if faiss_index is None:
if measure == 'css':
if measure == 'cosine':
self.h = self.h / (numpy.linalg.norm(self.h, axis=1)[:, None])
if measure == 'l2':
faiss_index = faiss.index_factory(
self.h.shape[1], 'Flat', faiss.METRIC_L2
)
elif measure in {'css', 'dot'}:
elif measure in {'cosine', 'dot'}:
faiss_index = faiss.index_factory(
self.h.shape[1], 'Flat', faiss.METRIC_INNER_PRODUCT
)
Expand Down
5 changes: 5 additions & 0 deletions tests/integration/conftest.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import random
import numpy as np
import time
from threading import Thread
from unittest import mock
Expand Down Expand Up @@ -32,6 +33,10 @@
as much as possible. This will make it easier to understand the test suite.
'''

# Set the seeds
random.seed(42)
np.random.seed(42)


mongodb_test_config = {
'host': '0.0.0.0',
Expand Down
Loading