superduper-io · thejumpman2323 · Jul 20, 2023 · Jul 19, 2023 · Jul 19, 2023 · Jul 20, 2023
diff --git a/requirements/requirements-test.txt b/requirements/requirements-test.txt
@@ -4,6 +4,8 @@
 #
 #    pip-compile requirements-test.in
 #
+accelerate==0.21.0
+    # via -r requirements.in
 aiohttp==3.8.4
     # via
     #   fsspec
@@ -132,6 +134,8 @@ fsspec[http]==2023.6.0
     #   huggingface-hub
     #   lightning
     #   pytorch-lightning
+greenlet==2.0.2
+    # via sqlalchemy
 grpcio==1.49.1
     # via
     #   pymilvus
@@ -242,6 +246,7 @@ numexpr==2.8.4
 numpy==1.24.4
     # via
     #   -r requirements.in
+    #   accelerate
     #   langchain
     #   lightning
     #   numexpr
@@ -263,6 +268,7 @@ ordered-set==4.1.0
     # via deepdiff
 packaging==23.1
     # via
+    #   accelerate
     #   dask
     #   distributed
     #   huggingface-hub
@@ -293,6 +299,7 @@ protobuf==4.23.4
     #   ray
 psutil==5.9.5
     # via
+    #   accelerate
     #   distributed
     #   lightning
 py==1.11.0
@@ -346,6 +353,7 @@ pytz==2023.3
     #   pandas
 pyyaml==6.0.1
     # via
+    #   accelerate
     #   dask
     #   distributed
     #   huggingface-hub
@@ -449,6 +457,7 @@ toolz==0.12.0
 torch==2.0.0
     # via
     #   -r requirements.in
+    #   accelerate
     #   lightning
     #   pytorch-lightning
     #   torchmetrics

diff --git a/requirements/requirements.in b/requirements/requirements.in
@@ -26,3 +26,4 @@ ray>=2.4.0
 scikit-learn>=1.1.3
 torch>=2.0.0,!=2.0.1
 transformers>=4.29.1
+accelerate>=0.20.1
diff --git a/requirements/requirements.txt b/requirements/requirements.txt
@@ -4,6 +4,8 @@
 #
 #    pip-compile requirements.in
 #
+accelerate==0.21.0
+    # via -r requirements.in
 aiohttp==3.8.4
     # via
     #   fsspec
@@ -127,6 +129,8 @@ fsspec[http]==2023.6.0
     #   huggingface-hub
     #   lightning
     #   pytorch-lightning
+greenlet==2.0.2
+    # via sqlalchemy
 grpcio==1.49.1
     # via
     #   pymilvus
@@ -226,6 +230,7 @@ numexpr==2.8.4
 numpy==1.24.4
     # via
     #   -r requirements.in
+    #   accelerate
     #   langchain
     #   lightning
     #   numexpr
@@ -247,6 +252,7 @@ ordered-set==4.1.0
     # via deepdiff
 packaging==23.1
     # via
+    #   accelerate
     #   dask
     #   distributed
     #   huggingface-hub
@@ -274,6 +280,7 @@ protobuf==4.23.4
     #   ray
 psutil==5.9.5
     # via
+    #   accelerate
     #   distributed
     #   lightning
 py==1.11.0
@@ -321,6 +328,7 @@ pytz==2023.3
     #   pandas
 pyyaml==6.0.1
     # via
+    #   accelerate
     #   dask
     #   distributed
     #   huggingface-hub
@@ -417,6 +425,7 @@ toolz==0.12.0
 torch==2.0.0
     # via
     #   -r requirements.in
+    #   accelerate
     #   lightning
     #   pytorch-lightning
     #   torchmetrics

diff --git a/superduperdb/core/documents.py b/superduperdb/core/documents.py
@@ -11,7 +11,7 @@ class Document:
     that resource to a mix of jsonable content or `bytes`
     """
 
-    _DEFAULT_ID_KEY = '_id'
+    _DEFAULT_ID_KEY: str = '_id'
 
     def __init__(self, content: t.Dict):
         self.content = content

diff --git a/superduperdb/core/model.py b/superduperdb/core/model.py
@@ -1,5 +1,5 @@
-import inspect
 from dask.distributed import Future
+import inspect
 import dataclasses as dc
 import typing as t
 

diff --git a/superduperdb/core/vector_index.py b/superduperdb/core/vector_index.py
@@ -130,7 +130,6 @@ def get_nearest(
         models, keys = self.models_keys
         if len(models) != len(keys):
             raise ValueError(f'len(models={models}) != len(keys={keys})')
-
         within_ids = ids or ()
 
         if db.db.id_field in like.content:  # type: ignore

diff --git a/superduperdb/datalayer/mongodb/query.py b/superduperdb/datalayer/mongodb/query.py
@@ -331,6 +331,9 @@ class FindOne(SelectOne):
 
     type_id: t.Literal['mongodb.FindOne'] = 'mongodb.FindOne'
 
+    def add_fold(self, fold: str) -> 'Select':
+        raise NotImplementedError
+
     def __call__(self, db):
         if self.collection is not None:
             return SuperDuperCursor.wrap_document(
@@ -361,6 +364,12 @@ class Aggregate(Select):
 
     type_id: t.Literal['mongodb.Aggregate'] = 'mongodb.Aggregate'
 
+    def add_fold(self, fold: str) -> 'Select':
+        raise NotImplementedError
+
+    def is_trivial(self) -> bool:
+        raise NotImplementedError
+
     @property
     def select_ids(self) -> 'Select':
         raise NotImplementedError
@@ -489,7 +498,6 @@ class InsertMany(Insert):
     verbose: bool = True
     args: t.List = dc.field(default_factory=list)
     kwargs: t.Dict = dc.field(default_factory=dict)
-    valid_prob: float = 0.05
     encoders: t.List = dc.field(default_factory=list)
 
     type_id: t.Literal['mongodb.InsertMany'] = 'mongodb.InsertMany'
@@ -506,11 +514,12 @@ def select_using_ids(self, ids):
         return Find(collection=self.collection, args=[{'_id': {'$in': ids}}])
 
     def __call__(self, db):
+        valid_prob = self.kwargs.get('valid_prob', 0.5)
         for e in self.encoders:
             db.add(e)
         documents = [r.encode() for r in self.documents]
         for r in documents:
-            if random.random() < self.valid_prob:
+            if random.random() < valid_prob:
                 r['_fold'] = 'valid'
             else:
                 r['_fold'] = 'train'

diff --git a/superduperdb/models/transformers/wrapper.py b/superduperdb/models/transformers/wrapper.py
@@ -31,13 +31,13 @@ def TransformersTrainerConfiguration(identifier: str, *args, **kwargs):
 class Pipeline(Model):
     tokenizer: t.Optional[t.Callable] = None
 
-    def __post_init__(self, db):
+    def __post_init__(self):
         if not self.device:
             self.device = "cuda" if torch.cuda.is_available() else "cpu"
         self.object.to(self.device)
         if not isinstance(self.tokenizer, Artifact):
-            self.tokenizer = Artifact(_artifact=self.tokenizer)
-        super().__post_init__(db)
+            self.tokenizer = Artifact(artifact=self.tokenizer)
+        super().__post_init__()
 
     @property
     def pipeline(self):
@@ -57,7 +57,7 @@ def _get_data(
         **tokenizer_kwargs,
     ):
         tokenizing_function = TokenizingFunction(
-            self.tokenizer.a, key=X_key, **tokenizer_kwargs
+            self.tokenizer.artifact, key=X_key, **tokenizer_kwargs
         )
         train_data = query_dataset_factory(
             select=self.training_select,
@@ -104,7 +104,7 @@ def _fit(  # type: ignore[override]
         prefetch_size: int = _DEFAULT_PREFETCH_SIZE,
         tokenizer_kwargs: t.Dict[str, t.Any] = {},
         **kwargs,
-    ):
+    ) -> t.Optional[t.Dict[str, t.Any]]:
         if configuration is not None:
             self.configuration = configuration
         if select is not None:
@@ -114,6 +114,8 @@ def _fit(  # type: ignore[override]
         if metrics is not None:
             self.metrics = metrics
 
+        evaluate = kwargs.pop('evaluate', True)
+
         if isinstance(X, str):
             train_data, valid_data = self._get_data(
                 db,
@@ -131,18 +133,21 @@ def _fit(  # type: ignore[override]
             eval_dataset=valid_data,
             **kwargs,
         )
+        evaluation = None
 
         try:
             trainer.train()
-            evaluation = trainer.evaluate()
+            if evaluate:
+                evaluation = trainer.evaluate()
         except Exception as exc:
-            log.error(f"Training could not finish :: {exc}")
-
-        return evaluation
+            log.exception(f"Training could not finish :: {exc}")
+            raise
+        else:
+            return evaluation
 
     def _predict_one(self, input: str, **kwargs):
         tokenized_input = self.tokenizer.a(input, return_tensors='pt').to(self.device)
-        return self.object.a(**tokenized_input, **kwargs)
+        return self.object.artifact(**tokenized_input, **kwargs)
 
     def _predict(self, input: str, **kwargs):
         if not isinstance(input, list):

diff --git a/superduperdb/models/vanilla/wrapper.py b/superduperdb/models/vanilla/wrapper.py
@@ -3,18 +3,22 @@
 
 
 class Function(Model):
+    vanilla = True
+
     def predict_one(self, x, **kwargs):
-        return self.object.a(x, **kwargs)
+        return self.object.artifact(x, **kwargs)
 
-    def _predict(self, docs, num_workers=0):
+    def _predict(self, docs, num_workers=0, **kwargs):
         outputs = []
+        if not isinstance(docs, list):
+            return self.predict_one(docs)
         if num_workers:
             pool = multiprocessing.Pool(processes=num_workers)
-            for r in pool.map(self.object, docs):
+            for r in pool.map(self.object.artifact, docs):
                 outputs.append(r)
             pool.close()
             pool.join()
         else:
             for r in docs:
-                outputs.append(self.object(r))
+                outputs.append(self.object.artifact(r))
         return outputs
diff --git a/superduperdb/vector_search/faiss_index.py b/superduperdb/vector_search/faiss_index.py
@@ -27,13 +27,13 @@ def __init__(self, h, index, measure='l2', faiss_index=None):
         super().__init__(h, index, measure)
         self.h = self.h.astype('float32')
         if faiss_index is None:
-            if measure == 'css':
+            if measure == 'cosine':
                 self.h = self.h / (numpy.linalg.norm(self.h, axis=1)[:, None])
             if measure == 'l2':
                 faiss_index = faiss.index_factory(
                     self.h.shape[1], 'Flat', faiss.METRIC_L2
                 )
-            elif measure in {'css', 'dot'}:
+            elif measure in {'cosine', 'dot'}:
                 faiss_index = faiss.index_factory(
                     self.h.shape[1], 'Flat', faiss.METRIC_INNER_PRODUCT
                 )

diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py
@@ -1,4 +1,5 @@
 import random
+import numpy as np
 import time
 from threading import Thread
 from unittest import mock
@@ -32,6 +33,10 @@
 as much as possible. This will make it easier to understand the test suite.
 '''
 
+# Set the seeds
+random.seed(42)
+np.random.seed(42)
+
 
 mongodb_test_config = {
     'host': '0.0.0.0',