diff --git a/CHANGELOG.md b/CHANGELOG.md index 279a25120..9cfd8262e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 #### Changed defaults / behaviours +- Deprecate vanilla `DataType` +- Remove `_Encodable` from project + #### New Features & Functionality - Streamlit component and server diff --git a/plugins/anthropic/superduper_anthropic/model.py b/plugins/anthropic/superduper_anthropic/model.py index 81c878f7f..28076aa34 100644 --- a/plugins/anthropic/superduper_anthropic/model.py +++ b/plugins/anthropic/superduper_anthropic/model.py @@ -23,9 +23,9 @@ class Anthropic(APIBaseModel): client_kwargs: t.Dict[str, t.Any] = dc.field(default_factory=dict) - def __post_init__(self, db, artifacts, example): + def __post_init__(self, db, example): self.model = self.model or self.identifier - super().__post_init__(db, artifacts, example=example) + super().__post_init__(db, example=example) def init(self, db=None): """Initialize the model. diff --git a/plugins/cohere/superduper_cohere/model.py b/plugins/cohere/superduper_cohere/model.py index b2d4f495d..973c9aa1e 100644 --- a/plugins/cohere/superduper_cohere/model.py +++ b/plugins/cohere/superduper_cohere/model.py @@ -23,8 +23,8 @@ class Cohere(APIBaseModel): client_kwargs: t.Dict[str, t.Any] = dc.field(default_factory=dict) - def __post_init__(self, db, artifacts, example): - super().__post_init__(db, artifacts, example=example) + def __post_init__(self, db, example): + super().__post_init__(db, example=example) self.identifier = self.identifier or self.model @@ -47,8 +47,8 @@ class CohereEmbed(Cohere): batch_size: int = 100 signature: str = 'singleton' - def __post_init__(self, db, artifacts, example): - super().__post_init__(db, artifacts, example=example) + def __post_init__(self, db, example): + super().__post_init__(db, example=example) if self.shape is None: self.shape = self.shapes[self.identifier] diff --git a/plugins/ibis/superduper_ibis/data_backend.py b/plugins/ibis/superduper_ibis/data_backend.py index c798ffa92..e0ad93e66 100644 --- a/plugins/ibis/superduper_ibis/data_backend.py +++ b/plugins/ibis/superduper_ibis/data_backend.py @@ -14,7 +14,7 @@ from superduper.backends.local.artifacts import FileSystemArtifactStore from superduper.base import exceptions from superduper.base.enums import DBType -from superduper.components.datatype import DataType +from superduper.components.datatype import BaseDataType from superduper.components.schema import Schema from superduper.components.table import Table @@ -69,7 +69,7 @@ def __init__(self, uri: str, flavour: t.Optional[str] = None): self.overwrite = False self._setup(conn) - if uri.startswith('snowflake://') or uri.startswith('sqlite://'): + if uri.startswith('snowflake://'): self.bytes_encoding = 'base64' self.datatype_presets = {'vector': 'superduper.ext.numpy.encoder.Array'} @@ -190,7 +190,7 @@ def drop_table_or_collection(self, name: str): def create_output_dest( self, predict_id: str, - datatype: t.Union[FieldType, DataType], + datatype: t.Union[FieldType, BaseDataType], flatten: bool = False, ): """Create a table for the output of the model. diff --git a/plugins/ibis/superduper_ibis/query.py b/plugins/ibis/superduper_ibis/query.py index 6ec315581..dd18d0867 100644 --- a/plugins/ibis/superduper_ibis/query.py +++ b/plugins/ibis/superduper_ibis/query.py @@ -11,7 +11,7 @@ ) from superduper.base.cursor import SuperDuperCursor from superduper.base.exceptions import DatabackendException -from superduper.components.datatype import Encodable +from superduper.components.datatype import _Encodable from superduper.components.schema import Schema from superduper.misc.special_dicts import SuperDuperFlatEncode @@ -81,7 +81,7 @@ def _model_update_impl( d = { "_source": str(source_id), f"{CFG.output_prefix}{predict_id}": output.x - if isinstance(output, Encodable) + if isinstance(output, _Encodable) else output, "id": str(uuid.uuid4()), } diff --git a/plugins/ibis/superduper_ibis/utils.py b/plugins/ibis/superduper_ibis/utils.py index 1aeeb87b3..b8215bf55 100644 --- a/plugins/ibis/superduper_ibis/utils.py +++ b/plugins/ibis/superduper_ibis/utils.py @@ -1,20 +1,12 @@ from ibis.expr.datatypes import dtype from superduper.components.datatype import ( - Artifact, BaseDataType, File, - LazyArtifact, - LazyFile, - Native, ) from superduper.components.schema import ID, FieldType, Schema SPECIAL_ENCODABLES_FIELDS = { File: "str", - LazyFile: "str", - Artifact: "str", - LazyArtifact: "str", - Native: "json", } diff --git a/plugins/jina/superduper_jina/model.py b/plugins/jina/superduper_jina/model.py index f2137c7f5..dd221e8fc 100644 --- a/plugins/jina/superduper_jina/model.py +++ b/plugins/jina/superduper_jina/model.py @@ -16,8 +16,8 @@ class Jina(APIBaseModel): api_key: t.Optional[str] = None - def __post_init__(self, db, artifacts, example): - super().__post_init__(db, artifacts, example=example) + def __post_init__(self, db, example): + super().__post_init__(db, example=example) self.identifier = self.identifier or self.model self.client = JinaAPIClient(model_name=self.identifier, api_key=self.api_key) @@ -41,8 +41,8 @@ class JinaEmbedding(Jina): shape: t.Optional[t.Sequence[int]] = None signature: str = 'singleton' - def __post_init__(self, db, artifacts, example): - super().__post_init__(db, artifacts, example) + def __post_init__(self, db, example): + super().__post_init__(db, example) if self.shape is None: self.shape = (len(self.client.encode_batch(['shape'])[0]),) diff --git a/plugins/mongodb/superduper_mongodb/data_backend.py b/plugins/mongodb/superduper_mongodb/data_backend.py index 474c650d4..ec13feefb 100644 --- a/plugins/mongodb/superduper_mongodb/data_backend.py +++ b/plugins/mongodb/superduper_mongodb/data_backend.py @@ -9,7 +9,7 @@ from superduper.backends.base.data_backend import BaseDataBackend from superduper.backends.base.metadata import MetaDataStoreProxy from superduper.base.enums import DBType -from superduper.components.datatype import DataType +from superduper.components.datatype import BaseDataType from superduper.components.schema import Schema from superduper.misc.colors import Colors @@ -140,7 +140,7 @@ def disconnect(self): def create_output_dest( self, predict_id: str, - datatype: t.Union[str, DataType], + datatype: t.Union[str, BaseDataType], flatten: bool = False, ): """Create an output collection for a component. diff --git a/plugins/mongodb/superduper_mongodb/query.py b/plugins/mongodb/superduper_mongodb/query.py index efa0277e6..d6e6e862f 100644 --- a/plugins/mongodb/superduper_mongodb/query.py +++ b/plugins/mongodb/superduper_mongodb/query.py @@ -532,6 +532,7 @@ def process_find_part(part): method, args, kwargs = part # args: (filter, projection, *args) filter = copy.deepcopy(args[0]) if len(args) > 0 else {} + filter = dict(filter) filter.update(self._get_filter_conditions()) args = tuple((filter, *args[1:])) diff --git a/plugins/openai/superduper_openai/model.py b/plugins/openai/superduper_openai/model.py index 6e6fead09..44739a0e7 100644 --- a/plugins/openai/superduper_openai/model.py +++ b/plugins/openai/superduper_openai/model.py @@ -50,8 +50,8 @@ class _OpenAI(APIBaseModel): openai_api_base: t.Optional[str] = None client_kwargs: t.Optional[dict] = dc.field(default_factory=dict) - def __post_init__(self, db, artifacts, example): - super().__post_init__(db, artifacts, example) + def __post_init__(self, db, example): + super().__post_init__(db, example) assert isinstance(self.client_kwargs, dict) @@ -151,8 +151,8 @@ class OpenAIChatCompletion(_OpenAI): batch_size: int = 1 prompt: str = '' - def __post_init__(self, db, artifacts, example): - super().__post_init__(db, artifacts, example) + def __post_init__(self, db, example): + super().__post_init__(db, example) self.takes_context = True def _format_prompt(self, context, X): diff --git a/plugins/sentence_transformers/superduper_sentence_transformers/model.py b/plugins/sentence_transformers/superduper_sentence_transformers/model.py index fc8245959..3eba51dcf 100644 --- a/plugins/sentence_transformers/superduper_sentence_transformers/model.py +++ b/plugins/sentence_transformers/superduper_sentence_transformers/model.py @@ -4,7 +4,7 @@ from superduper.backends.query_dataset import QueryDataset from superduper.base.enums import DBType from superduper.components.component import ensure_initialized -from superduper.components.datatype import DataType, dill_lazy +from superduper.components.datatype import dill_serializer from superduper.components.model import Model, Signature, _DeviceManaged DEFAULT_PREDICT_KWARGS = { @@ -39,9 +39,7 @@ class SentenceTransformer(Model, _DeviceManaged): """ - _artifacts: t.ClassVar[t.Sequence[t.Tuple[str, 'DataType']]] = ( - ('object', dill_lazy), - ) + _fields = {'object': dill_serializer} object: t.Optional[_SentenceTransformer] = None model: t.Optional[str] = None @@ -50,8 +48,8 @@ class SentenceTransformer(Model, _DeviceManaged): postprocess: t.Union[None, t.Callable] = None signature: Signature = 'singleton' - def __post_init__(self, db, artifacts, example): - super().__post_init__(db, artifacts, example=example) + def __post_init__(self, db, example): + super().__post_init__(db, example=example) if self.model is None: self.model = self.identifier diff --git a/plugins/sklearn/plugin_test/test_sklearn.py b/plugins/sklearn/plugin_test/test_sklearn.py index 10949fedc..cfbafc2c7 100644 --- a/plugins/sklearn/plugin_test/test_sklearn.py +++ b/plugins/sklearn/plugin_test/test_sklearn.py @@ -104,7 +104,7 @@ def test_sklearn(db): identifier='test', object=SVC(), ) - assert 'object' in m.artifact_schema.fields + assert 'object' in m.class_schema.fields db.apply(m, force=True) assert db.show('model') == ['test'] diff --git a/plugins/torch/superduper_torch/model.py b/plugins/torch/superduper_torch/model.py index 1e276a03b..abb9c15d6 100644 --- a/plugins/torch/superduper_torch/model.py +++ b/plugins/torch/superduper_torch/model.py @@ -153,8 +153,8 @@ class TorchModel(Model, _DeviceManaged): optimizer_state: t.Optional[t.Any] = None loader_kwargs: t.Dict = dc.field(default_factory=lambda: {}) - def __post_init__(self, db, artifacts, example): - super().__post_init__(db, artifacts=artifacts, example=example) + def __post_init__(self, db, example): + super().__post_init__(db, example=example) if self.optimizer_state is not None: self.optimizer.load_state_dict(self.optimizer_state) diff --git a/plugins/transformers/superduper_transformers/model.py b/plugins/transformers/superduper_transformers/model.py index 5adfe5727..515c4d984 100644 --- a/plugins/transformers/superduper_transformers/model.py +++ b/plugins/transformers/superduper_transformers/model.py @@ -78,11 +78,11 @@ class TransformersTrainer(TrainingArguments, Trainer): t.Callable[[torch.Tensor, torch.Tensor], torch.Tensor] ] = None - def __post_init__(self, db, artifacts): + def __post_init__(self, db): assert self.output_dir == '' or self.output_dir == self.identifier self.output_dir = self.identifier TrainingArguments.__post_init__(self) - return Trainer.__post_init__(self, db, artifacts) + return Trainer.__post_init__(self, db) @property def native_arguments(self): @@ -214,10 +214,10 @@ def _build_pipeline(self): model=self.model_cls.from_pretrained(self.model_name), ) - def __post_init__(self, db, artifacts, example): + def __post_init__(self, db, example): if self.pipeline is None: self._build_pipeline() - super().__post_init__(db, artifacts, example) + super().__post_init__(db, example) def predict(self, text: str): """Predict the class of a single text. @@ -284,12 +284,12 @@ class LLM(BaseLLM): ("tokenizer_kwargs", dill_serializer), ) - def __post_init__(self, db, artifacts, example): + def __post_init__(self, db, example): if not self.identifier: self.identifier = self.adapter_id or self.model_name_or_path # TODO: Compatible with the bug of artifact sha1 equality and will be deleted - super().__post_init__(db, artifacts, example) + super().__post_init__(db, example) @classmethod def from_pretrained( diff --git a/plugins/transformers/superduper_transformers/training.py b/plugins/transformers/superduper_transformers/training.py index a59c086a4..e231c985a 100644 --- a/plugins/transformers/superduper_transformers/training.py +++ b/plugins/transformers/superduper_transformers/training.py @@ -184,11 +184,11 @@ class LLMTrainer(TrainingArguments, SuperDuperTrainer): num_gpus: t.Optional[int] = None ray_configs: t.Optional[dict] = None - def __post_init__(self, db, artifacts): + def __post_init__(self, db): self.output_dir = self.output_dir or os.path.join("output", self.identifier) if self.num_gpus and 'num_gpus' not in self.compute_kwargs: self.compute_kwargs['num_gpus'] = self.num_gpus - return SuperDuperTrainer.__post_init__(self, db, artifacts) + return SuperDuperTrainer.__post_init__(self, db) def build(self): """Build the training arguments.""" diff --git a/plugins/vllm/superduper_vllm/model.py b/plugins/vllm/superduper_vllm/model.py index 3032ec809..975d0a67a 100644 --- a/plugins/vllm/superduper_vllm/model.py +++ b/plugins/vllm/superduper_vllm/model.py @@ -19,7 +19,7 @@ class _VLLMCore(Model): vllm_params: dict = dc.field(default_factory=dict) - def __post_init__(self, db, artifacts, example): + def __post_init__(self, db, example): super().__post_init__(db, artifacts, example) assert "model" in self.vllm_params, "model is required in vllm_params" self._async_llm = None diff --git a/superduper/__init__.py b/superduper/__init__.py index 4e0d8f189..4b41d44b1 100644 --- a/superduper/__init__.py +++ b/superduper/__init__.py @@ -24,7 +24,7 @@ from .components.application import Application from .components.component import Component from .components.dataset import Dataset -from .components.datatype import DataType, dill_serializer, pickle_serializer +from .components.datatype import BaseDataType, dill_serializer, pickle_serializer from .components.listener import Listener from .components.metric import Metric from .components.model import ( @@ -39,7 +39,7 @@ from .components.streamlit import Streamlit from .components.table import Table from .components.template import QueryTemplate, Template -from .components.vector_index import VectorIndex, vector +from .components.vector_index import VectorIndex REQUIRES = [ 'superduper=={}'.format(__version__), @@ -52,7 +52,7 @@ 'config', 'logging', 'superduper', - 'DataType', + 'BaseDataType', 'Document', 'code', 'ObjectModel', @@ -62,7 +62,6 @@ 'model', 'Listener', 'VectorIndex', - 'vector', 'Dataset', 'Metric', 'Plugin', diff --git a/superduper/backends/base/data_backend.py b/superduper/backends/base/data_backend.py index 30ad4ca77..0e5377977 100644 --- a/superduper/backends/base/data_backend.py +++ b/superduper/backends/base/data_backend.py @@ -4,7 +4,7 @@ from superduper import logging from superduper.backends.base.query import Query -from superduper.components.datatype import DataType +from superduper.components.datatype import BaseDataType if t.TYPE_CHECKING: from superduper.components.schema import Schema @@ -75,7 +75,7 @@ def build_artifact_store(self): def create_output_dest( self, predict_id: str, - datatype: t.Union[str, DataType], + datatype: t.Union[str, BaseDataType], flatten: bool = False, ): """Create an output destination for the database. diff --git a/superduper/backends/base/query.py b/superduper/backends/base/query.py index ae5f90ada..1fce727af 100644 --- a/superduper/backends/base/query.py +++ b/superduper/backends/base/query.py @@ -5,7 +5,7 @@ import typing as t import uuid from abc import abstractmethod -from functools import wraps +from functools import cached_property, wraps from superduper import CFG, logging from superduper.base.constant import ( @@ -309,31 +309,10 @@ def flavour(self): """Return the flavour of the query.""" return self._get_flavour() - @property + @cached_property def documents(self): """Return the documents.""" - def _wrap_document(document): - if not isinstance(document, Document): - if isinstance(document, dict): - document = Document(document) - else: - try: - table = self.db.load('table', self.table) - except FileNotFoundError: - raise FileNotFoundError( - "Table not found. Please provide a document or a dictionary" - ) - field = [ - k - for k in table.schema.fields - if k not in [self.primary_id, '_fold'] - and not k.startswith(CFG.output_prefix) - ] - assert len(field) == 1 - document = Document({field[0]: document}) - return document - def _update_part(documents): nonlocal self doc_args = (documents, *self.parts[0][1][1:]) @@ -345,8 +324,9 @@ def _update_part(documents): if one_document: documents = [documents] wrapped_documents = [] + for document in documents: - document = _wrap_document(document) + document = Document(document) wrapped_documents.append(document) if one_document: diff --git a/superduper/base/apply.py b/superduper/base/apply.py index 6168f3c79..5c353e709 100644 --- a/superduper/base/apply.py +++ b/superduper/base/apply.py @@ -139,6 +139,7 @@ def _apply( object.db = db serialized = object.dict(metadata=False) + del serialized['uuid'] create_events = {} @@ -183,7 +184,9 @@ def wrapper(child): del current_serialized['uuid'] # finds the fields where there is a difference - this_diff = Document(current_serialized).diff(serialized) + this_diff = Document(current_serialized, schema=current_serialized.schema).diff( + serialized + ) logging.info(f'Found identical {object.huuid}') if not this_diff: @@ -215,7 +218,7 @@ def wrapper(child): # this is necessary to prevent inconsistencies # this takes the difference between # the current and - serialized = serialized.update(this_diff).encode() + serialized = serialized.update(this_diff).encode(keep_schema=False) # assign/ increment the version since # this breaks a previous version @@ -229,7 +232,6 @@ def wrapper(child): Document(object.metadata).map(wrapper, lambda x: isinstance(x, Component)) else: - # if object.identifier == apply_status = 'update' current.handle_update_or_same(object) @@ -238,7 +240,12 @@ def wrapper(child): # update the existing component with the change # data from the applied component - serialized = current.dict().update(serialized).update(this_diff).encode() + serialized = ( + current.dict() + .update(serialized) + .update(this_diff) + .encode(keep_schema=False) + ) logging.info(f'Found update {object.huuid}') @@ -250,7 +257,7 @@ def wrapper(child): # need to be applied, do that now Document(object.metadata).map(wrapper, lambda x: isinstance(x, Component)) - serialized = serialized.encode() + serialized = serialized.encode(keep_schema=False) object.version = 0 apply_status = 'new' diff --git a/superduper/base/datalayer.py b/superduper/base/datalayer.py index b14ab0f6d..fa3895a8f 100644 --- a/superduper/base/datalayer.py +++ b/superduper/base/datalayer.py @@ -19,12 +19,12 @@ from superduper.base.cursor import SuperDuperCursor from superduper.base.document import Document from superduper.components.component import Component -from superduper.components.datatype import DataType +from superduper.components.datatype import BaseDataType from superduper.components.schema import Schema from superduper.components.table import Table from superduper.misc.annotations import deprecated from superduper.misc.colors import Colors -from superduper.misc.download import download_from_one +from superduper.misc.importing import import_object from superduper.misc.retry import db_retry DBResult = t.Any @@ -255,7 +255,7 @@ def _insert( self, insert: Query, refresh: bool = True, - datatypes: t.Sequence[DataType] = (), + datatypes: t.Sequence[BaseDataType] = (), auto_schema: bool = True, ) -> InsertResult: """ @@ -275,7 +275,7 @@ def _insert( 'train' if random.random() >= s.CFG.fold_probability else 'valid', ) if auto_schema and self.cfg.auto_schema: - self._auto_create_table(insert.table, insert.documents) + schema = self._auto_create_table(insert.table, insert.documents).schema timeout = 5 @@ -301,6 +301,8 @@ def _insert( f'{insert.table} not found after {timeout} seconds' ' table auto creation likely has failed or is stalling...' ) + for r in insert.documents: + r.schema = schema inserted_ids = insert.do_execute(self) @@ -333,6 +335,7 @@ def _auto_create_table(self, table_name, documents): table = Table(identifier=table_name, schema=schema) logging.info(f"Creating table {table_name} with schema {schema.fields_set}") self.apply(table, force=True) + return table def _select(self, select: Query, reference: bool = True) -> SelectResult: """ @@ -563,6 +566,7 @@ def load( :param version: [Optional] Numerical version. :param allow_hidden: Toggle to ``True`` to allow loading of deprecated components. + :param huuid: [Optional] human-readable UUID of the component to load. :param uuid: [Optional] UUID of the component to load. """ if version is not None: @@ -589,7 +593,20 @@ def load( info = self.metadata.get_component_by_uuid( uuid=uuid, allow_hidden=allow_hidden ) - c = Document.decode(info, db=self) + try: + class_schema = import_object(info['_path']).build_class_schema() + except KeyError: + # if defined in __main__ then the class is directly serialized + assert '_object' in info + from superduper.components.datatype import Blob, dill_serializer + + bytes_ = Blob( + identifier=info['_object'].split(':')[-1], db=self + ).unpack() + object = dill_serializer.decode_data(bytes_) + class_schema = object.build_class_schema() + + c = Document.decode(info, db=self, schema=class_schema) c.db = self if c.cache: logging.info(f'Adding {c.huuid} to cache') @@ -655,16 +672,6 @@ def _remove_component_version( self._delete_artifacts(r['uuid'], info) self.metadata.delete_component_version(type_id, identifier, version=version) - def _get_content_for_filter(self, filter) -> Document: - if isinstance(filter, dict): - filter = Document(filter) - if '_id' not in filter: - filter['_id'] = 0 - download_from_one(filter) - if not filter['_id']: - del filter['_id'] - return filter - def replace(self, object: t.Any): """ Replace a model in the artifact store with an updated object. @@ -691,7 +698,7 @@ def _replace_fn(component): return f'&:component:{component.huuid}' serialized = serialized.map(_replace_fn, lambda x: isinstance(x, Component)) - serialized = serialized.encode() + serialized = serialized.encode(keep_schema=False) self._delete_artifacts(object.uuid, info) serialized = self._save_artifact(object.uuid, serialized) @@ -775,7 +782,8 @@ def select_nearest( if not isinstance(like, Document): assert isinstance(like, dict) like = Document(like) - like = self._get_content_for_filter(like) + # TODO deprecate + # like = self._get_content_for_filter(like) logging.info('Getting vector-index') vi = self.load('vector_index', vector_index) if outputs is None: diff --git a/superduper/base/document.py b/superduper/base/document.py index 1b9b71345..7e57c2939 100644 --- a/superduper/base/document.py +++ b/superduper/base/document.py @@ -13,13 +13,7 @@ from superduper.base.leaf import Leaf, import_item from superduper.base.variables import _replace_variables from superduper.components.component import Component -from superduper.components.datatype import ( - Blob, - Encodable, - FileItem, - Native, - _BaseEncodable, -) +from superduper.components.datatype import BaseDataType, Blob, File from superduper.components.schema import Schema, get_schema from superduper.misc.reference import parse_reference from superduper.misc.special_dicts import MongoStyleDict, SuperDuperFlatEncode @@ -28,7 +22,6 @@ from superduper.base.datalayer import Datalayer -ContentType = t.Union[t.Dict, Encodable] LeafMetaType = t.Type['Leaf'] _VERSION_LIMIT = 1000 @@ -70,10 +63,11 @@ def __init__(self, getters=None): def add_getter(self, name: str, getter: t.Callable): """Add a getter for a reference type.""" - if name == 'blob': - self._getters[name].append(_build_blob_getter(getter)) - else: - self._getters[name].append(getter) + self._getters[name].append(getter) + # if name == 'blob': + # self._getters[name].append(_build_blob_getter(getter)) + # else: + # self._getters[name].append(getter) def run(self, name, data): """Run the getters one by one until one returns a value.""" @@ -102,6 +96,11 @@ def _diff(r1, r2, d): if isinstance(r1[k], Leaf): r1k = r1[k].dict(metadata=False) + + if r2[k] is None: + d[k] = None + continue + r2k = r2[k].dict(metadata=False) if set(r1k.keys()) != set(r2k.keys()): @@ -181,7 +180,7 @@ def _map(r): return fn(r) return r - return Document(_map(self)) + return Document(_map(self), schema=self.schema) def diff(self, other: 'Document'): """Get a `Document` with the difference to `other` inside. @@ -190,11 +189,16 @@ def diff(self, other: 'Document'): """ out: t.Dict = {} _diff(self, other, out) - return Document(out) + return Document(out, schema=self.schema) - def update(self, other: 'Document'): + def update(self, other: t.Union['Document', dict]): """Update document with values from other.""" - return Document(_update(dict(self), dict(other))) + schema = self.schema or Schema('tmp', fields={}) + + if isinstance(other, Document) and other.schema: + assert other.schema is not None + schema = schema.update(other.schema) + return Document(_update(dict(self), dict(other)), schema=schema) def encode( self, @@ -202,6 +206,7 @@ def encode( leaves_to_keep: t.Sequence = (), metadata: bool = True, defaults: bool = True, + keep_schema: bool = True, ) -> SuperDuperFlatEncode: """Encode the document to a format that can be used in a database. @@ -224,6 +229,10 @@ def encode( out = schema.encode_data( out, builds, blobs, files, leaves_to_keep=leaves_to_keep ) + + if not keep_schema: + del out['_schema'] + out = _deep_flat_encode( out, builds=builds, @@ -275,6 +284,7 @@ def decode( builds = r.get(KEY_BUILDS, {}) + # TODO is this the right place for this? # Important: Leaf.identifier or Component.type_id:Component.identifier are # are used as the key, but must be set if not present. for k in builds: @@ -297,18 +307,23 @@ def decode( # Prioritize using the local artifact storage getter, # and then use the DB read getter. if r.get(KEY_BLOBS): - getters.add_getter('blob', lambda x: r[KEY_BLOBS].get(x)) + getters.add_getter( + 'blob', lambda x: Blob(identifier=x, bytes=r[KEY_BLOBS].get(x)) + ) + + def my_getter(x): + return File(path=r[KEY_FILES].get(x.split(':')[-1]), db=db) if r.get(KEY_FILES): - getters.add_getter('file', lambda x: r[KEY_FILES].get(x.split(':')[-1])) + getters.add_getter('file', my_getter) # Add a remote file getter - getters.add_getter('file', _get_file_remote_callback) - getters.add_getter('blob', _get_local_blob) + # getters.add_getter('file', _get_file_remote_callback) + # getters.add_getter('blob', _get_local_blob) if db is not None: getters.add_getter('component', lambda x: _get_component(db, x)) - getters.add_getter('blob', _get_artifact_callback(db)) + getters.add_getter('blob', _get_blob_callback(db)) getters.add_getter('file', _get_file_callback(db)) if schema is not None: @@ -348,12 +363,25 @@ def set_variables(self, **kwargs) -> 'Document': def __repr__(self) -> str: return f'Document({repr(dict(self))})' + @staticmethod + def decode_blobs(schema, r): + for k, v in schema.fields.items(): + if k not in r: + continue + if not isinstance(v, BaseDataType): + continue + if v.encodable == 'artifact': + r[k] = v.decode_data(r[k]) + return r + def unpack(self, leaves_to_keep: t.Sequence = ()) -> t.Any: """Returns the content, but with any encodables replaced by their contents. :param leaves_to_keep: The types of leaves to keep. """ out = _unpack(self, leaves_to_keep=leaves_to_keep) + if self.schema is not None: + out = self.decode_blobs(self.schema, out) if '_base' in out: out = out['_base'] return out @@ -364,6 +392,7 @@ def __deepcopy__(self, momo): return new_doc +# TODO what is this? Looks like it should be in superduper_mongodb class QueryUpdateDocument(Document): """A document that is used to update a document in a database. @@ -402,6 +431,7 @@ def _create_metadata_update(update, original=None): update = {'$set': update} return update + # TODO needed? def to_template(self, **substitutions): """ Convert the document to a template with variables. @@ -443,9 +473,7 @@ def encode( def _unpack(item: t.Any, db=None, leaves_to_keep: t.Sequence = ()) -> t.Any: - if isinstance(item, _BaseEncodable) and not any( - [isinstance(item, leaf) for leaf in leaves_to_keep] - ): + if isinstance(item, Leaf) and not isinstance(item, tuple(leaves_to_keep)): return item.unpack() elif isinstance(item, dict): return {k: _unpack(v, leaves_to_keep=leaves_to_keep) for k, v in item.items()} @@ -515,13 +543,10 @@ def _deep_flat_encode( blobs[r.identifier] = r.bytes return '&:blob:' + r.identifier - if isinstance(r, FileItem): + if isinstance(r, File): files[r.identifier] = r.path return '&:file:' + r.identifier - if isinstance(r, Native): - return r.x - # TODO what is this?? from superduper.backends.base.query import _BaseQuery @@ -653,10 +678,11 @@ def _deep_flat_decode(r, builds, getters: Getters, db: t.Optional['Datalayer'] = if isinstance(r, dict) and '_object' in r: dict_ = {k: v for k, v in r.items() if k != '_object'} dict_ = _deep_flat_decode(dict_, builds, getters=getters, db=db) - object = _deep_flat_decode( - builds[r['_object'][1:]], builds, getters=getters, db=db - ) - instance = import_item(object=object.unpack(), dict=dict_, db=db) + from superduper.components.datatype import dill_serializer + + bytes_ = Blob(identifier=r['_object'].split(':')[-1], db=db).unpack() + object = dill_serializer.decode_data(bytes_) + instance = import_item(object=object, dict=dict_, db=db) return instance if isinstance(r, dict): literals = r.get('_literals', []) @@ -746,16 +772,28 @@ def pull_file(): def _get_file_callback(db): - def callback(path): - def pull_file(): - identifier = path.split(':')[-1] - return db.artifact_store.get_file(identifier), path + def callback(ref): + return File(identifier=ref, db=db) + + return callback - return pull_file + +def _get_blob_callback(db): + def callback(ref): + return Blob(identifier=ref, db=db) return callback +# def _get_file_callback(db): +# def callback(path): +# def pull_file(): +# identifier = path.split(':')[-1] +# return db.artifact_store.get_file(identifier), path +# return pull_file +# return callback + + def _get_local_blob(x, loader=None): if x.split('://')[0].startswith('file'): return loader(x) diff --git a/superduper/base/leaf.py b/superduper/base/leaf.py index e6275d15e..7e8f9aa6f 100644 --- a/superduper/base/leaf.py +++ b/superduper/base/leaf.py @@ -288,14 +288,12 @@ def dict(self, metadata: bool = True, defaults: bool = True): if self.literals: r['_literals'] = list(self.literals) - from superduper.components.datatype import Artifact, dill_serializer + from superduper.components.datatype import dill_serializer if self.__class__.__module__ == '__main__': - cls = Artifact( - x=self.__class__, - datatype=dill_serializer, + return Document( + {'_object': dill_serializer.encode_data(self.__class__), **r} ) - return Document({'_object': cls, **r}) path = f'{self.__class__.__module__}.{self.__class__.__name__}' return Document({'_path': path, **r}) diff --git a/superduper/components/application.py b/superduper/components/application.py index 08443c541..68aae3f15 100644 --- a/superduper/components/application.py +++ b/superduper/components/application.py @@ -31,8 +31,8 @@ class Application(Component): namespace: t.Optional[t.Sequence[t.Tuple[str, str]]] = None link: t.Optional[str] = None - def __post_init__(self, db, artifacts): - super().__post_init__(db, artifacts) + def __post_init__(self, db): + super().__post_init__(db) self._sort_components_and_set_upstream() def _sort_components_and_set_upstream(self): diff --git a/superduper/components/component.py b/superduper/components/component.py index a9729ee0c..b94acef47 100644 --- a/superduper/components/component.py +++ b/superduper/components/component.py @@ -24,7 +24,7 @@ if t.TYPE_CHECKING: from superduper import Document from superduper.base.datalayer import Datalayer - from superduper.components.datatype import DataType + from superduper.components.datatype import BaseDataType from superduper.components.plugin import Plugin @@ -87,6 +87,20 @@ def getdeepattr(obj, attr): ComponentTuple.__doc__ = 'noqa' +def _is_optional_callable(annotation) -> bool: + """Tell if an annotation is t.Optional[t.Callable]. + + >>> is_optional_callable(t.Optional[t.Callable]) + True + """ + # Check if the annotation is of the form Optional[...] + if t.get_origin(annotation) is t.Union: + # Get the type inside Optional and check if it is Callable + inner_type = t.get_args(annotation)[0] # Optional[X] means X is at index 0 + return inner_type is t.Callable + return False + + class ComponentMeta(LeafMeta): """Metaclass for the `Component` class. @@ -113,6 +127,33 @@ def __new__(cls, name, bases, dct): if hasattr(attr_value, 'events'): new_cls.triggers.add(attr_name) + import copy + + new_cls._fields = copy.deepcopy(new_cls._fields) + for base in bases: + try: + new_cls._fields.update( + {k: v for k, v in base._fields.items() if k not in new_cls._fields} + ) + except AttributeError: + continue + + for field in dc.fields(new_cls): + if field.name in new_cls._fields: + continue + try: + # For some reason it is random whether annotations are strings or not + annotation = new_cls.__annotations__[field.name] + if annotation in { + 't.Callable', + 't.Optional[t.Callable]', + 't.Callable | None', + }: + new_cls._fields[field.name] = 'dill_serializer' + elif annotation is t.Callable or _is_optional_callable(annotation): + new_cls._fields[field.name] = 'dill_serializer' + except KeyError: + continue return new_cls @@ -122,7 +163,6 @@ class Component(Leaf, metaclass=ComponentMeta): Class to represent superduper.io serializable entities that can be saved into a database. - :param artifacts: A dictionary of artifacts paths and `DataType` objects :param upstream: A list of upstream components :param plugins: A list of plugins to be used in the component. :param cache: (Optional) If set `true` the component will not be cached @@ -137,13 +177,11 @@ class Component(Leaf, metaclass=ComponentMeta): breaks: t.ClassVar[t.Sequence] = () triggers: t.ClassVar[t.List] = [] type_id: t.ClassVar[str] = 'component' - # TODO do something more elegant than this - _artifacts: t.ClassVar[t.Sequence[t.Tuple[str, 'DataType']]] = () + _fields: t.ClassVar[t.Dict[str, t.Union['BaseDataType', str]]] = {} set_post_init: t.ClassVar[t.Sequence] = ('version',) upstream: t.Optional[t.List["Component"]] = None plugins: t.Optional[t.List["Plugin"]] = None - artifacts: dc.InitVar[t.Optional[t.Dict]] = None cache: t.Optional[bool] = True status: t.Optional[Status] = None build_variables: t.Dict | None = None @@ -455,10 +493,9 @@ def leaves(self): leaf_keys = [k for k in r.keys(True) if isinstance(r[k], Leaf)] return {k: r[k] for k in leaf_keys} - def __post_init__(self, db, artifacts): + def __post_init__(self, db): super().__post_init__(db) - self.artifacts = artifacts self.version: t.Optional[int] = None if not self.identifier: raise ValueError('identifier cannot be empty or None') @@ -506,46 +543,22 @@ def _init(item): return [_init(i) for i in item] if isinstance(item, Leaf): - item.init(db=db) + item.init() return item.unpack() return item + schema = self.build_class_schema() + for f in dc.fields(self): item = getattr(self, f.name) - unpacked_item = _init(item) - setattr(self, f.name, unpacked_item) + item = _init(item) + if f.name in self._fields and isinstance(item, bytes): + item = schema.fields[f.name].decode_data(item) + setattr(self, f.name, item) return self - @property - def artifact_schema(self): - """Returns `Schema` representation for the serializers in the component.""" - from superduper import Schema - from superduper.components.datatype import dill_serializer - - schema = {} - lookup = dict(self._artifacts) - if self.artifacts is not None: - lookup.update(self.artifacts) - for f in dc.fields(self): - a = getattr(self, f.name) - if a is None: - continue - if f.name in lookup and not isinstance(a, Leaf): - schema[f.name] = lookup[f.name] - continue - if isinstance(getattr(self, f.name), Component): - continue - item = getattr(self, f.name) - if ( - callable(item) - and not isinstance(item, Leaf) - and not getattr(item, 'importable', False) - ): - schema[f.name] = dill_serializer - return Schema(identifier=f'serializer/{self.identifier}', fields=schema) - def _pre_create(self, db: Datalayer, startup_cache: t.Dict = {}): self.status = Status.initializing @@ -679,6 +692,8 @@ def export( r = self.dict(defaults=defaults, metadata=metadata).encode( defaults=defaults, metadata=metadata ) + + del r['_schema'] if not metadata: del r['uuid'] @@ -786,7 +801,6 @@ def dict( ) -> 'Document': """A dictionary representation of the component.""" from superduper import Document - from superduper.components.datatype import Artifact, File r = super().dict(metadata=metadata, defaults=defaults) @@ -801,14 +815,14 @@ def _convert_components_to_refs(r): if refs: r = _convert_components_to_refs(r) - s = self.artifact_schema + + s = self.build_class_schema() + + from superduper.components.datatype import Saveable for k in s.fields: - attr = getattr(self, k) - if isinstance(attr, (Artifact, File)): - r[k] = attr - else: - r[k] = s.fields[k](x=attr) # artifact or file + if r[k] is not None and not isinstance(r[k], Saveable): + r[k] = s.fields[k].encode_data(r[k]) if metadata: r['type_id'] = self.type_id @@ -817,7 +831,19 @@ def _convert_components_to_refs(r): if r.get('status') is not None: r['status'] = str(self.status) - return Document(r) + + return Document(r, schema=s) + + @classmethod + def build_class_schema(cls): + from superduper import Schema + from superduper.components.datatype import INBUILT_DATATYPES + + _fields = cls._fields.copy() + for k in _fields: + if isinstance(_fields[k], str): + _fields[k] = INBUILT_DATATYPES[_fields[k]] + return Schema(f'{cls.__name__}/class_schema', fields=_fields) # TODO needed? looks to have legacy "_content" @classmethod diff --git a/superduper/components/cron_job.py b/superduper/components/cron_job.py index 56de2743b..66475bcc1 100644 --- a/superduper/components/cron_job.py +++ b/superduper/components/cron_job.py @@ -33,7 +33,7 @@ class FunctionCronJob(CronJob): :param function: Callable to run """ - _artifacts = (('function', dill_serializer),) + _fields = {'function': dill_serializer} function: t.Callable diff --git a/superduper/components/dataset.py b/superduper/components/dataset.py index 59a4259cc..99e803df6 100644 --- a/superduper/components/dataset.py +++ b/superduper/components/dataset.py @@ -10,10 +10,7 @@ from superduper.base.datalayer import Datalayer from superduper.base.document import Document from superduper.components.component import Component, ensure_initialized -from superduper.components.datatype import ( - DataType, - dill_serializer, -) +from superduper.components.datatype import dill_serializer class Dataset(Component): @@ -30,9 +27,7 @@ class Dataset(Component): """ type_id: t.ClassVar[str] = 'dataset' - _artifacts: t.ClassVar[t.Sequence[t.Tuple[str, DataType]]] = ( - ('raw_data', dill_serializer), - ) + _fields = {'raw_data': dill_serializer} select: t.Optional[Query] = None sample_size: t.Optional[int] = None @@ -41,13 +36,13 @@ class Dataset(Component): raw_data: t.Optional[t.Sequence[t.Any]] = None pin: bool = False - def __post_init__(self, db, artifacts): + def __post_init__(self, db): """Post-initialization method. :param artifacts: Optional additional artifacts for initialization. """ + super().__post_init__(db=db) self._data = None - return super().__post_init__(db, artifacts) @property @ensure_initialized @@ -105,9 +100,9 @@ class RemoteData(Component): type_id: t.ClassVar[str] = 'dataset' getter: t.Callable - def __post_init__(self, db, artifacts): + def __post_init__(self, db): self._data = None - return super().__post_init__(db, artifacts) + return super().__post_init__(db) @property def data(self): diff --git a/superduper/components/datatype.py b/superduper/components/datatype.py index 6eb199ee1..d5f32c739 100644 --- a/superduper/components/datatype.py +++ b/superduper/components/datatype.py @@ -1,12 +1,8 @@ -import base64 -import dataclasses as dc import hashlib import inspect -import io import json import os import pickle -import re import typing as t from abc import abstractmethod from functools import cached_property @@ -16,144 +12,12 @@ import numpy from superduper import CFG -from superduper.backends.base.artifacts import ( - _construct_file_id_from_uri, -) -from superduper.base.config import BytesEncoding from superduper.base.leaf import Leaf -from superduper.components.component import Component, ensure_initialized -from superduper.misc.annotations import component -from superduper.misc.hash import hash_path +from superduper.components.component import Component Decode = t.Callable[[bytes], t.Any] Encode = t.Callable[[t.Any], bytes] -if t.TYPE_CHECKING: - from superduper.base.datalayer import Datalayer - - -class IntermediateType: - """Intermediate data type # noqa.""" - - BYTES = 'bytes' - STRING = 'string' - - -def json_encode(object: t.Any, info: t.Optional[t.Dict] = None) -> str: - """Encode the dict to a JSON string. - - :param object: The object to encode - :param info: Optional information - """ - return json.dumps(object) - - -def json_decode(b: str, info: t.Optional[t.Dict] = None) -> t.Any: - """Decode the JSON string to an dict. - - :param b: The JSON string to decode - :param info: Optional information - """ - return json.loads(b) - - -def pickle_encode(object: t.Any, info: t.Optional[t.Dict] = None) -> bytes: - """Encodes an object using pickle. - - :param object: The object to encode. - :param info: Optional information. - """ - return pickle.dumps(object) - - -def pickle_decode(b: bytes, info: t.Optional[t.Dict] = None) -> t.Any: - """Decodes bytes using pickle. - - :param b: The bytes to decode. - :param info: Optional information. - """ - return pickle.loads(b) - - -def dill_encode(object: t.Any, info: t.Optional[t.Dict] = None) -> bytes: - """Encodes an object using dill. - - :param object: The object to encode. - :param info: Optional information. - """ - return dill.dumps(object, recurse=True) - - -def dill_decode(b: bytes, info: t.Optional[t.Dict] = None) -> t.Any: - """Decodes bytes using dill. - - :param b: The bytes to decode. - :param info: Optional information. - """ - return dill.loads(b) - - -def file_check(path: t.Any, info: t.Optional[t.Dict] = None) -> str: - """Checks if a file path exists. - - :param path: The file path to check. - :param info: Optional information. - :raises ValueError: If the path does not exist. - """ - if not (isinstance(path, str) and os.path.exists(path)): - raise ValueError(f"Path '{path}' does not exist") - return path - - -def torch_encode(object: t.Any, info: t.Optional[t.Dict] = None) -> bytes: - """Saves an object in torch format. - - :param object: The object to encode. - :param info: Optional information. - """ - import torch - - from superduper.ext.torch.utils import device_of - - if not isinstance(object, dict): - previous_device = str(device_of(object)) - object.to('cpu') - f = io.BytesIO() - torch.save(object, f) - object.to(previous_device) - else: - f = io.BytesIO() - torch.save(object, f) - - return f.getvalue() - - -def torch_decode(b: bytes, info: t.Optional[t.Dict] = None) -> t.Any: - """Decodes bytes to a torch model. - - :param b: The bytes to decode. - :param info: Optional information. - """ - import torch - - return torch.load(io.BytesIO(b)) - - -def bytes_to_base64(bytes): - """Converts bytes to base64. - - :param bytes: The bytes to convert. - """ - return base64.b64encode(bytes).decode('utf-8') - - -def base64_to_bytes(encoded): - """Decodes a base64 encoded string. - - :param encoded: The base64 encoded string. - """ - return base64.b64decode(encoded) - class DataTypeFactory: """Abstract class for creating a DataType # noqa.""" @@ -180,17 +44,13 @@ def create(data: t.Any) -> "BaseDataType": class BaseDataType(Component): - """Base class for datatype. - - :param shape: size of vector - """ + """Base class for datatype.""" type_id: t.ClassVar[str] = 'datatype' - # TODO this can just be an integer - shape: t.Optional[int] = None + cache: bool = True @abstractmethod - def encode_data(self, item, info: t.Optional[t.Dict] = None): + def encode_data(self, item): """Decode the item as `bytes`. :param item: The item to decode. @@ -198,713 +58,266 @@ def encode_data(self, item, info: t.Optional[t.Dict] = None): """ @abstractmethod - def decode_data(self, item, info: t.Optional[t.Dict] = None): + def decode_data(self, item): """Decode the item from bytes. :param item: The item to decode. :param info: The optional information dictionary. """ - def encode_data_with_identifier(self, item, info: t.Optional[t.Dict] = None): - b = self.encode_data(item=item, info=info) - if isinstance(b, bytes): - return b, hashlib.sha1(b).hexdigest() - else: - return b, hashlib.sha1(str(b).encode()).hexdigest() +class BaseVector(BaseDataType): + """Base class for vector. -class NativeVector(BaseDataType): - """Datatype for encoding vectors which are supported natively by databackend. + :param shape: size of vector :param dtype: Datatype of array to encode. """ - encodable: t.ClassVar[str] = 'native' + shape: int dtype: str = 'float64' - def __post_init__(self, db, artifacts): - self.encodable_cls = Native - return super().__post_init__(db, artifacts) - - def encode_data(self, item, info=None): - if isinstance(item, numpy.ndarray): - item = item.tolist() - return item + @abstractmethod + def encode_data(self, item): + pass - def decode_data(self, item, info=None): - return numpy.array(item).astype(self.dtype) + @abstractmethod + def decode_data(self, item): + pass -class Json2Str(BaseDataType): - """Datatype for encoding vectors which are supported natively by databackend.""" +class NativeVector(BaseVector): + """Datatype for encoding vectors which are supported as list by databackend.""" encodable: t.ClassVar[str] = 'native' - def __post_init__(self, db, artifacts): - # self.encodable_cls = Native - return super().__post_init__(db, artifacts) + def encode_data(self, item): + if isinstance(item, numpy.ndarray): + item = item.tolist() + return item - def encode_data(self, item, info=None): - return json.dumps(item) + def decode_data(self, item): + return numpy.array(item).astype(self.dtype) - def decode_data(self, item, info=None): - return json.loads(item) +class Vector(BaseVector): + """Vector meta-datatype for encoding vectors ready for search. -class DataType(BaseDataType): - """A data type component that defines how data is encoded and decoded. - - :param encoder: A callable that converts an encodable object of this - encoder to bytes. - :param decoder: A callable that converts bytes to an encodable object - of this encoder. - :param info: An optional information dictionary. - :param directory: The directory to store file types. - :param encodable: The type of encodable object ('encodable', - 'lazy_artifact', or 'file'). - :param bytes_encoding: The encoding type for bytes ('base64' or 'bytes'). - :param intermediate_type: Type of the intermediate data - [IntermediateType.BYTES, IntermediateType.STRING] - :param media_type: The media type. + :param dtype: Datatype of encoded arrays. """ - encoder: t.Optional[t.Callable] = None # not necessary if encodable is file - decoder: t.Optional[t.Callable] = None - info: t.Optional[t.Dict] = None # TODO deprecate - directory: t.Optional[str] = None # TODO needed? - encodable: str = 'encodable' - bytes_encoding: t.Optional[str] = CFG.bytes_encoding - intermediate_type: t.Optional[str] = IntermediateType.BYTES - media_type: t.Optional[str] = None - registered_types: t.ClassVar[t.Dict[str, "DataType"]] = {} - cache: bool = True - - def __post_init__(self, db, artifacts): - """Post-initialization hook. - - :param artifacts: The artifacts. - """ - super().__post_init__(db, artifacts) - if self.encodable in _ENCODABLES: - self.encodable_cls = _ENCODABLES[self.encodable] - else: - import importlib - - self.encodable_cls = importlib.import_module( - '.'.join(self.encodable.split('.')[:-1]) - ).__dict__[self.encodable.split('.')[-1]] + identifier: str = '' - self.bytes_encoding = self.bytes_encoding or CFG.bytes_encoding - self.register_datatype(self) + def __post_init__(self, db): + self.identifier = f'vector[{self.shape[0]}]' + return super().__post_init__(db) @property - def artifact(self): - """Check if the encodable is an artifact.""" - return self.encodable_cls.artifact - - def dict(self, metadata: bool = True, defaults: bool = True, refs: bool = False): - """Get the dictionary representation of the object.""" - r = super().dict(metadata=metadata, defaults=defaults, refs=refs) - if hasattr(self.bytes_encoding, 'value'): - r['bytes_encoding'] = str(self.bytes_encoding.value) # type: ignore[union-attr] - return r - - def __call__( - self, x: t.Optional[t.Any] = None, uri: t.Optional[str] = None - ) -> '_BaseEncodable': - """Create an instance of the encodable class. - - :param x: The optional content. - :param uri: The optional URI. - """ - return self.encodable_cls(datatype=self, x=x, uri=uri, db=self.db) - - @ensure_initialized - def encode_data_with_identifier(self, item, info: t.Optional[t.Dict] = None): - """Encode the item into bytes. - - :param item: The item to encode. - :param info: The optional information dictionary. - """ - info = info or {} - data = self.encoder(item, info) if self.encoder else item - sha1 = self.encodable_cls.get_hash(data) - data = self.bytes_encoding_after_encode(data) - return data, sha1 - - @ensure_initialized - def encode_data(self, item, info: t.Optional[t.Dict] = None): - """Encode the item into bytes. - - :param item: The item to encode. - :param info: The optional information dictionary. - """ - info = info or {} - data = self.encoder(item, info) if self.encoder else item - # data = self.bytes_encoding_after_encode(data) - return data - - @ensure_initialized - def decode_data(self, item, info: t.Optional[t.Dict] = None): - """Decode the item from bytes. - - :param item: The item to decode. - :param info: The optional information dictionary. - """ - info = info or {} - # item = self.bytes_encoding_before_decode(item) - return self.decoder(item, info=info) if self.decoder else item - - def bytes_encoding_after_encode(self, data): - """Encode the data to base64. - - if the bytes_encoding is BASE64 and the intermediate_type is BYTES - - :param data: Encoded data - """ - if ( - self.bytes_encoding == BytesEncoding.BASE64 - and self.intermediate_type == IntermediateType.BYTES - ): - return bytes_to_base64(data) - return data - - def bytes_encoding_before_decode(self, data): - """Encode the data to base64. + def encodable(self): + return self.datatype_impl.encodable - if the bytes_encoding is BASE64 and the intermediate_type is BYTES + @cached_property + def datatype_impl(self): + if isinstance(CFG.datatype_presets.vector, str): + type_: str = CFG.datatype_presets.vector + else: + type_: str = self.db.databackend.datatype_presets['vector'] + module = '.'.join(type_.split('.')[:-1]) + cls = type_.split('.')[-1] + datatype = getattr(import_module(module), cls) + if inspect.isclass(datatype): + datatype = datatype('tmp', dtype=self.dtype, shape=self.shape) + return datatype - :param data: Decoded data - """ - if ( - self.bytes_encoding == BytesEncoding.BASE64 - and self.intermediate_type == IntermediateType.BYTES - ): - return base64_to_bytes(data) - return data - - @classmethod - def register_datatype(cls, instance): - """Register a datatype. - - :param instance: The datatype instance to register. - """ - cls.registered_types[instance.identifier] = instance + def encode_data(self, item): + return self.datatype_impl.encode_data(item=item) + def decode_data(self, item): + return self.datatype_impl.decode_data(item=item) -def encode_torch_state_dict(module, info): - """Encode torch state dictionary. - :param module: Module. - :param info: Information. - """ - import torch +class JSON(BaseDataType): + """Datatype for encoding vectors which are supported natively by databackend.""" - buffer = io.BytesIO() - torch.save(module.state_dict(), buffer) + encodable: t.ClassVar[str] = 'native' - return buffer.getvalue() + def __post_init__(self, db): + return super().__post_init__(db) + def encode_data(self, item): + return json.dumps(item) -# TODO migrate to torch plugin -class DecodeTorchStateDict: - """Torch state dictionary decoder. + def decode_data(self, item): + return json.loads(item) - :param cls: Torch state cls - """ - def __init__(self, cls): - self.cls = cls +class _Encodable: + encodable: t.ClassVar[str] = 'encodable' - def __call__(self, b: bytes, info: t.Dict): - """Decode the torch state dictionary. + def encode_data(self, item): + return self._encode_data(item) - :param b: Bytes. - :param info: Information. - """ - import torch - buffer = io.BytesIO(b) - module = self.cls(**info) - module.load_state_dict(torch.load(buffer)) - return module +class _Artifact: + encodable: t.ClassVar[str] = 'artifact' + def encode_data(self, item): + return Blob(bytes=self._encode_data(item)) -def _find_descendants(cls): - """Find descendants of the given class. - :param cls: The class to find descendants for. - """ - descendants = cls.__subclasses__() - for subclass in descendants: - descendants.extend(_find_descendants(subclass)) - return descendants +class _PickleMixin: + def _encode_data(self, item): + return pickle.dumps(item) + def decode_data(self, item): + return pickle.loads(item) -class _BaseEncodable(Leaf): - """Data variable wrapping encode-able item. - Encoding is controlled by the referred - to ``Encoder`` instance. +class PickleSerializer(_Artifact, _PickleMixin, BaseDataType): + """Serializer with pickle.""" - :param datatype: The datatype of the content. - :param uri: URI of the content, if any. - :param x: Wrapped content. - """ - identifier: str = '' - datatype: DataType - uri: t.Optional[str] = None # URI of the content to be deprecated - x: t.Optional[t.Any] = None - lazy: t.ClassVar[bool] = False - artifact: t.ClassVar[bool] = False +class PickleEncoder(_Encodable, _PickleMixin, BaseDataType): + """Pickle inline encoder.""" - def __post_init__(self, db): - """Post-initialization hook. - :param db: Datalayer instance. - """ - db = db or self.datatype.db - super().__post_init__(db) - if self.uri is not None and self.identifier is None: - self.identifier = _construct_file_id_from_uri(self.uri) +class _DillMixin: + def _encode_data(self, item): + return dill.dumps(item) - if self.uri and not re.match('^[a-z]{0,5}://', self.uri): - self.uri = f'file://{self.uri}' + def decode_data(self, item): + return dill.loads(item) - @property - def reference(self): - """Get the reference to the datatype.""" - return self.datatype.reference - def unpack(self): - """Unpack the content of the `Encodable`.""" - return self.x +class _DillSerializer(_Artifact, _DillMixin, BaseDataType): + ... - @staticmethod - def get_hash(data): - """Get the hash of the given data. - :param data: Data to hash. - """ - if isinstance(data, str): - bytes_ = data.encode() - elif isinstance(data, bytes): - bytes_ = data - elif isinstance(data, Native): - bytes_ = str([type(data), data.x]).encode() - else: - bytes_ = str(id(data)).encode() - return hashlib.sha1(bytes_).hexdigest() +class _DillEncoder(_Encodable, _DillMixin, BaseDataType): + ... - @staticmethod - def build_reference(identifier, source_data): - raise NotImplementedError +class FileType(BaseDataType): + """Type for encoding files on disk.""" -class Empty: - """Sentinel class # noqa.""" + encodable: t.ClassVar[str] = 'file' - def __repr__(self): - """Get the string representation of the Empty object.""" - return '' + def encode_data(self, item): + assert os.path.exists(item) + return File(path=item) + def decode_data(self, item): + return item -class Blob(Leaf): - """A wrapper to signify a blob for special treatment. - See `Document.encode` and related functions. +def get_hash(data): + """Get the hash of the given data. - :param identifier: The identifier of the blob. - :param bytes: The bytes of the blob. + :param data: Data to hash. """ - - identifier: str - bytes: bytes + if isinstance(data, str): + bytes_ = data.encode() + elif isinstance(data, bytes): + bytes_ = data + else: + bytes_ = str(id(data)).encode() + return hashlib.sha1(bytes_).hexdigest() -# TODO this is no longer stricly needed, since we now encode -# directly with `Schema` -class Encodable(_BaseEncodable): - """Class for encoding non-Python datatypes to the database. +class Saveable(Leaf): + """A Saveable base class.""" - :param x: The encodable object. - :param blob: The blob data. - """ + identifier: str = '' - x: t.Any = Empty() - artifact: t.ClassVar[bool] = False - blob: dc.InitVar[t.Optional[bytearray]] = None - - def __post_init__(self, db, blob): - super().__post_init__(db) - if isinstance(self.x, Empty): - self.datatype.init() - self.x = self.datatype.decode_data(blob) - - def _encode(self): - bytes_ = self.datatype.encode_data(self.x) - sha1 = self.get_hash(bytes_) - return bytes_, sha1 - - def to_artifact(self): - """Convert the encodable to an artifact.""" - r = self.dict() - r['datatype'].encodable = 'artifact' - kwargs = { - k: v for k, v in r.items() if k in inspect.signature(Artifact).parameters - } - return Artifact(**kwargs) - - def dict(self, metadata: bool = True, defaults: bool = True): - """Get the dictionary representation of the object.""" - r = super().dict(metadata=metadata, defaults=defaults) - del r['x'] - r['blob'], identifier = self._encode() - if not r['identifier']: - self.identifier = identifier - r['identifier'] = identifier - return r - - def init(self, db): - """Initialization method. - - :param db: The Datalayer instance. - """ + @property + @abstractmethod + def reference(self): pass - @classmethod - def get_datatype(cls, db, r): - """Get the datatype of the object. + @abstractmethod + def init(self): + pass - :param db: `Datalayer` instance to assist with - :param r: The object to get the datatype from - """ - if db is None: - try: - from superduper.components.datatype import serializers - - datatype = serializers[r['datatype']] - except KeyError: - raise ValueError( - f'You specified a serializer which doesn\'t have a' - f' default value: {r["datatype"]}' - ) - else: - datatype = db.datatypes[r['datatype']] - return datatype + @abstractmethod + def unpack(self): + pass -class Native(_BaseEncodable): - """Class for representing native data supported by the underlying database. +class File(Saveable): + """Placeholder for a file. - :param x: The encodable object. + :param path: Path to file. """ - x: t.Optional[t.Any] = None + path: str = '' - def __post_init__(self, db): + def __post_init__(self, db=None): + if not self.identifier: + self.identifier = get_hash(self.path) return super().__post_init__(db) - @classmethod - def _get_object(cls, db, r): - raise NotImplementedError - - -class Artifact(_BaseEncodable): - """Class for representing data to be saved on disk or in the artifact-store. - - :param x: The artifact object. - :param blob: The blob data. Can be a string or bytes. - if string, it should be in the format `&:blob:{file_id}` - if bytes, it should be the actual data. - """ - - artifact: t.ClassVar[bool] = True - x: t.Any = Empty() - blob: dc.InitVar[t.Optional[t.Union[str, bytes]]] = None - lazy: t.ClassVar[bool] = False - - def __post_init__(self, db, blob=None): - super().__post_init__(db) - self._blob = blob - self._reference = None - - if not (self.lazy and not isinstance(self._blob, bytes)): - self.init() - - def init(self, db=None): - """Initialize to load `x` with the actual file from the artifact store.""" - if isinstance(self._blob, t.Callable): - self._blob, _ = self._blob() - - if isinstance(self._blob, bytes): - blob = self._blob - self.datatype.init() - self.x = self.datatype.decoder(blob, info=None) - self._blob = None - - if not isinstance(self.x, Empty): + def init(self): + if self.path: return - - def dict(self, metadata: bool = True, defaults: bool = True): - """Get the dictionary representation of the object.""" - bytes, identifier = self._encode() - if not self.identifier: - self.identifier = identifier - r = super().dict(metadata=metadata, defaults=defaults) - del r['x'] - r['blob'] = Blob(identifier=self.identifier, bytes=bytes) - return r - - def _encode(self): - bytes_ = self.datatype.encoder(self.x) - sha1 = self.get_hash(bytes_) - return bytes_, sha1 + self.path = self.db.artifact_store.get_file(self.identifier) def unpack(self): - """Unpack the content of the `Encodable`.""" self.init() - return self.x - - @staticmethod - def build_reference(identifier, source_data): - """Build a reference to the blob. - - :param identifier: The identifier of the blob. - :param source_data: The source data. - :return: The reference to the blob. '&:blob:{file_id}' - """ - return f"&:blob:{identifier}" - - -class LazyArtifact(Artifact): - """Data to be saved and loaded only when needed.""" - - lazy: t.ClassVar[bool] = True - - def dict(self, metadata: bool = True, defaults: bool = True): - """Get the dictionary representation of the object.""" - self.init() - return super().dict(metadata=metadata, defaults=defaults) - - -class FileItem(Leaf): - """File item class. + return self.path - :param identifier: The identifier of the file. - :param path: The path of the file. - """ - - identifier: str - path: str + # TODO - return this as self.dict()? + @property + def reference(self): + return f'&:file:{self.identifier}' -class File(_BaseEncodable): - """Data to be saved on disk and passed as a file reference. +class Blob(Saveable): + """Placeholder for a blob of bytes. - :param x: path to the file + :param bytes: Bytes blob. """ - lazy: t.ClassVar[bool] = False - artifact: t.ClassVar[bool] = True - - x: t.Any = Empty() - - def __post_init__(self, db): - super().__post_init__(db) - if isinstance(self.x, t.Callable): - self._file = self.x - self.x = Empty() - else: - self._file = None - - if not self.lazy: - self.init() + bytes: bytearray | None = None + identifier: str = '' - def init(self, db=None): - """Initialize to load `x` with the actual file from the artifact store.""" - if isinstance(self._file, t.Callable): - file_path, self.identifier = self._file() - self.x = file_path + def __post_init__(self, db=None): + if not self.identifier: + assert self.bytes is not None + self.identifier = get_hash(self.bytes) + return super().__post_init__(db) - if not isinstance(self.x, Empty): + def init(self): + if self.bytes: return - - def dict(self, metadata: bool = True, defaults: bool = True): - """Get the dictionary representation of the object.""" - self.identifier = self.identifier or hash_path(self.x) - r = super().dict(metadata=metadata, defaults=defaults) - r['x'] = FileItem(identifier=self.identifier, path=self.x) - return r + self.bytes = self.db.artifact_store.get_bytes(self.identifier) def unpack(self): - """Unpack and get the original data.""" - self.init() - return self.x - - @staticmethod - def build_reference(identifier, source_data): - """Build a reference to the file. - - :param identifier: The identifier of the file. - :param source_data: The source data. - :return: The reference to the file. '?:file:{file_id}' - """ - return f"&:file:{identifier}" - - -class LazyFile(File): - """Class is used to load a file only when needed.""" - - lazy: t.ClassVar[bool] = True - - def dict(self, metadata: bool = True, defaults: bool = True): - """Get the dictionary representation of the object.""" self.init() - return super().dict(metadata=metadata, defaults=defaults) - - -_ENCODABLES = { - 'encodable': Encodable, - 'artifact': Artifact, - 'lazy_artifact': LazyArtifact, - 'file': File, - 'native': Native, - 'lazy_file': LazyFile, -} - - -methods: t.Dict[str, t.Dict] = { - 'pickle': {'encoder': pickle_encode, 'decoder': pickle_decode}, - 'dill': {'encoder': dill_encode, 'decoder': dill_decode}, - 'torch': {'encoder': torch_encode, 'decoder': torch_decode}, - 'file': {'encoder': file_check, 'decoder': file_check}, - 'native': {'encoder': None, 'decoder': None}, -} - - -@component() -def get_serializer( - identifier: str, - method: str, - encodable: str = "encodable", - db: t.Optional['Datalayer'] = None, -): - """Get a serializer. - - :param identifier: The identifier of the serializer. - :param method: The method of the serializer. - :param encodable: The type of encodable object. - :param db: The Datalayer instance. - """ - return DataType( - identifier=identifier, - encodable=encodable, - db=db, - **methods[method], - ) - - -json_serializer = Json2Str('json') - - -pickle_encoder = get_serializer( - identifier='pickle_encoder', - method='pickle', - encodable='encodable', -) - - -pickle_serializer = get_serializer( - identifier='pickle', - method='pickle', - encodable='artifact', -) - -pickle_lazy = get_serializer( - identifier='pickle_lazy', - method='pickle', - encodable='lazy_artifact', -) - -dill_serializer = get_serializer( - identifier='dill', - method='dill', - encodable='artifact', -) - -dill_lazy = get_serializer( - identifier='dill_lazy', - method='dill', - encodable='lazy_artifact', -) - -torch_serializer = get_serializer( - identifier='torch', - method='torch', - encodable='lazy_artifact', -) - -file_serializer = get_serializer( - identifier='file', - method='file', - encodable='file', -) - -file_lazy = get_serializer( - identifier='file_lazy', - method='file', - encodable='lazy_file', -) - -serializers = { - 'pickle': pickle_serializer, - 'dill': dill_serializer, - 'torch': torch_serializer, - 'file': file_serializer, - 'pickle_lazy': pickle_lazy, - 'dill_lazy': dill_lazy, - 'file_lazy': file_lazy, -} - - -class Vector(BaseDataType): - """Vector meta-datatype for encoding vectors ready for search. - - :param dtype: Datatype of encoded arrays. - """ - - identifier: str = '' - dtype: str = 'float64' - - def __post_init__(self, db, artifacts): - self.identifier = f'vector[{self.shape[0]}]' - return super().__post_init__(db, artifacts) + return self.bytes @property - def encodable_cls(self): - return self.datatype_impl.encodable_cls - - @property - def encodable(self): - return self.datatype_impl.encodable - - @cached_property - def datatype_impl(self): - if isinstance(CFG.datatype_presets.vector, str): - type_: str = CFG.datatype_presets.vector - else: - type_: str = self.db.databackend.datatype_presets['vector'] - module = '.'.join(type_.split('.')[:-1]) - cls = type_.split('.')[-1] - datatype = getattr(import_module(module), cls) - if inspect.isclass(datatype): - datatype = datatype('tmp', dtype=self.dtype) - return datatype - - def encode_data(self, item, info: t.Optional[t.Dict] = None): - return self.datatype_impl.encode_data(item=item, info=info) + def reference(self): + return f'&:blob:{self.identifier}' + + +json_encoder = JSON('json') +pickle_encoder = PickleEncoder('pickle_encoder') +pickle_serializer = PickleSerializer('pickle_serializer') +dill_encoder = _DillEncoder('dill_encoder') +dill_serializer = _DillSerializer('dill_serializer') +file = FileType('file') + + +INBUILT_DATATYPES = { + dt.identifier: dt + for dt in [ + json_encoder, + pickle_encoder, + pickle_serializer, + dill_encoder, + dill_serializer, + file, + ] +} - def decode_data(self, item, info: t.Optional[t.Dict] = None): - return self.datatype_impl.decode_data(item=item, info=info) +DEFAULT_DATATYPE = PickleEncoder('DEFAULT') diff --git a/superduper/components/graph.py b/superduper/components/graph.py index 81a1734cd..94555033b 100644 --- a/superduper/components/graph.py +++ b/superduper/components/graph.py @@ -167,8 +167,8 @@ class Input(Model): identifier: str = '_input' signature: Signature = '*args' - def __post_init__(self, db, artifacts, example): - super().__post_init__(db, artifacts, example) + def __post_init__(self, db, example): + super().__post_init__(db, example) if isinstance(self.spec, str): self.signature = 'singleton' @@ -199,8 +199,8 @@ class DocumentInput(Model): identifier: str = '_input' signature: Signature = 'singleton' - def __post_init__(self, db, artifacts, example): - super().__post_init__(db, artifacts, example) + def __post_init__(self, db, example): + super().__post_init__(db, example) def predict(self, r): """Single prediction. @@ -255,7 +255,7 @@ class Graph(Model): outputs: t.List[t.Union[str, Model]] = dc.field(default_factory=list) signature: Signature = '*args,**kwargs' - def __post_init__(self, db, artifacts, example): + def __post_init__(self, db, example): self.G = nx.DiGraph() self.nodes = {} self.version = 0 @@ -284,7 +284,7 @@ def __post_init__(self, db, artifacts, example): on=on, update_edge=False, ) - super().__post_init__(db, artifacts=artifacts, example=example) + super().__post_init__(db, example=example) def connect( self, diff --git a/superduper/components/listener.py b/superduper/components/listener.py index dbad51820..090024fe8 100644 --- a/superduper/components/listener.py +++ b/superduper/components/listener.py @@ -51,12 +51,12 @@ def _get_metadata(self): metadata = super()._get_metadata() return {**metadata, 'output_table': self.output_table} - def __post_init__(self, db, artifacts): + def __post_init__(self, db): if not self.cdc_table and self.select: self.cdc_table = self.select.table self._set_upstream() - return super().__post_init__(db, artifacts) + return super().__post_init__(db) def handle_update_or_same(self, other): super().handle_update_or_same(other) diff --git a/superduper/components/metric.py b/superduper/components/metric.py index b6a96ede8..641464e28 100644 --- a/superduper/components/metric.py +++ b/superduper/components/metric.py @@ -1,6 +1,6 @@ import typing as t -from superduper.components.component import Component +from superduper.components.component import Component, ensure_initialized class Metric(Component): @@ -15,6 +15,7 @@ class Metric(Component): object: t.Callable + @ensure_initialized def __call__(self, x: t.Sequence[int], y: t.Sequence[int]) -> bool: """Call the metric object on the x and y data. diff --git a/superduper/components/model.py b/superduper/components/model.py index 1dcc53101..d9985cf16 100644 --- a/superduper/components/model.py +++ b/superduper/components/model.py @@ -21,8 +21,9 @@ from superduper.base.annotations import trigger from superduper.base.document import Document from superduper.base.exceptions import DatabackendException +from superduper.base.leaf import Leaf from superduper.components.component import Component, ComponentMeta, ensure_initialized -from superduper.components.datatype import DataType, dill_lazy +from superduper.components.datatype import BaseDataType, dill_serializer from superduper.components.metric import Metric from superduper.components.schema import Schema @@ -31,7 +32,7 @@ from superduper.components.dataset import Dataset -EncoderArg = t.Union[DataType, str, None] +EncoderArg = t.Union[BaseDataType, str, None] ModelInputType = t.Union[str, t.List[str], t.Tuple[t.List[str], t.Dict[str, str]]] Signature = t.Literal['*args', '**kwargs', '*args,**kwargs', 'singleton'] @@ -391,8 +392,8 @@ class Model(Component, metaclass=ModelMeta): example: dc.InitVar[t.Any | None] = None deploy: bool = False - def __post_init__(self, db, artifacts, example): - super().__post_init__(db, artifacts) + def __post_init__(self, db, example): + super().__post_init__(db) self.example = example self._is_initialized = False @@ -1014,7 +1015,9 @@ def __getitem__(self, item): return _Node(item) -class ObjectModel(Model): +# This is if the user would like to +# import the object +class ImportedModel(Model): """Model component which wraps a Model to become serializable. Example: @@ -1030,10 +1033,7 @@ class ObjectModel(Model): """ breaks: t.ClassVar[t.Sequence] = ('object', 'trainer') - _artifacts: t.ClassVar[t.Sequence[t.Tuple[str, 'DataType']]] = ( - ('object', dill_lazy), - ) - object: t.Callable + object: Leaf method: t.Optional[str] = None @staticmethod @@ -1095,6 +1095,13 @@ def predict(self, *args, **kwargs): return getattr(self.object, self.method)(*args, **kwargs) +class ObjectModel(ImportedModel): + """A model to wrap a Python object and serialize it.""" + + _fields = {'object': dill_serializer} + object: t.Callable + + class APIBaseModel(Model): """APIBaseModel component which is used to make the type of API request. @@ -1105,8 +1112,8 @@ class APIBaseModel(Model): model: t.Optional[str] = None max_batch_size: int = 8 - def __post_init__(self, db, artifacts, example): - super().__post_init__(db, artifacts, example) + def __post_init__(self, db, example): + super().__post_init__(db, example) if self.model is None: assert self.identifier is not None self.model = self.identifier @@ -1146,8 +1153,8 @@ def inputs(self): """Method to get ``Inputs`` instance for model inputs.""" return Inputs(self.runtime_params) - def __post_init__(self, db, artifacts): - super().__post_init__(db, artifacts) + def __post_init__(self, db): + super().__post_init__(db) self.params['model'] = self.model env_variables = re.findall(r'{([A-Z0-9\_]+)}', self.url) runtime_variables = re.findall(r'{([a-z0-9\_]+)}', self.url) @@ -1190,7 +1197,7 @@ class QueryModel(Model): """ preprocess: t.Optional[t.Callable] = None - postprocess: t.Optional[t.Union[t.Callable]] = None + postprocess: t.Optional[t.Callable] = None select: Query signature: Signature = '**kwargs' @@ -1247,10 +1254,10 @@ class SequentialModel(Model): models: t.List[Model] - def __post_init__(self, db, artifacts, example): + def __post_init__(self, db, example): self.signature = self.models[0].signature self.datatype = self.models[-1].datatype - return super().__post_init__(db, artifacts, example) + return super().__post_init__(db, example) @property def inputs(self) -> Inputs: diff --git a/superduper/components/plugin.py b/superduper/components/plugin.py index 816a4b47f..22256a910 100644 --- a/superduper/components/plugin.py +++ b/superduper/components/plugin.py @@ -6,7 +6,7 @@ import typing as t from superduper import Component, logging -from superduper.components.datatype import LazyFile, file_lazy +from superduper.components.datatype import File, file class Plugin(Component): @@ -18,19 +18,19 @@ class Plugin(Component): """ type_id: t.ClassVar[str] = "plugin" - _artifacts: t.ClassVar = (("path", file_lazy),) + _fields = {"path": file} path: str identifier: str = "" cache_path: str = "~/.superduper/plugins" - def __post_init__(self, db, artifacts): - if isinstance(self.path, LazyFile): + def __post_init__(self, db): + if isinstance(self.path, File): self._prepare_plugin() else: path_name = os.path.basename(self.path.rstrip("/")) self.identifier = self.identifier or f"plugin-{path_name}".replace(".", "_") self._install() - super().__post_init__(db, artifacts) + super().__post_init__(db) def _install(self): logging.debug(f"Installing plugin {self.identifier}") @@ -92,7 +92,7 @@ def _pip_install(self, requirement_path): def _prepare_plugin(self): plugin_name_tag = f"{self.identifier}" - assert isinstance(self.path, LazyFile) + assert isinstance(self.path, File) cache_path = os.path.expanduser(self.cache_path) uuid_path = os.path.join(cache_path, self.uuid) # Check if plugin is already in cache diff --git a/superduper/components/schema.py b/superduper/components/schema.py index d0b9a89e5..c3f291f5f 100644 --- a/superduper/components/schema.py +++ b/superduper/components/schema.py @@ -1,12 +1,11 @@ import base64 -import hashlib import typing as t from functools import cached_property from superduper.base.constant import KEY_SCHEMA from superduper.base.leaf import Leaf from superduper.components.component import Component -from superduper.components.datatype import BaseDataType, DataType +from superduper.components.datatype import BaseDataType, Saveable from superduper.misc.reference import parse_reference from superduper.misc.special_dicts import SuperDuperFlatEncode @@ -14,20 +13,6 @@ from superduper.base.document import Getters -def get_hash(data): - """Get the hash of the given data. - - :param data: Data to hash. - """ - if isinstance(data, str): - bytes_ = data.encode() - elif isinstance(data, bytes): - bytes_ = data - else: - bytes_ = str(id(data)).encode() - return hashlib.sha1(bytes_).hexdigest() - - class FieldType(Leaf): """Field type to represent the type of a field in a table. @@ -36,12 +21,13 @@ class FieldType(Leaf): :param identifier: The name of the data type. """ - identifier: t.Union[str, DataType] + identifier: t.Union[str, BaseDataType] def __post_init__(self, db): super().__post_init__(db) - if isinstance(self.identifier, DataType): + # TODO why would this happen? + if isinstance(self.identifier, BaseDataType): self.identifier = self.identifier.name elif isinstance(self.identifier, self.__class__): @@ -74,12 +60,12 @@ class Schema(Component): """ type_id: t.ClassVar[str] = 'schema' - fields: t.Mapping[str, DataType] + fields: t.Mapping[str, BaseDataType] - def __post_init__(self, db, artifacts): + def __post_init__(self, db): assert self.identifier is not None, 'Schema must have an identifier' assert self.fields is not None, 'Schema must have fields' - super().__post_init__(db, artifacts) + super().__post_init__(db) for k, v in self.fields.items(): if isinstance(v, (BaseDataType, FieldType)): @@ -92,6 +78,12 @@ def __post_init__(self, db, artifacts): self.fields[k] = v + def update(self, other: 'Schema'): + new_fields = self.fields.copy() + new_fields.update(other.fields) + return Schema(self.identifier, fields=new_fields) + + # TODO why do we need this? @cached_property def encoded_types(self): """List of fields of type DataType.""" @@ -124,6 +116,7 @@ def decode_data( """Decode data using the schema's encoders. :param data: Data to decode. + :param getters: Getters to decode. """ if self.trivial: return data @@ -136,17 +129,8 @@ def decode_data( value = data[k] if reference := parse_reference(value): - value = getters.run(reference.name, reference.path) - if reference.name == 'blob': - kwargs = {'blob': value} - elif reference.name == 'file': - kwargs = {'x': value} - else: - assert False, f'Unknown reference type {reference.name}' - encodable = field.encodable_cls(datatype=field, **kwargs) - if not field.encodable_cls.lazy: - encodable = encodable.unpack() - decoded[k] = encodable + saveable: Saveable = getters.run(reference.name, reference.path) + decoded[k] = saveable else: b = data[k] if ( @@ -174,32 +158,34 @@ def encode_data(self, out, builds, blobs, files, leaves_to_keep=()): if k not in out: continue + if isinstance(out[k], Saveable): + continue + if isinstance(out[k], leaves_to_keep): continue - # data, identifier = field.encode_data_with_identifier(out[k]) data = field.encode_data(out[k]) - identifier = get_hash(data) - if ( field.encodable == 'encodable' and self.db.databackend.bytes_encoding == 'base64' ): assert isinstance(data, bytes) data = _convert_bytes_to_base64(data) + out[k] = data - if field.encodable in {'artifact', 'lazy_artifact'}: - reference = field.encodable_cls.build_reference(identifier, data) - ref_obj = parse_reference(reference) + elif isinstance(data, Saveable): + ref_obj = parse_reference(data.reference) if ref_obj.name == 'blob': - blobs[identifier] = data + blobs[data.identifier] = data.bytes + elif ref_obj.name == 'file': - files[identifier] = data + files[data.identifier] = data.path else: assert False, f'Unknown reference type {ref_obj.name}' - out[k] = reference + + out[k] = data.reference else: out[k] = data diff --git a/superduper/components/table.py b/superduper/components/table.py index d5947aa8d..8605d32db 100644 --- a/superduper/components/table.py +++ b/superduper/components/table.py @@ -22,7 +22,7 @@ class Table(Component): :param data: Data to insert post creation """ - _artifacts: t.ClassVar[t.Tuple[str]] = (('data', pickle_serializer),) + _fields = {'data': pickle_serializer} type_id: t.ClassVar[str] = 'table' @@ -30,8 +30,8 @@ class Table(Component): primary_id: str = DEFAULT_PRIMARY_ID data: t.List[t.Dict] | 'Dataset' | 'RemoteData' | None = None - def __post_init__(self, db, artifacts): - super().__post_init__(db, artifacts) + def __post_init__(self, db): + super().__post_init__(db) fields = {} fields.update(self.schema.fields) diff --git a/superduper/components/template.py b/superduper/components/template.py index c5ccec94f..22dad5cc2 100644 --- a/superduper/components/template.py +++ b/superduper/components/template.py @@ -38,7 +38,7 @@ class _BaseTemplate(Component): files: t.Optional[t.List[str]] = None substitutions: dc.InitVar[t.Optional[t.Dict]] = None - def __post_init__(self, db, artifacts, substitutions): + def __post_init__(self, db, substitutions): if isinstance(self.template, Leaf): self.template = self.template.encode(defaults=True, metadata=False) self.template = SuperDuperFlatEncode(self.template) @@ -59,7 +59,7 @@ def __post_init__(self, db, artifacts, substitutions): ) if self.template_variables is None: self.template_variables = self.template.variables - super().__post_init__(db, artifacts) + super().__post_init__(db) @ensure_initialized def __call__(self, **kwargs): @@ -224,10 +224,10 @@ class QueryTemplate(_BaseTemplate): type_id: t.ClassVar[str] = 'query_template' - def __post_init__(self, db, artifacts, substitutions): + def __post_init__(self, db, substitutions): if isinstance(self.template, Leaf): self.template = self.template.dict(metadata=False, defaults=False).encode() - return super().__post_init__(db, artifacts, substitutions) + return super().__post_init__(db, substitutions) @property def form_template(self): diff --git a/superduper/components/training.py b/superduper/components/training.py index c1981c03c..56ab504c4 100644 --- a/superduper/components/training.py +++ b/superduper/components/training.py @@ -1,7 +1,7 @@ import typing as t from superduper.components.component import Component -from superduper.components.datatype import DataType, file_lazy +from superduper.components.datatype import file_lazy class Checkpoint(Component): @@ -13,9 +13,9 @@ class Checkpoint(Component): path: t.Optional[str] step: int - _artifacts: t.ClassVar[t.Sequence[t.Tuple[str, DataType]]] = (("path", file_lazy),) + _fields = {'path': file_lazy} type_id: t.ClassVar[str] = "checkpoint" - def __post_init__(self, db, artifacts): - super().__post_init__(db, artifacts) + def __post_init__(self, db): + super().__post_init__(db) self.version = int(self.step) diff --git a/superduper/components/vector_index.py b/superduper/components/vector_index.py index 358ba4379..43723a5d7 100644 --- a/superduper/components/vector_index.py +++ b/superduper/components/vector_index.py @@ -11,13 +11,10 @@ from superduper.base.document import Document from superduper.components.cdc import CDC from superduper.components.component import Component -from superduper.components.datatype import DataType from superduper.components.listener import Listener from superduper.components.model import Mapping, ModelInputType from superduper.components.schema import Schema from superduper.components.table import Table -from superduper.ext.utils import str_shape -from superduper.misc.annotations import component from superduper.misc.special_dicts import MongoStyleDict from superduper.vector_search.base import VectorIndexMeasureType, VectorItem @@ -118,9 +115,9 @@ class VectorIndex(CDC): metric_values: t.Optional[t.Dict] = dc.field(default_factory=dict) cdc_table: str = '' - def __post_init__(self, db, artifacts): + def __post_init__(self, db): self.cdc_table = self.cdc_table or self.indexing_listener.outputs - return super().__post_init__(db, artifacts) + return super().__post_init__(db) def refresh(self): if self.cdc_table.startswith(CFG.output_prefix): @@ -419,42 +416,42 @@ def __call__(self, bytes, info: t.Optional[t.Dict] = None): return np.frombuffer(bytes, dtype=self.dtype).tolist() -@component( - {'name': 'shape', 'type': 'int'}, - {'name': 'identifier', 'type': 'str'}, -) -def vector(shape, identifier: t.Optional[str] = None): - """Create an encoder for a vector (list of ints/ floats) of a given shape. - - :param shape: The shape of the vector - :param identifier: The identifier of the vector - """ - if isinstance(shape, int): - shape = (shape,) - - identifier = identifier or f'vector[{str_shape(shape)}]' - return DataType( - identifier=identifier, - shape=shape, - encoder=None, - decoder=None, - encodable='native', - ) - - -@component() -def sqlvector(shape, bytes_encoding: t.Optional[str] = None): - """Create an encoder for a vector (list of ints/ floats) of a given shape. - - This is used for compatibility with SQL databases, as the default vector - - :param shape: The shape of the vector - :param bytes_encoding: The encoding of the bytes - """ - return DataType( - identifier=f'sqlvector[{str_shape(shape)}]', - shape=shape, - encoder=EncodeArray(dtype='float64'), - decoder=DecodeArray(dtype='float64'), - bytes_encoding=bytes_encoding, - ) +# @component( +# {'name': 'shape', 'type': 'int'}, +# {'name': 'identifier', 'type': 'str'}, +# ) +# def vector(shape, identifier: t.Optional[str] = None): +# """Create an encoder for a vector (list of ints/ floats) of a given shape. + +# :param shape: The shape of the vector +# :param identifier: The identifier of the vector +# """ +# if isinstance(shape, int): +# shape = (shape,) + +# identifier = identifier or f'vector[{str_shape(shape)}]' +# return DataType( +# identifier=identifier, +# shape=shape, +# encoder=None, +# decoder=None, +# encodable='native', +# ) + + +# @component() +# def sqlvector(shape, bytes_encoding: t.Optional[str] = None): +# """Create an encoder for a vector (list of ints/ floats) of a given shape. + +# This is used for compatibility with SQL databases, as the default vector + +# :param shape: The shape of the vector +# :param bytes_encoding: The encoding of the bytes +# """ +# return DataType( +# identifier=f'sqlvector[{str_shape(shape)}]', +# shape=shape, +# encoder=EncodeArray(dtype='float64'), +# decoder=DecodeArray(dtype='float64'), +# bytes_encoding=bytes_encoding, +# ) diff --git a/superduper/ext/llm/model.py b/superduper/ext/llm/model.py index bcd814c3c..4e466c056 100644 --- a/superduper/ext/llm/model.py +++ b/superduper/ext/llm/model.py @@ -28,8 +28,8 @@ class BaseLLM(Model): max_batch_size: t.Optional[int] = 4 signature: str = 'singleton' - def __post_init__(self, db, artifacts, example): - super().__post_init__(db, artifacts, example) + def __post_init__(self, db, example): + super().__post_init__(db, example) self.takes_context = True self.identifier = self.identifier.replace("/", "-") diff --git a/superduper/ext/llm/prompter.py b/superduper/ext/llm/prompter.py index e785e8df9..08e665062 100644 --- a/superduper/ext/llm/prompter.py +++ b/superduper/ext/llm/prompter.py @@ -67,9 +67,9 @@ class RetrievalPrompt(QueryModel): prompt_introduction: str = PROMPT_INTRODUCTION join: str = "\n---\n" - def __post_init__(self, db, artifacts): + def __post_init__(self, db): assert 'prompt' in self.select.variables - return super().__post_init__(db, artifacts) + return super().__post_init__(db) @property def inputs(self): diff --git a/superduper/ext/numpy/__init__.py b/superduper/ext/numpy/__init__.py index ae00cf5b6..a73b58d16 100644 --- a/superduper/ext/numpy/__init__.py +++ b/superduper/ext/numpy/__init__.py @@ -1,7 +1,7 @@ import typing as t -from .encoder import Array, array +from .encoder import Array requirements: t.List = [] -__all__ = ['array', 'Array'] +__all__ = ['Array'] diff --git a/superduper/ext/numpy/encoder.py b/superduper/ext/numpy/encoder.py index 5548ecad3..be781567e 100644 --- a/superduper/ext/numpy/encoder.py +++ b/superduper/ext/numpy/encoder.py @@ -4,12 +4,10 @@ from superduper.components.datatype import ( BaseDataType, - DataType, DataTypeFactory, - Encodable, + _Encodable, ) from superduper.ext.utils import str_shape -from superduper.misc.annotations import component class EncodeArray: @@ -55,21 +53,27 @@ def __call__(self, bytes, info: t.Optional[t.Dict] = None): class Array(BaseDataType): """Encode/ decode a numpy array as bytes. - :param dtype: numpy native datatype + :param dtype: numpy native datatype. + :param shape: Shape of array. """ dtype: str = 'float64' + shape: int | t.Tuple[int] + identifier: str = '' - def __post_init__(self, db, artifacts): - self.encodable_cls = Encodable + def __post_init__(self, db): + self.encodable_cls = _Encodable self.encodable = 'encodable' - return super().__post_init__(db, artifacts) + if not self.identifier: + dtype = str(self.dtype) + self.identifier = f'numpy-{dtype}[{str_shape(self.shape)}]' + return super().__post_init__(db) - def encode_data(self, item, info=None): + def encode_data(self, item): encoder = EncodeArray(self.dtype) return encoder(item) - def decode_data(self, item, info=None): + def decode_data(self, item): shape = self.shape if isinstance(shape, int): shape = (self.shape,) @@ -77,29 +81,27 @@ def decode_data(self, item, info=None): return decoder(item) -@component() -def array( - dtype: str, - shape: t.Sequence, - bytes_encoding: t.Optional[str] = None, - encodable: str = 'encodable', -): - """ - Create an encoder of numpy arrays. - - :param dtype: The dtype of the array. - :param shape: The shape of the array. - :param bytes_encoding: The bytes encoding to use. - :param encodable: The encodable to use. - """ - return DataType( - identifier=f'numpy-{dtype}[{str_shape(shape)}]', - encoder=EncodeArray(dtype), - decoder=DecodeArray(dtype, shape), - shape=shape, - bytes_encoding=bytes_encoding, - encodable=encodable, - ) +# @component() +# def array( +# dtype: str, +# shape: t.Sequence, +# bytes_encoding: t.Optional[str] = None, +# encodable: str = 'encodable', +# ): +# """ +# Create an encoder of numpy arrays. + +# :param dtype: The dtype of the array. +# :param shape: The shape of the array. +# :param bytes_encoding: The bytes encoding to use. +# :param encodable: The encodable to use. +# """ +# return DataType( +# identifier=f'numpy-{dtype}[{str_shape(shape)}]', +# encoder=EncodeArray(dtype), +# decoder=DecodeArray(dtype, shape), +# encodable=encodable, +# ) class NumpyDataTypeFactory(DataTypeFactory): @@ -115,10 +117,10 @@ def check(data: t.Any) -> bool: return isinstance(data, numpy.ndarray) @staticmethod - def create(data: t.Any) -> DataType: + def create(data: t.Any) -> Array: """Create a numpy array datatype. It's used for registering the auto schema. :param data: The numpy array. """ - return array(dtype=str(data.dtype), shape=list(data.shape)) + return Array(dtype=str(data.dtype), shape=list(data.shape)) diff --git a/superduper/misc/annotations.py b/superduper/misc/annotations.py index 0a6cba321..87dffd508 100644 --- a/superduper/misc/annotations.py +++ b/superduper/misc/annotations.py @@ -121,6 +121,7 @@ def _get_indent(docstring: str) -> int: return len(non_empty_lines[1]) - len(non_empty_lines[1].lstrip()) +# TODO deprecate - no longer needed def importable(f): """Make a function serializable as an importable. diff --git a/superduper/misc/auto_schema.py b/superduper/misc/auto_schema.py index 831a7b4e5..ccb01b96c 100644 --- a/superduper/misc/auto_schema.py +++ b/superduper/misc/auto_schema.py @@ -6,13 +6,11 @@ from superduper import CFG, logging from superduper.base.exceptions import UnsupportedDatatype from superduper.components.datatype import ( + DEFAULT_DATATYPE, BaseDataType, - DataType, DataTypeFactory, Vector, - _BaseEncodable, - get_serializer, - json_serializer, + json_encoder, ) from superduper.components.schema import FieldType, Schema @@ -29,12 +27,6 @@ def register_module(module_name): logging.debug(f"Could not register module: {module_name}") -DEFAULT_DATATYPE = get_serializer( - identifier='DEFAULT', - method='pickle', - encodable='encodable', -) - BASE_TYPES = ( int, str, @@ -45,7 +37,7 @@ def register_module(module_name): ) -def infer_datatype(data: t.Any) -> t.Optional[t.Union[DataType, type]]: +def infer_datatype(data: t.Any) -> t.Optional[t.Union[BaseDataType, type]]: """Infer the datatype of a given data object. If the data object is a base type, return None, @@ -55,8 +47,9 @@ def infer_datatype(data: t.Any) -> t.Optional[t.Union[DataType, type]]: """ datatype = None - if isinstance(data, _BaseEncodable): - return datatype + # # TODO - why this? + # if isinstance(data, _BaseEncodable): + # return datatype try: from bson import ObjectId @@ -80,8 +73,8 @@ def infer_datatype(data: t.Any) -> t.Optional[t.Union[DataType, type]]: if datatype is None: try: - encoded_data = DEFAULT_DATATYPE.encoder(data) - decoded_data = DEFAULT_DATATYPE.decoder(encoded_data) + encoded_data = DEFAULT_DATATYPE.encode_data(data) + decoded_data = DEFAULT_DATATYPE.decode_data(encoded_data) assert isinstance(decoded_data, type(data)) except Exception as e: raise UnsupportedDatatype( @@ -163,7 +156,7 @@ def check(data: t.Any) -> bool: :param data: The data object """ try: - json_serializer.encode_data(data) + json_encoder.encode_data(data) return True except Exception: return False @@ -176,7 +169,7 @@ def create(data: t.Any) -> BaseDataType | FieldType: """ if CFG.json_native: return FieldType(identifier='json') - return json_serializer + return json_encoder register_module("superduper.ext.numpy.encoder") diff --git a/superduper/misc/compat.py b/superduper/misc/compat.py index 516aebccc..05613d87b 100644 --- a/superduper/misc/compat.py +++ b/superduper/misc/compat.py @@ -1,4 +1,5 @@ """Functions from later standard libraries not available in Python 3.8.""" +# TODO not needed from functools import lru_cache diff --git a/superduper/misc/download.py b/superduper/misc/download.py index b81e770dc..d00875747 100644 --- a/superduper/misc/download.py +++ b/superduper/misc/download.py @@ -14,10 +14,8 @@ from tqdm import tqdm from superduper import CFG, logging -from superduper.backends.base.query import Query -from superduper.base.constant import KEY_BUILDS -from superduper.base.document import Document -from superduper.components.datatype import _BaseEncodable + +# from superduper.components.datatype import _BaseEncodable from superduper.components.model import Model @@ -255,277 +253,6 @@ def _sequential_go(self, f): f(i) -class Updater: - """Updater class to update the artifact. - - :param db: Datalayer instance - :param query: query to be executed - """ - - def __init__(self, db, query): - self.db = db - self.query = query - - def exists(self, uri, key, id, datatype): - """Check if the artifact exists. - - :param uri: uri to download from - :param key: key in the document - :param id: id of the document - :param datatype: datatype of the document - """ - if self.db.datatypes[datatype].encodable == 'artifact': - out = self.db.artifact_store.exists(uri=uri, datatype=datatype) - else: - table_or_collection = self.query.table_or_collection.identifier - out = self.db.databackend.exists(table_or_collection, id, key) - return out - - def __call__( - self, - *, - uri, - key, - id, - datatype, - bytes_, - ): - """Run the updater. - - :param uri: uri to download from - :param key: key in the document - :param id: id of the document - :param datatype: datatype of the document - :param bytes_: bytes to insert - """ - if self.db.datatypes[datatype].encodable == 'artifact': - self.db.artifact_store.save_artifact( - { - 'uri': uri, - 'datatype': datatype, - 'bytes': bytes_, - 'directory': self.db.datatypes[datatype].directory, - } - ) - else: - # TODO move back to databackend - self.query.download_update(db=self.db, key=key, id=id, bytes=bytes_) - - -class Downloader(BaseDownloader): - """ - Download files from a list of URIs. - - :param uris: list of uris/ file names to fetch - :param update_one: function to call to insert data into table - :param ids: list of ids of rows/ documents to update - :param keys: list of keys in rows/ documents to insert to - :param datatypes: list of datatypes of rows/ documents to insert to - :param n_workers: number of multiprocessing workers - :param headers: dictionary of request headers passed to``requests`` package - :param skip_existing: if ``True`` then don't bother getting already present data - :param timeout: set seconds until request times out - :param raises: raises error ``True``/``False`` - """ - - results: t.Dict[int, str] - - def __init__( - self, - uris, - update_one: t.Optional[t.Callable] = None, - ids: t.Optional[t.Union[t.List[str], t.List[int]]] = None, - keys: t.Optional[t.List[str]] = None, - datatypes: t.Optional[t.List[str]] = None, - n_workers: int = 20, - headers: t.Optional[t.Dict] = None, - skip_existing: bool = True, - timeout: t.Optional[int] = None, - raises: bool = True, - ): - super().__init__( - uris, n_workers=n_workers, timeout=timeout, headers=headers, raises=raises - ) - - if ids is not None: - if len(ids) != len(uris): - raise ValueError(f'len(ids={ids}) != len(uris={uris})') - - self.ids = ids - self.keys = keys - self.datatypes = datatypes - self.failed = 0 - self.skip_existing = skip_existing - self.update_one = update_one - - def _download(self, i): - if self.update_one.exists( - id=self.ids[i], - key=self.keys[i], - uri=self.uris[i], - datatype=self.datatypes[i], - ): - return - content = self.fetcher(self.uris[i]) - self.update_one( - id=self.ids[i], - key=self.keys[i], - datatype=self.datatypes[i], - bytes_=content, - uri=self.uris[i], - ) - - -def gather_uris( - documents: t.Sequence[Document], gather_ids: bool = True -) -> t.Tuple[t.List[str], t.List[str], t.List[t.Any], t.List[str]]: - """Get the uris out of all documents as denoted by ``{"_content": ...}``. - - :param documents: list of dictionaries - :param gather_ids: if ``True`` then gather ids of documents - """ - uris = [] - mongo_keys = [] - datatypes = [] - ids = [] - for i, r in enumerate(documents): - sub_uris, sub_mongo_keys, sub_datatypes = _gather_uris_for_document(r) - if gather_ids: - ids.extend([r['_id'] for _ in sub_uris]) - else: - ids.append(i) - uris.extend(sub_uris) - mongo_keys.extend(sub_mongo_keys) - datatypes.extend(sub_datatypes) - return uris, mongo_keys, datatypes, ids - - -def _gather_uris_for_document(r: Document, id_field: str = '_id'): - """Get the uris out of a single document as denoted by ``{"_content": ...}``. - - >>> _gather_uris_for_document({'a': {'_content': {'uri': 'test'}}}) - (['test'], ['a']) - >>> d = {'b': {'a': {'_content': {'uri': 'test'}}}} - >>> _gather_uris_for_document(d) - (['test'], ['b.a']) - >>> d = {'b': {'a': {'_content': {'uri': 'test', 'bytes': b'abc'}}}} - >>> _gather_uris_for_document(d) - ([], []) - """ - uris = [] - keys = [] - datatypes = [] - # TODO: This function not be tested in UT, - # fast fix the schema parameter to avoid type error - leaf_lookup = r.encode(None, leaves_to_keep=(_BaseEncodable,))[KEY_BUILDS] - for k in leaf_lookup: - if leaf_lookup[k].uri is None: - continue - keys.append(k) - uris.append(leaf_lookup[k].uri) - datatypes.append(leaf_lookup[k].datatype.identifier) - return uris, keys, datatypes - - -def download_content( - db, - query: t.Union[Query, t.Dict], - ids: t.Optional[t.Sequence[str]] = None, - documents: t.Optional[t.List[Document]] = None, - raises: bool = True, - n_workers: t.Optional[int] = None, -) -> t.Optional[t.Sequence[Document]]: - """Download content contained in uploaded data. - - Items to be downloaded are identifier - via the subdocuments in the form exemplified below. By default items are downloaded - to the database, unless a ``download_update`` function is provided. - - :param db: database instance - :param query: query to be executed - :param ids: ids to be downloaded - :param documents: documents to be downloaded - :param raises: whether to raise errors - :param n_workers: number of download workers - - >>> d = {"_content": {"uri": "", "encoder": ""}} - >>> def update(key, id, bytes): - >>> ... with open(f'/tmp/{key}+{id}', 'wb') as f: - >>> ... f.write(bytes) - >>> download_content(None, None, ids=["0"], documents=[d])) - ... - """ - logging.debug(str(query)) - logging.debug(str(ids)) - - # TODO handle this in the job runner - if isinstance(query, dict): - query = Document.decode(query).unpack() - query = t.cast(Query, query) - query.db = db - - if documents is not None: - pass - elif isinstance(query, Query) and query.type == 'select': - if ids is None: - # TODO deprecate reference since lazy loading in any case - documents = list(db.execute(query)) - else: - select = query.select_using_ids(ids) - documents = list(db.execute(select)) - else: - assert query.type == 'insert' - documents = t.cast(t.List[Document], query.documents) - - uris, keys, datatypes, place_ids = gather_uris(documents) - - if uris: - logging.info(f'found {len(uris)} uris') - - if not uris: - return # type: ignore[return-value] - - downloader = Downloader( - uris=uris, - ids=place_ids, - keys=keys, - datatypes=datatypes, - update_one=Updater(db, query), - n_workers=n_workers or CFG.downloads.n_workers, - timeout=CFG.downloads.timeout, - headers=CFG.downloads.headers, - raises=raises, - ) - downloader.go() - - return # type: ignore[return-value] - - -def download_from_one(r: Document): - """Download content from a single document. - - This function will find all URIs in the document and download them. - - :param r: document to download from - """ - uris, keys, _, _ = gather_uris([r]) - if not uris: - return - - downloader = BaseDownloader( - uris=uris, - n_workers=0, - timeout=CFG.downloads.timeout, - headers=CFG.downloads.headers, - raises=True, - ) - downloader.go() - for key, uri in zip(keys, uris): - r[key].x = r[key].datatype.decode_data(downloader.results[uri]) - - return - - class DownloadFiles(Model): """Download files from a list of URIs. diff --git a/superduper/misc/importing.py b/superduper/misc/importing.py new file mode 100644 index 000000000..052067ec1 --- /dev/null +++ b/superduper/misc/importing.py @@ -0,0 +1,11 @@ +import importlib + + +def import_object(path): + """Import item from path. + + :param path: Path to import from. + """ + module = '.'.join(path.split('.')[:-1]) + cls = path.split('.')[-1] + return getattr(importlib.import_module(module), cls) diff --git a/superduper/rest/utils.py b/superduper/rest/utils.py index ae25e9fb6..b72cdc6b9 100644 --- a/superduper/rest/utils.py +++ b/superduper/rest/utils.py @@ -1,19 +1,19 @@ import inspect from superduper import Document -from superduper.components.datatype import Artifact, Encodable +from superduper.components.datatype import _Artifact, _Encodable def rewrite_artifacts(r, db): """Helper function to rewrite artifacts.""" - if isinstance(r, Encodable): + if isinstance(r, _Encodable): kwargs = r.dict() kwargs['datatype'].encodable = 'artifact' blob = r._encode()[0] db.artifact_store.put_bytes(blob, file_id=r.identifier) - init_args = inspect.signature(Artifact.__init__).parameters.keys() + init_args = inspect.signature(_Artifact.__init__).parameters.keys() kwargs = {k: v for k, v in kwargs.items() if k in init_args} - return Artifact(**kwargs) + return _Artifact(**kwargs) if isinstance(r, Document): return Document(rewrite_artifacts(dict(r), db=db)) if isinstance(r, dict): diff --git a/superduper/vector_search/base.py b/superduper/vector_search/base.py index d8a6d4ca3..bb9c9d65e 100644 --- a/superduper/vector_search/base.py +++ b/superduper/vector_search/base.py @@ -12,6 +12,8 @@ if t.TYPE_CHECKING: from superduper.components.vector_index import VectorIndex +# TODO this is now in the wrong place + class BaseVectorSearcher(ABC): """Base class for vector searchers. diff --git a/templates/simple_rag/VERSION b/templates/simple_rag/VERSION index 1d0ba9ea1..8f0916f76 100644 --- a/templates/simple_rag/VERSION +++ b/templates/simple_rag/VERSION @@ -1 +1 @@ -0.4.0 +0.5.0 diff --git a/templates/simple_rag/build.ipynb b/templates/simple_rag/build.ipynb index 88289701c..17947c9f9 100644 --- a/templates/simple_rag/build.ipynb +++ b/templates/simple_rag/build.ipynb @@ -42,7 +42,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "3ef70f6d-a189-460a-8864-241a689624e2", "metadata": { "editable": true, @@ -65,7 +65,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "cb029a5e-fedf-4f07-8a31-d220cfbfbb3d", "metadata": { "editable": true, @@ -74,7 +74,22 @@ }, "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[32m2024-Nov-25 14:42:39.55\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.misc.plugins\u001b[0m:\u001b[36m13 \u001b[0m | \u001b[1mLoading plugin: mongodb\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:39.58\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.datalayer\u001b[0m:\u001b[36m69 \u001b[0m | \u001b[1mBuilding Data Layer\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:39.58\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.build\u001b[0m:\u001b[36m184 \u001b[0m | \u001b[1mConfiguration: \n", + " +---------------+--------------+\n", + "| Configuration | Value |\n", + "+---------------+--------------+\n", + "| Data Backend | mongomock:// |\n", + "+---------------+--------------+\u001b[0m\n" + ] + } + ], "source": [ "from superduper import superduper, CFG\n", "\n", @@ -86,7 +101,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "4e7902bd", "metadata": { "editable": true, @@ -112,10 +127,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "1ef8dd07-1b47-4dce-84dd-a081d1f5ee9d", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[32m2024-Nov-25 14:42:41.67\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36m__main__\u001b[0m:\u001b[36m7 \u001b[0m | \u001b[1mDownloading data...\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:42.44\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36m__main__\u001b[0m:\u001b[36m9 \u001b[0m | \u001b[1mDownloading data... (Done)\u001b[0m\n" + ] + } + ], "source": [ "if APPLY:\n", " data = getter()" @@ -134,10 +158,48 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "id": "c5965fdf", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[32m2024-Nov-25 14:42:44.05\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.datalayer\u001b[0m:\u001b[36m616 \u001b[0m | \u001b[1mComponent (table, docs) not found in cache, loading from db\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:44.05\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.datalayer\u001b[0m:\u001b[36m622 \u001b[0m | \u001b[1mLoad (('table', 'docs')) from metadata...\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:44.05\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.datalayer\u001b[0m:\u001b[36m329 \u001b[0m | \u001b[1mTable docs does not exist, auto creating...\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:44.05\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.datalayer\u001b[0m:\u001b[36m335 \u001b[0m | \u001b[1mCreating table docs with schema {('_fold', 'str'), ('x', 'str')}\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:44.05\u001b[0m| \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.document\u001b[0m:\u001b[36m576 \u001b[0m | \u001b[33m\u001b[1mLeaf str already exists\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:44.05\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.datalayer\u001b[0m:\u001b[36m616 \u001b[0m | \u001b[1mComponent (schema, AUTO-_fold=&x=) not found in cache, loading from db\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:44.05\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.datalayer\u001b[0m:\u001b[36m622 \u001b[0m | \u001b[1mLoad (('schema', \"AUTO-_fold=&x=\")) from metadata...\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:44.05\u001b[0m| \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.document\u001b[0m:\u001b[36m576 \u001b[0m | \u001b[33m\u001b[1mLeaf str already exists\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:44.05\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m257 \u001b[0m | \u001b[1mFound new schema:AUTO-_fold=&x=:0cc2139173e6460a\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:44.05\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.datalayer\u001b[0m:\u001b[36m616 \u001b[0m | \u001b[1mComponent (table, docs) not found in cache, loading from db\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:44.05\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.datalayer\u001b[0m:\u001b[36m622 \u001b[0m | \u001b[1mLoad (('table', 'docs')) from metadata...\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:44.05\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m257 \u001b[0m | \u001b[1mFound new table:docs:059cbc59f1794f86\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:44.06\u001b[0m| \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.backends.local.artifacts\u001b[0m:\u001b[36m87 \u001b[0m | \u001b[33m\u001b[1mFile /tmp/dbc1aaddc8b7343d6d33b34edcf608b8f8801918 already exists\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:44.06\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m67 \u001b[0m | \u001b[1mFound these changes and/ or additions that need to be made:\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:44.06\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m69 \u001b[0m | \u001b[1m----------------------------------------------------------------------------------------------------\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:44.06\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m70 \u001b[0m | \u001b[1mMETADATA EVENTS:\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:44.06\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m71 \u001b[0m | \u001b[1m----------------------------------------------------------------------------------------------------\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:44.06\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m78 \u001b[0m | \u001b[1m[0]: schema:AUTO-_fold=&x=:0cc2139173e6460a: create ~ [1]\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:44.06\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m82 \u001b[0m | \u001b[1m[1]: table:docs:059cbc59f1794f86: create\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:44.06\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m84 \u001b[0m | \u001b[1m----------------------------------------------------------------------------------------------------\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:44.06\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m85 \u001b[0m | \u001b[1mJOBS EVENTS:\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:44.06\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m86 \u001b[0m | \u001b[1m----------------------------------------------------------------------------------------------------\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:44.06\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m90 \u001b[0m | \u001b[1mNo job events...\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:44.06\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m101 \u001b[0m | \u001b[1m----------------------------------------------------------------------------------------------------\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:44.06\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.datalayer\u001b[0m:\u001b[36m588 \u001b[0m | \u001b[1mComponent 0cc2139173e6460a not found in cache, loading from db with uuid\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:44.06\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.datalayer\u001b[0m:\u001b[36m607 \u001b[0m | \u001b[1mAdding schema:AUTO-_fold=&x=:0cc2139173e6460a to cache\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:44.06\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.components.component\u001b[0m:\u001b[36m585 \u001b[0m | \u001b[1mAdding schema: AUTO-_fold=&x= to cache\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:44.06\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.datalayer\u001b[0m:\u001b[36m588 \u001b[0m | \u001b[1mComponent 059cbc59f1794f86 not found in cache, loading from db with uuid\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:44.06\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.datalayer\u001b[0m:\u001b[36m607 \u001b[0m | \u001b[1mAdding table:docs:059cbc59f1794f86 to cache\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:44.06\u001b[0m| \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.backends.local.compute\u001b[0m:\u001b[36m49 \u001b[0m | \u001b[33m\u001b[1mCould not release futures for context 059cbc59f1794f86\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:44.09\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.datalayer\u001b[0m:\u001b[36m308 \u001b[0m | \u001b[1mInserted 187 documents into docs\u001b[0m\n" + ] + } + ], "source": [ "if APPLY:\n", " from superduper import Document\n", @@ -168,7 +230,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "id": "2d20eaa0-a416-4483-938e-23f79845739a", "metadata": {}, "outputs": [], @@ -196,7 +258,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "id": "93d21872-d4dc-40dc-abab-fb07ba102ea3", "metadata": {}, "outputs": [], @@ -215,10 +277,84 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "id": "31900eec-b516-4bef-939e-2e8f46252b12", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[32m2024-Nov-25 14:42:48.68\u001b[0m| \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.components.listener\u001b[0m:\u001b[36m74 \u001b[0m | \u001b[33m\u001b[1moutput_table not found in listener.dict()\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:48.69\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.datalayer\u001b[0m:\u001b[36m616 \u001b[0m | \u001b[1mComponent (model, chunker) not found in cache, loading from db\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:48.69\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.datalayer\u001b[0m:\u001b[36m622 \u001b[0m | \u001b[1mLoad (('model', 'chunker')) from metadata...\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:48.70\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m257 \u001b[0m | \u001b[1mFound new model:chunker:6c65cfb0dc6a4240\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:48.70\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.datalayer\u001b[0m:\u001b[36m616 \u001b[0m | \u001b[1mComponent (listener, chunker) not found in cache, loading from db\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:48.70\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.datalayer\u001b[0m:\u001b[36m622 \u001b[0m | \u001b[1mLoad (('listener', 'chunker')) from metadata...\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:48.70\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.datalayer\u001b[0m:\u001b[36m616 \u001b[0m | \u001b[1mComponent (schema, _schema/_outputs__chunker__32a68622e6ac4e8c) not found in cache, loading from db\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:48.70\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.datalayer\u001b[0m:\u001b[36m622 \u001b[0m | \u001b[1mLoad (('schema', '_schema/_outputs__chunker__32a68622e6ac4e8c')) from metadata...\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:48.70\u001b[0m| \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.document\u001b[0m:\u001b[36m576 \u001b[0m | \u001b[33m\u001b[1mLeaf ID already exists\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:48.70\u001b[0m| \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.document\u001b[0m:\u001b[36m576 \u001b[0m | \u001b[33m\u001b[1mLeaf str already exists\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:48.70\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m257 \u001b[0m | \u001b[1mFound new schema:_schema/_outputs__chunker__32a68622e6ac4e8c:b65ad745363446e6\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:48.70\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.datalayer\u001b[0m:\u001b[36m616 \u001b[0m | \u001b[1mComponent (table, _outputs__chunker__32a68622e6ac4e8c) not found in cache, loading from db\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:48.70\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.datalayer\u001b[0m:\u001b[36m622 \u001b[0m | \u001b[1mLoad (('table', '_outputs__chunker__32a68622e6ac4e8c')) from metadata...\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:48.70\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m257 \u001b[0m | \u001b[1mFound new table:_outputs__chunker__32a68622e6ac4e8c:09d1628b5c0c4870\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:48.70\u001b[0m| \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.backends.local.artifacts\u001b[0m:\u001b[36m87 \u001b[0m | \u001b[33m\u001b[1mFile /tmp/dbc1aaddc8b7343d6d33b34edcf608b8f8801918 already exists\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:48.70\u001b[0m| \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.document\u001b[0m:\u001b[36m576 \u001b[0m | \u001b[33m\u001b[1mLeaf ID already exists\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:48.70\u001b[0m| \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.document\u001b[0m:\u001b[36m576 \u001b[0m | \u001b[33m\u001b[1mLeaf str already exists\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:48.70\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m257 \u001b[0m | \u001b[1mFound new listener:chunker:32a68622e6ac4e8c\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:48.70\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m67 \u001b[0m | \u001b[1mFound these changes and/ or additions that need to be made:\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:48.70\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m69 \u001b[0m | \u001b[1m----------------------------------------------------------------------------------------------------\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:48.70\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m70 \u001b[0m | \u001b[1mMETADATA EVENTS:\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:48.70\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m71 \u001b[0m | \u001b[1m----------------------------------------------------------------------------------------------------\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:48.70\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m78 \u001b[0m | \u001b[1m[0]: model:chunker:6c65cfb0dc6a4240: create ~ [3]\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:48.70\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m78 \u001b[0m | \u001b[1m[1]: schema:_schema/_outputs__chunker__32a68622e6ac4e8c:b65ad745363446e6: create ~ [2]\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:48.70\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m78 \u001b[0m | \u001b[1m[2]: table:_outputs__chunker__32a68622e6ac4e8c:09d1628b5c0c4870: create ~ [3]\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:48.70\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m82 \u001b[0m | \u001b[1m[3]: listener:chunker:32a68622e6ac4e8c: create\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:48.70\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m84 \u001b[0m | \u001b[1m----------------------------------------------------------------------------------------------------\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:48.70\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m85 \u001b[0m | \u001b[1mJOBS EVENTS:\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:48.70\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m86 \u001b[0m | \u001b[1m----------------------------------------------------------------------------------------------------\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:48.70\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m99 \u001b[0m | \u001b[1m[0]: listener:chunker:32a68622e6ac4e8c.run: run\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:48.70\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m94 \u001b[0m | \u001b[1m[1]: listener:chunker:32a68622e6ac4e8c.set_status: set_status ~ [0]\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:48.70\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m101 \u001b[0m | \u001b[1m----------------------------------------------------------------------------------------------------\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:48.70\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.datalayer\u001b[0m:\u001b[36m588 \u001b[0m | \u001b[1mComponent 6c65cfb0dc6a4240 not found in cache, loading from db with uuid\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:48.71\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.datalayer\u001b[0m:\u001b[36m607 \u001b[0m | \u001b[1mAdding model:chunker:6c65cfb0dc6a4240 to cache\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:48.71\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.components.component\u001b[0m:\u001b[36m585 \u001b[0m | \u001b[1mAdding model: chunker to cache\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:48.71\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.datalayer\u001b[0m:\u001b[36m588 \u001b[0m | \u001b[1mComponent b65ad745363446e6 not found in cache, loading from db with uuid\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:48.71\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.datalayer\u001b[0m:\u001b[36m607 \u001b[0m | \u001b[1mAdding schema:_schema/_outputs__chunker__32a68622e6ac4e8c:b65ad745363446e6 to cache\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:48.71\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.components.component\u001b[0m:\u001b[36m585 \u001b[0m | \u001b[1mAdding schema: _schema/_outputs__chunker__32a68622e6ac4e8c to cache\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:48.71\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.datalayer\u001b[0m:\u001b[36m588 \u001b[0m | \u001b[1mComponent 09d1628b5c0c4870 not found in cache, loading from db with uuid\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:48.71\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.datalayer\u001b[0m:\u001b[36m607 \u001b[0m | \u001b[1mAdding table:_outputs__chunker__32a68622e6ac4e8c:09d1628b5c0c4870 to cache\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:48.71\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.datalayer\u001b[0m:\u001b[36m588 \u001b[0m | \u001b[1mComponent 32a68622e6ac4e8c not found in cache, loading from db with uuid\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:48.71\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.datalayer\u001b[0m:\u001b[36m607 \u001b[0m | \u001b[1mAdding listener:chunker:32a68622e6ac4e8c to cache\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:48.71\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.components.component\u001b[0m:\u001b[36m585 \u001b[0m | \u001b[1mAdding listener: chunker to cache\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:48.71\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.components.model\u001b[0m:\u001b[36m531 \u001b[0m | \u001b[1mRequesting prediction in db - [chunker] with predict_id chunker__32a68622e6ac4e8c\n", + "\u001b[0m\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "187it [00:00, 5526.48it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[32m2024-Nov-25 14:42:48.77\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.components.model\u001b[0m:\u001b[36m664 \u001b[0m | \u001b[1mAdding 187 model outputs to `db`\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:48.84\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.datalayer\u001b[0m:\u001b[36m308 \u001b[0m | \u001b[1mInserted 336 documents into _outputs__chunker__32a68622e6ac4e8c\u001b[0m\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], "source": [ "if APPLY and EAGER:\n", " db.apply(upstream_listener, force=True)" @@ -263,13 +399,12 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "id": "a9b1f538-65ca-499e-b6d0-2dd733f81723", "metadata": {}, "outputs": [], "source": [ "import os\n", - "from superduper.components.vector_index import sqlvector\n", "\n", "from superduper_openai import OpenAIEmbedding\n", "\n", @@ -289,7 +424,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "id": "4663fa4b-c2ec-427d-bf8b-b8b109cc2ccf", "metadata": {}, "outputs": [], @@ -312,10 +447,138 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "id": "509c3505-54c5-4e68-84ec-3df8bea0fd74", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[32m2024-Nov-25 14:42:54.93\u001b[0m| \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.document\u001b[0m:\u001b[36m576 \u001b[0m | \u001b[33m\u001b[1mLeaf ID already exists\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:54.94\u001b[0m| \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.document\u001b[0m:\u001b[36m576 \u001b[0m | \u001b[33m\u001b[1mLeaf str already exists\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:55.62\u001b[0m| \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.components.listener\u001b[0m:\u001b[36m74 \u001b[0m | \u001b[33m\u001b[1moutput_table not found in listener.dict()\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:55.63\u001b[0m| \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.components.listener\u001b[0m:\u001b[36m74 \u001b[0m | \u001b[33m\u001b[1moutput_table not found in listener.dict()\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:55.65\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m188 \u001b[0m | \u001b[1mFound identical model:chunker:6c65cfb0dc6a4240\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:55.66\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m243 \u001b[0m | \u001b[1mFound update model:chunker:6c65cfb0dc6a4240\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:55.66\u001b[0m| \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.backends.local.artifacts\u001b[0m:\u001b[36m87 \u001b[0m | \u001b[33m\u001b[1mFile /tmp/4b100f6e48727d74cb84f5c7d979d988aee6aa51 already exists\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:55.66\u001b[0m| \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.components.listener\u001b[0m:\u001b[36m74 \u001b[0m | \u001b[33m\u001b[1moutput_table not found in listener.dict()\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:55.66\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m188 \u001b[0m | \u001b[1mFound identical listener:chunker:32a68622e6ac4e8c\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:55.66\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.datalayer\u001b[0m:\u001b[36m616 \u001b[0m | \u001b[1mComponent (datatype, vector[1536]) not found in cache, loading from db\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:55.66\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.datalayer\u001b[0m:\u001b[36m622 \u001b[0m | \u001b[1mLoad (('datatype', 'vector[1536]')) from metadata...\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:55.66\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m257 \u001b[0m | \u001b[1mFound new datatype:vector[1536]:7da0ede750ef4110\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:55.66\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.datalayer\u001b[0m:\u001b[36m616 \u001b[0m | \u001b[1mComponent (model, text-embedding) not found in cache, loading from db\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:55.66\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.datalayer\u001b[0m:\u001b[36m622 \u001b[0m | \u001b[1mLoad (('model', 'text-embedding')) from metadata...\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:55.66\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m257 \u001b[0m | \u001b[1mFound new model:text-embedding:169d3962c9964326\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:55.66\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.datalayer\u001b[0m:\u001b[36m616 \u001b[0m | \u001b[1mComponent (listener, embeddinglistener) not found in cache, loading from db\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:55.66\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.datalayer\u001b[0m:\u001b[36m622 \u001b[0m | \u001b[1mLoad (('listener', 'embeddinglistener')) from metadata...\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:55.66\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.datalayer\u001b[0m:\u001b[36m616 \u001b[0m | \u001b[1mComponent (datatype, vector[1536]) not found in cache, loading from db\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:55.66\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.datalayer\u001b[0m:\u001b[36m622 \u001b[0m | \u001b[1mLoad (('datatype', 'vector[1536]')) from metadata...\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:55.66\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m257 \u001b[0m | \u001b[1mFound new datatype:vector[1536]:7da0ede750ef4110\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:55.66\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.datalayer\u001b[0m:\u001b[36m616 \u001b[0m | \u001b[1mComponent (schema, _schema/_outputs__embeddinglistener__6e0274765d264d25) not found in cache, loading from db\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:55.66\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.datalayer\u001b[0m:\u001b[36m622 \u001b[0m | \u001b[1mLoad (('schema', '_schema/_outputs__embeddinglistener__6e0274765d264d25')) from metadata...\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:55.66\u001b[0m| \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.document\u001b[0m:\u001b[36m576 \u001b[0m | \u001b[33m\u001b[1mLeaf ID already exists\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:55.66\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m257 \u001b[0m | \u001b[1mFound new schema:_schema/_outputs__embeddinglistener__6e0274765d264d25:2013c1d9203b4c92\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:55.66\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.datalayer\u001b[0m:\u001b[36m616 \u001b[0m | \u001b[1mComponent (table, _outputs__embeddinglistener__6e0274765d264d25) not found in cache, loading from db\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:55.66\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.datalayer\u001b[0m:\u001b[36m622 \u001b[0m | \u001b[1mLoad (('table', '_outputs__embeddinglistener__6e0274765d264d25')) from metadata...\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:55.67\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m257 \u001b[0m | \u001b[1mFound new table:_outputs__embeddinglistener__6e0274765d264d25:949b42ee0b9e4afc\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:55.67\u001b[0m| \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.backends.local.artifacts\u001b[0m:\u001b[36m87 \u001b[0m | \u001b[33m\u001b[1mFile /tmp/dbc1aaddc8b7343d6d33b34edcf608b8f8801918 already exists\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:55.67\u001b[0m| \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.document\u001b[0m:\u001b[36m576 \u001b[0m | \u001b[33m\u001b[1mLeaf ID already exists\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:55.67\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m257 \u001b[0m | \u001b[1mFound new listener:embeddinglistener:6e0274765d264d25\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:55.67\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.datalayer\u001b[0m:\u001b[36m616 \u001b[0m | \u001b[1mComponent (vector_index, vectorindex) not found in cache, loading from db\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:55.67\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.datalayer\u001b[0m:\u001b[36m622 \u001b[0m | \u001b[1mLoad (('vector_index', 'vectorindex')) from metadata...\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:55.67\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m257 \u001b[0m | \u001b[1mFound new vector_index:vectorindex:fb8364f87f6446c0\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:55.67\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m64 \u001b[0m | \u001b[1mFound this diff:\u001b[0m\n" + ] + }, + { + "data": { + "text/html": [ + "
vectorindex\n",
+       "└── chunker\n",
+       "    ├── status: update\n",
+       "    ├── changes\n",
+       "    │   └── _object: Blob(identifier='4b100f6e48727d74cb84f5c7d979d988aee6aa51', uuid='22d9da58406e42e1', bytes=b'\\\n",
+       "    └── type_id: model\n",
+       "
\n" + ], + "text/plain": [ + "vectorindex\n", + "└── \u001b[1;33mchunker\u001b[0m\n", + " ├── \u001b[1;36mstatus: \u001b[0m\u001b[1;34mupdate\u001b[0m\n", + " ├── \u001b[1;33mchanges\u001b[0m\n", + " │ └── \u001b[1;36m_object: \u001b[0m\u001b[1;32mBlob(identifier='4b100f6e48727d74cb84f5c7d979d988aee6aa51', uuid='22d9da58406e42e1', bytes=b'\\\u001b[0m\n", + " └── \u001b[1;36mtype_id: \u001b[0m\u001b[1;32mmodel\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[32m2024-Nov-25 14:42:55.67\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m67 \u001b[0m | \u001b[1mFound these changes and/ or additions that need to be made:\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:55.67\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m69 \u001b[0m | \u001b[1m----------------------------------------------------------------------------------------------------\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:55.67\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m70 \u001b[0m | \u001b[1mMETADATA EVENTS:\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:55.67\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m71 \u001b[0m | \u001b[1m----------------------------------------------------------------------------------------------------\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:55.67\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m80 \u001b[0m | \u001b[1m[0]: model:chunker:6c65cfb0dc6a4240: update\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:55.67\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m78 \u001b[0m | \u001b[1m[1]: datatype:vector[1536]:7da0ede750ef4110: create ~ [3]\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:55.67\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m78 \u001b[0m | \u001b[1m[2]: model:text-embedding:169d3962c9964326: create ~ [5]\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:55.67\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m78 \u001b[0m | \u001b[1m[3]: schema:_schema/_outputs__embeddinglistener__6e0274765d264d25:2013c1d9203b4c92: create ~ [4]\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:55.67\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m78 \u001b[0m | \u001b[1m[4]: table:_outputs__embeddinglistener__6e0274765d264d25:949b42ee0b9e4afc: create ~ [5]\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:55.67\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m78 \u001b[0m | \u001b[1m[5]: listener:embeddinglistener:6e0274765d264d25: create ~ [6]\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:55.67\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m82 \u001b[0m | \u001b[1m[6]: vector_index:vectorindex:fb8364f87f6446c0: create\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:55.67\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m84 \u001b[0m | \u001b[1m----------------------------------------------------------------------------------------------------\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:55.67\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m85 \u001b[0m | \u001b[1mJOBS EVENTS:\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:55.67\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m86 \u001b[0m | \u001b[1m----------------------------------------------------------------------------------------------------\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:55.67\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m99 \u001b[0m | \u001b[1m[0]: listener:embeddinglistener:6e0274765d264d25.run: run\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:55.67\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m94 \u001b[0m | \u001b[1m[1]: listener:embeddinglistener:6e0274765d264d25.set_status: set_status ~ [0]\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:55.67\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m94 \u001b[0m | \u001b[1m[2]: vector_index:vectorindex:fb8364f87f6446c0.copy_vectors: copy_vectors ~ [0,1]\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:55.67\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m94 \u001b[0m | \u001b[1m[3]: vector_index:vectorindex:fb8364f87f6446c0.set_status: set_status ~ [2]\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:55.67\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m101 \u001b[0m | \u001b[1m----------------------------------------------------------------------------------------------------\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:55.68\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.datalayer\u001b[0m:\u001b[36m588 \u001b[0m | \u001b[1mComponent 7da0ede750ef4110 not found in cache, loading from db with uuid\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:55.68\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.datalayer\u001b[0m:\u001b[36m607 \u001b[0m | \u001b[1mAdding datatype:vector[1536]:7da0ede750ef4110 to cache\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:55.68\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.components.component\u001b[0m:\u001b[36m585 \u001b[0m | \u001b[1mAdding datatype: vector[1536] to cache\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:55.68\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.datalayer\u001b[0m:\u001b[36m588 \u001b[0m | \u001b[1mComponent 169d3962c9964326 not found in cache, loading from db with uuid\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:55.68\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.datalayer\u001b[0m:\u001b[36m607 \u001b[0m | \u001b[1mAdding model:text-embedding:169d3962c9964326 to cache\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:55.68\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.components.component\u001b[0m:\u001b[36m585 \u001b[0m | \u001b[1mAdding model: text-embedding to cache\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:55.68\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.datalayer\u001b[0m:\u001b[36m588 \u001b[0m | \u001b[1mComponent 2013c1d9203b4c92 not found in cache, loading from db with uuid\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:55.68\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.datalayer\u001b[0m:\u001b[36m607 \u001b[0m | \u001b[1mAdding schema:_schema/_outputs__embeddinglistener__6e0274765d264d25:2013c1d9203b4c92 to cache\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:55.68\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.components.component\u001b[0m:\u001b[36m585 \u001b[0m | \u001b[1mAdding schema: _schema/_outputs__embeddinglistener__6e0274765d264d25 to cache\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:55.68\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.datalayer\u001b[0m:\u001b[36m588 \u001b[0m | \u001b[1mComponent 949b42ee0b9e4afc not found in cache, loading from db with uuid\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:55.69\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.datalayer\u001b[0m:\u001b[36m607 \u001b[0m | \u001b[1mAdding table:_outputs__embeddinglistener__6e0274765d264d25:949b42ee0b9e4afc to cache\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:55.69\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.datalayer\u001b[0m:\u001b[36m588 \u001b[0m | \u001b[1mComponent 6e0274765d264d25 not found in cache, loading from db with uuid\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:55.69\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.datalayer\u001b[0m:\u001b[36m588 \u001b[0m | \u001b[1mComponent 32a68622e6ac4e8c not found in cache, loading from db with uuid\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:55.69\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.datalayer\u001b[0m:\u001b[36m588 \u001b[0m | \u001b[1mComponent 6c65cfb0dc6a4240 not found in cache, loading from db with uuid\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:55.69\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.datalayer\u001b[0m:\u001b[36m607 \u001b[0m | \u001b[1mAdding model:chunker:6c65cfb0dc6a4240 to cache\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:55.69\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.datalayer\u001b[0m:\u001b[36m607 \u001b[0m | \u001b[1mAdding listener:chunker:32a68622e6ac4e8c to cache\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:55.69\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.datalayer\u001b[0m:\u001b[36m607 \u001b[0m | \u001b[1mAdding listener:embeddinglistener:6e0274765d264d25 to cache\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:55.69\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.components.component\u001b[0m:\u001b[36m585 \u001b[0m | \u001b[1mAdding listener: embeddinglistener to cache\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:55.69\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.datalayer\u001b[0m:\u001b[36m588 \u001b[0m | \u001b[1mComponent fb8364f87f6446c0 not found in cache, loading from db with uuid\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:55.69\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.datalayer\u001b[0m:\u001b[36m607 \u001b[0m | \u001b[1mAdding vector_index:vectorindex:fb8364f87f6446c0 to cache\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:55.69\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.components.component\u001b[0m:\u001b[36m585 \u001b[0m | \u001b[1mAdding vector_index: vectorindex to cache\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:42:55.70\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.components.model\u001b[0m:\u001b[36m531 \u001b[0m | \u001b[1mRequesting prediction in db - [text-embedding] with predict_id embeddinglistener__6e0274765d264d25\n", + "\u001b[0m\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "336it [00:00, 28114.31it/s]\n", + "100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:06<00:00, 1.64s/it]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[32m2024-Nov-25 14:43:02.31\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.components.model\u001b[0m:\u001b[36m664 \u001b[0m | \u001b[1mAdding 336 model outputs to `db`\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:43:03.90\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.datalayer\u001b[0m:\u001b[36m308 \u001b[0m | \u001b[1mInserted 336 documents into _outputs__embeddinglistener__6e0274765d264d25\u001b[0m\n" + ] + } + ], "source": [ "if APPLY and EAGER:\n", " db.apply(vector_index, force=True)" @@ -332,7 +595,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "id": "f98e5ff4", "metadata": {}, "outputs": [], @@ -356,7 +619,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "id": "44baeb09-6f35-4cf2-b814-46283a59f7e9", "metadata": {}, "outputs": [], @@ -383,10 +646,41 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "id": "2d3a0d3a-da1c-41ec-b16c-f281c46ad794", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[32m2024-Nov-25 14:43:13.40\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.datalayer\u001b[0m:\u001b[36m616 \u001b[0m | \u001b[1mComponent (model, llm-model) not found in cache, loading from db\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:43:13.40\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.datalayer\u001b[0m:\u001b[36m622 \u001b[0m | \u001b[1mLoad (('model', 'llm-model')) from metadata...\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:43:13.40\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m257 \u001b[0m | \u001b[1mFound new model:llm-model:f5a1f4c2908b4570\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:43:13.40\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.datalayer\u001b[0m:\u001b[36m616 \u001b[0m | \u001b[1mComponent (model, simple_rag) not found in cache, loading from db\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:43:13.40\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.datalayer\u001b[0m:\u001b[36m622 \u001b[0m | \u001b[1mLoad (('model', 'simple_rag')) from metadata...\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:43:13.40\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m257 \u001b[0m | \u001b[1mFound new model:simple_rag:721ab56bc3784088\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:43:13.40\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m67 \u001b[0m | \u001b[1mFound these changes and/ or additions that need to be made:\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:43:13.40\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m69 \u001b[0m | \u001b[1m----------------------------------------------------------------------------------------------------\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:43:13.40\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m70 \u001b[0m | \u001b[1mMETADATA EVENTS:\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:43:13.40\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m71 \u001b[0m | \u001b[1m----------------------------------------------------------------------------------------------------\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:43:13.40\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m78 \u001b[0m | \u001b[1m[0]: model:llm-model:f5a1f4c2908b4570: create ~ [1]\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:43:13.40\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m82 \u001b[0m | \u001b[1m[1]: model:simple_rag:721ab56bc3784088: create\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:43:13.40\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m84 \u001b[0m | \u001b[1m----------------------------------------------------------------------------------------------------\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:43:13.40\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m85 \u001b[0m | \u001b[1mJOBS EVENTS:\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:43:13.40\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m86 \u001b[0m | \u001b[1m----------------------------------------------------------------------------------------------------\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:43:13.40\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m90 \u001b[0m | \u001b[1mNo job events...\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:43:13.40\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m101 \u001b[0m | \u001b[1m----------------------------------------------------------------------------------------------------\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:43:13.41\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.datalayer\u001b[0m:\u001b[36m588 \u001b[0m | \u001b[1mComponent f5a1f4c2908b4570 not found in cache, loading from db with uuid\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:43:13.42\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.datalayer\u001b[0m:\u001b[36m607 \u001b[0m | \u001b[1mAdding model:llm-model:f5a1f4c2908b4570 to cache\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:43:13.42\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.components.component\u001b[0m:\u001b[36m585 \u001b[0m | \u001b[1mAdding model: llm-model to cache\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:43:13.42\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.datalayer\u001b[0m:\u001b[36m588 \u001b[0m | \u001b[1mComponent 721ab56bc3784088 not found in cache, loading from db with uuid\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:43:13.43\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.datalayer\u001b[0m:\u001b[36m607 \u001b[0m | \u001b[1mAdding model:simple_rag:721ab56bc3784088 to cache\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:43:13.43\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.components.component\u001b[0m:\u001b[36m585 \u001b[0m | \u001b[1mAdding model: simple_rag to cache\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:43:13.43\u001b[0m| \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.backends.local.compute\u001b[0m:\u001b[36m49 \u001b[0m | \u001b[33m\u001b[1mCould not release futures for context 721ab56bc3784088\u001b[0m\n" + ] + } + ], "source": [ "if APPLY and EAGER:\n", " db.apply(rag, force=True)" @@ -402,10 +696,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "id": "e6787c78-4b14-4a72-818b-450408a74331", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[32m2024-Nov-25 14:43:17.42\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.components.application\u001b[0m:\u001b[36m39 \u001b[0m | \u001b[1mResorting components based on topological order.\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:43:17.42\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.components.application\u001b[0m:\u001b[36m56 \u001b[0m | \u001b[1mNew order of components: ['listener:chunker:32a68622e6ac4e8c', 'vector_index:vectorindex:fb8364f87f6446c0', 'model:simple_rag:721ab56bc3784088']\u001b[0m\n" + ] + } + ], "source": [ "from superduper import Application\n", "\n", @@ -421,10 +724,125 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "id": "e7c16557-af76-4e70-83d9-2984e19a9554", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[32m2024-Nov-25 14:43:18.39\u001b[0m| \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.document\u001b[0m:\u001b[36m576 \u001b[0m | \u001b[33m\u001b[1mLeaf ID already exists\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:43:18.39\u001b[0m| \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.document\u001b[0m:\u001b[36m576 \u001b[0m | \u001b[33m\u001b[1mLeaf str already exists\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:43:18.40\u001b[0m| \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.document\u001b[0m:\u001b[36m576 \u001b[0m | \u001b[33m\u001b[1mLeaf ID already exists\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:43:18.40\u001b[0m| \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.document\u001b[0m:\u001b[36m576 \u001b[0m | \u001b[33m\u001b[1mLeaf str already exists\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:43:18.40\u001b[0m| \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.document\u001b[0m:\u001b[36m576 \u001b[0m | \u001b[33m\u001b[1mLeaf ID already exists\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:43:18.40\u001b[0m| \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.components.listener\u001b[0m:\u001b[36m74 \u001b[0m | \u001b[33m\u001b[1moutput_table not found in listener.dict()\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:43:18.45\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m188 \u001b[0m | \u001b[1mFound identical model:chunker:6c65cfb0dc6a4240\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:43:18.46\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m243 \u001b[0m | \u001b[1mFound update model:chunker:6c65cfb0dc6a4240\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:43:18.46\u001b[0m| \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.backends.local.artifacts\u001b[0m:\u001b[36m87 \u001b[0m | \u001b[33m\u001b[1mFile /tmp/4b100f6e48727d74cb84f5c7d979d988aee6aa51 already exists\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:43:18.46\u001b[0m| \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.components.listener\u001b[0m:\u001b[36m74 \u001b[0m | \u001b[33m\u001b[1moutput_table not found in listener.dict()\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:43:18.46\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m188 \u001b[0m | \u001b[1mFound identical listener:chunker:32a68622e6ac4e8c\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:43:18.46\u001b[0m| \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.components.listener\u001b[0m:\u001b[36m74 \u001b[0m | \u001b[33m\u001b[1moutput_table not found in listener.dict()\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:43:18.46\u001b[0m| \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.components.listener\u001b[0m:\u001b[36m74 \u001b[0m | \u001b[33m\u001b[1moutput_table not found in listener.dict()\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:43:18.46\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m188 \u001b[0m | \u001b[1mFound identical model:chunker:6c65cfb0dc6a4240\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:43:18.47\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m243 \u001b[0m | \u001b[1mFound update model:chunker:6c65cfb0dc6a4240\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:43:18.47\u001b[0m| \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.backends.local.artifacts\u001b[0m:\u001b[36m87 \u001b[0m | \u001b[33m\u001b[1mFile /tmp/4b100f6e48727d74cb84f5c7d979d988aee6aa51 already exists\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:43:18.47\u001b[0m| \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.components.listener\u001b[0m:\u001b[36m74 \u001b[0m | \u001b[33m\u001b[1moutput_table not found in listener.dict()\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:43:18.47\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m188 \u001b[0m | \u001b[1mFound identical listener:chunker:32a68622e6ac4e8c\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:43:18.47\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m188 \u001b[0m | \u001b[1mFound identical datatype:vector[1536]:7da0ede750ef4110\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:43:18.47\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m243 \u001b[0m | \u001b[1mFound update datatype:vector[1536]:7da0ede750ef4110\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:43:18.47\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m188 \u001b[0m | \u001b[1mFound identical model:text-embedding:169d3962c9964326\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:43:18.47\u001b[0m| \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.components.listener\u001b[0m:\u001b[36m74 \u001b[0m | \u001b[33m\u001b[1moutput_table not found in listener.dict()\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:43:18.47\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m188 \u001b[0m | \u001b[1mFound identical listener:embeddinglistener:6e0274765d264d25\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:43:18.47\u001b[0m| \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.document\u001b[0m:\u001b[36m576 \u001b[0m | \u001b[33m\u001b[1mLeaf ID already exists\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:43:18.47\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m243 \u001b[0m | \u001b[1mFound update listener:embeddinglistener:6e0274765d264d25\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:43:18.47\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m188 \u001b[0m | \u001b[1mFound identical vector_index:vectorindex:fb8364f87f6446c0\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:43:18.47\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m188 \u001b[0m | \u001b[1mFound identical model:llm-model:f5a1f4c2908b4570\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:43:18.47\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m188 \u001b[0m | \u001b[1mFound identical model:simple_rag:721ab56bc3784088\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:43:18.47\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.datalayer\u001b[0m:\u001b[36m616 \u001b[0m | \u001b[1mComponent (application, simple-rag-app) not found in cache, loading from db\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:43:18.47\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.datalayer\u001b[0m:\u001b[36m622 \u001b[0m | \u001b[1mLoad (('application', 'simple-rag-app')) from metadata...\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:43:18.47\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m257 \u001b[0m | \u001b[1mFound new application:simple-rag-app:43582331de8b49c8\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:43:18.47\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m64 \u001b[0m | \u001b[1mFound this diff:\u001b[0m\n" + ] + }, + { + "data": { + "text/html": [ + "
simple-rag-app\n",
+       "├── chunker\n",
+       "│   ├── status: update\n",
+       "│   ├── changes\n",
+       "│   │   └── _object: Blob(identifier='4b100f6e48727d74cb84f5c7d979d988aee6aa51', uuid='7531030f3a244229', bytes=b'\\\n",
+       "│   └── type_id: model\n",
+       "├── vector[1536]\n",
+       "│   ├── status: update\n",
+       "│   ├── changes\n",
+       "│   │   └── shape: (1536,)\n",
+       "│   └── type_id: datatype\n",
+       "└── embeddinglistener\n",
+       "    ├── status: update\n",
+       "    ├── changes\n",
+       "    │   └── upstream: ['?chunker', '?chunker']\n",
+       "    └── type_id: listener\n",
+       "
\n" + ], + "text/plain": [ + "simple-rag-app\n", + "├── \u001b[1;33mchunker\u001b[0m\n", + "│ ├── \u001b[1;36mstatus: \u001b[0m\u001b[1;34mupdate\u001b[0m\n", + "│ ├── \u001b[1;33mchanges\u001b[0m\n", + "│ │ └── \u001b[1;36m_object: \u001b[0m\u001b[1;32mBlob(identifier='4b100f6e48727d74cb84f5c7d979d988aee6aa51', uuid='7531030f3a244229', bytes=b'\\\u001b[0m\n", + "│ └── \u001b[1;36mtype_id: \u001b[0m\u001b[1;32mmodel\u001b[0m\n", + "├── \u001b[1;33mvector[1536]\u001b[0m\n", + "│ ├── \u001b[1;36mstatus: \u001b[0m\u001b[1;34mupdate\u001b[0m\n", + "│ ├── \u001b[1;33mchanges\u001b[0m\n", + "│ │ └── \u001b[1;36mshape: \u001b[0m\u001b[1;32m(1536,)\u001b[0m\n", + "│ └── \u001b[1;36mtype_id: \u001b[0m\u001b[1;32mdatatype\u001b[0m\n", + "└── \u001b[1;33membeddinglistener\u001b[0m\n", + " ├── \u001b[1;36mstatus: \u001b[0m\u001b[1;34mupdate\u001b[0m\n", + " ├── \u001b[1;33mchanges\u001b[0m\n", + " │ └── \u001b[1;36mupstream: \u001b[0m\u001b[1;32m['?chunker', '?chunker']\u001b[0m\n", + " └── \u001b[1;36mtype_id: \u001b[0m\u001b[1;32mlistener\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[32m2024-Nov-25 14:43:18.47\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m67 \u001b[0m | \u001b[1mFound these changes and/ or additions that need to be made:\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:43:18.47\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m69 \u001b[0m | \u001b[1m----------------------------------------------------------------------------------------------------\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:43:18.47\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m70 \u001b[0m | \u001b[1mMETADATA EVENTS:\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:43:18.47\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m71 \u001b[0m | \u001b[1m----------------------------------------------------------------------------------------------------\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:43:18.47\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m80 \u001b[0m | \u001b[1m[0]: model:chunker:6c65cfb0dc6a4240: update\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:43:18.47\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m80 \u001b[0m | \u001b[1m[1]: datatype:vector[1536]:7da0ede750ef4110: update\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:43:18.47\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m80 \u001b[0m | \u001b[1m[2]: listener:embeddinglistener:6e0274765d264d25: update\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:43:18.47\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m82 \u001b[0m | \u001b[1m[3]: application:simple-rag-app:43582331de8b49c8: create\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:43:18.47\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m84 \u001b[0m | \u001b[1m----------------------------------------------------------------------------------------------------\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:43:18.47\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m85 \u001b[0m | \u001b[1mJOBS EVENTS:\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:43:18.47\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m86 \u001b[0m | \u001b[1m----------------------------------------------------------------------------------------------------\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:43:18.47\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m90 \u001b[0m | \u001b[1mNo job events...\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:43:18.47\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.apply\u001b[0m:\u001b[36m101 \u001b[0m | \u001b[1m----------------------------------------------------------------------------------------------------\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:43:18.48\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.datalayer\u001b[0m:\u001b[36m588 \u001b[0m | \u001b[1mComponent 43582331de8b49c8 not found in cache, loading from db with uuid\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:43:18.48\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.datalayer\u001b[0m:\u001b[36m588 \u001b[0m | \u001b[1mComponent 32a68622e6ac4e8c not found in cache, loading from db with uuid\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:43:18.48\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.datalayer\u001b[0m:\u001b[36m588 \u001b[0m | \u001b[1mComponent 6c65cfb0dc6a4240 not found in cache, loading from db with uuid\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:43:18.48\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.datalayer\u001b[0m:\u001b[36m607 \u001b[0m | \u001b[1mAdding model:chunker:6c65cfb0dc6a4240 to cache\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:43:18.48\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.datalayer\u001b[0m:\u001b[36m607 \u001b[0m | \u001b[1mAdding listener:chunker:32a68622e6ac4e8c to cache\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:43:18.48\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.datalayer\u001b[0m:\u001b[36m588 \u001b[0m | \u001b[1mComponent fb8364f87f6446c0 not found in cache, loading from db with uuid\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:43:18.48\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.datalayer\u001b[0m:\u001b[36m588 \u001b[0m | \u001b[1mComponent 6e0274765d264d25 not found in cache, loading from db with uuid\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:43:18.48\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.datalayer\u001b[0m:\u001b[36m607 \u001b[0m | \u001b[1mAdding listener:embeddinglistener:6e0274765d264d25 to cache\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:43:18.48\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.datalayer\u001b[0m:\u001b[36m607 \u001b[0m | \u001b[1mAdding vector_index:vectorindex:fb8364f87f6446c0 to cache\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:43:18.48\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.components.application\u001b[0m:\u001b[36m39 \u001b[0m | \u001b[1mResorting components based on topological order.\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:43:18.48\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.components.application\u001b[0m:\u001b[36m56 \u001b[0m | \u001b[1mNew order of components: ['listener:chunker:32a68622e6ac4e8c', 'vector_index:vectorindex:fb8364f87f6446c0', 'model:simple_rag:721ab56bc3784088']\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:43:18.48\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.datalayer\u001b[0m:\u001b[36m607 \u001b[0m | \u001b[1mAdding application:simple-rag-app:43582331de8b49c8 to cache\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:43:18.48\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.components.component\u001b[0m:\u001b[36m585 \u001b[0m | \u001b[1mAdding application: simple-rag-app to cache\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:43:18.48\u001b[0m| \u001b[33m\u001b[1mWARNING \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.backends.local.compute\u001b[0m:\u001b[36m49 \u001b[0m | \u001b[33m\u001b[1mCould not release futures for context 43582331de8b49c8\u001b[0m\n" + ] + } + ], "source": [ "if APPLY:\n", " db.apply(app, force=True)" @@ -432,13 +850,26 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "id": "2a82ea22-9694-4c65-b72f-c89ae49d1ab2", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[32m2024-Nov-25 14:43:20.87\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.datalayer\u001b[0m:\u001b[36m792 \u001b[0m | \u001b[1mGetting vector-index\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:43:20.87\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.datalayer\u001b[0m:\u001b[36m800 \u001b[0m | \u001b[1m{}\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:43:21.39\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.datalayer\u001b[0m:\u001b[36m792 \u001b[0m | \u001b[1mGetting vector-index\u001b[0m\n", + "\u001b[32m2024-Nov-25 14:43:21.39\u001b[0m| \u001b[1mINFO \u001b[0m | \u001b[36mDuncans-MBP.fritz.box\u001b[0m| \u001b[36msuperduper.base.datalayer\u001b[0m:\u001b[36m800 \u001b[0m | \u001b[1m{}\u001b[0m\n", + "Superduper project involves inserting data, tracking components, triggering work based on changes, and using AI models to interact with new data.\n" + ] + } + ], "source": [ "if APPLY:\n", - " print(rag.predict('Tell me about the project'))" + " print(rag.predict('Tell me about the project'))q\n", + " " ] }, { diff --git a/test/integration/usecase/test_training.py b/test/integration/usecase/test_training.py index 5d2e6bbb9..e1d25019c 100644 --- a/test/integration/usecase/test_training.py +++ b/test/integration/usecase/test_training.py @@ -17,7 +17,7 @@ def fit(self, model, db, train_dataset, valid_dataset): class MyModel(Model): - _artifacts: t.ClassVar[t.Any] = (('estimator', pickle_serializer),) + _fields = {'estimator': pickle_serializer} estimator: t.Any signature: str = 'singleton' diff --git a/test/unittest/backends/local/test_artifacts.py b/test/unittest/backends/local/test_artifacts.py index c4989bd9b..93291cba8 100644 --- a/test/unittest/backends/local/test_artifacts.py +++ b/test/unittest/backends/local/test_artifacts.py @@ -7,11 +7,7 @@ from superduper.backends.local.artifacts import FileSystemArtifactStore from superduper.components.component import Component -from superduper.components.datatype import ( - DataType, - file_lazy, - serializers, -) +from superduper.components.datatype import INBUILT_DATATYPES @dc.dataclass(kw_only=True) @@ -19,9 +15,7 @@ class TestComponent(Component): path: str type_id: t.ClassVar[str] = "TestComponent" - _artifacts: t.ClassVar[t.Sequence[t.Tuple[str, "DataType"]]] = ( - ("path", file_lazy), - ) + _fields = {'path': 'file'} @dc.dataclass(kw_only=True) @@ -29,9 +23,7 @@ class TestComponentBytes(Component): function: callable type_id: t.ClassVar[str] = "TestComponent" - _artifacts: t.ClassVar[t.Sequence[t.Tuple[str, "DataType"]]] = ( - ("path", file_lazy), - ) + _fields = {'function': 'dill_serializer'} @pytest.fixture @@ -59,7 +51,7 @@ def random_directory(tmpdir): def artifact_store(tmpdir) -> FileSystemArtifactStore: tmpdir_path = os.path.join(tmpdir, "artifact_store") artifact_strore = FileSystemArtifactStore(f"{tmpdir_path}") - artifact_strore._serializers = serializers + artifact_strore._serializers = INBUILT_DATATYPES return artifact_strore diff --git a/test/unittest/base/test_datalayer.py b/test/unittest/base/test_datalayer.py index 6b0c96b1b..01d2f7d87 100644 --- a/test/unittest/base/test_datalayer.py +++ b/test/unittest/base/test_datalayer.py @@ -12,15 +12,13 @@ from superduper.components.component import Component from superduper.components.dataset import Dataset from superduper.components.datatype import ( - DataType, - LazyArtifact, + BaseDataType, + Blob, dill_serializer, - pickle_decode, - pickle_encode, pickle_serializer, ) from superduper.components.listener import Listener -from superduper.components.model import Model, ObjectModel, Trainer +from superduper.components.model import ImportedModel, Model, ObjectModel, Trainer from superduper.components.schema import FieldType, Schema from superduper.components.table import Table @@ -39,7 +37,7 @@ class FakeModel(Model): class TestComponent(Component): breaks: ClassVar[Sequence] = ('inc',) - _artifacts: ClassVar[Sequence[str]] = (('artifact', dill_serializer),) + _fields = {'artifact': dill_serializer} inc: int = 0 type_id: str = 'test-component' is_on_create: bool = False @@ -83,7 +81,6 @@ def add_fake_model(db: Datalayer): identifier='fake_model', example=((1,), {}), ) - db.apply(model) select = db['documents'].select() listener = Listener( identifier='listener-x', @@ -134,10 +131,15 @@ def test_add_version(db: Datalayer): assert db.show('test-component', 'test') == [0, 1, 2] +class TestComponentPickle(TestComponent): + _fields = {'artifact': pickle_serializer} + + def test_add_component_with_bad_artifact(db): artifact = {'data': lambda x: x} - component = TestComponent( - identifier='test', artifact=artifact, artifacts={'artifact': pickle_serializer} + component = TestComponentPickle( + identifier='test', + artifact=artifact, ) with pytest.raises(Exception): db.apply(component) @@ -149,9 +151,7 @@ def test_add_artifact_auto_replace(db): component = TestComponent(identifier='test', artifact=artifact) db.apply(component) r = db.show('test-component', 'test', -1) - assert r['artifact'].startswith('?') - info = r['_builds'][r['artifact'][1:]] - assert info['blob'].startswith('&') + assert r['artifact'].startswith('&') def test_add_child(db): @@ -206,7 +206,7 @@ def test_add_with_artifact(db): assert m.object is not None - assert isinstance(m.object, LazyArtifact) + assert isinstance(m.object, Blob) m.init() assert callable(m.object) @@ -289,7 +289,7 @@ def test_remove_component_with_artifact(db): info_with_artifact = db.metadata.get_component( 'test-component', 'test_with_artifact', 0 ) - artifact_file_id = info_with_artifact['artifact'][1:] + artifact_file_id = info_with_artifact['artifact'].split(':')[-1] with patch.object(db.artifact_store, '_delete_bytes') as mock_delete: db._remove_component_version( 'test-component', 'test_with_artifact', 0, force=True @@ -367,14 +367,22 @@ def test_show(db): assert db.show('test-component', 'b', -1)['version'] == 2 +class DataType(BaseDataType): + def encode_data(self, item): + return item + + def decode_data(self, item): + return item + + def test_load(db): - m1 = ObjectModel(object=lambda x: x, identifier='m1', datatype='int32') + m1 = ObjectModel(object=lambda x: x, identifier='m1') components = [ DataType(identifier='e1'), DataType(identifier='e2'), m1, - ObjectModel(object=lambda x: x, identifier='m1', datatype='int32'), + ObjectModel(object=lambda x: x, identifier='m1'), ] for component in components: db.apply(component) @@ -389,7 +397,7 @@ def test_load(db): db.load('model', 'e1') datatype = db.load('datatype', 'e1') - assert isinstance(datatype, DataType) + assert isinstance(datatype, BaseDataType) assert datatype.type_id, datatype.identifier in db.cluster.cache @@ -411,17 +419,7 @@ def test_insert(db): def test_insert_artifacts(db): - dt = DataType( - 'my_saveable', - encodable='artifact', - encoder=pickle_encode, - decoder=pickle_decode, - ) - table = Table( - 'documents', - schema=Schema('documents', fields={'x': dt}), - ) - db.apply(table) + db.cfg.auto_schema = True db._insert( db['documents'].insert( [Document({'x': numpy.random.randn(100)}) for _ in range(1)] @@ -549,7 +547,7 @@ def my_lambda(x): def test_compound_component(db): - m = ObjectModel( + m = ImportedModel( object=imported_value(my_lambda), identifier='my-test-module', datatype=FieldType(identifier='int'), @@ -613,7 +611,7 @@ def test_dataset(db): assert len(dataset.data) == len(list(db.execute(dataset.select))) -def test_delete_componet_with_same_artifact(db): +def test_delete_component_with_same_artifact(db): from superduper import ObjectModel model1 = ObjectModel( diff --git a/test/unittest/base/test_document.py b/test/unittest/base/test_document.py index 62eca6b8b..cf059a60d 100644 --- a/test/unittest/base/test_document.py +++ b/test/unittest/base/test_document.py @@ -3,13 +3,14 @@ import tempfile import numpy as np +import pytest from superduper.backends.base.query import Query from superduper.base.constant import KEY_BLOBS, KEY_BUILDS from superduper.base.document import Document from superduper.components.datatype import ( - Artifact, - DataType, + BaseDataType, + pickle_encoder, pickle_serializer, ) from superduper.components.model import ObjectModel @@ -17,12 +18,16 @@ from superduper.components.table import Table -def test_document_encoding(): - document = Document({'x': pickle_serializer(np.random.rand(20))}) +def test_document_encoding(db): + schema = Schema('tmp', fields={'x': pickle_serializer}, db=db) + document = Document({'x': np.random.rand(20)}, schema=schema) new_document = Document.decode( - document.encode(), getters={'component': lambda x: pickle_serializer} + document.encode(), + schema=schema, + db=db, ) - assert (new_document['x'].x - document['x'].x).sum() == 0 + new_document = new_document.unpack() + assert (new_document['x'] - document['x']).sum() == 0 def test_flat_query_encoding(): @@ -67,6 +72,7 @@ def test_encode_decode_flattened_document(): assert isinstance(next(iter(encoded_r[KEY_BLOBS].values())), bytes) +@pytest.mark.skip def test_encode_model_with_remote_file(db): r = { '_base': '?20d76167d4a6ad7fe00250e8359d0dca', @@ -100,6 +106,7 @@ def test_encode_model_with_remote_file(db): assert r.readlines() == read +@pytest.mark.skip def test_encode_model_with_remote_blob(): m = ObjectModel( identifier='test', @@ -136,7 +143,9 @@ def test_encode_model(): pprint.pprint(encoded_r) decoded_r = Document.decode( - encoded_r, getters={'blob': lambda x: encoded_r[KEY_BLOBS][x]} + encoded_r, + getters={'blob': lambda x: encoded_r[KEY_BLOBS][x]}, + schema=m.build_class_schema(), ) print(decoded_r) @@ -144,9 +153,7 @@ def test_encode_model(): m = decoded_r.unpack() assert isinstance(m, ObjectModel) - assert isinstance(m.object, Artifact) - - pprint.pprint(m) + assert callable(m.object) r = m.dict() @@ -158,12 +165,12 @@ def test_encode_model(): pprint.pprint(m.dict().encode()) -def test_decode_inline_data(): - schema = Schema('my-schema', fields={'data': pickle_serializer}) +def test_decode_inline_data(db): + schema = Schema('my-schema', fields={'data': pickle_encoder}, db=db) r = { 'x': 2, - 'data': pickle_serializer.encode_data(np.random.randn(20)), + 'data': pickle_encoder.encode_data(np.random.randn(20)), } r = Document.decode(r, schema=schema).unpack() @@ -171,7 +178,7 @@ def test_decode_inline_data(): def test_refer_to_applied_item(db): - dt = DataType(identifier='my-type', encodable='artifact') + dt = pickle_serializer db.apply(dt) m = ObjectModel( @@ -183,14 +190,14 @@ def test_refer_to_applied_item(db): db.apply(m) r = db.metadata.get_component_by_uuid(m.uuid) - assert r['datatype'].startswith('&:component:datatype:my-type') + assert r['datatype'].startswith('&:component:datatype:pickle_serializer') import pprint pprint.pprint(r) print(db.show('datatype')) - dt = db.load('datatype', 'my-type', 0) + dt = db.load('datatype', 'pickle_serializer', 0) print(dt) c = db.load('model', 'test') print(c) @@ -220,38 +227,23 @@ def test_column_encoding(db): def test_refer_to_system(db): - from superduper.components.datatype import DataType, methods - - serializer = DataType( - identifier='my-datatype', - encodable='encodable', - db=db, - **methods['pickle'], - ) - db.apply(serializer) - db.artifact_store.put_bytes( - serializer.encode_data(np.random.rand(3)), file_id='12345' + pickle_serializer._encode_data(np.random.rand(3)), file_id='12345' ) r = { - '_builds': { - 'my_artifact': { - '_path': 'superduper.components.datatype.LazyArtifact', - 'blob': '&:blob:12345', - 'datatype': "&:component:datatype:my-datatype", - } - }, - 'data': '?my_artifact', + 'data': '&:blob:12345', } - r = Document.decode(r, db=db).unpack() + r = Document.decode( + r, db=db, schema=Schema('tmp', fields={'data': pickle_serializer}) + ).unpack() assert isinstance(r['data'], np.ndarray) def test_encode_same_identifier(): - datatype = DataType(identifier="a") + datatype = BaseDataType(identifier="a") model = ObjectModel(identifier="a", object=lambda x: x, datatype=datatype) listener = model.to_listener(identifier="a", key="a", select=None) diff --git a/test/unittest/base/test_leaf.py b/test/unittest/base/test_leaf.py index 749c21b4b..3d8e22f06 100644 --- a/test/unittest/base/test_leaf.py +++ b/test/unittest/base/test_leaf.py @@ -4,7 +4,7 @@ from superduper import ObjectModel from superduper.backends.base.query import Query -from superduper.base.constant import KEY_BUILDS +from superduper.base.constant import KEY_BLOBS, KEY_BUILDS from superduper.base.document import Document from superduper.base.leaf import Leaf from superduper.components.component import Component @@ -24,12 +24,10 @@ class TestSubModel(Component): type_id: t.ClassVar[str] = 'test-sub-model' a: int = 1 b: str = 'b' - c: ObjectModel = dc.field( - default_factory=ObjectModel(identifier='test-2', object=lambda x: x + 2) - ) + c: ObjectModel | None = None d: t.List[ObjectModel] = dc.field(default_factory=[]) - e: OtherSer = dc.field(default_factory=OtherSer(identifier='test', d='test')) - f: t.Callable = dc.field(default=lambda x: x) + e: OtherSer | None = None + f: t.Callable class MySer(Leaf): @@ -111,15 +109,16 @@ def test_component_with_document(): f=lambda x: x, ) print('encoding') - d = Document(t.dict()) - r = d.encode() + d = t.dict() + r = d.encode(leaves_to_keep=Leaf) builds = r[KEY_BUILDS] pprint(r) - assert len(builds) == 8 + assert len(builds) == 3 + assert len(r[KEY_BLOBS]) == 1 for leaf in builds: - print(type(leaf)) + print(type(builds[leaf])) def test_find_variables(): diff --git a/test/unittest/component/datatype/test_file.py b/test/unittest/component/datatype/test_file.py index 659c16f55..4aff00316 100644 --- a/test/unittest/component/datatype/test_file.py +++ b/test/unittest/component/datatype/test_file.py @@ -3,7 +3,7 @@ import pytest -from superduper import DataType +from superduper.components.datatype import file @pytest.fixture @@ -16,45 +16,17 @@ def random_data(tmpdir): return file_name -def dt_file(): - return DataType("my-file", encodable="file") +def test_data_with_schema(db, random_data): + datatype_utils.check_data_with_schema(random_data, file, db=db) -def dt_file_lazy(): - return DataType("my-file", encodable="lazy_file") +def test_data_with_schema_and_db(random_data, db): + datatype_utils.check_data_with_schema_and_db(random_data, file, db) -datatypes = [ - dt_file(), - dt_file_lazy(), -] +def test_component(random_data): + datatype_utils.check_component(random_data, file) -@pytest.mark.parametrize("datatype", datatypes) -def test_data_with_schema(db, datatype: DataType, random_data): - datatype_utils.check_data_with_schema(random_data, datatype, db=db) - - -@pytest.mark.parametrize("datatype", datatypes) -def test_data_with_schema_and_db(datatype: DataType, random_data, db): - datatype_utils.check_data_with_schema_and_db(random_data, datatype, db) - - -@pytest.mark.parametrize("datatype", datatypes) -def test_data_without_schema(datatype: DataType, random_data): - datatype_utils.check_data_without_schema(random_data, datatype) - - -@pytest.mark.parametrize("datatype", datatypes) -def test_data_without_schema_and_db(datatype: DataType, random_data, db): - datatype_utils.check_data_without_schema_and_db(random_data, datatype, db) - - -@pytest.mark.parametrize("datatype", datatypes) -def test_component(random_data, datatype): - datatype_utils.check_component(random_data, datatype) - - -@pytest.mark.parametrize("datatype", datatypes) -def test_component_with_db(db, random_data, datatype): - datatype_utils.check_component_with_db(random_data, datatype, db) +def test_component_with_db(db, random_data): + datatype_utils.check_component_with_db(random_data, file, db) diff --git a/test/unittest/component/datatype/test_pickle.py b/test/unittest/component/datatype/test_pickle.py index f152aa706..1e42dc987 100644 --- a/test/unittest/component/datatype/test_pickle.py +++ b/test/unittest/component/datatype/test_pickle.py @@ -6,9 +6,8 @@ from superduper.base.enums import DBType from superduper.components.datatype import ( - DataType, + BaseDataType, pickle_encoder, - pickle_lazy, pickle_serializer, ) @@ -22,30 +21,19 @@ def random_data(): datatypes = [ pickle_encoder, pickle_serializer, - pickle_lazy, ] @pytest.mark.parametrize("datatype", datatypes) -def test_data_with_schema(db, datatype: DataType, random_data: pd.DataFrame): +def test_data_with_schema(db, datatype: BaseDataType, random_data: pd.DataFrame): datatype_utils.check_data_with_schema(random_data, datatype, db) @pytest.mark.parametrize("datatype", datatypes) -def test_data_with_schema_and_db(datatype: DataType, random_data: pd.DataFrame, db): +def test_data_with_schema_and_db(datatype: BaseDataType, random_data: pd.DataFrame, db): datatype_utils.check_data_with_schema_and_db(random_data, datatype, db) -@pytest.mark.parametrize("datatype", datatypes) -def test_data_without_schema(datatype: DataType, random_data: pd.DataFrame): - datatype_utils.check_data_without_schema(random_data, datatype) - - -@pytest.mark.parametrize("datatype", datatypes) -def test_data_without_schema_and_db(datatype: DataType, random_data: pd.DataFrame, db): - datatype_utils.check_data_without_schema_and_db(random_data, datatype, db) - - @pytest.mark.parametrize("datatype", datatypes) def test_component(random_data, datatype): datatype_utils.check_component(random_data, datatype) diff --git a/test/unittest/component/test_component.py b/test/unittest/component/test_component.py index 64cefb769..1a363d29e 100644 --- a/test/unittest/component/test_component.py +++ b/test/unittest/component/test_component.py @@ -10,10 +10,8 @@ from superduper.base.annotations import trigger from superduper.components.component import Component from superduper.components.datatype import ( - Artifact, - DataType, - Empty, - LazyArtifact, + BaseDataType, + Blob, dill_serializer, ) from superduper.components.listener import Listener @@ -32,27 +30,41 @@ def cleanup(): @dc.dataclass(kw_only=True) class MyComponent(Component): type_id: t.ClassVar[str] = "my_type" - _lazy_fields: t.ClassVar[t.Sequence[str]] = ("my_dict",) + _fields = { + 'my_dict': dill_serializer, + 'nested_list': dill_serializer, + } my_dict: t.Dict nested_list: t.List a: t.Callable -def test_init(monkeypatch): - from unittest.mock import MagicMock +def test_reload(db): + m = ObjectModel('test', object=lambda x: x + 1) - e = Artifact(x=None, identifier="123", datatype=dill_serializer) - a = Artifact(x=None, identifier="456", datatype=dill_serializer) + db.apply(m) - def side_effect(*args, **kwargs): - a.x = lambda x: x + 1 + reloaded = db.load('model', 'test') + reloaded.unpack() - a.init = MagicMock() - a.init.side_effect = side_effect - list_ = [e, a] +def test_init(db, monkeypatch): + a = Blob( + identifier="456", + bytes=dill_serializer._encode_data(lambda x: x + 1), + db=db, + ) + my_dict = Blob( + identifier="456", + bytes=dill_serializer._encode_data({'a': lambda x: x + 1}), + db=db, + ) + + list_ = Blob( + identifier='789', bytes=dill_serializer._encode_data([lambda x: x + 1]), db=db + ) - c = MyComponent("test", my_dict={"a": a}, a=a, nested_list=list_) + c = MyComponent("test", my_dict=my_dict, a=a, nested_list=list_) c.init() @@ -62,8 +74,8 @@ def side_effect(*args, **kwargs): assert callable(c.a) assert c.a(1) == 2 - assert callable(c.nested_list[1]) - assert c.nested_list[1](1) == 2 + assert callable(c.nested_list[0]) + assert c.nested_list[0](1) == 2 def test_load_lazily(db): @@ -74,8 +86,8 @@ def test_load_lazily(db): reloaded = db.load("model", m.identifier) - assert isinstance(reloaded.object, LazyArtifact) - assert isinstance(reloaded.object.x, Empty) + assert isinstance(reloaded.object, Blob) + assert reloaded.object.bytes is None reloaded.init(db=db) @@ -97,7 +109,7 @@ def load(blob): reloaded = Component.read(save_path) # getters=getters assert isinstance(reloaded, ObjectModel) - assert isinstance(reloaded.datatype, DataType) + assert isinstance(reloaded.datatype, BaseDataType) def test_set_variables(db): @@ -163,6 +175,7 @@ def test_upstream(db, clean): db.apply(m) +# TODO needed? def test_set_db_deep(db): c1 = UpstreamComponent(identifier='c1') m = MyListener( diff --git a/test/unittest/component/test_graph.py b/test/unittest/component/test_graph.py index 0aa836310..eccca41e9 100644 --- a/test/unittest/component/test_graph.py +++ b/test/unittest/component/test_graph.py @@ -12,7 +12,6 @@ def model_object(x): return x + 1 model = ObjectModel(identifier='m1', object=model_object, signature='singleton') - db.add(model) yield model @@ -22,7 +21,6 @@ def model_object(x): return x + 2, x model = ObjectModel(identifier='m2', object=model_object) - db.add(model) yield model @@ -32,7 +30,6 @@ def model_object(x): return {'x': x + 2} model = ObjectModel(identifier='m2_multi_dict', object=model_object) - db.add(model) yield model @@ -42,7 +39,6 @@ def model_object(x, y=1): return x + y + 2 model = ObjectModel(identifier='m2_multi', object=model_object) - db.add(model) yield model @@ -52,7 +48,6 @@ def model_object(x, y): return x + y + 3 model = ObjectModel(identifier='m3', object=model_object) - db.add(model) yield model @@ -150,7 +145,7 @@ def test_complex_graph_with_select(db): def test_serialization(db, model1): g = Graph(identifier='complex-graph', input=model1) original_g = g.G - db.add(g) + db.apply(g) g = db.load('model', 'complex-graph') assert nx.utils.graphs_equal(original_g, g.G) diff --git a/test/unittest/component/test_listener.py b/test/unittest/component/test_listener.py index 78ce5510d..c34d6ff38 100644 --- a/test/unittest/component/test_listener.py +++ b/test/unittest/component/test_listener.py @@ -262,14 +262,12 @@ def test_upstream_serializes(db): upstream=[upstream_component], ) - db.apply(dependent_listener) - listener = Listener( identifier="test-listener", model=ObjectModel("test", object=lambda x: x), select=db[dependent_listener.outputs].select(), key=dependent_listener.outputs, - upstream=[upstream_component], + upstream=[dependent_listener], ) db.apply(listener) diff --git a/test/unittest/component/test_model.py b/test/unittest/component/test_model.py index 791b07aeb..c674d8986 100644 --- a/test/unittest/component/test_model.py +++ b/test/unittest/component/test_model.py @@ -12,7 +12,7 @@ from superduper.base.datalayer import Datalayer from superduper.base.document import Document from superduper.components.dataset import Dataset -from superduper.components.datatype import DataType, pickle_decode, pickle_encode +from superduper.components.datatype import pickle_serializer from superduper.components.metric import Metric from superduper.components.model import ( Mapping, @@ -158,7 +158,7 @@ def test_pm_predict_with_select_ids(monkeypatch, predict_mixin): monkeypatch.setattr( predict_mixin, 'datatype', - DataType(identifier='test', encoder=pickle_encode, decoder=pickle_decode), + pickle_serializer, ) predict_mixin._predict_with_select_and_ids( X=X, select=select, ids=ids, predict_id='test' diff --git a/test/unittest/component/test_plugin.py b/test/unittest/component/test_plugin.py index 1cf0e2f6a..8460a5da9 100644 --- a/test/unittest/component/test_plugin.py +++ b/test/unittest/component/test_plugin.py @@ -56,19 +56,9 @@ def create_import_plugin(tempdirname): component_dict = { "_base": "?plugin", "_builds": { - "file_lazy": { - "_path": "superduper.components.datatype.get_serializer", - "method": "file", - "encodable": "lazy_file", - }, - "file_id": { - "_path": "superduper.components.datatype.LazyFile", - "datatype": "?file_lazy", - "x": "&:file:p_import:file_id", - }, "plugin": { "_path": "superduper.components.plugin.Plugin", - "path": "?file_id", + "path": "&:file:p_import:file_id", }, }, } diff --git a/test/unittest/component/test_schema.py b/test/unittest/component/test_schema.py index a4893eb48..54d735f63 100644 --- a/test/unittest/component/test_schema.py +++ b/test/unittest/component/test_schema.py @@ -1,5 +1,28 @@ -from superduper import Schema, Table -from superduper.components.datatype import pickle_encoder +import typing as t + +import pytest + +from superduper import Component, Schema, Table +from superduper.components.datatype import ( + Blob, + File, + dill_serializer, + file, + pickle_encoder, + pickle_serializer, +) + + +class TestComponent(Component): + _fields = {'a': dill_serializer, 'b': file} + + a: t.Callable + b: str | None = None + + +class TestUnannotatedComponent(Component): + a: t.Callable + b: t.Optional[t.Callable] def test_schema_with_bytes_encoding(db): @@ -28,3 +51,115 @@ def test_schema_with_bytes_encoding(db): assert isinstance(r['txt'], str) r = db['documents'].find_one() + + +def test_schema_with_blobs(db): + db.apply( + Table( + 'documents', + schema=Schema('_schema/documents', fields={'txt': pickle_serializer}), + ) + ) + + db['documents'].insert([{'txt': 'testing 123'}]).execute() + + r = db['documents'].select().tolist()[0] + + assert isinstance(r['txt'], Blob) + + # artifacts are loaded lazily and initially empty + assert r['txt'].bytes is None + + # artifacts are downloaded and decoded with `.unpack()` + assert r.unpack()['txt'] == 'testing 123' + + +@pytest.fixture +def tmp_file(): + file = '/tmp/test_schema_with_file.txt' + with open(file, 'a') as f: + f.write('Hello 123') + pass + + yield file + + import os + + os.remove(file) + + +def test_schema_with_file(db, tmp_file): + # the `file` is a datatype which copies a file + # to the artifact store when a reference document + # containing a file field is inserted + db.apply( + Table( + 'documents', + schema=Schema('_schema/documents', fields={'my_file': file}), + ) + ) + db['documents'].insert([{'my_file': tmp_file}]).execute() + + # only the references are loaded when data is selected + r = db['documents'].select().tolist()[0] + + # loaded document contains a pointer to the file + assert isinstance(r['my_file'], File) + + # however the path has not been populated + assert not r['my_file'].path + + # unpacking the document copies the file to the artifact-store + rr = r.unpack() + + # the path has been populated + assert r['my_file'].path + + # and is also now local + import os + + assert os.path.exists(r['my_file'].path) + + # the unpacked value contains the local path + # this may be different from the original file path + assert rr['my_file'] == r['my_file'].path + + with open(rr['my_file']) as f: + f.read().split('\n')[0] = 'Hello 123' + + +def test_component_serializes_with_schema(db, tmp_file): + c = TestComponent('test', a='testing testing 123', b=tmp_file) + + r = c.dict() + + r_encoded = r.encode() + + import pprint + + pprint.pprint(r.schema) + + pprint.pprint(r_encoded) + + assert isinstance(r['a'], Blob) + + assert r_encoded['a'].startswith('&:blob:') + assert r_encoded['b'].startswith('&:file:') + + +def test_auto_infer_fields(): + s = TestUnannotatedComponent.build_class_schema() + + assert isinstance(s, Schema) + + import pprint + + pprint.pprint(s) + + assert list(s.fields.keys()) == ['a', 'b'] + + +def test_wrap_function_with_blob(): + r = TestComponent('test', a=lambda x: x + 1).dict() + + assert isinstance(r['a'], Blob) diff --git a/test/unittest/component/test_serialization.py b/test/unittest/component/test_serialization.py index bb7994b9f..32b0f13cc 100644 --- a/test/unittest/component/test_serialization.py +++ b/test/unittest/component/test_serialization.py @@ -1,14 +1,14 @@ -from superduper.components.datatype import pickle_serializer +from superduper.components.datatype import dill_serializer from superduper.components.model import ObjectModel def test_model(): m = ObjectModel( identifier='test', - datatype=pickle_serializer, + datatype=dill_serializer, object=lambda x: x + 1, ) m_dict = m.dict() assert m_dict['identifier'] == m.identifier - assert m_dict['object'].x == m.object - assert m_dict['datatype'].identifier == 'pickle' + assert m_dict['object'].bytes == dill_serializer._encode_data(m.object) + assert m_dict['datatype'].identifier == 'dill_serializer' diff --git a/test/unittest/component/test_template.py b/test/unittest/component/test_template.py index 26127ec52..988fea3bb 100644 --- a/test/unittest/component/test_template.py +++ b/test/unittest/component/test_template.py @@ -133,6 +133,7 @@ def test_from_template(db): component.init() assert isinstance(component, Listener) assert isinstance(component.model, ObjectModel) + assert component.model.object(3) == 5 diff --git a/test/unittest/test_quality.py b/test/unittest/test_quality.py index 014628f2e..ef384de49 100644 --- a/test/unittest/test_quality.py +++ b/test/unittest/test_quality.py @@ -17,9 +17,9 @@ # over time. If you have decreased the number of defects, change it here, # and take a bow! ALLOWABLE_DEFECTS = { - 'cast': 3, # Try to keep this down + 'cast': 1, # Try to keep this down 'noqa': 3, # This should never change - 'type_ignore': 10, # This should only ever increase in obscure edge cases + 'type_ignore': 7, # This should only ever increase in obscure edge cases } diff --git a/test/utils/component/datatype.py b/test/utils/component/datatype.py index 779e606d3..652e75c35 100644 --- a/test/utils/component/datatype.py +++ b/test/utils/component/datatype.py @@ -9,24 +9,12 @@ from superduper.base.document import Document from superduper.base.enums import DBType from superduper.components.component import Component -from superduper.components.datatype import ( - DataType, - Empty, - _BaseEncodable, -) +from superduper.components.datatype import BaseDataType, pickle_serializer from superduper.components.schema import Schema from superduper.components.table import Table def assert_equal(expect, actual): - if isinstance(actual, _BaseEncodable) and actual.lazy: - actual.init() - actual = actual.x - - if isinstance(expect, _BaseEncodable) and expect.lazy: - expect.init() - expect = expect.x - assert isinstance(expect, type(actual)) if isinstance(expect, np.ndarray): assert np.array_equal(expect, actual) @@ -45,7 +33,7 @@ def print_sep(): print("\n", "-" * 80, "\n") -def check_data_with_schema(data, datatype: DataType, db): +def check_data_with_schema(data, datatype, db): print("datatype", datatype) print_sep() schema = Schema(identifier="schema", fields={"x": datatype, "y": int}, db=db) @@ -58,11 +46,8 @@ def check_data_with_schema(data, datatype: DataType, db): pprint(encoded) print_sep() - decoded = Document.decode(encoded, schema=schema) - if datatype.encodable == 'lazy_artifact': - assert isinstance(decoded["x"], datatype.encodable_cls) - assert isinstance(decoded["x"].x, type(data)) - decoded = Document(decoded.unpack()) + decoded = Document.decode(encoded, schema=schema, db=db).unpack() + pprint(decoded) print_sep() @@ -72,7 +57,7 @@ def check_data_with_schema(data, datatype: DataType, db): return document, encoded, decoded -def check_data_with_schema_and_db(data, datatype: DataType, db: Datalayer): +def check_data_with_schema_and_db(data, datatype: BaseDataType, db: Datalayer): print("datatype", datatype) print_sep() schema = Schema(identifier="schema", fields={"x": datatype, "y": int}) @@ -95,11 +80,7 @@ def check_data_with_schema_and_db(data, datatype: DataType, db: Datalayer): print_sep() decoded = list(db["documents"].select().execute())[0] - - if datatype.encodable == 'lazy_artifact': - assert isinstance(decoded["x"], datatype.encodable_cls) - assert isinstance(decoded["x"].x, Empty) - decoded = Document(decoded.unpack()) + decoded = decoded.unpack() pprint(decoded) print_sep() @@ -110,51 +91,6 @@ def check_data_with_schema_and_db(data, datatype: DataType, db: Datalayer): return document, encoded, decoded -def check_data_without_schema(data, datatype: DataType): - print("datatype", datatype) - print_sep() - - document = Document({"x": datatype(data), "y": 1}) - pprint(document) - print_sep() - - encoded = document.encode() - pprint(encoded) - print_sep() - - decoded = Document.decode(encoded) - pprint(decoded) - assert_equal(document["x"], decoded["x"]) - assert_equal(document["y"], decoded["y"]) - return document, encoded, decoded - - -def check_data_without_schema_and_db(data, datatype: DataType, db: Datalayer): - print("datatype", datatype) - print("\n", "-" * 80, "\n") - - table = Table( - "documents", - schema=Schema(identifier="schema", fields={"x": datatype, "y": int}), - ) - - db.apply(table) - - document = Document({"x": data, "y": 1}) - print(document) - print("\n", "-" * 80, "\n") - db["documents"].insert([document]).execute() - - decoded = list(db["documents"].select().execute())[0] - pprint(decoded) - print("\n", "-" * 80, "\n") - - assert_equal(document["x"], decoded["x"]) - assert_equal(document["y"], decoded["y"]) - - return document, decoded - - @dc.dataclass(kw_only=True) class ChildComponent(Component): type_id: t.ClassVar[str] = "ChildComponent" @@ -167,18 +103,17 @@ class TestComponent(Component): y: int = 1 x: np.ndarray | None = None child: ChildComponent | None = None - _artifacts: t.ClassVar = () + _fields = {'x': pickle_serializer} -def check_component(data, datatype: DataType): +def check_component(data, datatype: BaseDataType): print("datatype", datatype) print_sep() c = TestComponent( "test", x=data, - child=ChildComponent("child", y=2, artifacts={"x": datatype}), - artifacts={"x": datatype}, + child=ChildComponent("child", y=2), ) pprint(c) print_sep() @@ -203,8 +138,7 @@ def check_component_with_db(data, datatype, db): c = TestComponent( "test", x=data, - child=ChildComponent("child", y=2, artifacts={"x": datatype}), - artifacts={"x": datatype}, + child=ChildComponent("child", y=2), ) db.add(c) pprint(c) diff --git a/test/utils/component/model.py b/test/utils/component/model.py index 502caa486..7a7bcd8a0 100644 --- a/test/utils/component/model.py +++ b/test/utils/component/model.py @@ -25,8 +25,6 @@ def test_predict(model: Model, sample_data: t.Any): def test_predict_in_db(model: Model, sample_data: t.Any, db: "Datalayer"): model.identifier = random_id() - db.apply(model) - db.cfg.auto_schema = True db["datas"].insert([{"data": sample_data, "i": i} for i in range(10)]).execute() diff --git a/test/utils/database/query.py b/test/utils/database/query.py index 5540f16e7..6cfaf64a3 100644 --- a/test/utils/database/query.py +++ b/test/utils/database/query.py @@ -210,16 +210,6 @@ def _check(n): table_or_collection.insert([data]).execute() _check(2) - # Without `Document` non dict data - table_or_collection.insert([np.zeros((1))]).execute() - c = _check(3) - - gt = np.zeros((1)) - - # Auto wrapped _base - assert "x" in c[-1] - assert c[-1].unpack()["x"] == gt - def test_model(db): from test.utils.setup.fake_data import add_models diff --git a/test/utils/setup/fake_data.py b/test/utils/setup/fake_data.py index 2c28b480a..9dac29ee7 100644 --- a/test/utils/setup/fake_data.py +++ b/test/utils/setup/fake_data.py @@ -10,7 +10,7 @@ from superduper.components.schema import Schema from superduper.components.table import Table from superduper.components.vector_index import VectorIndex -from superduper.ext.numpy.encoder import array +from superduper.ext.numpy.encoder import Array GLOBAL_TEST_N_DATA_POINTS = 100 @@ -31,7 +31,7 @@ def add_random_data( table_name: str = "documents", n: int = GLOBAL_TEST_N_DATA_POINTS, ): - float_array = array(dtype="float", shape=(32,)) + float_array = Array(dtype="float", shape=(32,)) schema = Schema( identifier=table_name, @@ -57,16 +57,16 @@ def add_random_data( def add_datatypes(db: Datalayer): for n in [8, 16, 32]: - db.apply(array(dtype="float", shape=(n,))) + db.apply(Array(dtype="float", shape=(n,))) def add_models(db: Datalayer): # identifier, weight_shape, encoder params = [ - ["linear_a", (32, 16), array(dtype="float", shape=(16,)), False], - ["linear_a_multi", (32, 16), array(dtype="float", shape=(16,)), True], - ["linear_b", (16, 8), array(dtype="float", shape=(8,)), False], - ["linear_b_multi", (16, 8), array(dtype="float", shape=(8,)), True], + ["linear_a", (32, 16), Array(dtype="float", shape=(16,)), False], + ["linear_a_multi", (32, 16), Array(dtype="float", shape=(16,)), True], + ["linear_b", (16, 8), Array(dtype="float", shape=(8,)), False], + ["linear_b_multi", (16, 8), Array(dtype="float", shape=(8,)), True], ] for identifier, weight_shape, datatype, flatten in params: weight = np.random.randn(weight_shape[1])