diff --git a/docs/content/reusable_snippets/create_vector_search_model.ipynb b/docs/content/reusable_snippets/create_vector_search_model.ipynb
index 6236d02e8..b678c1d61 100644
--- a/docs/content/reusable_snippets/create_vector_search_model.ipynb
+++ b/docs/content/reusable_snippets/create_vector_search_model.ipynb
@@ -16,8 +16,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from superduperdb.base.variables import Variable\n",
-    "item = {indexing_key: Variable('query')}"
+    "item = {indexing_key: '<var:query>'}"
    ]
   },
   {
diff --git a/docs/content/use_cases/multimodal_vector_search_video.ipynb b/docs/content/use_cases/multimodal_vector_search_video.ipynb
index 1e9391980..8baff2983 100644
--- a/docs/content/use_cases/multimodal_vector_search_video.ipynb
+++ b/docs/content/use_cases/multimodal_vector_search_video.ipynb
@@ -1 +1 @@
-{"metadata": {"kernelspec": {"display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3"}, "language_info": {"codemirror_mode": {"name": "ipython", "version": 3}, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.2"}}, "nbformat": 4, "nbformat_minor": 5, "cells": [{"cell_type": "markdown", "id": "38c1a328-fd86-4c5f-bd54-b8664f433608", "metadata": {}, "source": ["<!-- TABS -->\n", "# Multimodal vector search - Video"]}, {"cell_type": "markdown", "id": "f7a4aab8-86eb-4e1c-9200-0a16ba75b2e6", "metadata": {}, "source": ["<!-- TABS -->\n", "## Configure your production system"]}, {"cell_type": "markdown", "id": "81e7cd59-67d0-4776-aea1-4864aa768f95", "metadata": {}, "source": [":::note\n", "If you would like to use the production features \n", "of SuperDuperDB, then you should set the relevant \n", "connections and configurations in a configuration \n", "file. Otherwise you are welcome to use \"development\" mode \n", "to get going with SuperDuperDB quickly.\n", ":::"]}, {"cell_type": "code", "execution_count": null, "id": "62014646-ccd4-4d10-ac26-1c470f88f2f2", "metadata": {}, "outputs": [], "source": ["import os\n", "\n", "os.makedirs('.superduperdb', exist_ok=True)\n", "os.environ['SUPERDUPERDB_CONFIG'] = '.superduperdb/config.yaml'"]}, {"cell_type": "code", "execution_count": null, "id": "8e50edd2-438d-44ab-9da0-0b72197df262", "metadata": {}, "outputs": [], "source": ["# <tab: MongoDB Community>\n", "CFG = '''\n", "data_backend: mongodb://127.0.0.1:27017/documents\n", "artifact_store: filesystem://./artifact_store\n", "cluster:\n", "  cdc:\n", "    strategy: null\n", "    uri: ray://127.0.0.1:20000\n", "  compute:\n", "    uri: ray://127.0.0.1:10001\n", "  vector_search:\n", "    backfill_batch_size: 100\n", "    type: in_memory\n", "    uri: http://127.0.0.1:21000\n", "'''"]}, {"cell_type": "code", "execution_count": null, "id": "1ad9ee67-6402-45ea-8311-3efb039b5df3", "metadata": {}, "outputs": [], "source": ["# <tab: MongoDB Atlas>\n", "CFG = '''\n", "artifact_store: filesystem://<path-to-artifact-store>\n", "cluster: \n", "    compute: ray://<ray-host>\n", "    cdc:    \n", "        uri: http://<cdc-host>:<cdc-port>\n", "    vector_search:\n", "        uri: http://<vector-search-host>:<vector-search-port>\n", "        type: native\n", "databackend: mongodb+srv://<user>:<password>@<mongo-host>:27017/documents\n", "'''"]}, {"cell_type": "code", "execution_count": null, "id": "9c9e8351-b17f-4882-bda6-5ad51dbc7e1f", "metadata": {}, "outputs": [], "source": ["# <tab: SQLite>\n", "CFG = '''\n", "artifact_store: filesystem://<path-to-artifact-store>\n", "cluster: \n", "    compute: ray://<ray-host>\n", "    cdc:    \n", "        uri: http://<cdc-host>:<cdc-port>\n", "    vector_search:\n", "        uri: http://<vector-search-host>:<vector-search-port>\n", "databackend: sqlite://<path-to-db>.db\n", "'''"]}, {"cell_type": "code", "execution_count": null, "id": "d16c66bb-6ff2-4cea-b11c-0a65bf86c7ad", "metadata": {}, "outputs": [], "source": ["# <tab: MySQL>\n", "CFG = '''\n", "artifact_store: filesystem://<path-to-artifact-store>\n", "cluster: \n", "    compute: ray://<ray-host>\n", "    cdc:    \n", "        uri: http://<cdc-host>:<cdc-port>\n", "    vector_search:\n", "        uri: http://<vector-search-host>:<vector-search-port>\n", "databackend: mysql://<user>:<password>@<host>:<port>/database\n", "'''"]}, {"cell_type": "code", "execution_count": null, "id": "9b7ac715-712c-4ec7-be90-0aaa22518977", "metadata": {}, "outputs": [], "source": ["# <tab: Oracle>\n", "CFG = '''\n", "artifact_store: filesystem://<path-to-artifact-store>\n", "cluster: \n", "    compute: ray://<ray-host>\n", "    cdc:    \n", "        uri: http://<cdc-host>:<cdc-port>\n", "    vector_search:\n", "        uri: http://<vector-search-host>:<vector-search-port>\n", "databackend: mssql://<user>:<password>@<host>:<port>\n", "'''"]}, {"cell_type": "code", "execution_count": null, "id": "f21fad9c-cc0e-4cf5-83f0-41a3a614c6af", "metadata": {}, "outputs": [], "source": ["# <tab: PostgreSQL>\n", "CFG = '''\n", "artifact_store: filesystem://<path-to-artifact-store>\n", "cluster: \n", "    compute: ray://<ray-host>\n", "    cdc:    \n", "        uri: http://<cdc-host>:<cdc-port>\n", "    vector_search:\n", "        uri: http://<vector-search-host>:<vector-search-port>\n", "databackend: postgres://<user>:<password>@<host>:<port</<database>\n", "'''"]}, {"cell_type": "code", "execution_count": null, "id": "1badb5a3-823c-4463-ab79-6f4f9239dabe", "metadata": {}, "outputs": [], "source": ["# <tab: Snowflake>\n", "CFG = '''\n", "artifact_store: filesystem://<path-to-artifact-store>\n", "metadata_store: sqlite://<path-to-sqlite-db>.db\n", "cluster: \n", "    compute: ray://<ray-host>\n", "    cdc:    \n", "        uri: http://<cdc-host>:<cdc-port>\n", "    vector_search:\n", "        uri: http://<vector-search-host>:<vector-search-port>\n", "databackend: snowflake://<user>:<password>@<account>/<database>\n", "'''"]}, {"cell_type": "code", "execution_count": null, "id": "ae7807d9-9fc1-4c18-8027-a512f827783d", "metadata": {}, "outputs": [], "source": ["# <tab: Clickhouse>\n", "CFG = '''\n", "artifact_store: filesystem://<path-to-artifact-store>\n", "metadata_store: sqlite://<path-to-sqlite-db>.db\n", "cluster: \n", "    compute: ray://<ray-host>\n", "    cdc:    \n", "        uri: http://<cdc-host>:<cdc-port>\n", "    vector_search:\n", "        uri: http://<vector-search-host>:<vector-search-port>\n", "databackend: clickhouse://<user>:<password>@<host>:<port>\n", "'''"]}, {"cell_type": "code", "execution_count": null, "id": "fc40c13b-9bc5-47ac-86d6-ef7a379c45ee", "metadata": {}, "outputs": [], "source": ["with open(os.environ['SUPERDUPERDB_CONFIG'], 'w') as f:\n", "    f.write(CFG)"]}, {"cell_type": "markdown", "metadata": {}, "source": ["<!-- TABS -->\n", "## Start your cluster"]}, {"cell_type": "markdown", "metadata": {}, "source": [":::note\n", "Starting a SuperDuperDB cluster is useful in production and model development\n", "if you want to enable scalable compute, access to the models by multiple users for collaboration, \n", "monitoring.\n", "\n", "If you don't need this, then it is simpler to start in development mode.\n", ":::"]}, {"cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": ["# <tab: Experimental Cluster>\n", "!python -m superduperdb local-cluster up"]}, {"cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": ["# <tab: Docker-Compose>\n", "!make build_sandbox\n", "!make testenv_init"]}, {"cell_type": "markdown", "id": "32f8484d-2e35-472a-9b24-1a30ec1d144b", "metadata": {}, "source": ["<!-- TABS -->\n", "## Connect to SuperDuperDB"]}, {"cell_type": "markdown", "id": "06d66021-ce62-4021-a2c5-158dee92b3bb", "metadata": {}, "source": [":::note\n", "Note that this is only relevant if you are running SuperDuperDB in development mode.\n", "Otherwise refer to \"Configuring your production system\".\n", ":::"]}, {"cell_type": "code", "execution_count": null, "id": "61976f44-8139-41c0-a73e-569c6d16c4b1", "metadata": {}, "outputs": [], "source": ["# <tab: MongoDB>\n", "from superduperdb import superduper\n", "\n", "db = superduper('mongodb://localhost:27017/documents')"]}, {"cell_type": "code", "execution_count": null, "id": "e981a457", "metadata": {}, "outputs": [], "source": ["# <tab: SQLite>\n", "from superduperdb import superduper\n", "db = superduper('sqlite://my_db.db')"]}, {"cell_type": "code", "execution_count": null, "id": "19ecf7c0-b730-4503-9b5d-e97697b3bcee", "metadata": {}, "outputs": [], "source": ["# <tab: MySQL>\n", "from superduperdb import superduper\n", "\n", "user = 'superduper'\n", "password = 'superduper'\n", "port = 3306\n", "host = 'localhost'\n", "database = 'test_db'\n", "\n", "db = superduper(f\"mysql://{user}:{password}@{host}:{port}/{database}\")"]}, {"cell_type": "code", "execution_count": null, "id": "df208e8c-4fd0-438f-af29-22a763a2aebd", "metadata": {}, "outputs": [], "source": ["# <tab: Oracle>\n", "from superduperdb import superduper\n", "\n", "user = 'sa'\n", "password = 'Superduper#1'\n", "port = 1433\n", "host = 'localhost'\n", "\n", "db = superduper(f\"mssql://{user}:{password}@{host}:{port}\")"]}, {"cell_type": "code", "execution_count": null, "id": "d2297295", "metadata": {}, "outputs": [], "source": ["# <tab: PostgreSQL>\n", "!pip install psycopg2\n", "from superduperdb import superduper\n", "\n", "user = 'postgres'\n", "password = 'postgres'\n", "port = 5432\n", "host = 'localhost'\n", "database = 'test_db'\n", "db_uri = f\"postgres://{user}:{password}@{host}:{port}/{database}\"\n", "\n", "db = superduper(db_uri, metadata_store=db_uri.replace('postgres://', 'postgresql://'))"]}, {"cell_type": "code", "execution_count": null, "id": "cc6c8517", "metadata": {}, "outputs": [], "source": ["# <tab: Snowflake>\n", "from superduperdb import superduper\n", "\n", "user = \"superduperuser\"\n", "password = \"superduperpassword\"\n", "account = \"XXXX-XXXX\"  # ORGANIZATIONID-USERID\n", "database = \"FREE_COMPANY_DATASET/PUBLIC\"\n", "\n", "snowflake_uri = f\"snowflake://{user}:{password}@{account}/{database}\"\n", "\n", "db = superduper(\n", "    snowflake_uri, \n", "    metadata_store='sqlite:///your_database_name.db',\n", ")"]}, {"cell_type": "code", "execution_count": null, "id": "05da45e3-d9e4-49ca-b9ee-db1b8bf4eb44", "metadata": {}, "outputs": [], "source": ["# <tab: Clickhouse>\n", "from superduperdb import superduper\n", "\n", "user = 'default'\n", "password = ''\n", "port = 8123\n", "host = 'localhost'\n", "\n", "db = superduper(f\"clickhouse://{user}:{password}@{host}:{port}\", metadata_store=f'mongomock://meta')"]}, {"cell_type": "code", "execution_count": null, "id": "0e89c8dd-d845-423a-9acc-97e3360d370c", "metadata": {}, "outputs": [], "source": ["# <tab: DuckDB>\n", "from superduperdb import superduper\n", "\n", "db = superduper('duckdb://mydb.duckdb')"]}, {"cell_type": "code", "execution_count": null, "id": "2de71562", "metadata": {}, "outputs": [], "source": ["# <tab: Pandas>\n", "from superduperdb import superduper\n", "\n", "db = superduper(['my.csv'], metadata_store=f'mongomock://meta')"]}, {"cell_type": "code", "execution_count": null, "id": "cb029a5e-fedf-4f07-8a31-d220cfbfbb3d", "metadata": {}, "outputs": [], "source": ["# <tab: MongoMock>\n", "from superduperdb import superduper\n", "\n", "db = superduper('mongomock:///test_db')"]}, {"cell_type": "markdown", "id": "032c2e7b-3f54-4263-b778-0fef60596efb", "metadata": {}, "source": ["<!-- TABS -->\n", "## Get useful sample data"]}, {"cell_type": "code", "execution_count": null, "id": "1b6f7ccb", "metadata": {}, "outputs": [], "source": ["# <tab: Video>\n", "!curl -O https://superduperdb-public-demo.s3.amazonaws.com/videos.zip && unzip videos.zip\n", "import os\n", "\n", "data = [f'videos/{x}' for x in os.listdir('./videos')]\n", "sample_datapoint = data[-1]\n", "\n", "from superduperdb.ext.pillow import pil_image\n", "chunked_model_datatype = pil_image"]}, {"cell_type": "code", "execution_count": null, "id": "44a702b1-faf9-4edb-8a55-efc4add84a83", "metadata": {}, "outputs": [], "source": ["datas = [{'x': d} for d in data[:3]]"]}, {"cell_type": "markdown", "id": "b31257e4-06fa-4cc7-9626-bb4d03fdc029", "metadata": {}, "source": ["<!-- TABS -->\n", "## Create datatype"]}, {"cell_type": "markdown", "id": "43284218", "metadata": {}, "source": ["Data types such as \"text\" or \"integer\" which are natively support by your `db.databackend` don't need a datatype.\n", "\n", "Otherwise do one of the following:"]}, {"cell_type": "code", "execution_count": null, "id": "e844c762-3391-401d-9047-ed8617a9c946", "metadata": {}, "outputs": [], "source": ["# <tab: Video>\n", "from superduperdb import DataType\n", "\n", "# Create an instance of the Encoder with the identifier 'video_on_file' and load_hybrid set to False\n", "datatype = DataType(\n", "    identifier='video_on_file',\n", "    encodable='file',\n", ")"]}, {"cell_type": "markdown", "metadata": {}, "source": ["<!-- TABS -->\n", "## Setup tables or collections"]}, {"cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": ["from superduperdb.components.table import Table\n", "from superduperdb import Schema\n", "\n", "schema = Schema(identifier=\"schema\", fields={\"x\": datatype})\n", "table_or_collection = Table(\"documents\", schema=schema)\n", "db.apply(table_or_collection)"]}, {"cell_type": "markdown", "id": "a947c52e-919e-4440-b1d6-914e690314d4", "metadata": {}, "source": ["Inserting data, all fields will be matched with the schema for data conversion."]}, {"cell_type": "code", "execution_count": null, "id": "afead32f-fc4c-4b11-9d31-d38bf061c232", "metadata": {}, "outputs": [], "source": ["db['documents'].insert(datas).execute()\n", "select = db['documents'].select()"]}, {"cell_type": "markdown", "id": "54fea927-ee4a-44cd-aaf2-634b574c316d", "metadata": {}, "source": ["<!-- TABS -->\n", "## Apply a chunker for search"]}, {"cell_type": "markdown", "id": "06d90bda-e8c4-494e-a38c-837fb63689ae", "metadata": {}, "source": [":::note\n", "Note that applying a chunker is ***not*** mandatory for search.\n", "If your data is already chunked (e.g. short text snippets or audio) or if you\n", "are searching through something like images, which can't be chunked, then this\n", "won't be necessary.\n", ":::"]}, {"cell_type": "code", "execution_count": null, "id": "f093a6d0-9d2f-4ecf-b1bd-0027302c62de", "metadata": {}, "outputs": [], "source": ["# <tab: Video>\n", "!pip install opencv-python\n", "import cv2\n", "import tqdm\n", "from PIL import Image\n", "from superduperdb.ext.pillow import pil_image\n", "from superduperdb import model, Schema\n", "\n", "\n", "@model(\n", "    flatten=True,\n", "    model_update_kwargs={'document_embedded': False},\n", ")\n", "def chunker(video_file):\n", "    # Set the sampling frequency for frames\n", "    sample_freq = 10\n", "    \n", "    # Open the video file using OpenCV\n", "    cap = cv2.VideoCapture(video_file)\n", "    \n", "    # Initialize variables\n", "    frame_count = 0\n", "    fps = cap.get(cv2.CAP_PROP_FPS)\n", "    extracted_frames = []\n", "    progress = tqdm.tqdm()\n", "\n", "    # Iterate through video frames\n", "    while True:\n", "        ret, frame = cap.read()\n", "        if not ret:\n", "            break\n", "        \n", "        # Get the current timestamp based on frame count and FPS\n", "        current_timestamp = frame_count // fps\n", "        \n", "        # Sample frames based on the specified frequency\n", "        if frame_count % sample_freq == 0:\n", "            extracted_frames.append({\n", "                'image': Image.fromarray(frame[:,:,::-1]),  # Convert BGR to RGB\n", "                'current_timestamp': current_timestamp,\n", "            })\n", "        frame_count += 1\n", "        progress.update(1)\n", "    \n", "    # Release resources\n", "    cap.release()\n", "    cv2.destroyAllWindows()\n", "    \n", "    # Return the list of extracted frames\n", "    return extracted_frames"]}, {"cell_type": "markdown", "id": "b33a16f9-3bac-45bb-80ac-3ccf265dce5f", "metadata": {}, "source": ["Now we apply this chunker to the data by wrapping the chunker in `Listener`:"]}, {"cell_type": "code", "execution_count": null, "id": "93d21872-d4dc-40dc-abab-fb07ba102ea3", "metadata": {}, "outputs": [], "source": ["from superduperdb import Listener\n", "\n", "upstream_listener = Listener(\n", "    model=chunker,\n", "    select=select,\n", "    key='x',\n", "    uuid=\"chunk\",\n", ")\n", "\n", "db.apply(upstream_listener)"]}, {"cell_type": "markdown", "id": "907721f8-d5bf-4623-8871-3ab9a05001d7", "metadata": {}, "source": ["## Build multimodal embedding models"]}, {"cell_type": "markdown", "id": "033e1eaf-2cdb-499a-ba83-cf080a1a6fda", "metadata": {}, "source": ["We define the output data type of a model as a vector for vector transformation."]}, {"cell_type": "code", "execution_count": null, "id": "28848ff1-45ab-4926-8676-777edf237347", "metadata": {}, "outputs": [], "source": ["# <tab: MongoDB>\n", "from superduperdb.components.vector_index import vector\n", "output_datatpye = vector(shape=(1024,))"]}, {"cell_type": "code", "execution_count": null, "id": "6acf66c5-7369-4aa8-a8a0-5842bd17b469", "metadata": {}, "outputs": [], "source": ["# <tab: SQL>\n", "from superduperdb.components.vector_index import sqlvector\n", "output_datatpye = sqlvector(shape=(1024,))"]}, {"cell_type": "markdown", "id": "143bf946-64b7-4452-8d20-44f2f9ae3fd6", "metadata": {}, "source": ["Then define two models, one for text embedding and one for image embedding."]}, {"cell_type": "code", "execution_count": null, "id": "f33513d3-9f86-4108-8f8b-4a6251bdd9fd", "metadata": {}, "outputs": [], "source": ["# <tab: Text-Image>\n", "!pip install git+https://github.com/openai/CLIP.git\n", "import clip\n", "from superduperdb import vector\n", "from superduperdb.ext.torch import TorchModel\n", "\n", "# Load the CLIP model and obtain the preprocessing function\n", "model, preprocess = clip.load(\"ViT-B/32\", device='cpu')\n", "\n", "# Create a TorchModel for text encoding\n", "compatible_model = TorchModel(\n", "    identifier='clip_text', # Unique identifier for the model\n", "    object=model, # CLIP model\n", "    preprocess=lambda x: clip.tokenize(x)[0],  # Model input preprocessing using CLIP \n", "    postprocess=lambda x: x.tolist(), # Convert the model output to a list\n", "    datatype=output_datatpye,  # Vector encoder with shape (1024,)\n", "    forward_method='encode_text', # Use the 'encode_text' method for forward pass \n", ")\n", "\n", "# Create a TorchModel for visual encoding\n", "model = TorchModel(\n", "    identifier='clip_image',  # Unique identifier for the model\n", "    object=model.visual,  # Visual part of the CLIP model    \n", "    preprocess=preprocess, # Visual preprocessing using CLIP\n", "    postprocess=lambda x: x.tolist(), # Convert the output to a list \n", "    datatype=output_datatpye, # Vector encoder with shape (1024,)\n", ")"]}, {"cell_type": "markdown", "id": "3d0119da-9cfd-4a60-8847-c3bfdf37697f", "metadata": {}, "source": ["Because we use multimodal models, we define different keys to specify which model to use for embedding calculations in the vector_index."]}, {"cell_type": "code", "execution_count": null, "id": "12e75fab-8504-4d17-a7d9-f98667a5d6aa", "metadata": {}, "outputs": [], "source": ["compatible_key = 'text' # we use text key for text embedding\n", "indexing_key = upstream_listener.outputs_key + '.image' # we use indexing_key for image embedding, use the image field of the result\n", "select = upstream_listener.outputs_select"]}, {"cell_type": "markdown", "id": "41b8b40d-3750-4d7b-aa60-62e07b734b04", "metadata": {}, "source": ["## Create vector-index"]}, {"cell_type": "code", "execution_count": null, "id": "66ee3ff4-880e-477b-bbdf-5b8d89c56de2", "metadata": {}, "outputs": [], "source": ["vector_index_name = 'my-vector-index'"]}, {"cell_type": "code", "execution_count": null, "id": "4cede653", "metadata": {}, "outputs": [], "source": ["# <tab: 2-Modalities>\n", "from superduperdb import VectorIndex, Listener\n", "\n", "jobs, _ = db.add(\n", "    VectorIndex(\n", "        vector_index_name,\n", "        indexing_listener=Listener(\n", "            key=indexing_key,      # the `Document` key `model` should ingest to create embedding\n", "            select=select,       # a `Select` query telling which data to search over\n", "            model=model,         # a `_Predictor` how to convert data to embeddings\n", "        ),\n", "        compatible_listener=Listener(\n", "            key=compatible_key,      # the `Document` key `model` should ingest to create embedding\n", "            model=compatible_model,         # a `_Predictor` how to convert data to embeddings\n", "            active=False,\n", "            select=None,\n", "        )\n", "    )\n", ")"]}, {"cell_type": "code", "execution_count": null, "id": "067a1203-8dbc-4d0a-aa4a-705d99902d52", "metadata": {}, "outputs": [], "source": ["query_table_or_collection = select.table_or_collection"]}, {"cell_type": "markdown", "id": "b8a87f9d-581a-419a-81b8-a743250413e9", "metadata": {}, "source": ["## Perform a vector search\n", "\n", "We can perform the vector searches using text description:"]}, {"cell_type": "code", "execution_count": null, "id": "ce565823-4655-488c-8684-2240107fa30d", "metadata": {}, "outputs": [], "source": ["# <tab: Text>\n", "from superduperdb import Document\n", "item = Document({compatible_key: \"The moment of a soccer shot\"})"]}, {"cell_type": "markdown", "id": "fc3ba07d-1124-4d94-a117-60d2e72581f7", "metadata": {}, "source": ["Once we have this search target, we can execute a search as follows."]}, {"cell_type": "code", "execution_count": null, "id": "a061de0b-2694-4b36-844c-7753a465360f", "metadata": {}, "outputs": [], "source": ["select = query_table_or_collection.like(item, vector_index=vector_index_name, n=5).select()\n", "results = list(db.execute(select))"]}, {"cell_type": "markdown", "id": "9b6d9af9-a012-42bd-aad4-31b92d089caa", "metadata": {}, "source": ["## Visualize Results"]}, {"cell_type": "code", "execution_count": null, "id": "9e2ecea5-3a58-457c-ac50-ddc742484f2d", "metadata": {}, "outputs": [], "source": ["from IPython.display import display\n", "for result in results:\n", "    display(Document(result.unpack())[indexing_key])"]}, {"cell_type": "markdown", "id": "693b4878-39a2-444d-8e17-72a00e6c246d", "metadata": {}, "source": ["## Check the system stays updated\n", "\n", "You can add new data; once the data is added, all related models will perform calculations according to the underlying constructed model and listener, simultaneously updating the vector index to ensure that each query uses the latest data."]}, {"cell_type": "code", "execution_count": null, "id": "5ef97f5a-bb41-46ca-a85e-489824741216", "metadata": {}, "outputs": [], "source": ["new_datas = [{'x': data[-1]}]\n", "ids = db['documents'].insert(new_datas).execute()"]}]}
\ No newline at end of file
+{"metadata": {"kernelspec": {"display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3"}, "language_info": {"codemirror_mode": {"name": "ipython", "version": 3}, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.2"}}, "nbformat": 4, "nbformat_minor": 5, "cells": [{"cell_type": "markdown", "id": "38c1a328-fd86-4c5f-bd54-b8664f433608", "metadata": {}, "source": ["<!-- TABS -->\n", "# Multimodal vector search - Video"]}, {"cell_type": "markdown", "id": "f7a4aab8-86eb-4e1c-9200-0a16ba75b2e6", "metadata": {}, "source": ["<!-- TABS -->\n", "## Configure your production system"]}, {"cell_type": "markdown", "id": "81e7cd59-67d0-4776-aea1-4864aa768f95", "metadata": {}, "source": [":::note\n", "If you would like to use the production features \n", "of SuperDuperDB, then you should set the relevant \n", "connections and configurations in a configuration \n", "file. Otherwise you are welcome to use \"development\" mode \n", "to get going with SuperDuperDB quickly.\n", ":::"]}, {"cell_type": "code", "execution_count": null, "id": "62014646-ccd4-4d10-ac26-1c470f88f2f2", "metadata": {}, "outputs": [], "source": ["import os\n", "\n", "os.makedirs('.superduperdb', exist_ok=True)\n", "os.environ['SUPERDUPERDB_CONFIG'] = '.superduperdb/config.yaml'"]}, {"cell_type": "code", "execution_count": null, "id": "8e50edd2-438d-44ab-9da0-0b72197df262", "metadata": {}, "outputs": [], "source": ["# <tab: MongoDB Community>\n", "CFG = '''\n", "data_backend: mongodb://127.0.0.1:27017/documents\n", "artifact_store: filesystem://./artifact_store\n", "cluster:\n", "  cdc:\n", "    strategy: null\n", "    uri: ray://127.0.0.1:20000\n", "  compute:\n", "    uri: ray://127.0.0.1:10001\n", "  vector_search:\n", "    backfill_batch_size: 100\n", "    type: in_memory\n", "    uri: http://127.0.0.1:21000\n", "'''"]}, {"cell_type": "code", "execution_count": null, "id": "1ad9ee67-6402-45ea-8311-3efb039b5df3", "metadata": {}, "outputs": [], "source": ["# <tab: MongoDB Atlas>\n", "CFG = '''\n", "artifact_store: filesystem://<path-to-artifact-store>\n", "cluster: \n", "    compute: ray://<ray-host>\n", "    cdc:    \n", "        uri: http://<cdc-host>:<cdc-port>\n", "    vector_search:\n", "        uri: http://<vector-search-host>:<vector-search-port>\n", "        type: native\n", "databackend: mongodb+srv://<user>:<password>@<mongo-host>:27017/documents\n", "'''"]}, {"cell_type": "code", "execution_count": null, "id": "9c9e8351-b17f-4882-bda6-5ad51dbc7e1f", "metadata": {}, "outputs": [], "source": ["# <tab: SQLite>\n", "CFG = '''\n", "artifact_store: filesystem://<path-to-artifact-store>\n", "cluster: \n", "    compute: ray://<ray-host>\n", "    cdc:    \n", "        uri: http://<cdc-host>:<cdc-port>\n", "    vector_search:\n", "        uri: http://<vector-search-host>:<vector-search-port>\n", "databackend: sqlite://<path-to-db>.db\n", "'''"]}, {"cell_type": "code", "execution_count": null, "id": "d16c66bb-6ff2-4cea-b11c-0a65bf86c7ad", "metadata": {}, "outputs": [], "source": ["# <tab: MySQL>\n", "CFG = '''\n", "artifact_store: filesystem://<path-to-artifact-store>\n", "cluster: \n", "    compute: ray://<ray-host>\n", "    cdc:    \n", "        uri: http://<cdc-host>:<cdc-port>\n", "    vector_search:\n", "        uri: http://<vector-search-host>:<vector-search-port>\n", "databackend: mysql://<user>:<password>@<host>:<port>/database\n", "'''"]}, {"cell_type": "code", "execution_count": null, "id": "9b7ac715-712c-4ec7-be90-0aaa22518977", "metadata": {}, "outputs": [], "source": ["# <tab: Oracle>\n", "CFG = '''\n", "artifact_store: filesystem://<path-to-artifact-store>\n", "cluster: \n", "    compute: ray://<ray-host>\n", "    cdc:    \n", "        uri: http://<cdc-host>:<cdc-port>\n", "    vector_search:\n", "        uri: http://<vector-search-host>:<vector-search-port>\n", "databackend: mssql://<user>:<password>@<host>:<port>\n", "'''"]}, {"cell_type": "code", "execution_count": null, "id": "f21fad9c-cc0e-4cf5-83f0-41a3a614c6af", "metadata": {}, "outputs": [], "source": ["# <tab: PostgreSQL>\n", "CFG = '''\n", "artifact_store: filesystem://<path-to-artifact-store>\n", "cluster: \n", "    compute: ray://<ray-host>\n", "    cdc:    \n", "        uri: http://<cdc-host>:<cdc-port>\n", "    vector_search:\n", "        uri: http://<vector-search-host>:<vector-search-port>\n", "databackend: postgres://<user>:<password>@<host>:<port</<database>\n", "'''"]}, {"cell_type": "code", "execution_count": null, "id": "1badb5a3-823c-4463-ab79-6f4f9239dabe", "metadata": {}, "outputs": [], "source": ["# <tab: Snowflake>\n", "CFG = '''\n", "artifact_store: filesystem://<path-to-artifact-store>\n", "metadata_store: sqlite://<path-to-sqlite-db>.db\n", "cluster: \n", "    compute: ray://<ray-host>\n", "    cdc:    \n", "        uri: http://<cdc-host>:<cdc-port>\n", "    vector_search:\n", "        uri: http://<vector-search-host>:<vector-search-port>\n", "databackend: snowflake://<user>:<password>@<account>/<database>\n", "'''"]}, {"cell_type": "code", "execution_count": null, "id": "ae7807d9-9fc1-4c18-8027-a512f827783d", "metadata": {}, "outputs": [], "source": ["# <tab: Clickhouse>\n", "CFG = '''\n", "artifact_store: filesystem://<path-to-artifact-store>\n", "metadata_store: sqlite://<path-to-sqlite-db>.db\n", "cluster: \n", "    compute: ray://<ray-host>\n", "    cdc:    \n", "        uri: http://<cdc-host>:<cdc-port>\n", "    vector_search:\n", "        uri: http://<vector-search-host>:<vector-search-port>\n", "databackend: clickhouse://<user>:<password>@<host>:<port>\n", "'''"]}, {"cell_type": "code", "execution_count": null, "id": "fc40c13b-9bc5-47ac-86d6-ef7a379c45ee", "metadata": {}, "outputs": [], "source": ["with open(os.environ['SUPERDUPERDB_CONFIG'], 'w') as f:\n", "    f.write(CFG)"]}, {"cell_type": "markdown", "metadata": {}, "source": ["<!-- TABS -->\n", "## Start your cluster"]}, {"cell_type": "markdown", "metadata": {}, "source": [":::note\n", "Starting a SuperDuperDB cluster is useful in production and model development\n", "if you want to enable scalable compute, access to the models by multiple users for collaboration, \n", "monitoring.\n", "\n", "If you don't need this, then it is simpler to start in development mode.\n", ":::"]}, {"cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": ["# <tab: Experimental Cluster>\n", "!python -m superduperdb local-cluster up"]}, {"cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": ["# <tab: Docker-Compose>\n", "!make build_sandbox\n", "!make testenv_init"]}, {"cell_type": "markdown", "id": "32f8484d-2e35-472a-9b24-1a30ec1d144b", "metadata": {}, "source": ["<!-- TABS -->\n", "## Connect to SuperDuperDB"]}, {"cell_type": "markdown", "id": "06d66021-ce62-4021-a2c5-158dee92b3bb", "metadata": {}, "source": [":::note\n", "Note that this is only relevant if you are running SuperDuperDB in development mode.\n", "Otherwise refer to \"Configuring your production system\".\n", ":::"]}, {"cell_type": "code", "execution_count": null, "id": "61976f44-8139-41c0-a73e-569c6d16c4b1", "metadata": {}, "outputs": [], "source": ["# <tab: MongoDB>\n", "from superduperdb import superduper\n", "\n", "db = superduper('mongodb://localhost:27017/documents')"]}, {"cell_type": "code", "execution_count": null, "id": "e981a457", "metadata": {}, "outputs": [], "source": ["# <tab: SQLite>\n", "from superduperdb import superduper\n", "db = superduper('sqlite://my_db.db')"]}, {"cell_type": "code", "execution_count": null, "id": "19ecf7c0-b730-4503-9b5d-e97697b3bcee", "metadata": {}, "outputs": [], "source": ["# <tab: MySQL>\n", "from superduperdb import superduper\n", "\n", "user = 'superduper'\n", "password = 'superduper'\n", "port = 3306\n", "host = 'localhost'\n", "database = 'test_db'\n", "\n", "db = superduper(f\"mysql://{user}:{password}@{host}:{port}/{database}\")"]}, {"cell_type": "code", "execution_count": null, "id": "df208e8c-4fd0-438f-af29-22a763a2aebd", "metadata": {}, "outputs": [], "source": ["# <tab: Oracle>\n", "from superduperdb import superduper\n", "\n", "user = 'sa'\n", "password = 'Superduper#1'\n", "port = 1433\n", "host = 'localhost'\n", "\n", "db = superduper(f\"mssql://{user}:{password}@{host}:{port}\")"]}, {"cell_type": "code", "execution_count": null, "id": "d2297295", "metadata": {}, "outputs": [], "source": ["# <tab: PostgreSQL>\n", "!pip install psycopg2\n", "from superduperdb import superduper\n", "\n", "user = 'postgres'\n", "password = 'postgres'\n", "port = 5432\n", "host = 'localhost'\n", "database = 'test_db'\n", "db_uri = f\"postgres://{user}:{password}@{host}:{port}/{database}\"\n", "\n", "db = superduper(db_uri, metadata_store=db_uri.replace('postgres://', 'postgresql://'))"]}, {"cell_type": "code", "execution_count": null, "id": "cc6c8517", "metadata": {}, "outputs": [], "source": ["# <tab: Snowflake>\n", "from superduperdb import superduper\n", "\n", "user = \"superduperuser\"\n", "password = \"superduperpassword\"\n", "account = \"XXXX-XXXX\"  # ORGANIZATIONID-USERID\n", "database = \"FREE_COMPANY_DATASET/PUBLIC\"\n", "\n", "snowflake_uri = f\"snowflake://{user}:{password}@{account}/{database}\"\n", "\n", "db = superduper(\n", "    snowflake_uri, \n", "    metadata_store='sqlite:///your_database_name.db',\n", ")"]}, {"cell_type": "code", "execution_count": null, "id": "05da45e3-d9e4-49ca-b9ee-db1b8bf4eb44", "metadata": {}, "outputs": [], "source": ["# <tab: Clickhouse>\n", "from superduperdb import superduper\n", "\n", "user = 'default'\n", "password = ''\n", "port = 8123\n", "host = 'localhost'\n", "\n", "db = superduper(f\"clickhouse://{user}:{password}@{host}:{port}\", metadata_store=f'mongomock://meta')"]}, {"cell_type": "code", "execution_count": null, "id": "0e89c8dd-d845-423a-9acc-97e3360d370c", "metadata": {}, "outputs": [], "source": ["# <tab: DuckDB>\n", "from superduperdb import superduper\n", "\n", "db = superduper('duckdb://mydb.duckdb')"]}, {"cell_type": "code", "execution_count": null, "id": "2de71562", "metadata": {}, "outputs": [], "source": ["# <tab: Pandas>\n", "from superduperdb import superduper\n", "\n", "db = superduper(['my.csv'], metadata_store=f'mongomock://meta')"]}, {"cell_type": "code", "execution_count": null, "id": "cb029a5e-fedf-4f07-8a31-d220cfbfbb3d", "metadata": {}, "outputs": [], "source": ["# <tab: MongoMock>\n", "from superduperdb import superduper\n", "\n", "db = superduper('mongomock:///test_db')"]}, {"cell_type": "markdown", "id": "032c2e7b-3f54-4263-b778-0fef60596efb", "metadata": {}, "source": ["<!-- TABS -->\n", "## Get useful sample data"]}, {"cell_type": "code", "execution_count": null, "id": "1b6f7ccb", "metadata": {}, "outputs": [], "source": ["# <tab: Video>\n", "!curl -O https://superduperdb-public-demo.s3.amazonaws.com/videos.zip && unzip videos.zip\n", "import os\n", "\n", "data = [f'videos/{x}' for x in os.listdir('./videos')]\n", "sample_datapoint = data[-1]\n", "\n", "from superduperdb.ext.pillow import pil_image\n", "chunked_model_datatype = pil_image"]}, {"cell_type": "code", "execution_count": null, "id": "44a702b1-faf9-4edb-8a55-efc4add84a83", "metadata": {}, "outputs": [], "source": ["datas = [{'x': d} for d in data[:3]]"]}, {"cell_type": "markdown", "id": "b31257e4-06fa-4cc7-9626-bb4d03fdc029", "metadata": {}, "source": ["<!-- TABS -->\n", "## Create datatype"]}, {"cell_type": "markdown", "id": "43284218", "metadata": {}, "source": ["SuperduperDB supports automatic data conversion, so users don\u2019t need to worry about the compatibility of different data formats (`PIL.Image`, `numpy.array`, `pandas.DataFrame`, etc.) with the database.\n", "\n", "It also supports custom data conversion methods for transforming data, such as defining the following Datatype."]}, {"cell_type": "code", "execution_count": null, "id": "e844c762-3391-401d-9047-ed8617a9c946", "metadata": {}, "outputs": [], "source": ["# <tab: Video>\n", "from superduperdb import DataType\n", "\n", "# Create an instance of the Encoder with the identifier 'video_on_file' and load_hybrid set to False\n", "datatype = DataType(\n", "    identifier='video_on_file',\n", "    encodable='file',\n", ")"]}, {"cell_type": "markdown", "metadata": {}, "source": ["<!-- TABS -->\n", "## Setup tables or collections"]}, {"cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": ["from superduperdb.components.table import Table\n", "from superduperdb import Schema\n", "\n", "schema = Schema(identifier=\"schema\", fields={\"x\": datatype})\n", "table_or_collection = Table(\"documents\", schema=schema)\n", "db.apply(table_or_collection)"]}, {"cell_type": "markdown", "id": "a947c52e-919e-4440-b1d6-914e690314d4", "metadata": {}, "source": ["Inserting data, all fields will be matched with the schema for data conversion."]}, {"cell_type": "code", "execution_count": null, "id": "afead32f-fc4c-4b11-9d31-d38bf061c232", "metadata": {}, "outputs": [], "source": ["db['documents'].insert(datas).execute()\n", "select = db['documents'].select()"]}, {"cell_type": "markdown", "id": "54fea927-ee4a-44cd-aaf2-634b574c316d", "metadata": {}, "source": ["<!-- TABS -->\n", "## Apply a chunker for search"]}, {"cell_type": "markdown", "id": "06d90bda-e8c4-494e-a38c-837fb63689ae", "metadata": {}, "source": [":::note\n", "Note that applying a chunker is ***not*** mandatory for search.\n", "If your data is already chunked (e.g. short text snippets or audio) or if you\n", "are searching through something like images, which can't be chunked, then this\n", "won't be necessary.\n", ":::"]}, {"cell_type": "code", "execution_count": null, "id": "f093a6d0-9d2f-4ecf-b1bd-0027302c62de", "metadata": {}, "outputs": [], "source": ["# <tab: Video>\n", "!pip install opencv-python\n", "import cv2\n", "import tqdm\n", "from PIL import Image\n", "from superduperdb.ext.pillow import pil_image\n", "from superduperdb import model, Schema\n", "\n", "\n", "@model(\n", "    flatten=True,\n", "    model_update_kwargs={'document_embedded': False},\n", ")\n", "def chunker(video_file):\n", "    # Set the sampling frequency for frames\n", "    sample_freq = 10\n", "    \n", "    # Open the video file using OpenCV\n", "    cap = cv2.VideoCapture(video_file)\n", "    \n", "    # Initialize variables\n", "    frame_count = 0\n", "    fps = cap.get(cv2.CAP_PROP_FPS)\n", "    extracted_frames = []\n", "    progress = tqdm.tqdm()\n", "\n", "    # Iterate through video frames\n", "    while True:\n", "        ret, frame = cap.read()\n", "        if not ret:\n", "            break\n", "        \n", "        # Get the current timestamp based on frame count and FPS\n", "        current_timestamp = frame_count // fps\n", "        \n", "        # Sample frames based on the specified frequency\n", "        if frame_count % sample_freq == 0:\n", "            extracted_frames.append({\n", "                'image': Image.fromarray(frame[:,:,::-1]),  # Convert BGR to RGB\n", "                'current_timestamp': current_timestamp,\n", "            })\n", "        frame_count += 1\n", "        progress.update(1)\n", "    \n", "    # Release resources\n", "    cap.release()\n", "    cv2.destroyAllWindows()\n", "    \n", "    # Return the list of extracted frames\n", "    return extracted_frames"]}, {"cell_type": "markdown", "id": "b33a16f9-3bac-45bb-80ac-3ccf265dce5f", "metadata": {}, "source": ["Now we apply this chunker to the data by wrapping the chunker in `Listener`:"]}, {"cell_type": "code", "execution_count": null, "id": "93d21872-d4dc-40dc-abab-fb07ba102ea3", "metadata": {}, "outputs": [], "source": ["from superduperdb import Listener\n", "\n", "upstream_listener = Listener(\n", "    model=chunker,\n", "    select=select,\n", "    key='x',\n", "    uuid=\"chunk\",\n", ")\n", "\n", "db.apply(upstream_listener)"]}, {"cell_type": "markdown", "id": "907721f8-d5bf-4623-8871-3ab9a05001d7", "metadata": {}, "source": ["## Build multimodal embedding models"]}, {"cell_type": "markdown", "id": "033e1eaf-2cdb-499a-ba83-cf080a1a6fda", "metadata": {}, "source": ["We define the output data type of a model as a vector for vector transformation."]}, {"cell_type": "code", "execution_count": null, "id": "28848ff1-45ab-4926-8676-777edf237347", "metadata": {}, "outputs": [], "source": ["# <tab: MongoDB>\n", "from superduperdb.components.vector_index import vector\n", "output_datatpye = vector(shape=(1024,))"]}, {"cell_type": "code", "execution_count": null, "id": "6acf66c5-7369-4aa8-a8a0-5842bd17b469", "metadata": {}, "outputs": [], "source": ["# <tab: SQL>\n", "from superduperdb.components.vector_index import sqlvector\n", "output_datatpye = sqlvector(shape=(1024,))"]}, {"cell_type": "markdown", "id": "143bf946-64b7-4452-8d20-44f2f9ae3fd6", "metadata": {}, "source": ["Then define two models, one for text embedding and one for image embedding."]}, {"cell_type": "code", "execution_count": null, "id": "f33513d3-9f86-4108-8f8b-4a6251bdd9fd", "metadata": {}, "outputs": [], "source": ["# <tab: Text-Image>\n", "!pip install git+https://github.com/openai/CLIP.git\n", "import clip\n", "from superduperdb import vector\n", "from superduperdb.ext.torch import TorchModel\n", "\n", "# Load the CLIP model and obtain the preprocessing function\n", "model, preprocess = clip.load(\"ViT-B/32\", device='cpu')\n", "\n", "# Create a TorchModel for text encoding\n", "compatible_model = TorchModel(\n", "    identifier='clip_text', # Unique identifier for the model\n", "    object=model, # CLIP model\n", "    preprocess=lambda x: clip.tokenize(x)[0],  # Model input preprocessing using CLIP \n", "    postprocess=lambda x: x.tolist(), # Convert the model output to a list\n", "    datatype=output_datatpye,  # Vector encoder with shape (1024,)\n", "    forward_method='encode_text', # Use the 'encode_text' method for forward pass \n", ")\n", "\n", "# Create a TorchModel for visual encoding\n", "model = TorchModel(\n", "    identifier='clip_image',  # Unique identifier for the model\n", "    object=model.visual,  # Visual part of the CLIP model    \n", "    preprocess=preprocess, # Visual preprocessing using CLIP\n", "    postprocess=lambda x: x.tolist(), # Convert the output to a list \n", "    datatype=output_datatpye, # Vector encoder with shape (1024,)\n", ")"]}, {"cell_type": "markdown", "id": "3d0119da-9cfd-4a60-8847-c3bfdf37697f", "metadata": {}, "source": ["Because we use multimodal models, we define different keys to specify which model to use for embedding calculations in the vector_index."]}, {"cell_type": "code", "execution_count": null, "id": "12e75fab-8504-4d17-a7d9-f98667a5d6aa", "metadata": {}, "outputs": [], "source": ["compatible_key = 'text' # we use text key for text embedding\n", "indexing_key = upstream_listener.outputs_key + '.image' # we use indexing_key for image embedding, use the image field of the result\n", "select = upstream_listener.outputs_select"]}, {"cell_type": "markdown", "id": "41b8b40d-3750-4d7b-aa60-62e07b734b04", "metadata": {}, "source": ["## Create vector-index"]}, {"cell_type": "code", "execution_count": null, "id": "66ee3ff4-880e-477b-bbdf-5b8d89c56de2", "metadata": {}, "outputs": [], "source": ["vector_index_name = 'my-vector-index'"]}, {"cell_type": "code", "execution_count": null, "id": "4cede653", "metadata": {}, "outputs": [], "source": ["# <tab: 2-Modalities>\n", "from superduperdb import VectorIndex, Listener\n", "\n", "jobs, _ = db.add(\n", "    VectorIndex(\n", "        vector_index_name,\n", "        indexing_listener=Listener(\n", "            key=indexing_key,      # the `Document` key `model` should ingest to create embedding\n", "            select=select,       # a `Select` query telling which data to search over\n", "            model=model,         # a `_Predictor` how to convert data to embeddings\n", "        ),\n", "        compatible_listener=Listener(\n", "            key=compatible_key,      # the `Document` key `model` should ingest to create embedding\n", "            model=compatible_model,         # a `_Predictor` how to convert data to embeddings\n", "            active=False,\n", "            select=None,\n", "        )\n", "    )\n", ")"]}, {"cell_type": "code", "execution_count": null, "id": "067a1203-8dbc-4d0a-aa4a-705d99902d52", "metadata": {}, "outputs": [], "source": ["query_table_or_collection = select.table_or_collection"]}, {"cell_type": "markdown", "id": "b8a87f9d-581a-419a-81b8-a743250413e9", "metadata": {}, "source": ["## Perform a vector search\n", "\n", "We can perform the vector searches using text description:"]}, {"cell_type": "code", "execution_count": null, "id": "ce565823-4655-488c-8684-2240107fa30d", "metadata": {}, "outputs": [], "source": ["# <tab: Text>\n", "from superduperdb import Document\n", "item = Document({compatible_key: \"The moment of a soccer shot\"})"]}, {"cell_type": "markdown", "id": "fc3ba07d-1124-4d94-a117-60d2e72581f7", "metadata": {}, "source": ["Once we have this search target, we can execute a search as follows."]}, {"cell_type": "code", "execution_count": null, "id": "a061de0b-2694-4b36-844c-7753a465360f", "metadata": {}, "outputs": [], "source": ["select = query_table_or_collection.like(item, vector_index=vector_index_name, n=5).select()\n", "results = list(db.execute(select))"]}, {"cell_type": "markdown", "id": "9b6d9af9-a012-42bd-aad4-31b92d089caa", "metadata": {}, "source": ["## Visualize Results"]}, {"cell_type": "code", "execution_count": null, "id": "9e2ecea5-3a58-457c-ac50-ddc742484f2d", "metadata": {}, "outputs": [], "source": ["from IPython.display import display\n", "for result in results:\n", "    display(Document(result.unpack())[indexing_key])"]}, {"cell_type": "markdown", "id": "693b4878-39a2-444d-8e17-72a00e6c246d", "metadata": {}, "source": ["## Check the system stays updated\n", "\n", "You can add new data; once the data is added, all related models will perform calculations according to the underlying constructed model and listener, simultaneously updating the vector index to ensure that each query uses the latest data."]}, {"cell_type": "code", "execution_count": null, "id": "5ef97f5a-bb41-46ca-a85e-489824741216", "metadata": {}, "outputs": [], "source": ["new_datas = [{'x': data[-1]}]\n", "ids = db['documents'].insert(new_datas).execute()"]}]}
\ No newline at end of file
diff --git a/docs/content/use_cases/multimodal_vector_search_video.md b/docs/content/use_cases/multimodal_vector_search_video.md
index d703f426c..675246fbd 100644
--- a/docs/content/use_cases/multimodal_vector_search_video.md
+++ b/docs/content/use_cases/multimodal_vector_search_video.md
@@ -319,9 +319,9 @@ datas = [{'x': d} for d in data[:3]]
 <!-- TABS -->
 ## Create datatype
 
-Data types such as "text" or "integer" which are natively support by your `db.databackend` don't need a datatype.
+SuperduperDB supports automatic data conversion, so users don’t need to worry about the compatibility of different data formats (`PIL.Image`, `numpy.array`, `pandas.DataFrame`, etc.) with the database.
 
-Otherwise do one of the following:
+It also supports custom data conversion methods for transforming data, such as defining the following Datatype.
 
 
 <Tabs>
diff --git a/docs/content/use_cases/retrieval_augmented_generation.ipynb b/docs/content/use_cases/retrieval_augmented_generation.ipynb
index 5582c2b0a..5656c9f7c 100644
--- a/docs/content/use_cases/retrieval_augmented_generation.ipynb
+++ b/docs/content/use_cases/retrieval_augmented_generation.ipynb
@@ -1 +1 @@
-{"metadata": {"kernelspec": {"display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3"}, "language_info": {"codemirror_mode": {"name": "ipython", "version": 3}, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.2"}}, "nbformat": 4, "nbformat_minor": 5, "cells": [{"cell_type": "markdown", "id": "38c1a328-fd86-4c5f-bd54-b8664f433608", "metadata": {}, "source": ["<!-- TABS -->\n", "# Retrieval augmented generation"]}, {"cell_type": "markdown", "id": "f7a4aab8-86eb-4e1c-9200-0a16ba75b2e6", "metadata": {}, "source": ["<!-- TABS -->\n", "## Configure your production system"]}, {"cell_type": "markdown", "id": "81e7cd59-67d0-4776-aea1-4864aa768f95", "metadata": {}, "source": [":::note\n", "If you would like to use the production features \n", "of SuperDuperDB, then you should set the relevant \n", "connections and configurations in a configuration \n", "file. Otherwise you are welcome to use \"development\" mode \n", "to get going with SuperDuperDB quickly.\n", ":::"]}, {"cell_type": "code", "execution_count": null, "id": "62014646-ccd4-4d10-ac26-1c470f88f2f2", "metadata": {}, "outputs": [], "source": ["import os\n", "\n", "os.makedirs('.superduperdb', exist_ok=True)\n", "os.environ['SUPERDUPERDB_CONFIG'] = '.superduperdb/config.yaml'"]}, {"cell_type": "code", "execution_count": null, "id": "8e50edd2-438d-44ab-9da0-0b72197df262", "metadata": {}, "outputs": [], "source": ["# <tab: MongoDB Community>\n", "CFG = '''\n", "data_backend: mongodb://127.0.0.1:27017/documents\n", "artifact_store: filesystem://./artifact_store\n", "cluster:\n", "  cdc:\n", "    strategy: null\n", "    uri: ray://127.0.0.1:20000\n", "  compute:\n", "    uri: ray://127.0.0.1:10001\n", "  vector_search:\n", "    backfill_batch_size: 100\n", "    type: in_memory\n", "    uri: http://127.0.0.1:21000\n", "'''"]}, {"cell_type": "code", "execution_count": null, "id": "1ad9ee67-6402-45ea-8311-3efb039b5df3", "metadata": {}, "outputs": [], "source": ["# <tab: MongoDB Atlas>\n", "CFG = '''\n", "artifact_store: filesystem://<path-to-artifact-store>\n", "cluster: \n", "    compute: ray://<ray-host>\n", "    cdc:    \n", "        uri: http://<cdc-host>:<cdc-port>\n", "    vector_search:\n", "        uri: http://<vector-search-host>:<vector-search-port>\n", "        type: native\n", "databackend: mongodb+srv://<user>:<password>@<mongo-host>:27017/documents\n", "'''"]}, {"cell_type": "code", "execution_count": null, "id": "9c9e8351-b17f-4882-bda6-5ad51dbc7e1f", "metadata": {}, "outputs": [], "source": ["# <tab: SQLite>\n", "CFG = '''\n", "artifact_store: filesystem://<path-to-artifact-store>\n", "cluster: \n", "    compute: ray://<ray-host>\n", "    cdc:    \n", "        uri: http://<cdc-host>:<cdc-port>\n", "    vector_search:\n", "        uri: http://<vector-search-host>:<vector-search-port>\n", "databackend: sqlite://<path-to-db>.db\n", "'''"]}, {"cell_type": "code", "execution_count": null, "id": "d16c66bb-6ff2-4cea-b11c-0a65bf86c7ad", "metadata": {}, "outputs": [], "source": ["# <tab: MySQL>\n", "CFG = '''\n", "artifact_store: filesystem://<path-to-artifact-store>\n", "cluster: \n", "    compute: ray://<ray-host>\n", "    cdc:    \n", "        uri: http://<cdc-host>:<cdc-port>\n", "    vector_search:\n", "        uri: http://<vector-search-host>:<vector-search-port>\n", "databackend: mysql://<user>:<password>@<host>:<port>/database\n", "'''"]}, {"cell_type": "code", "execution_count": null, "id": "9b7ac715-712c-4ec7-be90-0aaa22518977", "metadata": {}, "outputs": [], "source": ["# <tab: Oracle>\n", "CFG = '''\n", "artifact_store: filesystem://<path-to-artifact-store>\n", "cluster: \n", "    compute: ray://<ray-host>\n", "    cdc:    \n", "        uri: http://<cdc-host>:<cdc-port>\n", "    vector_search:\n", "        uri: http://<vector-search-host>:<vector-search-port>\n", "databackend: mssql://<user>:<password>@<host>:<port>\n", "'''"]}, {"cell_type": "code", "execution_count": null, "id": "f21fad9c-cc0e-4cf5-83f0-41a3a614c6af", "metadata": {}, "outputs": [], "source": ["# <tab: PostgreSQL>\n", "CFG = '''\n", "artifact_store: filesystem://<path-to-artifact-store>\n", "cluster: \n", "    compute: ray://<ray-host>\n", "    cdc:    \n", "        uri: http://<cdc-host>:<cdc-port>\n", "    vector_search:\n", "        uri: http://<vector-search-host>:<vector-search-port>\n", "databackend: postgres://<user>:<password>@<host>:<port</<database>\n", "'''"]}, {"cell_type": "code", "execution_count": null, "id": "1badb5a3-823c-4463-ab79-6f4f9239dabe", "metadata": {}, "outputs": [], "source": ["# <tab: Snowflake>\n", "CFG = '''\n", "artifact_store: filesystem://<path-to-artifact-store>\n", "metadata_store: sqlite://<path-to-sqlite-db>.db\n", "cluster: \n", "    compute: ray://<ray-host>\n", "    cdc:    \n", "        uri: http://<cdc-host>:<cdc-port>\n", "    vector_search:\n", "        uri: http://<vector-search-host>:<vector-search-port>\n", "databackend: snowflake://<user>:<password>@<account>/<database>\n", "'''"]}, {"cell_type": "code", "execution_count": null, "id": "ae7807d9-9fc1-4c18-8027-a512f827783d", "metadata": {}, "outputs": [], "source": ["# <tab: Clickhouse>\n", "CFG = '''\n", "artifact_store: filesystem://<path-to-artifact-store>\n", "metadata_store: sqlite://<path-to-sqlite-db>.db\n", "cluster: \n", "    compute: ray://<ray-host>\n", "    cdc:    \n", "        uri: http://<cdc-host>:<cdc-port>\n", "    vector_search:\n", "        uri: http://<vector-search-host>:<vector-search-port>\n", "databackend: clickhouse://<user>:<password>@<host>:<port>\n", "'''"]}, {"cell_type": "code", "execution_count": null, "id": "fc40c13b-9bc5-47ac-86d6-ef7a379c45ee", "metadata": {}, "outputs": [], "source": ["with open(os.environ['SUPERDUPERDB_CONFIG'], 'w') as f:\n", "    f.write(CFG)"]}, {"cell_type": "markdown", "metadata": {}, "source": ["<!-- TABS -->\n", "## Start your cluster"]}, {"cell_type": "markdown", "metadata": {}, "source": [":::note\n", "Starting a SuperDuperDB cluster is useful in production and model development\n", "if you want to enable scalable compute, access to the models by multiple users for collaboration, \n", "monitoring.\n", "\n", "If you don't need this, then it is simpler to start in development mode.\n", ":::"]}, {"cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": ["# <tab: Experimental Cluster>\n", "!python -m superduperdb local-cluster up"]}, {"cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": ["# <tab: Docker-Compose>\n", "!make build_sandbox\n", "!make testenv_init"]}, {"cell_type": "markdown", "id": "32f8484d-2e35-472a-9b24-1a30ec1d144b", "metadata": {}, "source": ["<!-- TABS -->\n", "## Connect to SuperDuperDB"]}, {"cell_type": "markdown", "id": "06d66021-ce62-4021-a2c5-158dee92b3bb", "metadata": {}, "source": [":::note\n", "Note that this is only relevant if you are running SuperDuperDB in development mode.\n", "Otherwise refer to \"Configuring your production system\".\n", ":::"]}, {"cell_type": "code", "execution_count": null, "id": "61976f44-8139-41c0-a73e-569c6d16c4b1", "metadata": {}, "outputs": [], "source": ["# <tab: MongoDB>\n", "from superduperdb import superduper\n", "\n", "db = superduper('mongodb://localhost:27017/documents')"]}, {"cell_type": "code", "execution_count": null, "id": "e981a457", "metadata": {}, "outputs": [], "source": ["# <tab: SQLite>\n", "from superduperdb import superduper\n", "db = superduper('sqlite://my_db.db')"]}, {"cell_type": "code", "execution_count": null, "id": "19ecf7c0-b730-4503-9b5d-e97697b3bcee", "metadata": {}, "outputs": [], "source": ["# <tab: MySQL>\n", "from superduperdb import superduper\n", "\n", "user = 'superduper'\n", "password = 'superduper'\n", "port = 3306\n", "host = 'localhost'\n", "database = 'test_db'\n", "\n", "db = superduper(f\"mysql://{user}:{password}@{host}:{port}/{database}\")"]}, {"cell_type": "code", "execution_count": null, "id": "df208e8c-4fd0-438f-af29-22a763a2aebd", "metadata": {}, "outputs": [], "source": ["# <tab: Oracle>\n", "from superduperdb import superduper\n", "\n", "user = 'sa'\n", "password = 'Superduper#1'\n", "port = 1433\n", "host = 'localhost'\n", "\n", "db = superduper(f\"mssql://{user}:{password}@{host}:{port}\")"]}, {"cell_type": "code", "execution_count": null, "id": "d2297295", "metadata": {}, "outputs": [], "source": ["# <tab: PostgreSQL>\n", "!pip install psycopg2\n", "from superduperdb import superduper\n", "\n", "user = 'postgres'\n", "password = 'postgres'\n", "port = 5432\n", "host = 'localhost'\n", "database = 'test_db'\n", "db_uri = f\"postgres://{user}:{password}@{host}:{port}/{database}\"\n", "\n", "db = superduper(db_uri, metadata_store=db_uri.replace('postgres://', 'postgresql://'))"]}, {"cell_type": "code", "execution_count": null, "id": "cc6c8517", "metadata": {}, "outputs": [], "source": ["# <tab: Snowflake>\n", "from superduperdb import superduper\n", "\n", "user = \"superduperuser\"\n", "password = \"superduperpassword\"\n", "account = \"XXXX-XXXX\"  # ORGANIZATIONID-USERID\n", "database = \"FREE_COMPANY_DATASET/PUBLIC\"\n", "\n", "snowflake_uri = f\"snowflake://{user}:{password}@{account}/{database}\"\n", "\n", "db = superduper(\n", "    snowflake_uri, \n", "    metadata_store='sqlite:///your_database_name.db',\n", ")"]}, {"cell_type": "code", "execution_count": null, "id": "05da45e3-d9e4-49ca-b9ee-db1b8bf4eb44", "metadata": {}, "outputs": [], "source": ["# <tab: Clickhouse>\n", "from superduperdb import superduper\n", "\n", "user = 'default'\n", "password = ''\n", "port = 8123\n", "host = 'localhost'\n", "\n", "db = superduper(f\"clickhouse://{user}:{password}@{host}:{port}\", metadata_store=f'mongomock://meta')"]}, {"cell_type": "code", "execution_count": null, "id": "0e89c8dd-d845-423a-9acc-97e3360d370c", "metadata": {}, "outputs": [], "source": ["# <tab: DuckDB>\n", "from superduperdb import superduper\n", "\n", "db = superduper('duckdb://mydb.duckdb')"]}, {"cell_type": "code", "execution_count": null, "id": "2de71562", "metadata": {}, "outputs": [], "source": ["# <tab: Pandas>\n", "from superduperdb import superduper\n", "\n", "db = superduper(['my.csv'], metadata_store=f'mongomock://meta')"]}, {"cell_type": "code", "execution_count": null, "id": "cb029a5e-fedf-4f07-8a31-d220cfbfbb3d", "metadata": {}, "outputs": [], "source": ["# <tab: MongoMock>\n", "from superduperdb import superduper\n", "\n", "db = superduper('mongomock:///test_db')"]}, {"cell_type": "markdown", "id": "032c2e7b-3f54-4263-b778-0fef60596efb", "metadata": {}, "source": ["<!-- TABS -->\n", "## Get useful sample data"]}, {"cell_type": "code", "execution_count": null, "id": "4e7902bd", "metadata": {}, "outputs": [], "source": ["# <tab: Text>\n", "!curl -O https://superduperdb-public-demo.s3.amazonaws.com/text.json\n", "import json\n", "\n", "with open('text.json', 'r') as f:\n", "    data = json.load(f)"]}, {"cell_type": "code", "execution_count": null, "id": "33486ec7-0316-4e0c-a409-c09ab4c16669", "metadata": {}, "outputs": [], "source": ["# <tab: PDF>\n", "!curl -O https://superduperdb-public-demo.s3.amazonaws.com/pdfs.zip && unzip -o pdfs.zip\n", "import os\n", "\n", "data = [f'pdfs/{x}' for x in os.listdir('./pdfs') if x.endswith('.pdf')]"]}, {"cell_type": "code", "execution_count": null, "id": "b745ed54-3818-4685-a3b5-6ab4e2afc44d", "metadata": {}, "outputs": [], "source": ["datas = [{'x': d} for d in data]"]}, {"cell_type": "markdown", "metadata": {}, "source": ["<!-- TABS -->\n", "## Insert simple data\n", "\n", "After turning on auto_schema, we can directly insert data, and superduperdb will automatically analyze the data type, and match the construction of the table and datatype."]}, {"cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": ["from superduperdb import Document\n", "\n", "table_or_collection = db['documents']\n", "\n", "ids = db.execute(table_or_collection.insert([Document(data) for data in datas]))\n", "select = table_or_collection.select()"]}, {"cell_type": "markdown", "id": "54fea927-ee4a-44cd-aaf2-634b574c316d", "metadata": {}, "source": ["<!-- TABS -->\n", "## Apply a chunker for search"]}, {"cell_type": "markdown", "id": "06d90bda-e8c4-494e-a38c-837fb63689ae", "metadata": {}, "source": [":::note\n", "Note that applying a chunker is ***not*** mandatory for search.\n", "If your data is already chunked (e.g. short text snippets or audio) or if you\n", "are searching through something like images, which can't be chunked, then this\n", "won't be necessary.\n", ":::"]}, {"cell_type": "code", "execution_count": null, "id": "2d20eaa0-a416-4483-938e-23f79845739a", "metadata": {}, "outputs": [], "source": ["# <tab: Text>\n", "from superduperdb import model\n", "\n", "CHUNK_SIZE = 200\n", "\n", "@model(flatten=True, model_update_kwargs={'document_embedded': False})\n", "def chunker(text):\n", "    text = text.split()\n", "    chunks = [' '.join(text[i:i + CHUNK_SIZE]) for i in range(0, len(text), CHUNK_SIZE)]\n", "    return chunks"]}, {"cell_type": "code", "execution_count": null, "id": "facd7dc0-fffa-40d8-af72-2b9e4852ad79", "metadata": {}, "outputs": [], "source": ["# <tab: PDF>\n", "!pip install -q \"unstructured[pdf]\"\n", "from superduperdb import model\n", "from unstructured.partition.pdf import partition_pdf\n", "\n", "CHUNK_SIZE = 500\n", "\n", "@model(flatten=True, model_update_kwargs={'document_embedded': False})\n", "def chunker(pdf_file):\n", "    elements = partition_pdf(pdf_file)\n", "    text = '\\n'.join([e.text for e in elements])\n", "    chunks = [text[i:i + CHUNK_SIZE] for i in range(0, len(text), CHUNK_SIZE)]\n", "    return chunks"]}, {"cell_type": "markdown", "id": "b33a16f9-3bac-45bb-80ac-3ccf265dce5f", "metadata": {}, "source": ["Now we apply this chunker to the data by wrapping the chunker in `Listener`:"]}, {"cell_type": "code", "execution_count": null, "id": "93d21872-d4dc-40dc-abab-fb07ba102ea3", "metadata": {}, "outputs": [], "source": ["from superduperdb import Listener\n", "\n", "upstream_listener = Listener(\n", "    model=chunker,\n", "    select=select,\n", "    key='x',\n", "    uuid=\"chunk\",\n", ")\n", "\n", "db.apply(upstream_listener)"]}, {"cell_type": "markdown", "id": "7c5377c0-4c9b-4ba9-8f08-5e866b9220b5", "metadata": {}, "source": ["## Select outputs of upstream listener"]}, {"cell_type": "markdown", "id": "809f5f62-95c3-483b-ae74-a5cdb5c1c83d", "metadata": {}, "source": [":::note\n", "This is useful if you have performed a first step, such as pre-computing \n", "features, or chunking your data. You can use this query to \n", "operate on those outputs.\n", ":::"]}, {"cell_type": "code", "execution_count": null, "id": "e49b116d-34f5-438a-995e-a8bd59e1dd80", "metadata": {}, "outputs": [], "source": ["indexing_key = upstream_listener.outputs_key\n", "select = upstream_listener.outputs_select"]}, {"cell_type": "markdown", "id": "c9a2cd87-723f-4cee-87c7-9b8181c9e54b", "metadata": {}, "source": ["<!-- TABS -->\n", "## Build text embedding model"]}, {"cell_type": "code", "execution_count": null, "id": "a9b1f538-65ca-499e-b6d0-2dd733f81723", "metadata": {}, "outputs": [], "source": ["# <tab: OpenAI>\n", "!pip install openai\n", "from superduperdb.ext.openai import OpenAIEmbedding\n", "model = OpenAIEmbedding(identifier='text-embedding-ada-002')"]}, {"cell_type": "code", "execution_count": null, "id": "e83facd8-8823-492f-a2c6-659f38d8e6ec", "metadata": {}, "outputs": [], "source": ["# <tab: JinaAI>\n", "import os\n", "from superduperdb.ext.jina import JinaEmbedding\n", "\n", "os.environ[\"JINA_API_KEY\"] = \"jina_xxxx\"\n", " \n", "# define the model\n", "model = JinaEmbedding(identifier='jina-embeddings-v2-base-en')"]}, {"cell_type": "code", "execution_count": null, "id": "3b4a9a60-41df-461d-b165-1d136ee25694", "metadata": {}, "outputs": [], "source": ["# <tab: Sentence-Transformers>\n", "!pip install sentence-transformers\n", "from superduperdb import vector\n", "import sentence_transformers\n", "from superduperdb.ext.sentence_transformers import SentenceTransformer\n", "\n", "model = SentenceTransformer(\n", "    identifier=\"embedding\",\n", "    object=sentence_transformers.SentenceTransformer(\"BAAI/bge-small-en\"),\n", "    datatype=vector(shape=(1024,)),\n", "    postprocess=lambda x: x.tolist(),\n", "    predict_kwargs={\"show_progress_bar\": True},\n", ")"]}, {"cell_type": "code", "execution_count": null, "id": "b1219380-13ce-4301-90e6-6ede2eee1497", "metadata": {}, "outputs": [], "source": ["# <tab: Transformers>\n", "import dataclasses as dc\n", "from superduperdb import vector\n", "from superduperdb.components.model import Model, ensure_initialized, Signature\n", "from transformers import AutoTokenizer, AutoModel\n", "import torch\n", "\n", "@dc.dataclass(kw_only=True)\n", "class TransformerEmbedding(Model):\n", "    signature: Signature = 'singleton'\n", "    pretrained_model_name_or_path : str\n", "\n", "    def init(self):\n", "        self.tokenizer = AutoTokenizer.from_pretrained(self.pretrained_model_name_or_path)\n", "        self.model = AutoModel.from_pretrained(self.pretrained_model_name_or_path)\n", "        self.model.eval()\n", "\n", "    @ensure_initialized\n", "    def predict(self, x):\n", "        return self.predict([x])[0]\n", "        \n", "    @ensure_initialized\n", "    def predict(self, dataset):\n", "        encoded_input = self.tokenizer(dataset, padding=True, truncation=True, return_tensors='pt')\n", "        # Compute token embeddings\n", "        with torch.no_grad():\n", "            model_output = self.model(**encoded_input)\n", "            # Perform pooling. In this case, cls pooling.\n", "            sentence_embeddings = model_output[0][:, 0]\n", "        # normalize embeddings\n", "        sentence_embeddings = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)\n", "        return sentence_embeddings.tolist()\n", "\n", "\n", "model = TransformerEmbedding(identifier=\"embedding\", pretrained_model_name_or_path=\"BAAI/bge-small-en\", datatype=vector(shape=(384, )))"]}, {"cell_type": "code", "execution_count": null, "id": "b9b238cf-56d5-44b4-87b0-9d8d55bdf36f", "metadata": {}, "outputs": [], "source": ["print(len(model.predict(\"What is SuperDuperDB\")))"]}, {"cell_type": "markdown", "id": "f31843db-8638-458a-a770-96a79041be88", "metadata": {}, "source": ["## Create vector-index"]}, {"cell_type": "code", "execution_count": null, "id": "4663fa4b-c2ec-427d-bf8b-b8b109cc2ccf", "metadata": {}, "outputs": [], "source": ["from superduperdb import VectorIndex, Listener\n", "\n", "vector_index_name = 'vector-index'\n", "\n", "jobs, _ = db.add(\n", "    VectorIndex(\n", "        vector_index_name,\n", "        indexing_listener=Listener(\n", "            key=indexing_key,      # the `Document` key `model` should ingest to create embedding\n", "            select=select,       # a `Select` query telling which data to search over\n", "            model=model,         # a `_Predictor` how to convert data to embeddings\n", "            uuid=\"embedding\"\n", "        )\n", "    )\n", ")\n", "query_table_or_collection = select.table_or_collection"]}, {"cell_type": "code", "execution_count": null, "id": "053183e1-fee8-4b7b-a567-62ce97845c98", "metadata": {}, "outputs": [], "source": ["query = \"Tell me about the SuperDuperDB\""]}, {"cell_type": "markdown", "id": "91142c55-b256-4025-94c2-6c4d215c6975", "metadata": {}, "source": ["<!-- TABS -->\n", "## Create Vector Search Model"]}, {"cell_type": "code", "execution_count": null, "id": "b5b99541-fd10-41c1-b6a7-1da6c1d4dbd7", "metadata": {}, "outputs": [], "source": ["from superduperdb.base.variables import Variable\n", "item = {indexing_key: Variable('query')}"]}, {"cell_type": "code", "execution_count": null, "id": "d47799ab-b688-4eb8-82d4-6c0aa1204801", "metadata": {}, "outputs": [], "source": ["from superduperdb.components.model import QueryModel\n", "\n", "vector_search_model = QueryModel(\n", "    identifier=\"VectorSearch\",\n", "    select=query_table_or_collection.like(item, vector_index=vector_index_name, n=5).select(),\n", "    # The _source is the identifier of the upstream data, which can be used to locate the data from upstream sources using `_source`.\n", "    postprocess=lambda docs: [{\"text\": doc[indexing_key], \"_source\": doc[\"_source\"]} for doc in docs],\n", "    db=db\n", ")"]}, {"cell_type": "code", "execution_count": null, "id": "56c5b28e-1f9d-4e9a-8238-537b71c07d2b", "metadata": {}, "outputs": [], "source": ["vector_search_model.predict(query=query)"]}, {"cell_type": "markdown", "id": "1179a67b-4e40-496b-9851-98f32d42faa0", "metadata": {}, "source": ["<!-- TABS -->\n", "## Build LLM"]}, {"cell_type": "code", "execution_count": null, "id": "f98e5ff4", "metadata": {}, "outputs": [], "source": ["# <tab: OpenAI>\n", "!pip install openai\n", "from superduperdb.ext.openai import OpenAIChatCompletion\n", "\n", "llm = OpenAIChatCompletion(identifier='llm', model='gpt-3.5-turbo')"]}, {"cell_type": "code", "execution_count": null, "id": "9bf39c47", "metadata": {}, "outputs": [], "source": ["# <tab: Anthropic>\n", "!pip install anthropic\n", "from superduperdb.ext.anthropic import AnthropicCompletions\n", "import os\n", "\n", "os.environ[\"ANTHROPIC_API_KEY\"] = \"sk-xxx\"\n", "\n", "predict_kwargs = {\n", "    \"max_tokens\": 1024,\n", "    \"temperature\": 0.8,\n", "}\n", "\n", "llm = AnthropicCompletions(identifier='llm', model='claude-2.1', predict_kwargs=predict_kwargs)"]}, {"cell_type": "code", "execution_count": null, "id": "95e48deb", "metadata": {}, "outputs": [], "source": ["# <tab: vLLM>\n", "!pip install vllm\n", "from superduperdb.ext.vllm import VllmModel\n", "\n", "predict_kwargs = {\n", "    \"max_tokens\": 1024,\n", "    \"temperature\": 0.8,\n", "}\n", "\n", "\n", "llm = VllmModel(\n", "    identifier=\"llm\",\n", "    model_name=\"TheBloke/Mistral-7B-Instruct-v0.2-AWQ\",\n", "    vllm_kwargs={\n", "        \"gpu_memory_utilization\": 0.7,\n", "        \"max_model_len\": 1024,\n", "        \"quantization\": \"awq\",\n", "    },\n", "    predict_kwargs=predict_kwargs,\n", ")"]}, {"cell_type": "code", "execution_count": null, "id": "fe4ac344", "metadata": {}, "outputs": [], "source": ["# <tab: Transformers>\n", "!pip install transformers datasets bitsandbytes accelerate\n", "from superduperdb.ext.transformers import LLM\n", "\n", "llm = LLM.from_pretrained(\"mistralai/Mistral-7B-Instruct-v0.2\", load_in_8bit=True, device_map=\"cuda\", identifier=\"llm\", predict_kwargs=dict(max_new_tokens=128))"]}, {"cell_type": "code", "execution_count": null, "id": "1fdbfae2-af7d-4845-bce5-0cb230e3614e", "metadata": {}, "outputs": [], "source": ["# <tab: Llama.cpp>\n", "!pip install llama_cpp_python\n", "# !huggingface-cli download TheBloke/Mistral-7B-Instruct-v0.2-GGUF mistral-7b-instruct-v0.2.Q4_K_M.gguf --local-dir . --local-dir-use-symlinks False\n", "\n", "from superduperdb.ext.llamacpp.model import LlamaCpp\n", "llm = LlamaCpp(identifier=\"llm\", model_name_or_path=\"mistral-7b-instruct-v0.2.Q4_K_M.gguf\")"]}, {"cell_type": "code", "execution_count": null, "id": "7d39a98d-c2f2-4496-b50e-ff82a59d7204", "metadata": {}, "outputs": [], "source": ["# test the llm model\n", "llm.predict(\"Tell me about the SuperDuperDB\")"]}, {"cell_type": "markdown", "id": "60ae6203-dcc4-493c-a8f8-f727f0f75778", "metadata": {}, "source": ["## Answer question with LLM"]}, {"cell_type": "code", "execution_count": null, "id": "44baeb09-6f35-4cf2-b814-46283a59f7e9", "metadata": {}, "outputs": [], "source": ["from superduperdb import model\n", "from superduperdb.components.graph import Graph, input_node\n", "\n", "prompt_template = (\n", "    \"Use the following context snippets, these snippets are not ordered!, Answer the question based on this context.\\n\"\n", "    \"{context}\\n\\n\"\n", "    \"Here's the question: {query}\"\n", ")\n", "\n", "\n", "@model\n", "def build_prompt(query, docs):\n", "    chunks = [doc[\"text\"] for doc in docs]\n", "    context = \"\\n\\n\".join(chunks)\n", "    prompt = prompt_template.format(context=context, query=query)\n", "    return prompt\n", "    \n", "\n", "# We build a graph to handle the entire pipeline\n", "\n", "# create a input node, only have one input parameter `query`\n", "in_ = input_node('query')\n", "# pass the query to the vector search model\n", "vector_search_results = vector_search_model(query=in_)\n", "# pass the query and the search results to the prompt builder\n", "prompt = build_prompt(query=in_, docs=vector_search_results)\n", "# pass the prompt to the llm model\n", "answer = llm(prompt)\n", "# create a graph, and the graph output is the answer\n", "rag = answer.to_graph(\"rag\")\n", "print(rag.predict(query)[0])"]}, {"cell_type": "markdown", "id": "183bf5b6-4644-4e4c-b65b-e6bafdc6b49f", "metadata": {}, "source": ["By applying the RAG model to the database, it will subsequently be accessible for use in other services."]}, {"cell_type": "code", "execution_count": null, "id": "e6787c78-4b14-4a72-818b-450408a74331", "metadata": {}, "outputs": [], "source": ["db.add(rag)"]}, {"cell_type": "markdown", "id": "5da0306b-0969-49ab-95c4-0eb93c39f515", "metadata": {}, "source": ["You can now load the model elsewhere and make predictions using the following command.\n", "\n", "```python\n", "rag = db.load(\"model\", 'context_llm')\n", "print(rag.predict(\"Tell me about the SuperDuperDB\")[0])\n", "```"]}]}
\ No newline at end of file
+{"metadata": {"kernelspec": {"display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3"}, "language_info": {"codemirror_mode": {"name": "ipython", "version": 3}, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.2"}}, "nbformat": 4, "nbformat_minor": 5, "cells": [{"cell_type": "markdown", "id": "38c1a328-fd86-4c5f-bd54-b8664f433608", "metadata": {}, "source": ["<!-- TABS -->\n", "# Retrieval augmented generation"]}, {"cell_type": "markdown", "id": "f7a4aab8-86eb-4e1c-9200-0a16ba75b2e6", "metadata": {}, "source": ["<!-- TABS -->\n", "## Configure your production system"]}, {"cell_type": "markdown", "id": "81e7cd59-67d0-4776-aea1-4864aa768f95", "metadata": {}, "source": [":::note\n", "If you would like to use the production features \n", "of SuperDuperDB, then you should set the relevant \n", "connections and configurations in a configuration \n", "file. Otherwise you are welcome to use \"development\" mode \n", "to get going with SuperDuperDB quickly.\n", ":::"]}, {"cell_type": "code", "execution_count": null, "id": "62014646-ccd4-4d10-ac26-1c470f88f2f2", "metadata": {}, "outputs": [], "source": ["import os\n", "\n", "os.makedirs('.superduperdb', exist_ok=True)\n", "os.environ['SUPERDUPERDB_CONFIG'] = '.superduperdb/config.yaml'"]}, {"cell_type": "code", "execution_count": null, "id": "8e50edd2-438d-44ab-9da0-0b72197df262", "metadata": {}, "outputs": [], "source": ["# <tab: MongoDB Community>\n", "CFG = '''\n", "data_backend: mongodb://127.0.0.1:27017/documents\n", "artifact_store: filesystem://./artifact_store\n", "cluster:\n", "  cdc:\n", "    strategy: null\n", "    uri: ray://127.0.0.1:20000\n", "  compute:\n", "    uri: ray://127.0.0.1:10001\n", "  vector_search:\n", "    backfill_batch_size: 100\n", "    type: in_memory\n", "    uri: http://127.0.0.1:21000\n", "'''"]}, {"cell_type": "code", "execution_count": null, "id": "1ad9ee67-6402-45ea-8311-3efb039b5df3", "metadata": {}, "outputs": [], "source": ["# <tab: MongoDB Atlas>\n", "CFG = '''\n", "artifact_store: filesystem://<path-to-artifact-store>\n", "cluster: \n", "    compute: ray://<ray-host>\n", "    cdc:    \n", "        uri: http://<cdc-host>:<cdc-port>\n", "    vector_search:\n", "        uri: http://<vector-search-host>:<vector-search-port>\n", "        type: native\n", "databackend: mongodb+srv://<user>:<password>@<mongo-host>:27017/documents\n", "'''"]}, {"cell_type": "code", "execution_count": null, "id": "9c9e8351-b17f-4882-bda6-5ad51dbc7e1f", "metadata": {}, "outputs": [], "source": ["# <tab: SQLite>\n", "CFG = '''\n", "artifact_store: filesystem://<path-to-artifact-store>\n", "cluster: \n", "    compute: ray://<ray-host>\n", "    cdc:    \n", "        uri: http://<cdc-host>:<cdc-port>\n", "    vector_search:\n", "        uri: http://<vector-search-host>:<vector-search-port>\n", "databackend: sqlite://<path-to-db>.db\n", "'''"]}, {"cell_type": "code", "execution_count": null, "id": "d16c66bb-6ff2-4cea-b11c-0a65bf86c7ad", "metadata": {}, "outputs": [], "source": ["# <tab: MySQL>\n", "CFG = '''\n", "artifact_store: filesystem://<path-to-artifact-store>\n", "cluster: \n", "    compute: ray://<ray-host>\n", "    cdc:    \n", "        uri: http://<cdc-host>:<cdc-port>\n", "    vector_search:\n", "        uri: http://<vector-search-host>:<vector-search-port>\n", "databackend: mysql://<user>:<password>@<host>:<port>/database\n", "'''"]}, {"cell_type": "code", "execution_count": null, "id": "9b7ac715-712c-4ec7-be90-0aaa22518977", "metadata": {}, "outputs": [], "source": ["# <tab: Oracle>\n", "CFG = '''\n", "artifact_store: filesystem://<path-to-artifact-store>\n", "cluster: \n", "    compute: ray://<ray-host>\n", "    cdc:    \n", "        uri: http://<cdc-host>:<cdc-port>\n", "    vector_search:\n", "        uri: http://<vector-search-host>:<vector-search-port>\n", "databackend: mssql://<user>:<password>@<host>:<port>\n", "'''"]}, {"cell_type": "code", "execution_count": null, "id": "f21fad9c-cc0e-4cf5-83f0-41a3a614c6af", "metadata": {}, "outputs": [], "source": ["# <tab: PostgreSQL>\n", "CFG = '''\n", "artifact_store: filesystem://<path-to-artifact-store>\n", "cluster: \n", "    compute: ray://<ray-host>\n", "    cdc:    \n", "        uri: http://<cdc-host>:<cdc-port>\n", "    vector_search:\n", "        uri: http://<vector-search-host>:<vector-search-port>\n", "databackend: postgres://<user>:<password>@<host>:<port</<database>\n", "'''"]}, {"cell_type": "code", "execution_count": null, "id": "1badb5a3-823c-4463-ab79-6f4f9239dabe", "metadata": {}, "outputs": [], "source": ["# <tab: Snowflake>\n", "CFG = '''\n", "artifact_store: filesystem://<path-to-artifact-store>\n", "metadata_store: sqlite://<path-to-sqlite-db>.db\n", "cluster: \n", "    compute: ray://<ray-host>\n", "    cdc:    \n", "        uri: http://<cdc-host>:<cdc-port>\n", "    vector_search:\n", "        uri: http://<vector-search-host>:<vector-search-port>\n", "databackend: snowflake://<user>:<password>@<account>/<database>\n", "'''"]}, {"cell_type": "code", "execution_count": null, "id": "ae7807d9-9fc1-4c18-8027-a512f827783d", "metadata": {}, "outputs": [], "source": ["# <tab: Clickhouse>\n", "CFG = '''\n", "artifact_store: filesystem://<path-to-artifact-store>\n", "metadata_store: sqlite://<path-to-sqlite-db>.db\n", "cluster: \n", "    compute: ray://<ray-host>\n", "    cdc:    \n", "        uri: http://<cdc-host>:<cdc-port>\n", "    vector_search:\n", "        uri: http://<vector-search-host>:<vector-search-port>\n", "databackend: clickhouse://<user>:<password>@<host>:<port>\n", "'''"]}, {"cell_type": "code", "execution_count": null, "id": "fc40c13b-9bc5-47ac-86d6-ef7a379c45ee", "metadata": {}, "outputs": [], "source": ["with open(os.environ['SUPERDUPERDB_CONFIG'], 'w') as f:\n", "    f.write(CFG)"]}, {"cell_type": "markdown", "metadata": {}, "source": ["<!-- TABS -->\n", "## Start your cluster"]}, {"cell_type": "markdown", "metadata": {}, "source": [":::note\n", "Starting a SuperDuperDB cluster is useful in production and model development\n", "if you want to enable scalable compute, access to the models by multiple users for collaboration, \n", "monitoring.\n", "\n", "If you don't need this, then it is simpler to start in development mode.\n", ":::"]}, {"cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": ["# <tab: Experimental Cluster>\n", "!python -m superduperdb local-cluster up"]}, {"cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": ["# <tab: Docker-Compose>\n", "!make build_sandbox\n", "!make testenv_init"]}, {"cell_type": "markdown", "id": "32f8484d-2e35-472a-9b24-1a30ec1d144b", "metadata": {}, "source": ["<!-- TABS -->\n", "## Connect to SuperDuperDB"]}, {"cell_type": "markdown", "id": "06d66021-ce62-4021-a2c5-158dee92b3bb", "metadata": {}, "source": [":::note\n", "Note that this is only relevant if you are running SuperDuperDB in development mode.\n", "Otherwise refer to \"Configuring your production system\".\n", ":::"]}, {"cell_type": "code", "execution_count": null, "id": "61976f44-8139-41c0-a73e-569c6d16c4b1", "metadata": {}, "outputs": [], "source": ["# <tab: MongoDB>\n", "from superduperdb import superduper\n", "\n", "db = superduper('mongodb://localhost:27017/documents')"]}, {"cell_type": "code", "execution_count": null, "id": "e981a457", "metadata": {}, "outputs": [], "source": ["# <tab: SQLite>\n", "from superduperdb import superduper\n", "db = superduper('sqlite://my_db.db')"]}, {"cell_type": "code", "execution_count": null, "id": "19ecf7c0-b730-4503-9b5d-e97697b3bcee", "metadata": {}, "outputs": [], "source": ["# <tab: MySQL>\n", "from superduperdb import superduper\n", "\n", "user = 'superduper'\n", "password = 'superduper'\n", "port = 3306\n", "host = 'localhost'\n", "database = 'test_db'\n", "\n", "db = superduper(f\"mysql://{user}:{password}@{host}:{port}/{database}\")"]}, {"cell_type": "code", "execution_count": null, "id": "df208e8c-4fd0-438f-af29-22a763a2aebd", "metadata": {}, "outputs": [], "source": ["# <tab: Oracle>\n", "from superduperdb import superduper\n", "\n", "user = 'sa'\n", "password = 'Superduper#1'\n", "port = 1433\n", "host = 'localhost'\n", "\n", "db = superduper(f\"mssql://{user}:{password}@{host}:{port}\")"]}, {"cell_type": "code", "execution_count": null, "id": "d2297295", "metadata": {}, "outputs": [], "source": ["# <tab: PostgreSQL>\n", "!pip install psycopg2\n", "from superduperdb import superduper\n", "\n", "user = 'postgres'\n", "password = 'postgres'\n", "port = 5432\n", "host = 'localhost'\n", "database = 'test_db'\n", "db_uri = f\"postgres://{user}:{password}@{host}:{port}/{database}\"\n", "\n", "db = superduper(db_uri, metadata_store=db_uri.replace('postgres://', 'postgresql://'))"]}, {"cell_type": "code", "execution_count": null, "id": "cc6c8517", "metadata": {}, "outputs": [], "source": ["# <tab: Snowflake>\n", "from superduperdb import superduper\n", "\n", "user = \"superduperuser\"\n", "password = \"superduperpassword\"\n", "account = \"XXXX-XXXX\"  # ORGANIZATIONID-USERID\n", "database = \"FREE_COMPANY_DATASET/PUBLIC\"\n", "\n", "snowflake_uri = f\"snowflake://{user}:{password}@{account}/{database}\"\n", "\n", "db = superduper(\n", "    snowflake_uri, \n", "    metadata_store='sqlite:///your_database_name.db',\n", ")"]}, {"cell_type": "code", "execution_count": null, "id": "05da45e3-d9e4-49ca-b9ee-db1b8bf4eb44", "metadata": {}, "outputs": [], "source": ["# <tab: Clickhouse>\n", "from superduperdb import superduper\n", "\n", "user = 'default'\n", "password = ''\n", "port = 8123\n", "host = 'localhost'\n", "\n", "db = superduper(f\"clickhouse://{user}:{password}@{host}:{port}\", metadata_store=f'mongomock://meta')"]}, {"cell_type": "code", "execution_count": null, "id": "0e89c8dd-d845-423a-9acc-97e3360d370c", "metadata": {}, "outputs": [], "source": ["# <tab: DuckDB>\n", "from superduperdb import superduper\n", "\n", "db = superduper('duckdb://mydb.duckdb')"]}, {"cell_type": "code", "execution_count": null, "id": "2de71562", "metadata": {}, "outputs": [], "source": ["# <tab: Pandas>\n", "from superduperdb import superduper\n", "\n", "db = superduper(['my.csv'], metadata_store=f'mongomock://meta')"]}, {"cell_type": "code", "execution_count": null, "id": "cb029a5e-fedf-4f07-8a31-d220cfbfbb3d", "metadata": {}, "outputs": [], "source": ["# <tab: MongoMock>\n", "from superduperdb import superduper\n", "\n", "db = superduper('mongomock:///test_db')"]}, {"cell_type": "markdown", "id": "032c2e7b-3f54-4263-b778-0fef60596efb", "metadata": {}, "source": ["<!-- TABS -->\n", "## Get useful sample data"]}, {"cell_type": "code", "execution_count": null, "id": "4e7902bd", "metadata": {}, "outputs": [], "source": ["# <tab: Text>\n", "!curl -O https://superduperdb-public-demo.s3.amazonaws.com/text.json\n", "import json\n", "\n", "with open('text.json', 'r') as f:\n", "    data = json.load(f)"]}, {"cell_type": "code", "execution_count": null, "id": "33486ec7-0316-4e0c-a409-c09ab4c16669", "metadata": {}, "outputs": [], "source": ["# <tab: PDF>\n", "!curl -O https://superduperdb-public-demo.s3.amazonaws.com/pdfs.zip && unzip -o pdfs.zip\n", "import os\n", "\n", "data = [f'pdfs/{x}' for x in os.listdir('./pdfs') if x.endswith('.pdf')]"]}, {"cell_type": "code", "execution_count": null, "id": "b745ed54-3818-4685-a3b5-6ab4e2afc44d", "metadata": {}, "outputs": [], "source": ["datas = [{'x': d} for d in data]"]}, {"cell_type": "markdown", "metadata": {}, "source": ["<!-- TABS -->\n", "## Insert simple data\n", "\n", "After turning on auto_schema, we can directly insert data, and superduperdb will automatically analyze the data type, and match the construction of the table and datatype."]}, {"cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": ["from superduperdb import Document\n", "\n", "table_or_collection = db['documents']\n", "\n", "ids = db.execute(table_or_collection.insert([Document(data) for data in datas]))\n", "select = table_or_collection.select()"]}, {"cell_type": "markdown", "id": "54fea927-ee4a-44cd-aaf2-634b574c316d", "metadata": {}, "source": ["<!-- TABS -->\n", "## Apply a chunker for search"]}, {"cell_type": "markdown", "id": "06d90bda-e8c4-494e-a38c-837fb63689ae", "metadata": {}, "source": [":::note\n", "Note that applying a chunker is ***not*** mandatory for search.\n", "If your data is already chunked (e.g. short text snippets or audio) or if you\n", "are searching through something like images, which can't be chunked, then this\n", "won't be necessary.\n", ":::"]}, {"cell_type": "code", "execution_count": null, "id": "2d20eaa0-a416-4483-938e-23f79845739a", "metadata": {}, "outputs": [], "source": ["# <tab: Text>\n", "from superduperdb import model\n", "\n", "CHUNK_SIZE = 200\n", "\n", "@model(flatten=True, model_update_kwargs={'document_embedded': False})\n", "def chunker(text):\n", "    text = text.split()\n", "    chunks = [' '.join(text[i:i + CHUNK_SIZE]) for i in range(0, len(text), CHUNK_SIZE)]\n", "    return chunks"]}, {"cell_type": "code", "execution_count": null, "id": "facd7dc0-fffa-40d8-af72-2b9e4852ad79", "metadata": {}, "outputs": [], "source": ["# <tab: PDF>\n", "!pip install -q \"unstructured[pdf]\"\n", "from superduperdb import model\n", "from unstructured.partition.pdf import partition_pdf\n", "\n", "CHUNK_SIZE = 500\n", "\n", "@model(flatten=True, model_update_kwargs={'document_embedded': False})\n", "def chunker(pdf_file):\n", "    elements = partition_pdf(pdf_file)\n", "    text = '\\n'.join([e.text for e in elements])\n", "    chunks = [text[i:i + CHUNK_SIZE] for i in range(0, len(text), CHUNK_SIZE)]\n", "    return chunks"]}, {"cell_type": "markdown", "id": "b33a16f9-3bac-45bb-80ac-3ccf265dce5f", "metadata": {}, "source": ["Now we apply this chunker to the data by wrapping the chunker in `Listener`:"]}, {"cell_type": "code", "execution_count": null, "id": "93d21872-d4dc-40dc-abab-fb07ba102ea3", "metadata": {}, "outputs": [], "source": ["from superduperdb import Listener\n", "\n", "upstream_listener = Listener(\n", "    model=chunker,\n", "    select=select,\n", "    key='x',\n", "    uuid=\"chunk\",\n", ")\n", "\n", "db.apply(upstream_listener)"]}, {"cell_type": "markdown", "id": "7c5377c0-4c9b-4ba9-8f08-5e866b9220b5", "metadata": {}, "source": ["## Select outputs of upstream listener"]}, {"cell_type": "markdown", "id": "809f5f62-95c3-483b-ae74-a5cdb5c1c83d", "metadata": {}, "source": [":::note\n", "This is useful if you have performed a first step, such as pre-computing \n", "features, or chunking your data. You can use this query to \n", "operate on those outputs.\n", ":::"]}, {"cell_type": "code", "execution_count": null, "id": "e49b116d-34f5-438a-995e-a8bd59e1dd80", "metadata": {}, "outputs": [], "source": ["indexing_key = upstream_listener.outputs_key\n", "select = upstream_listener.outputs_select"]}, {"cell_type": "markdown", "id": "c9a2cd87-723f-4cee-87c7-9b8181c9e54b", "metadata": {}, "source": ["<!-- TABS -->\n", "## Build text embedding model"]}, {"cell_type": "code", "execution_count": null, "id": "a9b1f538-65ca-499e-b6d0-2dd733f81723", "metadata": {}, "outputs": [], "source": ["# <tab: OpenAI>\n", "!pip install openai\n", "from superduperdb.ext.openai import OpenAIEmbedding\n", "model = OpenAIEmbedding(identifier='text-embedding-ada-002')"]}, {"cell_type": "code", "execution_count": null, "id": "e83facd8-8823-492f-a2c6-659f38d8e6ec", "metadata": {}, "outputs": [], "source": ["# <tab: JinaAI>\n", "import os\n", "from superduperdb.ext.jina import JinaEmbedding\n", "\n", "os.environ[\"JINA_API_KEY\"] = \"jina_xxxx\"\n", " \n", "# define the model\n", "model = JinaEmbedding(identifier='jina-embeddings-v2-base-en')"]}, {"cell_type": "code", "execution_count": null, "id": "3b4a9a60-41df-461d-b165-1d136ee25694", "metadata": {}, "outputs": [], "source": ["# <tab: Sentence-Transformers>\n", "!pip install sentence-transformers\n", "from superduperdb import vector\n", "import sentence_transformers\n", "from superduperdb.ext.sentence_transformers import SentenceTransformer\n", "\n", "model = SentenceTransformer(\n", "    identifier=\"embedding\",\n", "    object=sentence_transformers.SentenceTransformer(\"BAAI/bge-small-en\"),\n", "    datatype=vector(shape=(1024,)),\n", "    postprocess=lambda x: x.tolist(),\n", "    predict_kwargs={\"show_progress_bar\": True},\n", ")"]}, {"cell_type": "code", "execution_count": null, "id": "b1219380-13ce-4301-90e6-6ede2eee1497", "metadata": {}, "outputs": [], "source": ["# <tab: Transformers>\n", "import dataclasses as dc\n", "from superduperdb import vector\n", "from superduperdb.components.model import Model, ensure_initialized, Signature\n", "from transformers import AutoTokenizer, AutoModel\n", "import torch\n", "\n", "@dc.dataclass(kw_only=True)\n", "class TransformerEmbedding(Model):\n", "    signature: Signature = 'singleton'\n", "    pretrained_model_name_or_path : str\n", "\n", "    def init(self):\n", "        self.tokenizer = AutoTokenizer.from_pretrained(self.pretrained_model_name_or_path)\n", "        self.model = AutoModel.from_pretrained(self.pretrained_model_name_or_path)\n", "        self.model.eval()\n", "\n", "    @ensure_initialized\n", "    def predict(self, x):\n", "        return self.predict([x])[0]\n", "        \n", "    @ensure_initialized\n", "    def predict(self, dataset):\n", "        encoded_input = self.tokenizer(dataset, padding=True, truncation=True, return_tensors='pt')\n", "        # Compute token embeddings\n", "        with torch.no_grad():\n", "            model_output = self.model(**encoded_input)\n", "            # Perform pooling. In this case, cls pooling.\n", "            sentence_embeddings = model_output[0][:, 0]\n", "        # normalize embeddings\n", "        sentence_embeddings = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)\n", "        return sentence_embeddings.tolist()\n", "\n", "\n", "model = TransformerEmbedding(identifier=\"embedding\", pretrained_model_name_or_path=\"BAAI/bge-small-en\", datatype=vector(shape=(384, )))"]}, {"cell_type": "code", "execution_count": null, "id": "b9b238cf-56d5-44b4-87b0-9d8d55bdf36f", "metadata": {}, "outputs": [], "source": ["print(len(model.predict(\"What is SuperDuperDB\")))"]}, {"cell_type": "markdown", "id": "f31843db-8638-458a-a770-96a79041be88", "metadata": {}, "source": ["## Create vector-index"]}, {"cell_type": "code", "execution_count": null, "id": "4663fa4b-c2ec-427d-bf8b-b8b109cc2ccf", "metadata": {}, "outputs": [], "source": ["from superduperdb import VectorIndex, Listener\n", "\n", "vector_index_name = 'vector-index'\n", "\n", "jobs, _ = db.add(\n", "    VectorIndex(\n", "        vector_index_name,\n", "        indexing_listener=Listener(\n", "            key=indexing_key,      # the `Document` key `model` should ingest to create embedding\n", "            select=select,       # a `Select` query telling which data to search over\n", "            model=model,         # a `_Predictor` how to convert data to embeddings\n", "            uuid=\"embedding\"\n", "        )\n", "    )\n", ")\n", "query_table_or_collection = select.table_or_collection"]}, {"cell_type": "code", "execution_count": null, "id": "053183e1-fee8-4b7b-a567-62ce97845c98", "metadata": {}, "outputs": [], "source": ["query = \"Tell me about the SuperDuperDB\""]}, {"cell_type": "markdown", "id": "91142c55-b256-4025-94c2-6c4d215c6975", "metadata": {}, "source": ["<!-- TABS -->\n", "## Create Vector Search Model"]}, {"cell_type": "code", "execution_count": null, "id": "b5b99541-fd10-41c1-b6a7-1da6c1d4dbd7", "metadata": {}, "outputs": [], "source": ["item = {indexing_key: '<var:query>'}"]}, {"cell_type": "code", "execution_count": null, "id": "d47799ab-b688-4eb8-82d4-6c0aa1204801", "metadata": {}, "outputs": [], "source": ["from superduperdb.components.model import QueryModel\n", "\n", "vector_search_model = QueryModel(\n", "    identifier=\"VectorSearch\",\n", "    select=query_table_or_collection.like(item, vector_index=vector_index_name, n=5).select(),\n", "    # The _source is the identifier of the upstream data, which can be used to locate the data from upstream sources using `_source`.\n", "    postprocess=lambda docs: [{\"text\": doc[indexing_key], \"_source\": doc[\"_source\"]} for doc in docs],\n", "    db=db\n", ")"]}, {"cell_type": "code", "execution_count": null, "id": "56c5b28e-1f9d-4e9a-8238-537b71c07d2b", "metadata": {}, "outputs": [], "source": ["vector_search_model.predict(query=query)"]}, {"cell_type": "markdown", "id": "1179a67b-4e40-496b-9851-98f32d42faa0", "metadata": {}, "source": ["<!-- TABS -->\n", "## Build LLM"]}, {"cell_type": "code", "execution_count": null, "id": "f98e5ff4", "metadata": {}, "outputs": [], "source": ["# <tab: OpenAI>\n", "!pip install openai\n", "from superduperdb.ext.openai import OpenAIChatCompletion\n", "\n", "llm = OpenAIChatCompletion(identifier='llm', model='gpt-3.5-turbo')"]}, {"cell_type": "code", "execution_count": null, "id": "9bf39c47", "metadata": {}, "outputs": [], "source": ["# <tab: Anthropic>\n", "!pip install anthropic\n", "from superduperdb.ext.anthropic import AnthropicCompletions\n", "import os\n", "\n", "os.environ[\"ANTHROPIC_API_KEY\"] = \"sk-xxx\"\n", "\n", "predict_kwargs = {\n", "    \"max_tokens\": 1024,\n", "    \"temperature\": 0.8,\n", "}\n", "\n", "llm = AnthropicCompletions(identifier='llm', model='claude-2.1', predict_kwargs=predict_kwargs)"]}, {"cell_type": "code", "execution_count": null, "id": "95e48deb", "metadata": {}, "outputs": [], "source": ["# <tab: vLLM>\n", "!pip install vllm\n", "from superduperdb.ext.vllm import VllmModel\n", "\n", "predict_kwargs = {\n", "    \"max_tokens\": 1024,\n", "    \"temperature\": 0.8,\n", "}\n", "\n", "\n", "llm = VllmModel(\n", "    identifier=\"llm\",\n", "    model_name=\"TheBloke/Mistral-7B-Instruct-v0.2-AWQ\",\n", "    vllm_kwargs={\n", "        \"gpu_memory_utilization\": 0.7,\n", "        \"max_model_len\": 1024,\n", "        \"quantization\": \"awq\",\n", "    },\n", "    predict_kwargs=predict_kwargs,\n", ")"]}, {"cell_type": "code", "execution_count": null, "id": "fe4ac344", "metadata": {}, "outputs": [], "source": ["# <tab: Transformers>\n", "!pip install transformers datasets bitsandbytes accelerate\n", "from superduperdb.ext.transformers import LLM\n", "\n", "llm = LLM.from_pretrained(\"mistralai/Mistral-7B-Instruct-v0.2\", load_in_8bit=True, device_map=\"cuda\", identifier=\"llm\", predict_kwargs=dict(max_new_tokens=128))"]}, {"cell_type": "code", "execution_count": null, "id": "1fdbfae2-af7d-4845-bce5-0cb230e3614e", "metadata": {}, "outputs": [], "source": ["# <tab: Llama.cpp>\n", "!pip install llama_cpp_python\n", "# !huggingface-cli download TheBloke/Mistral-7B-Instruct-v0.2-GGUF mistral-7b-instruct-v0.2.Q4_K_M.gguf --local-dir . --local-dir-use-symlinks False\n", "\n", "from superduperdb.ext.llamacpp.model import LlamaCpp\n", "llm = LlamaCpp(identifier=\"llm\", model_name_or_path=\"mistral-7b-instruct-v0.2.Q4_K_M.gguf\")"]}, {"cell_type": "code", "execution_count": null, "id": "7d39a98d-c2f2-4496-b50e-ff82a59d7204", "metadata": {}, "outputs": [], "source": ["# test the llm model\n", "llm.predict(\"Tell me about the SuperDuperDB\")"]}, {"cell_type": "markdown", "id": "60ae6203-dcc4-493c-a8f8-f727f0f75778", "metadata": {}, "source": ["## Answer question with LLM"]}, {"cell_type": "code", "execution_count": null, "id": "44baeb09-6f35-4cf2-b814-46283a59f7e9", "metadata": {}, "outputs": [], "source": ["from superduperdb import model\n", "from superduperdb.components.graph import Graph, input_node\n", "\n", "prompt_template = (\n", "    \"Use the following context snippets, these snippets are not ordered!, Answer the question based on this context.\\n\"\n", "    \"{context}\\n\\n\"\n", "    \"Here's the question: {query}\"\n", ")\n", "\n", "\n", "@model\n", "def build_prompt(query, docs):\n", "    chunks = [doc[\"text\"] for doc in docs]\n", "    context = \"\\n\\n\".join(chunks)\n", "    prompt = prompt_template.format(context=context, query=query)\n", "    return prompt\n", "    \n", "\n", "# We build a graph to handle the entire pipeline\n", "\n", "# create a input node, only have one input parameter `query`\n", "in_ = input_node('query')\n", "# pass the query to the vector search model\n", "vector_search_results = vector_search_model(query=in_)\n", "# pass the query and the search results to the prompt builder\n", "prompt = build_prompt(query=in_, docs=vector_search_results)\n", "# pass the prompt to the llm model\n", "answer = llm(prompt)\n", "# create a graph, and the graph output is the answer\n", "rag = answer.to_graph(\"rag\")\n", "print(rag.predict(query)[0])"]}, {"cell_type": "markdown", "id": "183bf5b6-4644-4e4c-b65b-e6bafdc6b49f", "metadata": {}, "source": ["By applying the RAG model to the database, it will subsequently be accessible for use in other services."]}, {"cell_type": "code", "execution_count": null, "id": "e6787c78-4b14-4a72-818b-450408a74331", "metadata": {}, "outputs": [], "source": ["db.add(rag)"]}, {"cell_type": "markdown", "id": "5da0306b-0969-49ab-95c4-0eb93c39f515", "metadata": {}, "source": ["You can now load the model elsewhere and make predictions using the following command.\n", "\n", "```python\n", "rag = db.load(\"model\", 'context_llm')\n", "print(rag.predict(\"Tell me about the SuperDuperDB\")[0])\n", "```"]}]}
\ No newline at end of file
diff --git a/docs/content/use_cases/retrieval_augmented_generation.md b/docs/content/use_cases/retrieval_augmented_generation.md
index a5f9e1848..a4e19112e 100644
--- a/docs/content/use_cases/retrieval_augmented_generation.md
+++ b/docs/content/use_cases/retrieval_augmented_generation.md
@@ -516,8 +516,7 @@ query = "Tell me about the SuperDuperDB"
 ## Create Vector Search Model
 
 ```python
-from superduperdb.base.variables import Variable
-item = {indexing_key: Variable('query')}
+item = {indexing_key: '<var:query>'}
 ```
 
 ```python