From 38e1fceadfdf615051ced217973ea9cd1a6bba6f Mon Sep 17 00:00:00 2001 From: John Pruitt Date: Mon, 8 Apr 2024 15:03:26 -0500 Subject: [PATCH] Metadata values containing jsonb arrays are now supported. Predicates can take a list as a value. A comparison operator of "@>" now tests for array containment. This is an indexed operation. We allow for list/tuple PredicateValues with elements of mixed type. Fixes to upgrade to latest openai --- README.md | 49 +++--- nbs/00_vector.ipynb | 145 ++++++++++-------- nbs/01_pgvectorizer.ipynb | 18 +-- nbs/index.ipynb | 54 +++---- nbs/requirements.txt | 1 + nbs/tsv_python_getting_started_tutorial.ipynb | 94 ++++++------ timescale_vector/client.py | 40 +++-- 7 files changed, 206 insertions(+), 195 deletions(-) diff --git a/README.md b/README.md index 7a580a4..63ed704 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,6 @@ # Timescale Vector + PostgreSQL++ for AI Applications. @@ -109,12 +110,12 @@ Now, you can query for similar items: vec.search([1.0, 9.0]) ``` - [[UUID('73d05df0-84c1-11ee-98da-6ee10b77fd08'), + [[UUID('45ecb666-0f15-11ef-8d89-e666703872d0'), {'action': 'jump', 'animal': 'fox'}, 'jumped over the', array([ 1. , 10.8], dtype=float32), 0.00016793422934946456], - [UUID('73d05d6e-84c1-11ee-98da-6ee10b77fd08'), + [UUID('45ecb350-0f15-11ef-8d89-e666703872d0'), {'animal': 'fox'}, 'the brown fox', array([1. , 1.3], dtype=float32), @@ -130,7 +131,7 @@ constrained by a metadata filter. vec.search([1.0, 9.0], limit=1, filter={"action": "jump"}) ``` - [[UUID('73d05df0-84c1-11ee-98da-6ee10b77fd08'), + [[UUID('45ecb666-0f15-11ef-8d89-e666703872d0'), {'action': 'jump', 'animal': 'fox'}, 'jumped over the', array([ 1. , 10.8], dtype=float32), @@ -154,7 +155,7 @@ records = vec.search([1.0, 9.0], limit=1, filter={"action": "jump"}) (records[0]["id"],records[0]["metadata"], records[0]["contents"], records[0]["embedding"], records[0]["distance"]) ``` - (UUID('73d05df0-84c1-11ee-98da-6ee10b77fd08'), + (UUID('45ecb666-0f15-11ef-8d89-e666703872d0'), {'action': 'jump', 'animal': 'fox'}, 'jumped over the', array([ 1. , 10.8], dtype=float32), @@ -217,12 +218,12 @@ The basic query looks like: vec.search([1.0, 9.0]) ``` - [[UUID('7487af96-84c1-11ee-98da-6ee10b77fd08'), + [[UUID('4d629b54-0f15-11ef-8d89-e666703872d0'), {'times': 100, 'action': 'jump', 'animal': 'fox'}, 'jumped over the', array([ 1. , 10.8], dtype=float32), 0.00016793422934946456], - [UUID('7487af14-84c1-11ee-98da-6ee10b77fd08'), + [UUID('4d629a50-0f15-11ef-8d89-e666703872d0'), {'times': 1, 'action': 'sit', 'animal': 'fox'}, 'the brown fox', array([1. , 1.3], dtype=float32), @@ -234,7 +235,7 @@ You could provide a limit for the number of items returned: vec.search([1.0, 9.0], limit=1) ``` - [[UUID('7487af96-84c1-11ee-98da-6ee10b77fd08'), + [[UUID('4d629b54-0f15-11ef-8d89-e666703872d0'), {'times': 100, 'action': 'jump', 'animal': 'fox'}, 'jumped over the', array([ 1. , 10.8], dtype=float32), @@ -259,7 +260,7 @@ unconstrained): vec.search([1.0, 9.0], limit=1, filter={"action": "sit"}) ``` - [[UUID('7487af14-84c1-11ee-98da-6ee10b77fd08'), + [[UUID('4d629a50-0f15-11ef-8d89-e666703872d0'), {'times': 1, 'action': 'sit', 'animal': 'fox'}, 'the brown fox', array([1. , 1.3], dtype=float32), @@ -272,12 +273,12 @@ returned if it matches any dict: vec.search([1.0, 9.0], limit=2, filter=[{"action": "jump"}, {"animal": "fox"}]) ``` - [[UUID('7487af96-84c1-11ee-98da-6ee10b77fd08'), + [[UUID('4d629b54-0f15-11ef-8d89-e666703872d0'), {'times': 100, 'action': 'jump', 'animal': 'fox'}, 'jumped over the', array([ 1. , 10.8], dtype=float32), 0.00016793422934946456], - [UUID('7487af14-84c1-11ee-98da-6ee10b77fd08'), + [UUID('4d629a50-0f15-11ef-8d89-e666703872d0'), {'times': 1, 'action': 'sit', 'animal': 'fox'}, 'the brown fox', array([1. , 1.3], dtype=float32), @@ -292,7 +293,7 @@ could use greater than and less than conditions on numeric values. vec.search([1.0, 9.0], limit=2, predicates=client.Predicates("times", ">", 1)) ``` - [[UUID('7487af96-84c1-11ee-98da-6ee10b77fd08'), + [[UUID('4d629b54-0f15-11ef-8d89-e666703872d0'), {'times': 100, 'action': 'jump', 'animal': 'fox'}, 'jumped over the', array([ 1. , 10.8], dtype=float32), @@ -316,7 +317,7 @@ use the right type. Supported Python types are: `str`, `int`, and vec.search([1.0, 9.0], limit=2, predicates=client.Predicates("action", "==", "jump")) ``` - [[UUID('7487af96-84c1-11ee-98da-6ee10b77fd08'), + [[UUID('4d629b54-0f15-11ef-8d89-e666703872d0'), {'times': 100, 'action': 'jump', 'animal': 'fox'}, 'jumped over the', array([ 1. , 10.8], dtype=float32), @@ -330,7 +331,7 @@ combining using OR semantic). So you can do: vec.search([1.0, 9.0], limit=2, predicates=client.Predicates("action", "==", "jump") & client.Predicates("times", ">", 1)) ``` - [[UUID('7487af96-84c1-11ee-98da-6ee10b77fd08'), + [[UUID('4d629b54-0f15-11ef-8d89-e666703872d0'), {'times': 100, 'action': 'jump', 'animal': 'fox'}, 'jumped over the', array([ 1. , 10.8], dtype=float32), @@ -353,7 +354,7 @@ my_predicates = client.Predicates("action", "==", "jump") & (client.Predicates(" vec.search([1.0, 9.0], limit=2, predicates=my_predicates) ``` - [[UUID('7487af96-84c1-11ee-98da-6ee10b77fd08'), + [[UUID('4d629b54-0f15-11ef-8d89-e666703872d0'), {'times': 100, 'action': 'jump', 'animal': 'fox'}, 'jumped over the', array([ 1. , 10.8], dtype=float32), @@ -367,7 +368,7 @@ semantics. You can pass in multiple 3-tuples to vec.search([1.0, 9.0], limit=2, predicates=client.Predicates(("action", "==", "jump"), ("times", ">", 10))) ``` - [[UUID('7487af96-84c1-11ee-98da-6ee10b77fd08'), + [[UUID('4d629b54-0f15-11ef-8d89-e666703872d0'), {'times': 100, 'action': 'jump', 'animal': 'fox'}, 'jumped over the', array([ 1. , 10.8], dtype=float32), @@ -399,7 +400,7 @@ Then, you can filter using the timestamps by specifing a tpvec.search([1.0, 9.0], limit=4, uuid_time_filter=client.UUIDTimeRange(specific_datetime, specific_datetime+timedelta(days=1))) ``` - [[UUID('33c52800-ef15-11e7-be03-4f1f9a1bde5a'), + [[UUID('95899000-ef1d-11e7-990e-7d2f7e013038'), {'times': 1, 'action': 'sit', 'animal': 'fox'}, 'the brown fox', array([1. , 1.3], dtype=float32), @@ -415,12 +416,12 @@ unconstrained. tpvec.search([1.0, 9.0], limit=4, uuid_time_filter=client.UUIDTimeRange(start_date=specific_datetime)) ``` - [[UUID('ac8be800-0de6-11e9-889a-5eec84ba8a7b'), + [[UUID('0e505000-0def-11e9-8732-a154fea6fb50'), {'times': 100, 'action': 'jump', 'animal': 'fox'}, 'jumped over the', array([ 1. , 10.8], dtype=float32), 0.00016793422934946456], - [UUID('33c52800-ef15-11e7-be03-4f1f9a1bde5a'), + [UUID('95899000-ef1d-11e7-990e-7d2f7e013038'), {'times': 1, 'action': 'sit', 'animal': 'fox'}, 'the brown fox', array([1. , 1.3], dtype=float32), @@ -437,7 +438,7 @@ One example: tpvec.search([1.0, 9.0], limit=4, uuid_time_filter=client.UUIDTimeRange(start_date=specific_datetime, start_inclusive=False)) ``` - [[UUID('ac8be800-0de6-11e9-889a-5eec84ba8a7b'), + [[UUID('0e505000-0def-11e9-8732-a154fea6fb50'), {'times': 100, 'action': 'jump', 'animal': 'fox'}, 'jumped over the', array([ 1. , 10.8], dtype=float32), @@ -459,7 +460,7 @@ filters and `__uuid_timestamp` for predicates. Some examples below: tpvec.search([1.0, 9.0], limit=4, filter={ "__start_date": specific_datetime, "__end_date": specific_datetime+timedelta(days=1)}) ``` - [[UUID('33c52800-ef15-11e7-be03-4f1f9a1bde5a'), + [[UUID('95899000-ef1d-11e7-990e-7d2f7e013038'), {'times': 1, 'action': 'sit', 'animal': 'fox'}, 'the brown fox', array([1. , 1.3], dtype=float32), @@ -470,7 +471,7 @@ tpvec.search([1.0, 9.0], limit=4, predicates=client.Predicates("__uuid_timestamp", ">", specific_datetime) & client.Predicates("__uuid_timestamp", "<", specific_datetime+timedelta(days=1))) ``` - [[UUID('33c52800-ef15-11e7-be03-4f1f9a1bde5a'), + [[UUID('95899000-ef1d-11e7-990e-7d2f7e013038'), {'times': 1, 'action': 'sit', 'animal': 'fox'}, 'the brown fox', array([1. , 1.3], dtype=float32), @@ -839,7 +840,7 @@ import psycopg2 from langchain.docstore.document import Document from langchain.text_splitter import CharacterTextSplitter from timescale_vector import client, pgvectorizer -from langchain.embeddings.openai import OpenAIEmbeddings +from langchain_openai import OpenAIEmbeddings from langchain.vectorstores.timescalevector import TimescaleVector from datetime import timedelta ``` @@ -952,8 +953,8 @@ res = vector_store.similarity_search_with_score("Blogs about cats") res ``` - [(Document(page_content='Author Matvey Arye, title: First Post, contents:some super interesting content about cats.', metadata={'id': '4a784000-4bc4-11eb-855a-06302dbc8ce7', 'author': 'Matvey Arye', 'blog_id': 1, 'category': 'AI', 'published_time': '2021-01-01T00:00:00+00:00'}), - 0.12595687795193833)] + [(Document(page_content='Author Matvey Arye, title: First Post, contents:some super interesting content about cats.', metadata={'id': '4a784000-4bc4-11eb-979c-e8748f6439f2', 'author': 'Matvey Arye', 'blog_id': 1, 'category': 'AI', 'published_time': '2021-01-01T00:00:00+00:00'}), + 0.12657619616729976)] ## Development diff --git a/nbs/00_vector.ipynb b/nbs/00_vector.ipynb index 73d4207..c9b7bb1 100644 --- a/nbs/00_vector.ipynb +++ b/nbs/00_vector.ipynb @@ -447,9 +447,10 @@ " \"<=\": \"<=\",\n", " \"<\": \"<\",\n", " \"!=\": \"<>\",\n", + " \"@>\": \"@>\", # array contains\n", " }\n", "\n", - " PredicateValue = Union[str, int, float, datetime]\n", + " PredicateValue = Union[str, int, float, datetime, list, tuple]\n", "\n", " def __init__(self, *clauses: Union['Predicates', Tuple[str, PredicateValue], Tuple[str, str, PredicateValue], str, PredicateValue], operator: str = 'AND'):\n", " \"\"\"\n", @@ -550,18 +551,24 @@ " else:\n", " where_conditions.append(f\"uuid_timestamp(id) {operator} {param_name}\")\n", " params.append(value)\n", - " continue\n", - "\n", - " field_cast = ''\n", - " if isinstance(value, int):\n", - " field_cast = '::int'\n", - " elif isinstance(value, float):\n", - " field_cast = '::numeric'\n", - " elif isinstance(value, datetime):\n", - " field_cast = '::timestamptz' \n", - "\n", - " where_conditions.append(f\"(metadata->>'{field}'){field_cast} {operator} {param_name}\")\n", - " params.append(value) \n", + " \n", + " elif operator == \"@>\" and (isinstance(value, list) or isinstance(value, tuple)):\n", + " if len(value) == 0:\n", + " raise ValueError(\"Invalid value. Empty lists and empty tuples are not supported.\")\n", + " json_value = json.dumps(value)\n", + " where_conditions.append(f\"metadata @> jsonb_build_object('{field}', {param_name}::jsonb)\")\n", + " params.append(json_value)\n", + " \n", + " else:\n", + " field_cast = ''\n", + " if isinstance(value, int):\n", + " field_cast = '::int'\n", + " elif isinstance(value, float):\n", + " field_cast = '::numeric'\n", + " elif isinstance(value, datetime):\n", + " field_cast = '::timestamptz'\n", + " where_conditions.append(f\"(metadata->>'{field}'){field_cast} {operator} {param_name}\")\n", + " params.append(value) \n", "\n", " if self.operator == 'NOT':\n", " or_clauses = (\" OR \").join(where_conditions)\n", @@ -896,24 +903,24 @@ "text/markdown": [ "---\n", "\n", - "[source](https://github.com/timescale/python-vector/blob/main/timescale_vector/client.py#L475){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", + "[source](https://github.com/timescale/python-vector/blob/main/timescale_vector/client.py#L546){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", "\n", "### QueryBuilder.get_create_query\n", "\n", "> QueryBuilder.get_create_query ()\n", "\n", - "Generates a query to create the tables, indexes, and extensions needed to store the vector data." + "*Generates a query to create the tables, indexes, and extensions needed to store the vector data.*" ], "text/plain": [ "---\n", "\n", - "[source](https://github.com/timescale/python-vector/blob/main/timescale_vector/client.py#L475){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", + "[source](https://github.com/timescale/python-vector/blob/main/timescale_vector/client.py#L546){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", "\n", "### QueryBuilder.get_create_query\n", "\n", "> QueryBuilder.get_create_query ()\n", "\n", - "Generates a query to create the tables, indexes, and extensions needed to store the vector data." + "*Generates a query to create the tables, indexes, and extensions needed to store the vector data.*" ] }, "execution_count": null, @@ -1152,7 +1159,7 @@ " The index to create.\n", "\n", " Returns\n", - " --------\n", + " -------\n", " None\n", " \"\"\"\n", " #todo: can we make geting the records lazy?\n", @@ -1183,9 +1190,12 @@ " A filter for metadata. Should be specified as a key-value object or a list of key-value objects (where any objects in the list are matched).\n", " predicates\n", " A Predicates object to filter the results. Predicates support more complex queries than the filter parameter. Predicates can be combined using logical operators (&, |, and ~).\n", + " uuid_time_filter\n", + " A UUIDTimeRange object to filter the results by time using the id column.\n", + " query_params\n", "\n", " Returns\n", - " --------\n", + " -------\n", " List: List of similar records.\n", " \"\"\"\n", " (query, params) = self.builder.search_query(\n", @@ -1213,24 +1223,24 @@ "text/markdown": [ "---\n", "\n", - "[source](https://github.com/timescale/python-vector/blob/main/timescale_vector/client.py#L843){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", + "[source](https://github.com/timescale/python-vector/blob/main/timescale_vector/client.py#L884){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", "\n", "### Async.create_tables\n", "\n", "> Async.create_tables ()\n", "\n", - "Creates necessary tables." + "*Creates necessary tables.*" ], "text/plain": [ "---\n", "\n", - "[source](https://github.com/timescale/python-vector/blob/main/timescale_vector/client.py#L843){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", + "[source](https://github.com/timescale/python-vector/blob/main/timescale_vector/client.py#L884){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", "\n", "### Async.create_tables\n", "\n", "> Async.create_tables ()\n", "\n", - "Creates necessary tables." + "*Creates necessary tables.*" ] }, "execution_count": null, @@ -1252,24 +1262,24 @@ "text/markdown": [ "---\n", "\n", - "[source](https://github.com/timescale/python-vector/blob/main/timescale_vector/client.py#L843){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", + "[source](https://github.com/timescale/python-vector/blob/main/timescale_vector/client.py#L884){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", "\n", "### Async.create_tables\n", "\n", "> Async.create_tables ()\n", "\n", - "Creates necessary tables." + "*Creates necessary tables.*" ], "text/plain": [ "---\n", "\n", - "[source](https://github.com/timescale/python-vector/blob/main/timescale_vector/client.py#L843){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", + "[source](https://github.com/timescale/python-vector/blob/main/timescale_vector/client.py#L884){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", "\n", "### Async.create_tables\n", "\n", "> Async.create_tables ()\n", "\n", - "Creates necessary tables." + "*Creates necessary tables.*" ] }, "execution_count": null, @@ -1286,24 +1296,12 @@ "execution_count": null, "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/cevian/.pyenv/versions/3.11.4/envs/nbdev_env/lib/python3.11/site-packages/fastcore/docscrape.py:225: UserWarning: potentially wrong underline length... \n", - "Returns \n", - "-------- in \n", - "Retrieves similar records using a similarity query.\n", - "...\n", - " else: warn(msg)\n" - ] - }, { "data": { "text/markdown": [ "---\n", "\n", - "[source](https://github.com/timescale/python-vector/blob/main/timescale_vector/client.py#L944){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", + "[source](https://github.com/timescale/python-vector/blob/main/timescale_vector/client.py#L985){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", "\n", "### Async.search\n", "\n", @@ -1313,7 +1311,7 @@ "> uuid_time_filter:Optional[__main__.UUIDTimeRange]=None,\n", "> query_params:Optional[__main__.QueryParams]=None)\n", "\n", - "Retrieves similar records using a similarity query.\n", + "*Retrieves similar records using a similarity query.*\n", "\n", "| | **Type** | **Default** | **Details** |\n", "| -- | -------- | ----------- | ----------- |\n", @@ -1321,14 +1319,14 @@ "| limit | int | 10 | The number of nearest neighbors to retrieve. |\n", "| filter | Union | None | A filter for metadata. Should be specified as a key-value object or a list of key-value objects (where any objects in the list are matched). |\n", "| predicates | Optional | None | A Predicates object to filter the results. Predicates support more complex queries than the filter parameter. Predicates can be combined using logical operators (&, \\|, and ~). |\n", - "| uuid_time_filter | Optional | None | |\n", + "| uuid_time_filter | Optional | None | A UUIDTimeRange object to filter the results by time using the id column. |\n", "| query_params | Optional | None | |\n", "| **Returns** | **List: List of similar records.** | | |" ], "text/plain": [ "---\n", "\n", - "[source](https://github.com/timescale/python-vector/blob/main/timescale_vector/client.py#L944){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", + "[source](https://github.com/timescale/python-vector/blob/main/timescale_vector/client.py#L985){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", "\n", "### Async.search\n", "\n", @@ -1338,7 +1336,7 @@ "> uuid_time_filter:Optional[__main__.UUIDTimeRange]=None,\n", "> query_params:Optional[__main__.QueryParams]=None)\n", "\n", - "Retrieves similar records using a similarity query.\n", + "*Retrieves similar records using a similarity query.*\n", "\n", "| | **Type** | **Default** | **Details** |\n", "| -- | -------- | ----------- | ----------- |\n", @@ -1346,7 +1344,7 @@ "| limit | int | 10 | The number of nearest neighbors to retrieve. |\n", "| filter | Union | None | A filter for metadata. Should be specified as a key-value object or a list of key-value objects (where any objects in the list are matched). |\n", "| predicates | Optional | None | A Predicates object to filter the results. Predicates support more complex queries than the filter parameter. Predicates can be combined using logical operators (&, \\|, and ~). |\n", - "| uuid_time_filter | Optional | None | |\n", + "| uuid_time_filter | Optional | None | A UUIDTimeRange object to filter the results by time using the id column. |\n", "| query_params | Optional | None | |\n", "| **Returns** | **List: List of similar records.** | | |" ] @@ -1410,6 +1408,12 @@ " (uuid.uuid4(), '''{\"key2\":\"val\"}''', \"the brown fox\", [1.0, 1.8]),\n", " (uuid.uuid4(), '''{\"key_1\":\"val_1\", \"key_2\":\"val_2\"}''',\n", " \"the brown fox\", [1.0, 1.8]),\n", + "\n", + " (uuid.uuid4(), '''{\"key0\": [1,2,3,4]}''', \"the brown fox\", [1.0, 1.8]),\n", + " (uuid.uuid4(), '''{\"key0\": [8,9,\"A\"]}''', \"the brown fox\", [1.0, 1.8]), # mixed types\n", + " (uuid.uuid4(), '''{\"key0\": [5,6,7], \"key3\": 3}''', \"the brown fox\", [1.0, 1.8]),\n", + " (uuid.uuid4(), '''{\"key0\": [\"B\", \"C\"]}''', \"the brown fox\", [1.0, 1.8]),\n", + "\n", "])\n", "\n", "await vec.create_embedding_index(IvfflatIndex())\n", @@ -1475,6 +1479,20 @@ "assert len(rec) == 1\n", "rec = await vec.search([1.0, 2.0], limit=4, predicates=Predicates(\"key_11\", \"<\", 11.299999))\n", "assert len(rec) == 0\n", + "rec = await vec.search([1.0, 2.0], limit=4, predicates=Predicates(\"key0\", \"@>\", [1, 2]))\n", + "assert len(rec) == 1\n", + "rec = await vec.search([1.0, 2.0], limit=4, predicates=Predicates(\"key0\", \"@>\", [3, 7]))\n", + "assert len(rec) == 0\n", + "rec = await vec.search([1.0, 2.0], limit=4, predicates=Predicates(\"key0\", \"@>\", [42]))\n", + "assert len(rec) == 0\n", + "rec = await vec.search([1.0, 2.0], limit=4, predicates=Predicates(\"key0\", \"@>\", [4]))\n", + "assert len(rec) == 1\n", + "rec = await vec.search([1.0, 2.0], limit=4, predicates=Predicates(\"key0\", \"@>\", [9, \"A\"]))\n", + "assert len(rec) == 1\n", + "rec = await vec.search([1.0, 2.0], limit=4, predicates=Predicates(\"key0\", \"@>\", [\"A\"]))\n", + "assert len(rec) == 1\n", + "rec = await vec.search([1.0, 2.0], limit=4, predicates=Predicates(\"key0\", \"@>\", (\"C\", \"B\")))\n", + "assert len(rec) == 1\n", "\n", "rec = await vec.search([1.0, 2.0], limit=4, predicates=Predicates(*[(\"key\", \"val2\"), (\"key_10\", \"<\", 100)]))\n", "assert len(rec) == 1\n", @@ -1486,6 +1504,10 @@ "assert len(rec) == 1\n", "rec = await vec.search([1.0, 2.0], limit=4, predicates=Predicates(\"key_10\", \"<\", 100) and (Predicates(\"key\",\"==\", \"val2\") or Predicates(\"key_2\",\"==\", \"val_2\"))) \n", "assert len(rec) == 1\n", + "rec = await vec.search([1.0, 2.0], limit=4, predicates=Predicates(\"key0\", \"@>\", [6,7]) and Predicates(\"key3\",\"==\", 3))\n", + "assert len(rec) == 1\n", + "rec = await vec.search([1.0, 2.0], limit=4, predicates=Predicates(\"key0\", \"@>\", [6,7]) and Predicates(\"key3\",\"==\", 6))\n", + "assert len(rec) == 0\n", "rec = await vec.search(limit=4, predicates=~Predicates((\"key\", \"val2\"), (\"key_10\", \"<\", 100)))\n", "assert len(rec) == 4\n", "\n", @@ -2002,24 +2024,24 @@ "text/markdown": [ "---\n", "\n", - "[source](https://github.com/timescale/python-vector/blob/main/timescale_vector/client.py#L1147){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", + "[source](https://github.com/timescale/python-vector/blob/main/timescale_vector/client.py#L1198){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", "\n", "### Sync.create_tables\n", "\n", "> Sync.create_tables ()\n", "\n", - "Creates necessary tables." + "*Creates necessary tables.*" ], "text/plain": [ "---\n", "\n", - "[source](https://github.com/timescale/python-vector/blob/main/timescale_vector/client.py#L1147){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", + "[source](https://github.com/timescale/python-vector/blob/main/timescale_vector/client.py#L1198){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", "\n", "### Sync.create_tables\n", "\n", "> Sync.create_tables ()\n", "\n", - "Creates necessary tables." + "*Creates necessary tables.*" ] }, "execution_count": null, @@ -2041,13 +2063,13 @@ "text/markdown": [ "---\n", "\n", - "[source](https://github.com/timescale/python-vector/blob/main/timescale_vector/client.py#L1127){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", + "[source](https://github.com/timescale/python-vector/blob/main/timescale_vector/client.py#L1178){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", "\n", "### Sync.upsert\n", "\n", "> Sync.upsert (records)\n", "\n", - "Performs upsert operation for multiple records.\n", + "*Performs upsert operation for multiple records.*\n", "\n", "| | **Type** | **Details** |\n", "| -- | -------- | ----------- |\n", @@ -2057,13 +2079,13 @@ "text/plain": [ "---\n", "\n", - "[source](https://github.com/timescale/python-vector/blob/main/timescale_vector/client.py#L1127){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", + "[source](https://github.com/timescale/python-vector/blob/main/timescale_vector/client.py#L1178){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", "\n", "### Sync.upsert\n", "\n", "> Sync.upsert (records)\n", "\n", - "Performs upsert operation for multiple records.\n", + "*Performs upsert operation for multiple records.*\n", "\n", "| | **Type** | **Details** |\n", "| -- | -------- | ----------- |\n", @@ -2090,7 +2112,7 @@ "text/markdown": [ "---\n", "\n", - "[source](https://github.com/timescale/python-vector/blob/main/timescale_vector/client.py#L1262){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", + "[source](https://github.com/timescale/python-vector/blob/main/timescale_vector/client.py#L1313){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", "\n", "### Sync.search\n", "\n", @@ -2100,7 +2122,7 @@ "> uuid_time_filter:Optional[__main__.UUIDTimeRange]=None,\n", "> query_params:Optional[__main__.QueryParams]=None)\n", "\n", - "Retrieves similar records using a similarity query.\n", + "*Retrieves similar records using a similarity query.*\n", "\n", "| | **Type** | **Default** | **Details** |\n", "| -- | -------- | ----------- | ----------- |\n", @@ -2115,7 +2137,7 @@ "text/plain": [ "---\n", "\n", - "[source](https://github.com/timescale/python-vector/blob/main/timescale_vector/client.py#L1262){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", + "[source](https://github.com/timescale/python-vector/blob/main/timescale_vector/client.py#L1313){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", "\n", "### Sync.search\n", "\n", @@ -2125,7 +2147,7 @@ "> uuid_time_filter:Optional[__main__.UUIDTimeRange]=None,\n", "> query_params:Optional[__main__.QueryParams]=None)\n", "\n", - "Retrieves similar records using a similarity query.\n", + "*Retrieves similar records using a similarity query.*\n", "\n", "| | **Type** | **Default** | **Details** |\n", "| -- | -------- | ----------- | ----------- |\n", @@ -2198,6 +2220,8 @@ " (uuid.uuid4(), '''{\"key2\":\"val\"}''', \"the brown fox\", [1.0, 1.8]),\n", " (uuid.uuid4(), '''{\"key_1\":\"val_1\", \"key_2\":\"val_2\"}''',\n", " \"the brown fox\", [1.0, 1.8]),\n", + " (uuid.uuid4(), '''{\"key0\": [1,2,3,4]}''', \"the brown fox\", [1.0, 1.8]),\n", + " (uuid.uuid4(), '''{\"key0\": [5,6,7], \"key3\": 3}''', \"the brown fox\", [1.0, 1.8]),\n", "])\n", "\n", "vec.create_embedding_index(IvfflatIndex())\n", @@ -2412,13 +2436,6 @@ "import nbdev\n", "nbdev.nbdev_export()" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/nbs/01_pgvectorizer.ipynb b/nbs/01_pgvectorizer.ipynb index f77abc8..373f9c6 100644 --- a/nbs/01_pgvectorizer.ipynb +++ b/nbs/01_pgvectorizer.ipynb @@ -225,7 +225,7 @@ "from langchain.docstore.document import Document\n", "from langchain.text_splitter import CharacterTextSplitter\n", "from timescale_vector import client\n", - "from langchain.embeddings.openai import OpenAIEmbeddings\n", + "from langchain_openai import OpenAIEmbeddings\n", "from langchain.vectorstores.timescalevector import TimescaleVector\n", "from datetime import timedelta" ] @@ -358,20 +358,6 @@ "assert vectorizer.process(embed_and_write) == 1\n", "assert vectorizer.process(embed_and_write) == 0" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { @@ -382,5 +368,5 @@ } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 } diff --git a/nbs/index.ipynb b/nbs/index.ipynb index cc4650c..1590717 100644 --- a/nbs/index.ipynb +++ b/nbs/index.ipynb @@ -218,12 +218,12 @@ { "data": { "text/plain": [ - "[[UUID('73d05df0-84c1-11ee-98da-6ee10b77fd08'),\n", + "[[UUID('45ecb666-0f15-11ef-8d89-e666703872d0'),\n", " {'action': 'jump', 'animal': 'fox'},\n", " 'jumped over the',\n", " array([ 1. , 10.8], dtype=float32),\n", " 0.00016793422934946456],\n", - " [UUID('73d05d6e-84c1-11ee-98da-6ee10b77fd08'),\n", + " [UUID('45ecb350-0f15-11ef-8d89-e666703872d0'),\n", " {'animal': 'fox'},\n", " 'the brown fox',\n", " array([1. , 1.3], dtype=float32),\n", @@ -257,7 +257,7 @@ { "data": { "text/plain": [ - "[[UUID('73d05df0-84c1-11ee-98da-6ee10b77fd08'),\n", + "[[UUID('45ecb666-0f15-11ef-8d89-e666703872d0'),\n", " {'action': 'jump', 'animal': 'fox'},\n", " 'jumped over the',\n", " array([ 1. , 10.8], dtype=float32),\n", @@ -299,7 +299,7 @@ { "data": { "text/plain": [ - "(UUID('73d05df0-84c1-11ee-98da-6ee10b77fd08'),\n", + "(UUID('45ecb666-0f15-11ef-8d89-e666703872d0'),\n", " {'action': 'jump', 'animal': 'fox'},\n", " 'jumped over the',\n", " array([ 1. , 10.8], dtype=float32),\n", @@ -428,12 +428,12 @@ { "data": { "text/plain": [ - "[[UUID('7487af96-84c1-11ee-98da-6ee10b77fd08'),\n", + "[[UUID('4d629b54-0f15-11ef-8d89-e666703872d0'),\n", " {'times': 100, 'action': 'jump', 'animal': 'fox'},\n", " 'jumped over the',\n", " array([ 1. , 10.8], dtype=float32),\n", " 0.00016793422934946456],\n", - " [UUID('7487af14-84c1-11ee-98da-6ee10b77fd08'),\n", + " [UUID('4d629a50-0f15-11ef-8d89-e666703872d0'),\n", " {'times': 1, 'action': 'sit', 'animal': 'fox'},\n", " 'the brown fox',\n", " array([1. , 1.3], dtype=float32),\n", @@ -465,7 +465,7 @@ { "data": { "text/plain": [ - "[[UUID('7487af96-84c1-11ee-98da-6ee10b77fd08'),\n", + "[[UUID('4d629b54-0f15-11ef-8d89-e666703872d0'),\n", " {'times': 100, 'action': 'jump', 'animal': 'fox'},\n", " 'jumped over the',\n", " array([ 1. , 10.8], dtype=float32),\n", @@ -507,7 +507,7 @@ { "data": { "text/plain": [ - "[[UUID('7487af14-84c1-11ee-98da-6ee10b77fd08'),\n", + "[[UUID('4d629a50-0f15-11ef-8d89-e666703872d0'),\n", " {'times': 1, 'action': 'sit', 'animal': 'fox'},\n", " 'the brown fox',\n", " array([1. , 1.3], dtype=float32),\n", @@ -539,12 +539,12 @@ { "data": { "text/plain": [ - "[[UUID('7487af96-84c1-11ee-98da-6ee10b77fd08'),\n", + "[[UUID('4d629b54-0f15-11ef-8d89-e666703872d0'),\n", " {'times': 100, 'action': 'jump', 'animal': 'fox'},\n", " 'jumped over the',\n", " array([ 1. , 10.8], dtype=float32),\n", " 0.00016793422934946456],\n", - " [UUID('7487af14-84c1-11ee-98da-6ee10b77fd08'),\n", + " [UUID('4d629a50-0f15-11ef-8d89-e666703872d0'),\n", " {'times': 1, 'action': 'sit', 'animal': 'fox'},\n", " 'the brown fox',\n", " array([1. , 1.3], dtype=float32),\n", @@ -578,7 +578,7 @@ { "data": { "text/plain": [ - "[[UUID('7487af96-84c1-11ee-98da-6ee10b77fd08'),\n", + "[[UUID('4d629b54-0f15-11ef-8d89-e666703872d0'),\n", " {'times': 100, 'action': 'jump', 'animal': 'fox'},\n", " 'jumped over the',\n", " array([ 1. , 10.8], dtype=float32),\n", @@ -614,7 +614,7 @@ { "data": { "text/plain": [ - "[[UUID('7487af96-84c1-11ee-98da-6ee10b77fd08'),\n", + "[[UUID('4d629b54-0f15-11ef-8d89-e666703872d0'),\n", " {'times': 100, 'action': 'jump', 'animal': 'fox'},\n", " 'jumped over the',\n", " array([ 1. , 10.8], dtype=float32),\n", @@ -646,7 +646,7 @@ { "data": { "text/plain": [ - "[[UUID('7487af96-84c1-11ee-98da-6ee10b77fd08'),\n", + "[[UUID('4d629b54-0f15-11ef-8d89-e666703872d0'),\n", " {'times': 100, 'action': 'jump', 'animal': 'fox'},\n", " 'jumped over the',\n", " array([ 1. , 10.8], dtype=float32),\n", @@ -706,7 +706,7 @@ { "data": { "text/plain": [ - "[[UUID('7487af96-84c1-11ee-98da-6ee10b77fd08'),\n", + "[[UUID('4d629b54-0f15-11ef-8d89-e666703872d0'),\n", " {'times': 100, 'action': 'jump', 'animal': 'fox'},\n", " 'jumped over the',\n", " array([ 1. , 10.8], dtype=float32),\n", @@ -739,7 +739,7 @@ { "data": { "text/plain": [ - "[[UUID('7487af96-84c1-11ee-98da-6ee10b77fd08'),\n", + "[[UUID('4d629b54-0f15-11ef-8d89-e666703872d0'),\n", " {'times': 100, 'action': 'jump', 'animal': 'fox'},\n", " 'jumped over the',\n", " array([ 1. , 10.8], dtype=float32),\n", @@ -797,7 +797,7 @@ { "data": { "text/plain": [ - "[[UUID('33c52800-ef15-11e7-be03-4f1f9a1bde5a'),\n", + "[[UUID('95899000-ef1d-11e7-990e-7d2f7e013038'),\n", " {'times': 1, 'action': 'sit', 'animal': 'fox'},\n", " 'the brown fox',\n", " array([1. , 1.3], dtype=float32),\n", @@ -829,12 +829,12 @@ { "data": { "text/plain": [ - "[[UUID('ac8be800-0de6-11e9-889a-5eec84ba8a7b'),\n", + "[[UUID('0e505000-0def-11e9-8732-a154fea6fb50'),\n", " {'times': 100, 'action': 'jump', 'animal': 'fox'},\n", " 'jumped over the',\n", " array([ 1. , 10.8], dtype=float32),\n", " 0.00016793422934946456],\n", - " [UUID('33c52800-ef15-11e7-be03-4f1f9a1bde5a'),\n", + " [UUID('95899000-ef1d-11e7-990e-7d2f7e013038'),\n", " {'times': 1, 'action': 'sit', 'animal': 'fox'},\n", " 'the brown fox',\n", " array([1. , 1.3], dtype=float32),\n", @@ -866,7 +866,7 @@ { "data": { "text/plain": [ - "[[UUID('ac8be800-0de6-11e9-889a-5eec84ba8a7b'),\n", + "[[UUID('0e505000-0def-11e9-8732-a154fea6fb50'),\n", " {'times': 100, 'action': 'jump', 'animal': 'fox'},\n", " 'jumped over the',\n", " array([ 1. , 10.8], dtype=float32),\n", @@ -900,7 +900,7 @@ { "data": { "text/plain": [ - "[[UUID('33c52800-ef15-11e7-be03-4f1f9a1bde5a'),\n", + "[[UUID('95899000-ef1d-11e7-990e-7d2f7e013038'),\n", " {'times': 1, 'action': 'sit', 'animal': 'fox'},\n", " 'the brown fox',\n", " array([1. , 1.3], dtype=float32),\n", @@ -924,7 +924,7 @@ { "data": { "text/plain": [ - "[[UUID('33c52800-ef15-11e7-be03-4f1f9a1bde5a'),\n", + "[[UUID('95899000-ef1d-11e7-990e-7d2f7e013038'),\n", " {'times': 1, 'action': 'sit', 'animal': 'fox'},\n", " 'the brown fox',\n", " array([1. , 1.3], dtype=float32),\n", @@ -1479,7 +1479,7 @@ "from langchain.docstore.document import Document\n", "from langchain.text_splitter import CharacterTextSplitter\n", "from timescale_vector import client, pgvectorizer\n", - "from langchain.embeddings.openai import OpenAIEmbeddings\n", + "from langchain_openai import OpenAIEmbeddings\n", "from langchain.vectorstores.timescalevector import TimescaleVector\n", "from datetime import timedelta" ] @@ -1637,8 +1637,8 @@ { "data": { "text/plain": [ - "[(Document(page_content='Author Matvey Arye, title: First Post, contents:some super interesting content about cats.', metadata={'id': '4a784000-4bc4-11eb-855a-06302dbc8ce7', 'author': 'Matvey Arye', 'blog_id': 1, 'category': 'AI', 'published_time': '2021-01-01T00:00:00+00:00'}),\n", - " 0.12595687795193833)]" + "[(Document(page_content='Author Matvey Arye, title: First Post, contents:some super interesting content about cats.', metadata={'id': '4a784000-4bc4-11eb-979c-e8748f6439f2', 'author': 'Matvey Arye', 'blog_id': 1, 'category': 'AI', 'published_time': '2021-01-01T00:00:00+00:00'}),\n", + " 0.12657619616729976)]" ] }, "execution_count": null, @@ -1668,12 +1668,6 @@ "\n", "This project is developed with [nbdev](https://nbdev.fast.ai/). Please see that website for the development process." ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [] } ], "metadata": { diff --git a/nbs/requirements.txt b/nbs/requirements.txt index f22ec48..51e3cc5 100644 --- a/nbs/requirements.txt +++ b/nbs/requirements.txt @@ -2,3 +2,4 @@ python-dotenv asyncpg psycopg2 pgvector +numpy \ No newline at end of file diff --git a/nbs/tsv_python_getting_started_tutorial.ipynb b/nbs/tsv_python_getting_started_tutorial.ipynb index 181e364..4a0d121 100644 --- a/nbs/tsv_python_getting_started_tutorial.ipynb +++ b/nbs/tsv_python_getting_started_tutorial.ipynb @@ -349,29 +349,29 @@ "" ], "text/plain": [ - " commit \n", - "0 44e41c12ab25e36c202f58e068ced262eadc8d16 \\\n", + " commit \\\n", + "0 44e41c12ab25e36c202f58e068ced262eadc8d16 \n", "1 e66a40038e3c84fb1a68da67ad71caf75c64a027 \n", "2 c6a930897e9f9e9878db031cc7fb6ea79d721a74 \n", "3 8e941b80ae1b0e0b6affe5431454cdc637628d99 \n", "4 caada43454e25d3098744fa6b675ac7d07390550 \n", "\n", - " author \n", - "0 Lakshmi Narayanan Sreethar \\\n", + " author \\\n", + "0 Lakshmi Narayanan Sreethar \n", "1 Bharathy \n", "2 Jan Nidzwetzki \n", "3 Lakshmi Narayanan Sreethar \n", "4 Lakshmi Narayanan Sreethar \n", "\n", - " date \n", - "0 Tue Sep 5 21:03:21 2023 +0530 \\\n", + " date \\\n", + "0 Tue Sep 5 21:03:21 2023 +0530 \n", "1 Sat Sep 2 09:24:31 2023 +0530 \n", "2 Tue Aug 29 21:13:51 2023 +0200 \n", "3 Mon Aug 28 23:19:22 2023 +0530 \n", "4 Tue May 30 20:32:29 2023 +0530 \n", "\n", - " change summary \n", - "0 Fix segfault in set_integer_now_func \\\n", + " change summary \\\n", + "0 Fix segfault in set_integer_now_func \n", "1 Fix server crash on UPDATE of compressed chunk \n", "2 Use Debian Bookworm for 32-bit tests \n", "3 Fix incorrect row count in EXPLAIN ANALYZE INS... \n", @@ -416,13 +416,15 @@ "embedding_list = []\n", "content_list = []\n", "\n", + "openai_client = openai.Client()\n", + "\n", "# Helper function: get embeddings for a text\n", "def get_embeddings(text):\n", - " response = openai.Embedding.create(\n", + " response = openai_client.embeddings.create(\n", " model=\"text-embedding-ada-002\",\n", " input = text.replace(\"\\n\",\" \")\n", " )\n", - " embedding = response['data'][0]['embedding']\n", + " embedding = response.data[0].embedding\n", " return embedding\n", "\n", "for index, row in df.iterrows():\n", @@ -714,26 +716,26 @@ { "data": { "text/plain": [ - "[ Tue Jul 5 13:39:14 2022 +0200 e34218ce2963358a500f6bc315aace0fad29c450 Migrate Continuous Aggregates to the new format Timescale 2.7 released a new version of Continuous Aggregate (#4269) that store the final aggregation state instead of the byte array of the partial aggregate state, offering multiple opportunities of optimizations as well a more compact form. When upgrading to Timescale 2.7, new created Continuous Aggregates are using the new format, but existing Continuous Aggregates keep using the format they were defined with. Created a procedure to upgrade existing Continuous Aggregates from the old format to the new format, by calling a simple procedure: test=# CALL cagg_migrate('conditions_summary_daily'); Closes #4424 \" embedding=array([-0.02062987, -0.00239963, -0.00293877, ..., -0.00428753,\n", - " -0.00881631, -0.02099539], dtype=float32) distance=0.15422340530791157>,\n", - " ,\n", - " ,\n", - " Wed May 11 19:36:58 2022 -0300 f266f5cf564fcc5509b91493a39eb201c6f5914a Continuous Aggregates finals form Following work started by #4294 to improve performance of Continuous Aggregates by removing the re-aggregation in the user view. This PR get rid of `partialize_agg` and `finalize_agg` aggregate functions and store the finalized aggregated (plain) data in the materialization hypertable. Because we're not storing partials anymore and removed the re-aggregation, now is be possible to create indexes on aggregated columns in the materialization hypertable in order to improve the performance even more. Also removed restrictions on types of aggregates users can perform with Continuous Aggregates: * aggregates with DISTINCT * aggregates with FILTER * aggregates with FILTER in HAVING clause * aggregates without combine function * ordered-set aggregates * hypothetical-set aggregates By default new Continuous Aggregates will be created using this new format, but the previous version (with partials) will be supported. Users can create the previous style by setting to `false` the storage paramater named `timescaledb.finalized` during the creation of the Continuous Aggregate. Fixes #4233 \" embedding=array([-0.03068576, 0.01442172, -0.01128976, ..., -0.00485268,\n", - " -0.0147386 , -0.02291852], dtype=float32) distance=0.1634976668956617>,\n", - " ,\n", - " ,\n", - " ,\n", - " ,\n", - " Mon Jun 13 17:25:59 2022 -0300 28440b79008230ef8c50da2f8d4640456bba8e02 Enable ORDER BY on Continuous Aggregates Users often execute TopN like queries over Continuous Aggregates and now with the release 2.7 such queries are even faster because we remove the re-aggregation and don't store partials anymore. Also the previous PR #4430 gave us the ability to create indexes direct on the aggregated columns leading to performance improvements. But there are a noticable performance difference between `Materialized-Only` and `Real-Time` Continuous Aggregates for TopN queries. Enabling the ORDER BY clause in the Continuous Aggregates definition result in: 1) improvements of the User Experience that can use this so commom clause in SELECT queries 2) performance improvements because we give the planner a chance to use the MergeAppend node by producing ordered datasets. Closes #4456 \" embedding=array([-0.02860901, 0.00882402, -0.01317788, ..., -0.00909296,\n", - " -0.01232746, -0.02680641], dtype=float32) distance=0.17086502834587636>,\n", - " ]" + "[ Tue Jul 5 13:39:14 2022 +0200 e34218ce2963358a500f6bc315aace0fad29c450 Migrate Continuous Aggregates to the new format Timescale 2.7 released a new version of Continuous Aggregate (#4269) that store the final aggregation state instead of the byte array of the partial aggregate state, offering multiple opportunities of optimizations as well a more compact form. When upgrading to Timescale 2.7, new created Continuous Aggregates are using the new format, but existing Continuous Aggregates keep using the format they were defined with. Created a procedure to upgrade existing Continuous Aggregates from the old format to the new format, by calling a simple procedure: test=# CALL cagg_migrate('conditions_summary_daily'); Closes #4424 \" embedding=array([-0.02072006, -0.00232497, -0.00290987, ..., -0.00420762,\n", + " -0.00879542, -0.02118798], dtype=float32) distance=0.15402132505614874>,\n", + " Tue Jul 5 13:39:14 2022 +0200 e34218ce2963358a500f6bc315aace0fad29c450 Migrate Continuous Aggregates to the new format Timescale 2.7 released a new version of Continuous Aggregate (#4269) that store the final aggregation state instead of the byte array of the partial aggregate state, offering multiple opportunities of optimizations as well a more compact form. When upgrading to Timescale 2.7, new created Continuous Aggregates are using the new format, but existing Continuous Aggregates keep using the format they were defined with. Created a procedure to upgrade existing Continuous Aggregates from the old format to the new format, by calling a simple procedure: test=# CALL cagg_migrate('conditions_summary_daily'); Closes #4424 \" embedding=array([-0.02072006, -0.00232497, -0.00290987, ..., -0.00420762,\n", + " -0.00879542, -0.02118798], dtype=float32) distance=0.15402132505614874>,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " Wed May 11 19:36:58 2022 -0300 f266f5cf564fcc5509b91493a39eb201c6f5914a Continuous Aggregates finals form Following work started by #4294 to improve performance of Continuous Aggregates by removing the re-aggregation in the user view. This PR get rid of `partialize_agg` and `finalize_agg` aggregate functions and store the finalized aggregated (plain) data in the materialization hypertable. Because we're not storing partials anymore and removed the re-aggregation, now is be possible to create indexes on aggregated columns in the materialization hypertable in order to improve the performance even more. Also removed restrictions on types of aggregates users can perform with Continuous Aggregates: * aggregates with DISTINCT * aggregates with FILTER * aggregates with FILTER in HAVING clause * aggregates without combine function * ordered-set aggregates * hypothetical-set aggregates By default new Continuous Aggregates will be created using this new format, but the previous version (with partials) will be supported. Users can create the previous style by setting to `false` the storage paramater named `timescaledb.finalized` during the creation of the Continuous Aggregate. Fixes #4233 \" embedding=array([-0.03077092, 0.0143465 , -0.01135488, ..., -0.00501059,\n", + " -0.01490651, -0.02304872], dtype=float32) distance=0.1637590571138441>,\n", + " Wed May 11 19:36:58 2022 -0300 f266f5cf564fcc5509b91493a39eb201c6f5914a Continuous Aggregates finals form Following work started by #4294 to improve performance of Continuous Aggregates by removing the re-aggregation in the user view. This PR get rid of `partialize_agg` and `finalize_agg` aggregate functions and store the finalized aggregated (plain) data in the materialization hypertable. Because we're not storing partials anymore and removed the re-aggregation, now is be possible to create indexes on aggregated columns in the materialization hypertable in order to improve the performance even more. Also removed restrictions on types of aggregates users can perform with Continuous Aggregates: * aggregates with DISTINCT * aggregates with FILTER * aggregates with FILTER in HAVING clause * aggregates without combine function * ordered-set aggregates * hypothetical-set aggregates By default new Continuous Aggregates will be created using this new format, but the previous version (with partials) will be supported. Users can create the previous style by setting to `false` the storage paramater named `timescaledb.finalized` during the creation of the Continuous Aggregate. Fixes #4233 \" embedding=array([-0.03077092, 0.0143465 , -0.01135488, ..., -0.00501059,\n", + " -0.01490651, -0.02304872], dtype=float32) distance=0.1637590571138441>,\n", + " ,\n", + " ]" ] }, "execution_count": null, @@ -767,23 +769,23 @@ "--------------------------------------------------------------------------------\n", "Fabrízio de Royes Mello Tue Jul 5 13:39:14 2022 +0200 e34218ce2963358a500f6bc315aace0fad29c450 Migrate Continuous Aggregates to the new format Timescale 2.7 released a new version of Continuous Aggregate (#4269) that store the final aggregation state instead of the byte array of the partial aggregate state, offering multiple opportunities of optimizations as well a more compact form. When upgrading to Timescale 2.7, new created Continuous Aggregates are using the new format, but existing Continuous Aggregates keep using the format they were defined with. Created a procedure to upgrade existing Continuous Aggregates from the old format to the new format, by calling a simple procedure: test=# CALL cagg_migrate('conditions_summary_daily'); Closes #4424 \n", "--------------------------------------------------------------------------------\n", - "Fabrízio de Royes Mello Thu Nov 24 13:19:36 2022 -0300 35c91204987ccb0161d745af1a39b7eb91bc65a5 Add Hierarchical Continuous Aggregates validations Commit 3749953e introduce Hierarchical Continuous Aggregates (aka Continuous Aggregate on top of another Continuous Aggregate) but it lacks of some basic validations. Validations added during the creation of a Hierarchical Continuous Aggregate: * Forbid create a continuous aggregate with fixed-width bucket on top of a continuous aggregate with variable-width bucket. * Forbid incompatible bucket widths: - should not be equal; - bucket width of the new continuous aggregate should be greater than the source continuous aggregate; - bucket width of the new continuous aggregate should be multiple of the source continuous aggregate. \n", + "Fabrízio de Royes Mello Tue Jul 5 13:39:14 2022 +0200 e34218ce2963358a500f6bc315aace0fad29c450 Migrate Continuous Aggregates to the new format Timescale 2.7 released a new version of Continuous Aggregate (#4269) that store the final aggregation state instead of the byte array of the partial aggregate state, offering multiple opportunities of optimizations as well a more compact form. When upgrading to Timescale 2.7, new created Continuous Aggregates are using the new format, but existing Continuous Aggregates keep using the format they were defined with. Created a procedure to upgrade existing Continuous Aggregates from the old format to the new format, by calling a simple procedure: test=# CALL cagg_migrate('conditions_summary_daily'); Closes #4424 \n", "--------------------------------------------------------------------------------\n", - "Erik Nordström Tue Feb 8 09:57:23 2022 +0100 5af9f45488d51027804cac16362811f71a89bb64 Add extra telemetry for continuous aggregates Add the following telemetry fields for continuous aggregates: * The number of continuous aggregates created on distributed hypertables * The number of continuous aggregates using real-time aggregation \n", + "Fabrízio de Royes Mello Thu Nov 24 13:19:36 2022 -0300 35c91204987ccb0161d745af1a39b7eb91bc65a5 Add Hierarchical Continuous Aggregates validations Commit 3749953e introduce Hierarchical Continuous Aggregates (aka Continuous Aggregate on top of another Continuous Aggregate) but it lacks of some basic validations. Validations added during the creation of a Hierarchical Continuous Aggregate: * Forbid create a continuous aggregate with fixed-width bucket on top of a continuous aggregate with variable-width bucket. * Forbid incompatible bucket widths: - should not be equal; - bucket width of the new continuous aggregate should be greater than the source continuous aggregate; - bucket width of the new continuous aggregate should be multiple of the source continuous aggregate. \n", "--------------------------------------------------------------------------------\n", - "Fabrízio de Royes Mello Wed May 11 19:36:58 2022 -0300 f266f5cf564fcc5509b91493a39eb201c6f5914a Continuous Aggregates finals form Following work started by #4294 to improve performance of Continuous Aggregates by removing the re-aggregation in the user view. This PR get rid of `partialize_agg` and `finalize_agg` aggregate functions and store the finalized aggregated (plain) data in the materialization hypertable. Because we're not storing partials anymore and removed the re-aggregation, now is be possible to create indexes on aggregated columns in the materialization hypertable in order to improve the performance even more. Also removed restrictions on types of aggregates users can perform with Continuous Aggregates: * aggregates with DISTINCT * aggregates with FILTER * aggregates with FILTER in HAVING clause * aggregates without combine function * ordered-set aggregates * hypothetical-set aggregates By default new Continuous Aggregates will be created using this new format, but the previous version (with partials) will be supported. Users can create the previous style by setting to `false` the storage paramater named `timescaledb.finalized` during the creation of the Continuous Aggregate. Fixes #4233 \n", + "Fabrízio de Royes Mello Thu Nov 24 13:19:36 2022 -0300 35c91204987ccb0161d745af1a39b7eb91bc65a5 Add Hierarchical Continuous Aggregates validations Commit 3749953e introduce Hierarchical Continuous Aggregates (aka Continuous Aggregate on top of another Continuous Aggregate) but it lacks of some basic validations. Validations added during the creation of a Hierarchical Continuous Aggregate: * Forbid create a continuous aggregate with fixed-width bucket on top of a continuous aggregate with variable-width bucket. * Forbid incompatible bucket widths: - should not be equal; - bucket width of the new continuous aggregate should be greater than the source continuous aggregate; - bucket width of the new continuous aggregate should be multiple of the source continuous aggregate. \n", "--------------------------------------------------------------------------------\n", - "Fabrízio de Royes Mello Wed Oct 5 18:45:40 2022 -0300 3749953e9704e45df8f621607989ada0714ce28d Hierarchical Continuous Aggregates Enable users create Hierarchical Continuous Aggregates (aka Continuous Aggregates on top of another Continuous Aggregates). With this PR users can create levels of aggregation granularity in Continuous Aggregates making the refresh process even faster. A problem with this feature can be in upper levels we can end up with the \"average of averages\". But to get the \"real average\" we can rely on \"stats_aggs\" TimescaleDB Toolkit function that calculate and store the partials that can be finalized with other toolkit functions like \"average\" and \"sum\". Closes #1400 \n", + "Erik Nordström Tue Feb 8 09:57:23 2022 +0100 5af9f45488d51027804cac16362811f71a89bb64 Add extra telemetry for continuous aggregates Add the following telemetry fields for continuous aggregates: * The number of continuous aggregates created on distributed hypertables * The number of continuous aggregates using real-time aggregation \n", "--------------------------------------------------------------------------------\n", - "Fabrízio de Royes Mello Mon May 9 10:37:30 2022 -0300 e81e32fe5c56c39b67f2b24942deed26c0552388 Telemetry Stats for CAggs finals form Introduced by #4294 and #4269 PRs the default implementation of Continuous Aggregates get rid of `chunk_id` in the materialization hypertable and `partialize_agg`/`finalize_agg` aggregate functions. A new counter named `num_caggs_finalized` was added to telemetry report in this PR to count the number of Continuos Aggregates created in this new format. \n", + "Erik Nordström Tue Feb 8 09:57:23 2022 +0100 5af9f45488d51027804cac16362811f71a89bb64 Add extra telemetry for continuous aggregates Add the following telemetry fields for continuous aggregates: * The number of continuous aggregates created on distributed hypertables * The number of continuous aggregates using real-time aggregation \n", "--------------------------------------------------------------------------------\n", - "Rafia Sabih Wed Feb 8 11:54:28 2023 +0100 98218c1d079231a9aa469b37ddd0ed39e77c2adb Enable joins for heirarchical continuous aggregates The joins could be between a continuous aggregate and hypertable, continuous aggregate and a regular Postgres table, and continuous aggregate and a regular Postgres view. \n", + "Fabrízio de Royes Mello Wed May 11 19:36:58 2022 -0300 f266f5cf564fcc5509b91493a39eb201c6f5914a Continuous Aggregates finals form Following work started by #4294 to improve performance of Continuous Aggregates by removing the re-aggregation in the user view. This PR get rid of `partialize_agg` and `finalize_agg` aggregate functions and store the finalized aggregated (plain) data in the materialization hypertable. Because we're not storing partials anymore and removed the re-aggregation, now is be possible to create indexes on aggregated columns in the materialization hypertable in order to improve the performance even more. Also removed restrictions on types of aggregates users can perform with Continuous Aggregates: * aggregates with DISTINCT * aggregates with FILTER * aggregates with FILTER in HAVING clause * aggregates without combine function * ordered-set aggregates * hypothetical-set aggregates By default new Continuous Aggregates will be created using this new format, but the previous version (with partials) will be supported. Users can create the previous style by setting to `false` the storage paramater named `timescaledb.finalized` during the creation of the Continuous Aggregate. Fixes #4233 \n", "--------------------------------------------------------------------------------\n", - "Fabrízio de Royes Mello Fri Dec 9 16:01:50 2022 -0300 024b1e1f30db0c58b49eae04ff0b50055b191734 Fix CAgg on CAgg bucket size validation The bucket size of a Continuous Aggregate should be greater or equal to the parent Continuous Aggregate because there are many cases where you actually want to roll up on another dimension. \n", + "Fabrízio de Royes Mello Wed May 11 19:36:58 2022 -0300 f266f5cf564fcc5509b91493a39eb201c6f5914a Continuous Aggregates finals form Following work started by #4294 to improve performance of Continuous Aggregates by removing the re-aggregation in the user view. This PR get rid of `partialize_agg` and `finalize_agg` aggregate functions and store the finalized aggregated (plain) data in the materialization hypertable. Because we're not storing partials anymore and removed the re-aggregation, now is be possible to create indexes on aggregated columns in the materialization hypertable in order to improve the performance even more. Also removed restrictions on types of aggregates users can perform with Continuous Aggregates: * aggregates with DISTINCT * aggregates with FILTER * aggregates with FILTER in HAVING clause * aggregates without combine function * ordered-set aggregates * hypothetical-set aggregates By default new Continuous Aggregates will be created using this new format, but the previous version (with partials) will be supported. Users can create the previous style by setting to `false` the storage paramater named `timescaledb.finalized` during the creation of the Continuous Aggregate. Fixes #4233 \n", "--------------------------------------------------------------------------------\n", - "Fabrízio de Royes Mello Mon Jun 13 17:25:59 2022 -0300 28440b79008230ef8c50da2f8d4640456bba8e02 Enable ORDER BY on Continuous Aggregates Users often execute TopN like queries over Continuous Aggregates and now with the release 2.7 such queries are even faster because we remove the re-aggregation and don't store partials anymore. Also the previous PR #4430 gave us the ability to create indexes direct on the aggregated columns leading to performance improvements. But there are a noticable performance difference between `Materialized-Only` and `Real-Time` Continuous Aggregates for TopN queries. Enabling the ORDER BY clause in the Continuous Aggregates definition result in: 1) improvements of the User Experience that can use this so commom clause in SELECT queries 2) performance improvements because we give the planner a chance to use the MergeAppend node by producing ordered datasets. Closes #4456 \n", + "Fabrízio de Royes Mello Wed Oct 5 18:45:40 2022 -0300 3749953e9704e45df8f621607989ada0714ce28d Hierarchical Continuous Aggregates Enable users create Hierarchical Continuous Aggregates (aka Continuous Aggregates on top of another Continuous Aggregates). With this PR users can create levels of aggregation granularity in Continuous Aggregates making the refresh process even faster. A problem with this feature can be in upper levels we can end up with the \"average of averages\". But to get the \"real average\" we can rely on \"stats_aggs\" TimescaleDB Toolkit function that calculate and store the partials that can be finalized with other toolkit functions like \"average\" and \"sum\". Closes #1400 \n", "--------------------------------------------------------------------------------\n", - "Rafia Sabih Thu Apr 27 15:01:38 2023 +0200 d9849325d0d0f81a13db1e41aa56f8b567945e72 Improve test suite Add more regression tests for Continuous aggregates with joins. \n" + "Fabrízio de Royes Mello Wed Oct 5 18:45:40 2022 -0300 3749953e9704e45df8f621607989ada0714ce28d Hierarchical Continuous Aggregates Enable users create Hierarchical Continuous Aggregates (aka Continuous Aggregates on top of another Continuous Aggregates). With this PR users can create levels of aggregation granularity in Continuous Aggregates making the refresh process even faster. A problem with this feature can be in upper levels we can end up with the \"average of averages\". But to get the \"real average\" we can rely on \"stats_aggs\" TimescaleDB Toolkit function that calculate and store the partials that can be finalized with other toolkit functions like \"average\" and \"sum\". Closes #1400 \n" ] } ], @@ -826,10 +828,10 @@ "output_type": "stream", "text": [ "--------------------------------------------------------------------------------\n", - "18331d00-fc57-11ec-9b2d-68884ce13ab0\n", + "18331d00-fc57-11ec-a166-06cee12dbc78\n", "{'date': '2022-07-5 13:39:14+0320', 'author': 'Fabrízio de Royes Mello', 'commit': ' e34218ce2963358a500f6bc315aace0fad29c450'}\n", "Fabrízio de Royes Mello Tue Jul 5 13:39:14 2022 +0200 e34218ce2963358a500f6bc315aace0fad29c450 Migrate Continuous Aggregates to the new format Timescale 2.7 released a new version of Continuous Aggregate (#4269) that store the final aggregation state instead of the byte array of the partial aggregate state, offering multiple opportunities of optimizations as well a more compact form. When upgrading to Timescale 2.7, new created Continuous Aggregates are using the new format, but existing Continuous Aggregates keep using the format they were defined with. Created a procedure to upgrade existing Continuous Aggregates from the old format to the new format, by calling a simple procedure: test=# CALL cagg_migrate('conditions_summary_daily'); Closes #4424 \n", - "0.15422340530791157\n", + "0.15402132505614874\n", "--------------------------------------------------------------------------------\n" ] } @@ -874,9 +876,9 @@ "--------------------------------------------------------------------------------\n", "Rafia Sabih Wed Feb 8 11:54:28 2023 +0100 98218c1d079231a9aa469b37ddd0ed39e77c2adb Enable joins for heirarchical continuous aggregates The joins could be between a continuous aggregate and hypertable, continuous aggregate and a regular Postgres table, and continuous aggregate and a regular Postgres view. \n", "--------------------------------------------------------------------------------\n", - "Rafia Sabih Thu Apr 27 15:01:38 2023 +0200 d9849325d0d0f81a13db1e41aa56f8b567945e72 Improve test suite Add more regression tests for Continuous aggregates with joins. \n", + "Rafia Sabih Wed Feb 8 11:54:28 2023 +0100 98218c1d079231a9aa469b37ddd0ed39e77c2adb Enable joins for heirarchical continuous aggregates The joins could be between a continuous aggregate and hypertable, continuous aggregate and a regular Postgres table, and continuous aggregate and a regular Postgres view. \n", "--------------------------------------------------------------------------------\n", - "Rafia Sabih Mon Oct 24 13:05:55 2022 +0200 a67b90e977194f3e55c93ed6b3f5d2a671d503c1 Allow joins in continuous aggregates Enable the support of having join in the query used for creating the continuous aggregates. It has follwoing restrictions- 1. Join can involve only one hypertable and one normal table 2. Join should be a inner join 3. Join condition can only be equality \n" + "Rafia Sabih Thu Apr 27 15:01:38 2023 +0200 d9849325d0d0f81a13db1e41aa56f8b567945e72 Improve test suite Add more regression tests for Continuous aggregates with joins. \n" ] } ], @@ -1047,9 +1049,9 @@ "--------------------------------------------------------------------------------\n", "Sven Klemm Tue Aug 29 18:13:24 2023 +0200 e4facda540286b0affba47ccc63959fefe2a7b26 Add compatibility layer for _timescaledb_internal functions With timescaledb 2.12 all the functions present in _timescaledb_internal were moved into the _timescaledb_functions schema to improve schema security. This patch adds a compatibility layer so external callers of these internal functions will not break and allow for more flexibility when migrating. \n", "--------------------------------------------------------------------------------\n", - "Dmitry Simonenko Thu Aug 3 14:30:23 2023 +0300 7aeed663b9c0f337b530fd6cad47704a51a9b2ec Feature flags for TimescaleDB features This PR adds several GUCs which allow to enable/disable major timescaledb features: - enable_hypertable_create - enable_hypertable_compression - enable_cagg_create - enable_policy_create \n", + "Sven Klemm Tue Aug 29 18:13:24 2023 +0200 e4facda540286b0affba47ccc63959fefe2a7b26 Add compatibility layer for _timescaledb_internal functions With timescaledb 2.12 all the functions present in _timescaledb_internal were moved into the _timescaledb_functions schema to improve schema security. This patch adds a compatibility layer so external callers of these internal functions will not break and allow for more flexibility when migrating. \n", "--------------------------------------------------------------------------------\n", - "Konstantina Skovola Wed Aug 9 15:26:03 2023 +0300 44eab9cf9bef34274c88efd37a750eaa74cd8044 Release 2.11.2 This release contains bug fixes since the 2.11.1 release. We recommend that you upgrade at the next available opportunity. **Features** * #5923 Feature flags for TimescaleDB features **Bugfixes** * #5680 Fix DISTINCT query with JOIN on multiple segmentby columns * #5774 Fixed two bugs in decompression sorted merge code * #5786 Ensure pg_config --cppflags are passed * #5906 Fix quoting owners in sql scripts. * #5912 Fix crash in 1-step integer policy creation **Thanks** * @mrksngl for submitting a PR to fix extension upgrade scripts * @ericdevries for reporting an issue with DISTINCT queries using segmentby columns of compressed hypertable \n" + "Dmitry Simonenko Thu Aug 3 14:30:23 2023 +0300 7aeed663b9c0f337b530fd6cad47704a51a9b2ec Feature flags for TimescaleDB features This PR adds several GUCs which allow to enable/disable major timescaledb features: - enable_hypertable_create - enable_hypertable_compression - enable_cagg_create - enable_policy_create \n" ] } ], @@ -1094,5 +1096,5 @@ } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 } diff --git a/timescale_vector/client.py b/timescale_vector/client.py index 17c56a8..ccab029 100644 --- a/timescale_vector/client.py +++ b/timescale_vector/client.py @@ -328,9 +328,10 @@ class Predicates: "<=": "<=", "<": "<", "!=": "<>", + "@>": "@>", # array contains } - PredicateValue = Union[str, int, float, datetime] + PredicateValue = Union[str, int, float, datetime, list, tuple] def __init__(self, *clauses: Union['Predicates', Tuple[str, PredicateValue], Tuple[str, str, PredicateValue], str, PredicateValue], operator: str = 'AND'): """ @@ -431,18 +432,24 @@ def build_query(self, params: List) -> Tuple[str, List]: else: where_conditions.append(f"uuid_timestamp(id) {operator} {param_name}") params.append(value) - continue - - field_cast = '' - if isinstance(value, int): - field_cast = '::int' - elif isinstance(value, float): - field_cast = '::numeric' - elif isinstance(value, datetime): - field_cast = '::timestamptz' - - where_conditions.append(f"(metadata->>'{field}'){field_cast} {operator} {param_name}") - params.append(value) + + elif operator == "@>" and (isinstance(value, list) or isinstance(value, tuple)): + if len(value) == 0: + raise ValueError("Invalid value. Empty lists and empty tuples are not supported.") + json_value = json.dumps(value) + where_conditions.append(f"metadata @> jsonb_build_object('{field}', {param_name}::jsonb)") + params.append(json_value) + + else: + field_cast = '' + if isinstance(value, int): + field_cast = '::int' + elif isinstance(value, float): + field_cast = '::numeric' + elif isinstance(value, datetime): + field_cast = '::timestamptz' + where_conditions.append(f"(metadata->>'{field}'){field_cast} {operator} {param_name}") + params.append(value) if self.operator == 'NOT': or_clauses = (" OR ").join(where_conditions) @@ -972,7 +979,7 @@ async def create_embedding_index(self, index: BaseIndex): The index to create. Returns - -------- + ------- None """ #todo: can we make geting the records lazy? @@ -1003,9 +1010,12 @@ async def search(self, A filter for metadata. Should be specified as a key-value object or a list of key-value objects (where any objects in the list are matched). predicates A Predicates object to filter the results. Predicates support more complex queries than the filter parameter. Predicates can be combined using logical operators (&, |, and ~). + uuid_time_filter + A UUIDTimeRange object to filter the results by time using the id column. + query_params Returns - -------- + ------- List: List of similar records. """ (query, params) = self.builder.search_query(