From f984d62c11888d6c07d1a5a6318cd69cf62961c0 Mon Sep 17 00:00:00 2001 From: Tyler Hutcherson Date: Tue, 25 Mar 2025 08:48:38 -0400 Subject: [PATCH 01/11] very much wip --- docs/user_guide/data_validation.ipynb | 1102 ++++++++++++++++++ docs/validation.md | 228 ++++ redisvl/index/index.py | 37 +- redisvl/index/storage.py | 322 ++--- redisvl/schema/__init__.py | 34 +- redisvl/schema/fields.py | 4 +- redisvl/schema/schema.py | 62 +- redisvl/schema/type_utils.py | 63 + redisvl/schema/validation.py | 290 +++++ tests/integration/test_async_search_index.py | 2 +- tests/integration/test_flow_async.py | 2 +- tests/integration/test_search_index.py | 2 +- tests/unit/conftest.py | 183 +++ tests/unit/test_edge_cases.py | 451 +++++++ tests/unit/test_fields.py | 57 + tests/unit/test_storage.py | 560 ++++++++- tests/unit/test_validation.py | 515 ++++++++ 17 files changed, 3658 insertions(+), 256 deletions(-) create mode 100644 docs/user_guide/data_validation.ipynb create mode 100644 docs/validation.md create mode 100644 redisvl/schema/type_utils.py create mode 100644 redisvl/schema/validation.py create mode 100644 tests/unit/conftest.py create mode 100644 tests/unit/test_edge_cases.py create mode 100644 tests/unit/test_validation.py diff --git a/docs/user_guide/data_validation.ipynb b/docs/user_guide/data_validation.ipynb new file mode 100644 index 00000000..366f47a4 --- /dev/null +++ b/docs/user_guide/data_validation.ipynb @@ -0,0 +1,1102 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Getting Started with RedisVL\n", + "`redisvl` is a versatile Python library with an integrated CLI, designed to enhance AI applications using Redis. This guide will walk you through the following steps:\n", + "\n", + "1. Defining an `IndexSchema`\n", + "2. Preparing a sample dataset\n", + "3. Creating a `SearchIndex` object\n", + "4. Testing `rvl` CLI functionality\n", + "5. Loading the sample data\n", + "6. Building `VectorQuery` objects and executing searches\n", + "7. Updating a `SearchIndex` object\n", + "\n", + "...and more!\n", + "\n", + "Prerequisites:\n", + "- Ensure `redisvl` is installed in your Python environment.\n", + "- Have a running instance of [Redis Stack](https://redis.io/docs/install/install-stack/) or [Redis Cloud](https://redis.io/cloud).\n", + "\n", + "_____" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Define an `IndexSchema`\n", + "\n", + "The `IndexSchema` maintains crucial **index configuration** and **field definitions** to\n", + "enable search with Redis. For ease of use, the schema can be constructed from a\n", + "python dictionary or yaml file.\n", + "\n", + "### Example Schema Creation\n", + "Consider a dataset with user information, including `job`, `age`, `credit_score`,\n", + "and a 3-dimensional `user_embedding` vector.\n", + "\n", + "You must also decide on a Redis index name and key prefix to use for this\n", + "dataset. Below are example schema definitions in both YAML and Dict format.\n", + "\n", + "**YAML Definition:**\n", + "\n", + "```yaml\n", + "version: '0.1.0'\n", + "\n", + "index:\n", + " name: user_simple\n", + " prefix: user_simple_docs\n", + "\n", + "fields:\n", + " - name: user\n", + " type: tag\n", + " - name: credit_score\n", + " type: tag\n", + " - name: job\n", + " type: text\n", + " - name: age\n", + " type: numeric\n", + " - name: user_embedding\n", + " type: vector\n", + " attrs:\n", + " algorithm: flat\n", + " dims: 3\n", + " distance_metric: cosine\n", + " datatype: float32\n", + "```\n", + "> Store this in a local file, such as `schema.yaml`, for RedisVL usage." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Python Dictionary:**" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "schema = {\n", + " \"index\": {\n", + " \"name\": \"user_simple\",\n", + " \"prefix\": \"user_simple_docs\",\n", + " \"storage_type\": \"json\"\n", + " },\n", + " \"fields\": [\n", + " {\"name\": \"user\", \"type\": \"tag\"},\n", + " {\"name\": \"credit_score\", \"type\": \"tag\"},\n", + " {\"name\": \"job\", \"type\": \"text\"},\n", + " {\"name\": \"age\", \"type\": \"numeric\"},\n", + " {\"name\": \"location\", \"type\": \"geo\"},\n", + " {\n", + " \"name\": \"user_embedding\",\n", + " \"type\": \"vector\",\n", + " \"attrs\": {\n", + " \"dims\": 3,\n", + " \"distance_metric\": \"cosine\",\n", + " \"algorithm\": \"flat\",\n", + " \"datatype\": \"float32\"\n", + " }\n", + " }\n", + " ]\n", + "}" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Sample Dataset Preparation\n", + "\n", + "Below, create a mock dataset with `user`, `job`, `age`, `credit_score`, and\n", + "`user_embedding` fields. The `user_embedding` vectors are synthetic examples\n", + "for demonstration purposes.\n", + "\n", + "For more information on creating real-world embeddings, refer to this\n", + "[article](https://mlops.community/vector-similarity-search-from-basics-to-production/)." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "\n", + "\n", + "data = [\n", + " {\n", + " 'user': 'john',\n", + " 'age': 1,\n", + " 'job': 'engineer',\n", + " 'credit_score': 'high',\n", + " 'location': '37.540760,-77.433929',\n", + " 'user_embedding': np.array([0.1, 0.1, 0.5], dtype=np.float32).tobytes()\n", + " },\n", + " {\n", + " 'user': 'mary',\n", + " 'age': 2,\n", + " 'job': 'doctor',\n", + " 'credit_score': 'low',\n", + " 'location': '37.540760,-77.433929',\n", + " 'user_embedding': np.array([0.1, 0.1, 0.5], dtype=np.float32).tobytes()\n", + " },\n", + " {\n", + " 'user': 'joe',\n", + " 'age': 3,\n", + " 'job': 'dentist',\n", + " 'credit_score': 'medium',\n", + " 'location': '37.540760,-77.433929',\n", + " 'user_embedding': np.array([0.9, 0.9, 0.1], dtype=np.float32).tobytes()\n", + " }\n", + "]" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + ">As seen above, the sample `user_embedding` vectors are converted into bytes. Using the `NumPy`, this is fairly trivial." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create a `SearchIndex`\n", + "\n", + "With the schema and sample dataset ready, instantiate a `SearchIndex`:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "from redisvl.index import SearchIndex\n", + "\n", + "index = SearchIndex.from_dict(schema)\n", + "# or use .from_yaml('schema_file.yaml')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we also need to facilitate a Redis connection. There are a few ways to do this:\n", + "\n", + "- Create & manage your own client connection (recommended)\n", + "- Provide a Redis URL and let RedisVL connect on your behalf (by default, it will connect to \"redis://localhost:6379\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Bring your own Redis connection instance\n", + "\n", + "This is ideal in scenarios where you have custom settings on the connection instance or if your application will share a connection pool:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "from redis import Redis\n", + "\n", + "client = Redis.from_url(\"redis://localhost:6379\")\n", + "index = SearchIndex.from_dict(schema, redis_client=client)\n", + "\n", + "# alternatively, provide an async Redis client object to enable async index operations\n", + "# from redis.asyncio import Redis\n", + "# from redisvl.index import AsyncSearchIndex\n", + "# client = Redis.from_url(\"redis://localhost:6379\")\n", + "# index = AsyncSearchIndex.from_dict(schema, redis_client=client)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Let the index manage the connection instance\n", + "\n", + "This is ideal for simple cases:" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "index = SearchIndex.from_dict(schema, redis_url=\"redis://localhost:6379\", validate_on_load=True)\n", + "\n", + "# If you don't specify a client or Redis URL, the index will attempt to\n", + "# connect to Redis at the default address (\"redis://localhost:6379\")." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create the underlying index\n", + "\n", + "Now that we are connected to Redis, we need to run the create command." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "16:42:16 redisvl.index.index INFO Index already exists, overwriting.\n" + ] + } + ], + "source": [ + "index.create(overwrite=True, drop=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + ">Note that at this point, the index has no entries. Data loading follows." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Inspect with the `rvl` CLI\n", + "Use the `rvl` CLI to inspect the created index and its fields:" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[32m16:36:30\u001b[0m \u001b[34m[RedisVL]\u001b[0m \u001b[1;30mINFO\u001b[0m Indices:\n", + "\u001b[32m16:36:30\u001b[0m \u001b[34m[RedisVL]\u001b[0m \u001b[1;30mINFO\u001b[0m 1. user_simple\n" + ] + } + ], + "source": [ + "!rvl index listall" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "Index Information:\n", + "╭──────────────┬────────────────┬──────────────────────┬─────────────────┬────────────╮\n", + "│ Index Name │ Storage Type │ Prefixes │ Index Options │ Indexing │\n", + "├──────────────┼────────────────┼──────────────────────┼─────────────────┼────────────┤\n", + "│ user_simple │ JSON │ ['user_simple_docs'] │ [] │ 0 │\n", + "╰──────────────┴────────────────┴──────────────────────┴─────────────────┴────────────╯\n", + "Index Fields:\n", + "╭──────────────────┬────────────────┬─────────┬────────────────┬────────────────┬────────────────┬────────────────┬────────────────┬────────────────┬─────────────────┬────────────────╮\n", + "│ Name │ Attribute │ Type │ Field Option │ Option Value │ Field Option │ Option Value │ Field Option │ Option Value │ Field Option │ Option Value │\n", + "├──────────────────┼────────────────┼─────────┼────────────────┼────────────────┼────────────────┼────────────────┼────────────────┼────────────────┼─────────────────┼────────────────┤\n", + "│ $.user │ user │ TAG │ SEPARATOR │ , │ │ │ │ │ │ │\n", + "│ $.credit_score │ credit_score │ TAG │ SEPARATOR │ , │ │ │ │ │ │ │\n", + "│ $.job │ job │ TEXT │ WEIGHT │ 1 │ │ │ │ │ │ │\n", + "│ $.age │ age │ NUMERIC │ │ │ │ │ │ │ │ │\n", + "│ $.location │ location │ GEO │ │ │ │ │ │ │ │ │\n", + "│ $.user_embedding │ user_embedding │ VECTOR │ algorithm │ FLAT │ data_type │ FLOAT32 │ dim │ 3 │ distance_metric │ COSINE │\n", + "╰──────────────────┴────────────────┴─────────┴────────────────┴────────────────┴────────────────┴────────────────┴────────────────┴────────────────┴─────────────────┴────────────────╯\n" + ] + } + ], + "source": [ + "!rvl index info -i user_simple" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load Data to `SearchIndex`\n", + "\n", + "Load the sample dataset to Redis:" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "16:42:27 redisvl.index.index ERROR Error while loading data to Redis\n", + "Traceback (most recent call last):\n", + " File \"/Users/tyler.hutcherson/Documents/AppliedAI/redis-vl-python/redisvl/index/storage.py\", line 201, in _preprocess_and_validate_objects\n", + " processed_obj = self.validate(processed_obj)\n", + " File \"/Users/tyler.hutcherson/Documents/AppliedAI/redis-vl-python/redisvl/index/storage.py\", line 158, in validate\n", + " return validate_object(self.index_schema, obj)\n", + " File \"/Users/tyler.hutcherson/Documents/AppliedAI/redis-vl-python/redisvl/schema/validation.py\", line 254, in validate_object\n", + " validated = model_class.model_validate(flat_obj)\n", + " File \"/Users/tyler.hutcherson/Library/Caches/pypoetry/virtualenvs/redisvl-VnTEShF2-py3.13/lib/python3.13/site-packages/pydantic/main.py\", line 627, in model_validate\n", + " return cls.__pydantic_validator__.validate_python(\n", + " ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^\n", + " obj, strict=strict, from_attributes=from_attributes, context=context\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " )\n", + " ^\n", + "pydantic_core._pydantic_core.ValidationError: 1 validation error for user_simple__PydanticModel\n", + "user_embedding\n", + " Input should be a valid list [type=list_type, input_value=b'\\xcd\\xcc\\xcc=\\xcd\\xcc\\xcc=\\x00\\x00\\x00?', input_type=bytes]\n", + " For further information visit https://errors.pydantic.dev/2.10/v/list_type\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/Users/tyler.hutcherson/Documents/AppliedAI/redis-vl-python/redisvl/index/index.py\", line 600, in load\n", + " return self._storage.write(\n", + " ~~~~~~~~~~~~~~~~~~~^\n", + " self._redis_client, # type: ignore\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " ...<6 lines>...\n", + " validate=self._validate_on_load,\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " )\n", + " ^\n", + " File \"/Users/tyler.hutcherson/Documents/AppliedAI/redis-vl-python/redisvl/index/storage.py\", line 260, in write\n", + " prepared_objects = self._preprocess_and_validate_objects(\n", + " objects,\n", + " ...<3 lines>...\n", + " validate=validate\n", + " )\n", + " File \"/Users/tyler.hutcherson/Documents/AppliedAI/redis-vl-python/redisvl/index/storage.py\", line 212, in _preprocess_and_validate_objects\n", + " raise ValueError(f\"Validation failed for object {object_id}: {str(e)}\")\n", + "ValueError: Validation failed for object at index 0: 1 validation error for user_simple__PydanticModel\n", + "user_embedding\n", + " Input should be a valid list [type=list_type, input_value=b'\\xcd\\xcc\\xcc=\\xcd\\xcc\\xcc=\\x00\\x00\\x00?', input_type=bytes]\n", + " For further information visit https://errors.pydantic.dev/2.10/v/list_type\n" + ] + }, + { + "ename": "ValueError", + "evalue": "Validation failed for object at index 0: 1 validation error for user_simple__PydanticModel\nuser_embedding\n Input should be a valid list [type=list_type, input_value=b'\\xcd\\xcc\\xcc=\\xcd\\xcc\\xcc=\\x00\\x00\\x00?', input_type=bytes]\n For further information visit https://errors.pydantic.dev/2.10/v/list_type", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mValidationError\u001b[0m Traceback (most recent call last)", + "File \u001b[0;32m~/Documents/AppliedAI/redis-vl-python/redisvl/index/storage.py:201\u001b[0m, in \u001b[0;36mBaseStorage._preprocess_and_validate_objects\u001b[0;34m(self, objects, id_field, keys, preprocess, validate)\u001b[0m\n\u001b[1;32m 200\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m validate:\n\u001b[0;32m--> 201\u001b[0m processed_obj \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mvalidate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mprocessed_obj\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 203\u001b[0m \u001b[38;5;66;03m# Store valid object with its key for writing\u001b[39;00m\n", + "File \u001b[0;32m~/Documents/AppliedAI/redis-vl-python/redisvl/index/storage.py:158\u001b[0m, in \u001b[0;36mBaseStorage.validate\u001b[0;34m(self, obj)\u001b[0m\n\u001b[1;32m 157\u001b[0m \u001b[38;5;66;03m# Pass directly to validation function and let any errors propagate\u001b[39;00m\n\u001b[0;32m--> 158\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mvalidate_object\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mindex_schema\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mobj\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/Documents/AppliedAI/redis-vl-python/redisvl/schema/validation.py:254\u001b[0m, in \u001b[0;36mvalidate_object\u001b[0;34m(schema, obj)\u001b[0m\n\u001b[1;32m 253\u001b[0m \u001b[38;5;66;03m# Validate against model\u001b[39;00m\n\u001b[0;32m--> 254\u001b[0m validated \u001b[38;5;241m=\u001b[39m \u001b[43mmodel_class\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmodel_validate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mflat_obj\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 255\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m validated\u001b[38;5;241m.\u001b[39mmodel_dump(exclude_none\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n", + "File \u001b[0;32m~/Library/Caches/pypoetry/virtualenvs/redisvl-VnTEShF2-py3.13/lib/python3.13/site-packages/pydantic/main.py:627\u001b[0m, in \u001b[0;36mBaseModel.model_validate\u001b[0;34m(cls, obj, strict, from_attributes, context)\u001b[0m\n\u001b[1;32m 626\u001b[0m __tracebackhide__ \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[0;32m--> 627\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mcls\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m__pydantic_validator__\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mvalidate_python\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 628\u001b[0m \u001b[43m \u001b[49m\u001b[43mobj\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstrict\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstrict\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfrom_attributes\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfrom_attributes\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcontext\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcontext\u001b[49m\n\u001b[1;32m 629\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", + "\u001b[0;31mValidationError\u001b[0m: 1 validation error for user_simple__PydanticModel\nuser_embedding\n Input should be a valid list [type=list_type, input_value=b'\\xcd\\xcc\\xcc=\\xcd\\xcc\\xcc=\\x00\\x00\\x00?', input_type=bytes]\n For further information visit https://errors.pydantic.dev/2.10/v/list_type", + "\nDuring handling of the above exception, another exception occurred:\n", + "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[16], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m keys \u001b[38;5;241m=\u001b[39m \u001b[43mindex\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mload\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdata\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28mprint\u001b[39m(keys)\n", + "File \u001b[0;32m~/Documents/AppliedAI/redis-vl-python/redisvl/index/index.py:600\u001b[0m, in \u001b[0;36mSearchIndex.load\u001b[0;34m(self, data, id_field, keys, ttl, preprocess, batch_size)\u001b[0m\n\u001b[1;32m 551\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Load objects to the Redis database. Returns the list of keys loaded\u001b[39;00m\n\u001b[1;32m 552\u001b[0m \u001b[38;5;124;03mto Redis.\u001b[39;00m\n\u001b[1;32m 553\u001b[0m \n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 597\u001b[0m \u001b[38;5;124;03m keys = index.load(data, preprocess=add_field)\u001b[39;00m\n\u001b[1;32m 598\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 599\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 600\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_storage\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mwrite\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 601\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_redis_client\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m# type: ignore\u001b[39;49;00m\n\u001b[1;32m 602\u001b[0m \u001b[43m \u001b[49m\u001b[43mobjects\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdata\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 603\u001b[0m \u001b[43m \u001b[49m\u001b[43mid_field\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mid_field\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 604\u001b[0m \u001b[43m \u001b[49m\u001b[43mkeys\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mkeys\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 605\u001b[0m \u001b[43m \u001b[49m\u001b[43mttl\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mttl\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 606\u001b[0m \u001b[43m \u001b[49m\u001b[43mpreprocess\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpreprocess\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 607\u001b[0m \u001b[43m \u001b[49m\u001b[43mbatch_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbatch_size\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 608\u001b[0m \u001b[43m \u001b[49m\u001b[43mvalidate\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_validate_on_load\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 609\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 610\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m:\n\u001b[1;32m 611\u001b[0m logger\u001b[38;5;241m.\u001b[39mexception(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mError while loading data to Redis\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", + "File \u001b[0;32m~/Documents/AppliedAI/redis-vl-python/redisvl/index/storage.py:260\u001b[0m, in \u001b[0;36mBaseStorage.write\u001b[0;34m(self, redis_client, objects, id_field, keys, ttl, preprocess, batch_size, validate)\u001b[0m\n\u001b[1;32m 257\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m []\n\u001b[1;32m 259\u001b[0m \u001b[38;5;66;03m# Pass 1: Preprocess and validate all objects\u001b[39;00m\n\u001b[0;32m--> 260\u001b[0m prepared_objects \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_preprocess_and_validate_objects\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 261\u001b[0m \u001b[43m \u001b[49m\u001b[43mobjects\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\n\u001b[1;32m 262\u001b[0m \u001b[43m \u001b[49m\u001b[43mid_field\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mid_field\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 263\u001b[0m \u001b[43m \u001b[49m\u001b[43mkeys\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mkeys\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 264\u001b[0m \u001b[43m \u001b[49m\u001b[43mpreprocess\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpreprocess\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 265\u001b[0m \u001b[43m \u001b[49m\u001b[43mvalidate\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mvalidate\u001b[49m\n\u001b[1;32m 266\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 268\u001b[0m \u001b[38;5;66;03m# Pass 2: Write all valid objects in batches\u001b[39;00m\n\u001b[1;32m 269\u001b[0m added_keys \u001b[38;5;241m=\u001b[39m []\n", + "File \u001b[0;32m~/Documents/AppliedAI/redis-vl-python/redisvl/index/storage.py:212\u001b[0m, in \u001b[0;36mBaseStorage._preprocess_and_validate_objects\u001b[0;34m(self, objects, id_field, keys, preprocess, validate)\u001b[0m\n\u001b[1;32m 209\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m id_field \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(obj, \u001b[38;5;28mdict\u001b[39m) \u001b[38;5;129;01mand\u001b[39;00m id_field \u001b[38;5;129;01min\u001b[39;00m obj:\n\u001b[1;32m 210\u001b[0m object_id \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mwith \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mid_field\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m=\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mobj[id_field]\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m--> 212\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mValidation failed for object \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mobject_id\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mstr\u001b[39m(e)\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 214\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m prepared_objects\n", + "\u001b[0;31mValueError\u001b[0m: Validation failed for object at index 0: 1 validation error for user_simple__PydanticModel\nuser_embedding\n Input should be a valid list [type=list_type, input_value=b'\\xcd\\xcc\\xcc=\\xcd\\xcc\\xcc=\\x00\\x00\\x00?', input_type=bytes]\n For further information visit https://errors.pydantic.dev/2.10/v/list_type" + ] + } + ], + "source": [ + "keys = index.load(data)\n", + "\n", + "print(keys)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "IndexInfo(name='user_simple', prefix='user_simple_docs', key_separator=':', storage_type=)" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "index.schema.index" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['user_simple_docs:01JQ4Y9V0NK7QBYKMCP47MT3DE']" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "index.load(data=[\n", + " {\n", + " 'user': 'john',\n", + " 'age': 1,\n", + " 'job': 'engineer',\n", + " 'credit_score': 'high',\n", + " 'location': 1,\n", + " 'user_embedding': [\n", + " ]\n", + " }\n", + "])" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'index_name': 'user_simple',\n", + " 'index_options': [],\n", + " 'index_definition': ['key_type',\n", + " 'JSON',\n", + " 'prefixes',\n", + " ['user_simple_docs'],\n", + " 'default_score',\n", + " '1',\n", + " 'indexes_all',\n", + " 'false'],\n", + " 'attributes': [['identifier',\n", + " '$.user',\n", + " 'attribute',\n", + " 'user',\n", + " 'type',\n", + " 'TAG',\n", + " 'SEPARATOR',\n", + " ','],\n", + " ['identifier',\n", + " '$.credit_score',\n", + " 'attribute',\n", + " 'credit_score',\n", + " 'type',\n", + " 'TAG',\n", + " 'SEPARATOR',\n", + " ','],\n", + " ['identifier', '$.job', 'attribute', 'job', 'type', 'TEXT', 'WEIGHT', '1'],\n", + " ['identifier', '$.age', 'attribute', 'age', 'type', 'NUMERIC'],\n", + " ['identifier', '$.location', 'attribute', 'location', 'type', 'GEO'],\n", + " ['identifier',\n", + " '$.user_embedding',\n", + " 'attribute',\n", + " 'user_embedding',\n", + " 'type',\n", + " 'VECTOR',\n", + " 'algorithm',\n", + " 'FLAT',\n", + " 'data_type',\n", + " 'FLOAT32',\n", + " 'dim',\n", + " 3,\n", + " 'distance_metric',\n", + " 'COSINE']],\n", + " 'num_docs': 2,\n", + " 'max_doc_id': 2,\n", + " 'num_terms': 2,\n", + " 'num_records': 12,\n", + " 'inverted_sz_mb': '4.61578369140625e-4',\n", + " 'vector_index_sz_mb': '0.028045654296875',\n", + " 'total_inverted_index_blocks': 5,\n", + " 'offset_vectors_sz_mb': '3.814697265625e-6',\n", + " 'doc_table_size_mb': '2.117156982421875e-4',\n", + " 'sortable_values_size_mb': '0',\n", + " 'key_table_size_mb': '8.296966552734375e-5',\n", + " 'tag_overhead_sz_mb': '5.53131103515625e-5',\n", + " 'text_overhead_sz_mb': '6.67572021484375e-5',\n", + " 'total_index_memory_sz_mb': '9.565353393554688e-4',\n", + " 'geoshapes_sz_mb': '0',\n", + " 'records_per_doc_avg': '6',\n", + " 'bytes_per_record_avg': '40.33333206176758',\n", + " 'offsets_per_term_avg': '0.3333333432674408',\n", + " 'offset_bits_per_record_avg': '8',\n", + " 'hash_indexing_failures': 4,\n", + " 'total_indexing_time': '0.3160000145435333',\n", + " 'indexing': 0,\n", + " 'percent_indexed': '1',\n", + " 'number_of_uses': 2,\n", + " 'cleaning': 0,\n", + " 'gc_stats': ['bytes_collected',\n", + " '0',\n", + " 'total_ms_run',\n", + " '0',\n", + " 'total_cycles',\n", + " '0',\n", + " 'average_cycle_time_ms',\n", + " 'nan',\n", + " 'last_run_time_ms',\n", + " '0',\n", + " 'gc_numeric_trees_missed',\n", + " '0',\n", + " 'gc_blocks_denied',\n", + " '0'],\n", + " 'cursor_stats': ['global_idle',\n", + " 0,\n", + " 'global_total',\n", + " 0,\n", + " 'index_capacity',\n", + " 128,\n", + " 'index_total',\n", + " 0],\n", + " 'dialect_stats': ['dialect_1',\n", + " 0,\n", + " 'dialect_2',\n", + " 0,\n", + " 'dialect_3',\n", + " 0,\n", + " 'dialect_4',\n", + " 0],\n", + " 'Index Errors': ['indexing failures',\n", + " 4,\n", + " 'last indexing error',\n", + " 'Empty array for vector field on JSON document',\n", + " 'last indexing error key',\n", + " 'user_simple_docs:01JQ4Y9V0NK7QBYKMCP47MT3DE'],\n", + " 'field statistics': [['identifier',\n", + " '$.user',\n", + " 'attribute',\n", + " 'user',\n", + " 'Index Errors',\n", + " ['indexing failures',\n", + " 0,\n", + " 'last indexing error',\n", + " 'N/A',\n", + " 'last indexing error key',\n", + " 'N/A']],\n", + " ['identifier',\n", + " '$.credit_score',\n", + " 'attribute',\n", + " 'credit_score',\n", + " 'Index Errors',\n", + " ['indexing failures',\n", + " 0,\n", + " 'last indexing error',\n", + " 'N/A',\n", + " 'last indexing error key',\n", + " 'N/A']],\n", + " ['identifier',\n", + " '$.job',\n", + " 'attribute',\n", + " 'job',\n", + " 'Index Errors',\n", + " ['indexing failures',\n", + " 0,\n", + " 'last indexing error',\n", + " 'N/A',\n", + " 'last indexing error key',\n", + " 'N/A']],\n", + " ['identifier',\n", + " '$.age',\n", + " 'attribute',\n", + " 'age',\n", + " 'Index Errors',\n", + " ['indexing failures',\n", + " 0,\n", + " 'last indexing error',\n", + " 'N/A',\n", + " 'last indexing error key',\n", + " 'N/A']],\n", + " ['identifier',\n", + " '$.location',\n", + " 'attribute',\n", + " 'location',\n", + " 'Index Errors',\n", + " ['indexing failures',\n", + " 0,\n", + " 'last indexing error',\n", + " 'N/A',\n", + " 'last indexing error key',\n", + " 'N/A']],\n", + " ['identifier',\n", + " '$.user_embedding',\n", + " 'attribute',\n", + " 'user_embedding',\n", + " 'Index Errors',\n", + " ['indexing failures',\n", + " 4,\n", + " 'last indexing error',\n", + " 'Empty array for vector field on JSON document',\n", + " 'last indexing error key',\n", + " 'user_simple_docs:01JQ4Y9V0NK7QBYKMCP47MT3DE']]]}" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "index.info()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + ">By default, `load` will create a unique Redis key as a combination of the index key `prefix` and a random ULID. You can also customize the key by providing direct keys or pointing to a specified `id_field` on load." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Upsert the index with new data\n", + "Upsert data by using the `load` method again:" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['user_simple_docs:01JM2NWJGYMJ0QTR5YB4MB0BX9']\n" + ] + } + ], + "source": [ + "# Add more data\n", + "new_data = [{\n", + " 'user': 'tyler',\n", + " 'age': 9,\n", + " 'job': 'engineer',\n", + " 'credit_score': 'high',\n", + " 'user_embedding': np.array([0.1, 0.3, 0.5], dtype=np.float32).tobytes()\n", + "}]\n", + "keys = index.load(new_data)\n", + "\n", + "print(keys)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Creating `VectorQuery` Objects\n", + "\n", + "Next we will create a vector query object for our newly populated index. This example will use a simple vector to demonstrate how vector similarity works. Vectors in production will likely be much larger than 3 floats and often require Machine Learning models (i.e. Huggingface sentence transformers) or an embeddings API (Cohere, OpenAI). `redisvl` provides a set of [Vectorizers](https://docs.redisvl.com/en/latest/user_guide/vectorizers_04.html#openai) to assist in vector creation." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "from redisvl.query import VectorQuery\n", + "from jupyterutils import result_print\n", + "\n", + "query = VectorQuery(\n", + " vector=[0.1, 0.1, 0.5],\n", + " vector_field_name=\"user_embedding\",\n", + " return_fields=[\"user\", \"age\", \"job\", \"credit_score\", \"vector_distance\"],\n", + " num_results=3\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Executing queries\n", + "With our `VectorQuery` object defined above, we can execute the query over the `SearchIndex` using the `query` method." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "*=>[KNN 3 @user_embedding $vector AS vector_distance] RETURN 6 user age job credit_score vector_distance vector_distance SORTBY vector_distance ASC DIALECT 2 LIMIT 0 3\n" + ] + }, + { + "data": { + "text/html": [ + "table>vector_distanceuseragejobcredit_score0john1engineerhigh0mary2doctorlow0.0566299557686tyler9engineerhigh" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "results = index.query(query)\n", + "result_print(results)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Using an Asynchronous Redis Client\n", + "\n", + "The `AsyncSearchIndex` class along with an async Redis python client allows for queries, index creation, and data loading to be done asynchronously. This is the\n", + "recommended route for working with `redisvl` in production-like settings." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'index': {'name': 'user_simple', 'prefix': 'user_simple_docs'},\n", + " 'fields': [{'name': 'user', 'type': 'tag'},\n", + " {'name': 'credit_score', 'type': 'tag'},\n", + " {'name': 'job', 'type': 'text'},\n", + " {'name': 'age', 'type': 'numeric'},\n", + " {'name': 'user_embedding',\n", + " 'type': 'vector',\n", + " 'attrs': {'dims': 3,\n", + " 'distance_metric': 'cosine',\n", + " 'algorithm': 'flat',\n", + " 'datatype': 'float32'}}]}" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "schema" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from redisvl.index import AsyncSearchIndex\n", + "from redis.asyncio import Redis\n", + "\n", + "client = Redis.from_url(\"redis://localhost:6379\")\n", + "\n", + "index = AsyncSearchIndex.from_dict(schema, redis_client=client)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
vector_distanceuseragejobcredit_score
0john1engineerhigh
0mary2doctorlow
0.0566299557686tyler9engineerhigh
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# execute the vector query async\n", + "results = await index.query(query)\n", + "result_print(results)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Updating a schema\n", + "In some scenarios, it makes sense to update the index schema. With Redis and `redisvl`, this is easy because Redis can keep the underlying data in place while you change or make updates to the index configuration." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "So for our scenario, let's imagine we want to reindex this data in 2 ways:\n", + "- by using a `Tag` type for `job` field instead of `Text`\n", + "- by using an `hnsw` vector index for the `user_embedding` field instead of a `flat` vector index" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "# Modify this schema to have what we want\n", + "\n", + "index.schema.remove_field(\"job\")\n", + "index.schema.remove_field(\"user_embedding\")\n", + "index.schema.add_fields([\n", + " {\"name\": \"job\", \"type\": \"tag\"},\n", + " {\n", + " \"name\": \"user_embedding\",\n", + " \"type\": \"vector\",\n", + " \"attrs\": {\n", + " \"dims\": 3,\n", + " \"distance_metric\": \"cosine\",\n", + " \"algorithm\": \"hnsw\",\n", + " \"datatype\": \"float32\"\n", + " }\n", + " }\n", + "])" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "11:28:32 redisvl.index.index INFO Index already exists, overwriting.\n" + ] + } + ], + "source": [ + "# Run the index update but keep underlying data in place\n", + "await index.create(overwrite=True, drop=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
vector_distanceuseragejobcredit_score
0mary2doctorlow
0john1engineerhigh
0.0566299557686tyler9engineerhigh
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Execute the vector query async\n", + "results = await index.query(query)\n", + "result_print(results)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Check Index Stats\n", + "Use the `rvl` CLI to check the stats for the index:" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Statistics:\n", + "╭─────────────────────────────┬─────────────╮\n", + "│ Stat Key │ Value │\n", + "├─────────────────────────────┼─────────────┤\n", + "│ num_docs │ 4 │\n", + "│ num_terms │ 4 │\n", + "│ max_doc_id │ 4 │\n", + "│ num_records │ 22 │\n", + "│ percent_indexed │ 1 │\n", + "│ hash_indexing_failures │ 0 │\n", + "│ number_of_uses │ 2 │\n", + "│ bytes_per_record_avg │ 47.8 │\n", + "│ doc_table_size_mb │ 0.000423431 │\n", + "│ inverted_sz_mb │ 0.000911713 │\n", + "│ key_table_size_mb │ 0.000165939 │\n", + "│ offset_bits_per_record_avg │ nan │\n", + "│ offset_vectors_sz_mb │ 0 │\n", + "│ offsets_per_term_avg │ 0 │\n", + "│ records_per_doc_avg │ 5 │\n", + "│ sortable_values_size_mb │ 0 │\n", + "│ total_indexing_time │ 0.239 │\n", + "│ total_inverted_index_blocks │ 11 │\n", + "│ vector_index_sz_mb │ 0.235603 │\n", + "╰─────────────────────────────┴─────────────╯\n" + ] + } + ], + "source": [ + "!rvl stats -i user_simple" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Cleanup" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Below we will clean up after our work. First, you can flush all data from Redis associated with the index by\n", + "using the `.clear()` method. This will leave the secondary index in place for future insertions or updates.\n", + "\n", + "But if you want to clean up everything, including the index, just use `.delete()`\n", + "which will by default remove the index AND the underlying data." + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "4" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Clear all data from Redis associated with the index\n", + "await index.clear()" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Butm the index is still in place\n", + "await index.exists()" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "# Remove / delete the index in its entirety\n", + "await index.delete()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "env", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.2" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs/validation.md b/docs/validation.md new file mode 100644 index 00000000..204a009f --- /dev/null +++ b/docs/validation.md @@ -0,0 +1,228 @@ +# RedisVL Validation System + +The RedisVL validation system ensures that data written to Redis indexes conforms to the defined schema. It uses dynamic Pydantic model generation to validate objects before they are stored. + +## Key Features + +- **Schema-Based Validation**: Validates objects against your index schema definition +- **Dynamic Model Generation**: Creates Pydantic models on the fly based on your schema +- **Type Checking**: Ensures fields contain appropriate data types +- **Field-Specific Validation**: + - Text and Tag fields must be strings + - Numeric fields must be integers or floats + - Geo fields must be properly formatted latitude/longitude strings + - Vector fields must have the correct dimensions and data types +- **JSON Path Support**: Validates fields extracted from nested JSON structures +- **Fail-Fast Approach**: Stops processing at the first validation error +- **Performance Optimized**: Caches models for repeated validation + +## Usage + +### Basic Validation + +```python +from redisvl.schema.validation import validate_object + +# Assuming you have a schema defined +validated_data = validate_object(schema, data) +``` + +### Storage Integration + +The validation is automatically integrated with the storage classes: + +```python +from redisvl.index.storage import BaseStorage + +# Create storage with schema +storage = BaseStorage(schema=schema, client=redis_client) + +# Write data - validation happens automatically +storage.write_one(data) + +# Or validate explicitly +validated = storage.validate_object(data) +``` + +## Field Type Validation + +The validation system supports all Redis field types: + +### Text Fields + +Text fields are validated to ensure they contain string values: + +```python +# Valid +{"title": "Hello World"} + +# Invalid +{"title": 123} # Not a string +``` + +### Tag Fields + +Tag fields are validated to ensure they contain string values: + +```python +# Valid +{"category": "electronics"} + +# Invalid +{"category": 123} # Not a string +``` + +### Numeric Fields + +Numeric fields must contain integers or floats: + +```python +# Valid +{"price": 19.99} +{"quantity": 5} + +# Invalid +{"price": "19.99"} # String, not a number +``` + +### Geo Fields + +Geo fields must contain properly formatted latitude/longitude strings: + +```python +# Valid +{"location": "37.7749,-122.4194"} # San Francisco +{"location": "40.7128,-74.0060"} # New York + +# Invalid +{"location": "invalid"} # Not in lat,lon format +{"location": "91.0,0.0"} # Latitude out of range (-90 to 90) +{"location": "0.0,181.0"} # Longitude out of range (-180 to 180) +``` + +### Vector Fields + +Vector fields must contain arrays with the correct dimensions and data types: + +```python +# Valid +{"embedding": [0.1, 0.2, 0.3, 0.4]} # 4-dimensional float vector +{"embedding": b'\x00\x01\x02\x03'} # Raw bytes (dimensions not checked) + +# Invalid +{"embedding": [0.1, 0.2, 0.3]} # Wrong dimensions +{"embedding": "not a vector"} # Wrong type +{"embedding": [0.1, "text", 0.3]} # Mixed types +``` + +For integer vectors, the values must be within the appropriate range: + +- **INT8**: -128 to 127 +- **INT16**: -32,768 to 32,767 + +```python +# Valid INT8 vector +{"int_vector": [1, 2, 3]} + +# Invalid INT8 vector +{"int_vector": [1000, 2000, 3000]} # Values out of range +``` + +## Nested JSON Validation + +The validation system supports extracting and validating fields from nested JSON structures: + +```python +# Schema with JSON paths +fields = { + "id": Field(name="id", type=FieldTypes.TAG), + "title": Field(name="title", type=FieldTypes.TEXT, path="$.content.title"), + "rating": Field(name="rating", type=FieldTypes.NUMERIC, path="$.metadata.rating") +} + +# Nested JSON data +data = { + "id": "doc1", + "content": { + "title": "Hello World" + }, + "metadata": { + "rating": 4.5 + } +} + +# Validation extracts fields using JSON paths +validated = validate_object(schema, data) +# Result: {"id": "doc1", "title": "Hello World", "rating": 4.5} +``` + +## Error Handling + +The validation system uses a fail-fast approach, raising a `ValueError` when validation fails: + +```python +try: + validated = validate_object(schema, data) +except ValueError as e: + print(f"Validation error: {e}") + # Handle the error +``` + +The error message includes information about the field that failed validation. + +## Optional Fields + +All fields are considered optional during validation. If a field is missing, it will be excluded from the validated result: + +```python +# Schema with multiple fields +fields = { + "id": Field(name="id", type=FieldTypes.TAG), + "title": Field(name="title", type=FieldTypes.TEXT), + "rating": Field(name="rating", type=FieldTypes.NUMERIC) +} + +# Data with missing fields +data = { + "id": "doc1", + "title": "Hello World" + # rating is missing +} + +# Validation succeeds with partial data +validated = validate_object(schema, data) +# Result: {"id": "doc1", "title": "Hello World"} +``` + +## Performance Considerations + +The validation system is optimized for performance: + +- **Model Caching**: Pydantic models are cached by schema name to avoid regeneration +- **Lazy Validation**: Fields are validated only when needed +- **Fail-Fast Approach**: Processing stops at the first validation error + +For large datasets, validation can be a significant part of the processing time. If you need to write many objects with the same structure, consider validating a sample first to ensure correctness. + +## Limitations + +- **JSON Path**: The current implementation only supports simple dot notation paths (e.g., `$.field.subfield`). Array indexing is not supported. +- **Vector Bytes**: When vectors are provided as bytes, the dimensions cannot be validated. +- **Custom Validators**: The current implementation does not support custom user-defined validators. + +## Best Practices + +1. **Define Clear Schemas**: Be explicit about field types and constraints +2. **Pre-validate Critical Data**: For large datasets, validate a sample before processing everything +3. **Handle Validation Errors**: Implement proper error handling for validation failures +4. **Use JSON Paths Carefully**: Test nested JSON extraction to ensure paths are correctly defined +5. **Consider Optional Fields**: Decide which fields are truly required for your application + +## Integration with Storage Classes + +The validation system is fully integrated with the storage classes: + +- **BaseStorage**: For hash-based storage, validates each field individually +- **JsonStorage**: For JSON storage, extracts and validates fields from nested structures + +Each storage class automatically validates data before writing to Redis, ensuring data integrity. \ No newline at end of file diff --git a/redisvl/index/index.py b/redisvl/index/index.py index c4e5de62..0cf9b172 100644 --- a/redisvl/index/index.py +++ b/redisvl/index/index.py @@ -130,8 +130,7 @@ def __init__(*args, **kwargs): def _storage(self) -> BaseStorage: """The storage type for the index schema.""" return self._STORAGE_MAP[self.schema.index.storage_type]( - prefix=self.schema.index.prefix, - key_separator=self.schema.index.key_separator, + index_schema=self.schema ) @property @@ -263,6 +262,7 @@ def __init__( redis_client: Optional[redis.Redis] = None, redis_url: Optional[str] = None, connection_kwargs: Optional[Dict[str, Any]] = None, + validate_on_load: bool = False, **kwargs, ): """Initialize the RedisVL search index with a schema, Redis client @@ -277,6 +277,8 @@ def __init__( connect to. connection_kwargs (Dict[str, Any], optional): Redis client connection args. + validate_on_load (bool, optional): Whether to validate data against schema + when loading. Defaults to False. """ if "connection_args" in kwargs: connection_kwargs = kwargs.pop("connection_args") @@ -285,7 +287,7 @@ def __init__( raise ValueError("Must provide a valid IndexSchema object") self.schema = schema - + self._validate_on_load = validate_on_load self._lib_name: Optional[str] = kwargs.pop("lib_name", None) # Store connection parameters @@ -593,7 +595,7 @@ def load( Raises: ValueError: If the length of provided keys does not match the length - of objects. + of objects or if validation fails when validate_on_load is enabled. .. code-block:: python @@ -623,6 +625,7 @@ def add_field(d): ttl=ttl, preprocess=preprocess, batch_size=batch_size, + validate=self._validate_on_load, ) except: logger.exception("Error while loading data to Redis") @@ -934,6 +937,7 @@ def __init__( redis_url: Optional[str] = None, redis_client: Optional[aredis.Redis] = None, connection_kwargs: Optional[Dict[str, Any]] = None, + validate_on_load: bool = False, **kwargs, ): """Initialize the RedisVL async search index with a schema. @@ -946,6 +950,8 @@ def __init__( instantiated redis client. connection_kwargs (Optional[Dict[str, Any]]): Redis client connection args. + validate_on_load (bool, optional): Whether to validate data against schema + when loading. Defaults to False. """ if "redis_kwargs" in kwargs: connection_kwargs = kwargs.pop("redis_kwargs") @@ -955,7 +961,7 @@ def __init__( raise ValueError("Must provide a valid IndexSchema object") self.schema = schema - + self._validate_on_load = validate_on_load self._lib_name: Optional[str] = kwargs.pop("lib_name", None) # Store connection parameters @@ -1203,6 +1209,7 @@ async def expire_keys( else: return await client.expire(keys, ttl) + @deprecated_argument("concurrency", "Use batch_size instead.") async def load( self, data: Iterable[Any], @@ -1211,9 +1218,10 @@ async def load( ttl: Optional[int] = None, preprocess: Optional[Callable] = None, concurrency: Optional[int] = None, + batch_size: Optional[int] = None, ) -> List[str]: - """Asynchronously load objects to Redis with concurrency control. - Returns the list of keys loaded to Redis. + """Asynchronously load objects to Redis. Returns the list of keys loaded + to Redis. RedisVL automatically handles constructing the object keys, batching, optional preprocessing steps, and setting optional expiration @@ -1228,18 +1236,18 @@ async def load( Must match the length of objects if provided. Defaults to None. ttl (Optional[int], optional): Time-to-live in seconds for each key. Defaults to None. - preprocess (Optional[Callable], optional): An async function to + preprocess (Optional[Callable], optional): A function to preprocess objects before storage. Defaults to None. - concurrency (Optional[int], optional): The maximum number of - concurrent write operations. Defaults to class's default - concurrency level. + batch_size (Optional[int], optional): Number of objects to write in + a single Redis pipeline execution. Defaults to class's + default batch size. Returns: List[str]: List of keys loaded to Redis. Raises: ValueError: If the length of provided keys does not match the - length of objects. + length of objects or if validation fails when validate_on_load is enabled. .. code-block:: python @@ -1255,7 +1263,7 @@ async def load( keys = await index.load(data, keys=["rvl:foo", "rvl:bar"]) # load data with preprocessing step - async def add_field(d): + def add_field(d): d["new_field"] = 123 return d keys = await index.load(data, preprocess=add_field) @@ -1270,7 +1278,8 @@ async def add_field(d): keys=keys, ttl=ttl, preprocess=preprocess, - concurrency=concurrency, + batch_size=batch_size, + validate=self._validate_on_load, ) except: logger.exception("Error while loading data to Redis") diff --git a/redisvl/index/storage.py b/redisvl/index/storage.py index 2be386c0..f90e45b4 100644 --- a/redisvl/index/storage.py +++ b/redisvl/index/storage.py @@ -1,14 +1,18 @@ -import asyncio -from typing import Any, Callable, Dict, Iterable, List, Optional +from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple -from pydantic import BaseModel +from pydantic import BaseModel, ValidationError from redis import Redis from redis.asyncio import Redis as AsyncRedis from redis.commands.search.indexDefinition import IndexType from redisvl.redis.utils import convert_bytes +from redisvl.schema import IndexSchema +from redisvl.schema.validation import validate_object +from redisvl.utils.log import get_logger from redisvl.utils.utils import create_ulid +logger = get_logger(__name__) + class BaseStorage(BaseModel): """ @@ -20,14 +24,10 @@ class BaseStorage(BaseModel): type: IndexType """Type of index used in storage""" - prefix: str - """Prefix for Redis keys""" - key_separator: str - """Separator between prefix and key value""" + index_schema: IndexSchema + """Index schema definition""" default_batch_size: int = 200 """Default size for batch operations""" - default_write_concurrency: int = 20 - """Default concurrency for async ops""" @staticmethod def _key(id: str, prefix: str, key_separator: str) -> str: @@ -72,7 +72,9 @@ def _create_key(self, obj: Dict[str, Any], id_field: Optional[str] = None) -> st raise ValueError(f"Key field {id_field} not found in record {obj}") return self._key( - key_value, prefix=self.prefix, key_separator=self.key_separator + key_value, + prefix=self.index_schema.index.prefix, + key_separator=self.index_schema.index.key_separator, ) @staticmethod @@ -92,35 +94,6 @@ def _preprocess(obj: Any, preprocess: Optional[Callable] = None) -> Dict[str, An obj = preprocess(obj) return obj - @staticmethod - async def _apreprocess( - obj: Any, preprocess: Optional[Callable] = None - ) -> Dict[str, Any]: - """Asynchronously apply a preprocessing function to the object if - provided. - - Args: - preprocess (Optional[Callable], optional): Async function to - process the object. - obj (Any): Object to preprocess. - - Returns: - Dict[str, Any]: Processed object as a dictionary. - """ - # optionally async preprocess object - if preprocess: - obj = await preprocess(obj) - return obj - - def _validate(self, obj: Dict[str, Any]): - """Validate the object before writing to Redis. This method should be - implemented by subclasses. - - Args: - obj (Dict[str, Any]): The object to validate. - """ - raise NotImplementedError - @staticmethod def _set(client: Redis, key: str, obj: Dict[str, Any]): """Synchronously set the value in Redis for the given key. @@ -169,6 +142,84 @@ async def _aget(client: AsyncRedis, key: str) -> Dict[str, Any]: """ raise NotImplementedError + def validate(self, obj: Dict[str, Any]) -> Dict[str, Any]: + """ + Validate an object against the schema using Pydantic-based validation. + + Args: + obj: The object to validate + + Returns: + Validated object with any type coercions applied + + Raises: + ValueError: If validation fails + """ + # Pass directly to validation function and let any errors propagate + return validate_object(self.index_schema, obj) + + def _preprocess_and_validate_objects( + self, + objects: List[Any], + id_field: Optional[str] = None, + keys: Optional[Iterable[str]] = None, + preprocess: Optional[Callable] = None, + validate: bool = False, + ) -> List[Tuple[str, Dict[str, Any]]]: + """ + Preprocess and validate a list of objects with fail-fast approach. + + Args: + objects: List of objects to preprocess and validate + id_field: Field to use as the key + keys: Optional iterable of keys + preprocess: Optional preprocessing function + validate: Whether to validate against schema + + Returns: + List of tuples (key, processed_obj) for valid objects + + Raises: + ValueError: If any validation fails with object context + """ + prepared_objects = [] + keys_iterator = iter(keys) if keys else None + + for i, obj in enumerate(objects): + try: + # Generate key + key = ( + next(keys_iterator) + if keys_iterator + else self._create_key(obj, id_field) + ) + + # Preprocess + processed_obj = self._preprocess(obj, preprocess) + + # Basic type validation + if not isinstance(processed_obj, dict): + raise ValueError( + f"Object must be a dictionary, got {type(processed_obj).__name__}" + ) + + # Schema validation if enabled + if validate: + processed_obj = self.validate(processed_obj) + + # Store valid object with its key for writing + prepared_objects.append((key, processed_obj)) + + except Exception as e: + # Enhance error message with object context + object_id = f"at index {i}" + if id_field and isinstance(obj, dict) and id_field in obj: + object_id = f"with {id_field}={obj[id_field]}" + + raise ValueError(f"Validation failed for object {object_id}: {str(e)}") + + return prepared_objects + def write( self, redis_client: Redis, @@ -178,6 +229,7 @@ def write( ttl: Optional[int] = None, preprocess: Optional[Callable] = None, batch_size: Optional[int] = None, + validate: bool = False, ) -> List[str]: """Write a batch of objects to Redis as hash entries. This method returns a list of Redis keys written to the database. @@ -195,44 +247,52 @@ def write( objects before storage. Defaults to None. batch_size (Optional[int], optional): Number of objects to write in a single Redis pipeline execution. + validate (bool, optional): Whether to validate objects against schema. + Defaults to False. Raises: ValueError: If the length of provided keys does not match the - length of objects. + length of objects, or if validation fails. """ if keys and len(keys) != len(objects): # type: ignore raise ValueError("Length of keys does not match the length of objects") if batch_size is None: - # Use default or calculate based on the input data batch_size = self.default_batch_size - keys_iterator = iter(keys) if keys else None - added_keys: List[str] = [] - - if objects: - with redis_client.pipeline(transaction=False) as pipe: - for i, obj in enumerate(objects, start=1): - # Construct key, validate, and write - key = ( - next(keys_iterator) - if keys_iterator - else self._create_key(obj, id_field) - ) - obj = self._preprocess(obj, preprocess) - self._validate(obj) - self._set(pipe, key, obj) - # Set TTL if provided - if ttl: - pipe.expire(key, ttl) - # Execute mini batch - if i % batch_size == 0: - pipe.execute() - added_keys.append(key) - # Clean up batches if needed - if i % batch_size != 0: + if not objects: + return [] + + # Pass 1: Preprocess and validate all objects + prepared_objects = self._preprocess_and_validate_objects( + objects, + id_field=id_field, + keys=keys, + preprocess=preprocess, + validate=validate, + ) + + # Pass 2: Write all valid objects in batches + added_keys = [] + + with redis_client.pipeline(transaction=False) as pipe: + for i, (key, obj) in enumerate(prepared_objects, start=1): + self._set(pipe, key, obj) + + # Set TTL if provided + if ttl: + pipe.expire(key, ttl) + + added_keys.append(key) + + # Execute in batches + if i % batch_size == 0: pipe.execute() + # Execute any remaining commands + if len(prepared_objects) % batch_size != 0: + pipe.execute() + return added_keys async def awrite( @@ -242,12 +302,12 @@ async def awrite( id_field: Optional[str] = None, keys: Optional[Iterable[str]] = None, ttl: Optional[int] = None, + batch_size: Optional[int] = None, preprocess: Optional[Callable] = None, - concurrency: Optional[int] = None, + validate: bool = False, ) -> List[str]: - """Asynchronously write objects to Redis as hash entries with - concurrency control. The method returns a list of keys written to the - database. + """Asynchronously write objects to Redis as hash entries using pipeline batching. + The method returns a list of keys written to the database. Args: redis_client (AsyncRedis): An asynchronous Redis client used @@ -259,47 +319,60 @@ async def awrite( Must match the length of objects if provided. ttl (Optional[int], optional): Time-to-live in seconds for each key. Defaults to None. + batch_size (Optional[int], optional): Number of objects to write + in a single Redis pipeline execution. preprocess (Optional[Callable], optional): An async function to preprocess objects before storage. Defaults to None. - concurrency (Optional[int], optional): The maximum number of - concurrent write operations. Defaults to class's default - concurrency level. + validate (bool, optional): Whether to validate objects against schema. + Defaults to False. Returns: List[str]: List of Redis keys loaded to the databases. Raises: ValueError: If the length of provided keys does not match the - length of objects. + length of objects, or if validation fails. """ if keys and len(keys) != len(objects): # type: ignore raise ValueError("Length of keys does not match the length of objects") - if not concurrency: - concurrency = self.default_write_concurrency + if batch_size is None: + batch_size = self.default_batch_size - semaphore = asyncio.Semaphore(concurrency) - keys_iterator = iter(keys) if keys else None + if not objects: + return [] + + # Pass 1: Preprocess and validate all objects + prepared_objects = self._preprocess_and_validate_objects( + objects, + id_field=id_field, + keys=keys, + preprocess=preprocess, + validate=validate, + ) + + # Pass 2: Write all valid objects in batches using pipeline + added_keys = [] + + async with redis_client.pipeline(transaction=False) as pipe: + for i, (key, obj) in enumerate(prepared_objects, start=1): + await self._aset(pipe, key, obj) - async def _load(obj: Dict[str, Any], key: Optional[str] = None) -> str: - async with semaphore: - if key is None: - key = self._create_key(obj, id_field) - obj = await self._apreprocess(obj, preprocess) - self._validate(obj) - await self._aset(redis_client, key, obj) + # Set TTL if provided if ttl: - await redis_client.expire(key, ttl) - return key + await pipe.expire(key, ttl) - if keys_iterator: - tasks = [ - asyncio.create_task(_load(obj, next(keys_iterator))) for obj in objects - ] - else: - tasks = [asyncio.create_task(_load(obj)) for obj in objects] + added_keys.append(key) + + # Execute in batches + if i % batch_size == 0: + await pipe.execute() - return await asyncio.gather(*tasks) + # Execute any remaining commands + if len(prepared_objects) % batch_size != 0: + await pipe.execute() + + return added_keys def get( self, redis_client: Redis, keys: Iterable[str], batch_size: Optional[int] = None @@ -325,9 +398,7 @@ def get( return [] if batch_size is None: - batch_size = ( - self.default_batch_size - ) # Use default or calculate based on the input data + batch_size = self.default_batch_size # Use a pipeline to batch the retrieval with redis_client.pipeline(transaction=False) as pipe: @@ -345,39 +416,42 @@ async def aget( self, redis_client: AsyncRedis, keys: Iterable[str], - concurrency: Optional[int] = None, + batch_size: Optional[int] = None, ) -> List[Dict[str, Any]]: - """Asynchronously retrieve objects from Redis by keys, with concurrency - control. + """Asynchronously retrieve objects from Redis by keys. Args: redis_client (AsyncRedis): Asynchronous Redis client. keys (Iterable[str]): Keys to retrieve from Redis. - concurrency (Optional[int], optional): The number of concurrent - requests to make. + batch_size (Optional[int], optional): Number of objects to write + in a single Redis pipeline execution. Defaults to class's + default batch size. Returns: Dict[str, Any]: Dictionary with keys and their corresponding objects. """ + results: List = [] + if not isinstance(keys, Iterable): # type: ignore raise TypeError("Keys must be an iterable of strings") if len(keys) == 0: # type: ignore return [] - if not concurrency: - concurrency = self.default_write_concurrency - - semaphore = asyncio.Semaphore(concurrency) + if batch_size is None: + batch_size = self.default_batch_size - async def _get(key: str) -> Dict[str, Any]: - async with semaphore: - result = await self._aget(redis_client, key) - return result + # Use a pipeline to batch the retrieval + async with redis_client.pipeline(transaction=False) as pipe: + for i, key in enumerate(keys, start=1): + await self._aget(pipe, key) + if i % batch_size == 0: + results.extend(await pipe.execute()) + if i % batch_size != 0: + results.extend(await pipe.execute()) - tasks = [asyncio.create_task(_get(key)) for key in keys] - results = await asyncio.gather(*tasks) + # Process results return convert_bytes(results) @@ -392,19 +466,6 @@ class HashStorage(BaseStorage): type: IndexType = IndexType.HASH """Hash data type for the index""" - def _validate(self, obj: Dict[str, Any]): - """Validate that the given object is a dictionary, suitable for storage - as a Redis hash. - - Args: - obj (Dict[str, Any]): The object to validate. - - Raises: - TypeError: If the object is not a dictionary. - """ - if not isinstance(obj, dict): - raise TypeError("Object must be a dictionary.") - @staticmethod def _set(client: Redis, key: str, obj: Dict[str, Any]): """Synchronously set a hash value in Redis for the given key. @@ -465,19 +526,6 @@ class JsonStorage(BaseStorage): type: IndexType = IndexType.JSON """JSON data type for the index""" - def _validate(self, obj: Dict[str, Any]): - """Validate that the given object is a dictionary, suitable for JSON - serialization. - - Args: - obj (Dict[str, Any]): The object to validate. - - Raises: - TypeError: If the object is not a dictionary. - """ - if not isinstance(obj, dict): - raise TypeError("Object must be a dictionary.") - @staticmethod def _set(client: Redis, key: str, obj: Dict[str, Any]): """Synchronously set a JSON obj in Redis for the given key. diff --git a/redisvl/schema/__init__.py b/redisvl/schema/__init__.py index 24f6b821..c835ccd5 100644 --- a/redisvl/schema/__init__.py +++ b/redisvl/schema/__init__.py @@ -1,3 +1,35 @@ +from redisvl.schema.fields import ( + BaseField, + FieldTypes, + FlatVectorField, + GeoField, + HNSWVectorField, + NumericField, + TagField, + TextField, + VectorDataType, + VectorDistanceMetric, + VectorIndexAlgorithm, +) from redisvl.schema.schema import IndexInfo, IndexSchema, StorageType -__all__ = ["StorageType", "IndexSchema", "IndexInfo"] +# Expose validation functionality +from redisvl.schema.validation import validate_object + +__all__ = [ + "IndexSchema", + "IndexInfo", + "StorageType", + "FieldTypes", + "VectorDistanceMetric", + "VectorDataType", + "VectorIndexAlgorithm", + "BaseField", + "TextField", + "TagField", + "NumericField", + "GeoField", + "FlatVectorField", + "HNSWVectorField", + "validate_object", +] diff --git a/redisvl/schema/fields.py b/redisvl/schema/fields.py index 17714480..b77188d7 100644 --- a/redisvl/schema/fields.py +++ b/redisvl/schema/fields.py @@ -164,12 +164,14 @@ class BaseField(BaseModel): """Specified field attributes""" def _handle_names(self) -> Tuple[str, Optional[str]]: + """Helper to handle field naming with path support""" if self.path: return self.path, self.name return self.name, None def as_redis_field(self) -> RedisField: - raise NotImplementedError + """Convert schema field to Redis Field object""" + raise NotImplementedError("Must be implemented by field subclasses") class TextField(BaseField): diff --git a/redisvl/schema/schema.py b/redisvl/schema/schema.py index 33dfd9c7..90617d18 100644 --- a/redisvl/schema/schema.py +++ b/redisvl/schema/schema.py @@ -8,6 +8,7 @@ from redis.commands.search.field import Field as RedisField from redisvl.schema.fields import BaseField, FieldFactory +from redisvl.schema.type_utils import TypeInferrer from redisvl.utils.log import get_logger from redisvl.utils.utils import model_to_dict @@ -455,64 +456,3 @@ def to_yaml(self, file_path: str, overwrite: bool = True) -> None: with open(fp, "w") as f: yaml_data = self.to_dict() yaml.dump(yaml_data, f, sort_keys=False) - - -class TypeInferrer: - """Infers the type of a field based on its value.""" - - GEO_PATTERN = re.compile( - r"^\s*[-+]?([1-8]?\d(\.\d+)?|90(\.0+)?),\s*[-+]?(180(\.0+)?|((1[0-7]\d)|([1-9]?\d))(\.\d+)?)\s*$" - ) - - TYPE_METHOD_MAP = { - "numeric": "_is_numeric", - "geo": "_is_geographic", - "tag": "_is_tag", - "text": "_is_text", - } - - @classmethod - def infer(cls, value: Any) -> str: - """Infers the field type for a given value. - - Args: - value: The value to infer the type of. - - Returns: - The inferred field type as a string. - - Raises: - ValueError: If the type cannot be inferred. - """ - for type_name, method_name in cls.TYPE_METHOD_MAP.items(): - if getattr(cls, method_name)(value): - return type_name - raise ValueError(f"Unable to infer type for value: {value}") - - @classmethod - def _is_numeric(cls, value: Any) -> bool: - """Check if the value is numeric.""" - if not isinstance(value, (int, float, str)): - return False - try: - float(value) - return True - except (ValueError, TypeError): - return False - - @classmethod - def _is_tag(cls, value: Any) -> bool: - """Check if the value is a tag.""" - return isinstance(value, (list, set, tuple)) and all( - isinstance(v, str) for v in value - ) - - @classmethod - def _is_text(cls, value: Any) -> bool: - """Check if the value is text.""" - return isinstance(value, str) - - @classmethod - def _is_geographic(cls, value: Any) -> bool: - """Check if the value is a geographic coordinate.""" - return isinstance(value, str) and cls.GEO_PATTERN.match(value) is not None diff --git a/redisvl/schema/type_utils.py b/redisvl/schema/type_utils.py new file mode 100644 index 00000000..83329961 --- /dev/null +++ b/redisvl/schema/type_utils.py @@ -0,0 +1,63 @@ +import re +from typing import Any + + +class TypeInferrer: + """Infers the type of a field based on its value.""" + + GEO_PATTERN = re.compile( + r"^\s*[-+]?([1-8]?\d(\.\d+)?|90(\.0+)?),\s*[-+]?(180(\.0+)?|((1[0-7]\d)|([1-9]?\d))(\.\d+)?)\s*$" + ) + + TYPE_METHOD_MAP = { + "numeric": "_is_numeric", + "geo": "_is_geographic", + "tag": "_is_tag", + "text": "_is_text", + } + + @classmethod + def infer(cls, value: Any) -> str: + """Infers the field type for a given value. + + Args: + value: The value to infer the type of. + + Returns: + The inferred field type as a string. + + Raises: + ValueError: If the type cannot be inferred. + """ + for type_name, method_name in cls.TYPE_METHOD_MAP.items(): + if getattr(cls, method_name)(value): + return type_name + raise ValueError(f"Unable to infer type for value: {value}") + + @classmethod + def _is_numeric(cls, value: Any) -> bool: + """Check if the value is numeric.""" + if not isinstance(value, (int, float, str)): + return False + try: + float(value) + return True + except (ValueError, TypeError): + return False + + @classmethod + def _is_tag(cls, value: Any) -> bool: + """Check if the value is a tag.""" + return isinstance(value, (list, set, tuple)) and all( + isinstance(v, str) for v in value + ) + + @classmethod + def _is_text(cls, value: Any) -> bool: + """Check if the value is text.""" + return isinstance(value, str) + + @classmethod + def _is_geographic(cls, value: Any) -> bool: + """Check if the value is a geographic coordinate.""" + return isinstance(value, str) and cls.GEO_PATTERN.match(value) is not None diff --git a/redisvl/schema/validation.py b/redisvl/schema/validation.py new file mode 100644 index 00000000..51fcf445 --- /dev/null +++ b/redisvl/schema/validation.py @@ -0,0 +1,290 @@ +""" +RedisVL Schema Validation Module + +This module provides utilities for validating data against RedisVL schemas +using dynamically generated Pydantic models. +""" + +import json +import re +import warnings +from typing import Any, Dict, List, Optional, Type, Union + +from pydantic import BaseModel, Field, ValidationError, create_model, field_validator + +from redisvl.schema import IndexSchema +from redisvl.schema.fields import BaseField, FieldTypes, VectorDataType +from redisvl.schema.schema import StorageType +from redisvl.schema.type_utils import TypeInferrer +from redisvl.utils.log import get_logger + +logger = get_logger(__name__) + + +class SchemaModelGenerator: + """ + Generates and caches Pydantic models based on Redis schema definitions. + + This class handles the conversion of RedisVL IndexSchema objects into + Pydantic models with appropriate field types and validators. + """ + + _model_cache: Dict[str, Type[BaseModel]] = {} + + @classmethod + def get_model_for_schema(cls, schema: IndexSchema) -> Type[BaseModel]: + """ + Get or create a Pydantic model for a schema. + + Args: + schema: The IndexSchema to convert to a Pydantic model + + Returns: + A Pydantic model class that can validate data against the schema + """ + # Use schema identifier as cache key + cache_key = schema.index.name + + if cache_key not in cls._model_cache: + cls._model_cache[cache_key] = cls._create_model(schema) + + return cls._model_cache[cache_key] + + @classmethod + def _map_field_to_pydantic_type( + cls, field: BaseField, storage_type: StorageType + ) -> Type: + """ + Map Redis field types to appropriate Pydantic types. + + Args: + field: The Redis field definition + storage_type: The storage type (HASH or JSON) + + Returns: + The Pydantic field type + + Raises: + ValueError: If the field type is not supported + """ + if field.type == FieldTypes.TEXT: + return str + elif field.type == FieldTypes.TAG: + return str + elif field.type == FieldTypes.NUMERIC: + return Union[int, float] + elif field.type == FieldTypes.GEO: + return str + elif field.type == FieldTypes.VECTOR: + # For JSON storage, vectors are always lists + if storage_type == StorageType.JSON: + return List[Union[int, float]] + else: + return bytes + + # If we get here, the field type is not supported + raise ValueError(f"Unsupported field type: {field.type}") + + @classmethod + def _create_model(cls, schema: IndexSchema) -> Type[BaseModel]: + """ + Create a Pydantic model from schema definition. + + Args: + schema: The IndexSchema to convert + + Returns: + A Pydantic model class with appropriate fields and validators + """ + field_definitions = {} + validators = {} + + # Get storage type from schema + storage_type = schema.index.storage_type + + # Create field definitions dictionary for create_model + for field_name, field in schema.fields.items(): + field_type = cls._map_field_to_pydantic_type(field, storage_type) + + # Create field definition (all fields are optional in the model) + # this handles the cases where objects have missing fields (supported behavior) + field_definitions[field_name] = ( + Optional[field_type], # Make fields optional + Field( + default=None, + json_schema_extra={ + "field_type": field.type, + }, + ), + ) + + # Add field-specific validator info to our validator registry + if field.type == FieldTypes.GEO: + validators[field_name] = {"type": "geo"} + + elif field.type == FieldTypes.VECTOR: + validators[field_name] = { + "type": "vector", + "dims": field.attrs.dims, + "datatype": field.attrs.datatype, + "storage_type": storage_type, + } + + # First create the model class with field definitions + model_name = f"{schema.index.name}__PydanticModel" + model_class = create_model(model_name, **field_definitions) + + # Then add validators to the model class + for field_name, validator_info in validators.items(): + if validator_info["type"] == "geo": + # Add geo validator + validator = cls._create_geo_validator(field_name) + setattr(model_class, f"validate_{field_name}", validator) + + elif validator_info["type"] == "vector": + # Add vector validator + validator = cls._create_vector_validator( + field_name, + validator_info["dims"], + validator_info["datatype"], + validator_info["storage_type"], + ) + setattr(model_class, f"validate_{field_name}", validator) + + return model_class + + @staticmethod + def _create_geo_validator(field_name: str): + """ + Create a validator for geo fields. + + Args: + field_name: Name of the field to validate + + Returns: + A validator function that can be attached to a Pydantic model + """ + + # Create the validator function + def validate_geo_field(cls, value): + # Skip validation for None values + if value is not None: + # Validate against pattern + if not re.match(TypeInferrer.GEO_PATTERN.pattern, value): + raise ValueError( + f"Geo field '{field_name}' value '{value}' is not a valid 'lat,lon' format" + ) + return value + + # Add the field_validator decorator + return field_validator(field_name, mode="after")(validate_geo_field) + + @staticmethod + def _create_vector_validator( + field_name: str, dims: int, datatype: VectorDataType, storage_type: StorageType + ): + """ + Create a validator for vector fields. + + Args: + field_name: Name of the field to validate + dims: Expected dimensions of the vector + datatype: Expected datatype of the vector elements + storage_type: Type of storage (HASH or JSON) + + Returns: + A validator function that can be attached to a Pydantic model + """ + + # Create the validator function + def validate_vector_field(cls, value): + # Skip validation for None values + if value is not None: + + # Handle list representation + if isinstance(value, list): + + # Validate dimensions + if len(value) != dims: + raise ValueError( + f"Vector field '{field_name}' must have {dims} dimensions, got {len(value)}" + ) + + # Validate data types + datatype_str = str(datatype).upper() + + # Integer-based datatypes + if datatype_str in ("INT8", "UINT8"): + # Check type + if not all(isinstance(v, int) for v in value): + raise ValueError( + f"Vector field '{field_name}' must contain only integer values for {datatype_str}" + ) + + return value + + return validate_vector_field + + +def extract_from_json_path(obj: Dict[str, Any], path: str) -> Any: + """ + Extract a value from a nested JSON object using a path. + + Args: + obj: The object to extract values from + path: JSONPath-style path (e.g., $.field.subfield) + + Returns: + The extracted value or None if not found + """ + # Handle JSONPath syntax (e.g., $.field.subfield) + if path.startswith("$."): + path_parts = path[2:].split(".") + else: + path_parts = path.split(".") + + current = obj + for part in path_parts: + if isinstance(current, dict) and part in current: + current = current[part] + else: + return None + + return current + + +def validate_object(schema: IndexSchema, obj: Dict[str, Any]) -> Dict[str, Any]: + """ + Validate an object against a schema. + + Args: + schema: The IndexSchema to validate against + obj: The object to validate + + Returns: + Validated object with any type coercions applied + + Raises: + ValueError: If validation fails with enhanced error message + """ + # Get Pydantic model for this schema + model_class = SchemaModelGenerator.get_model_for_schema(schema) + + # Prepare object for validation + # Handle nested JSON if needed + if schema.index.storage_type == StorageType.JSON: + # Extract values from nested paths + flat_obj = {} + for field_name, field in schema.fields.items(): + if field.path: + value = extract_from_json_path(obj, field.path) + if value is not None: + flat_obj[field_name] = value + elif field_name in obj: + flat_obj[field_name] = obj[field_name] + else: + flat_obj = obj + + # Validate against model + validated = model_class.model_validate(flat_obj) + return validated.model_dump(exclude_none=True) diff --git a/tests/integration/test_async_search_index.py b/tests/integration/test_async_search_index.py index edc6c01a..ea122d5d 100644 --- a/tests/integration/test_async_search_index.py +++ b/tests/integration/test_async_search_index.py @@ -284,7 +284,7 @@ async def preprocess(record): async def bad_preprocess(record): return 1 - with pytest.raises(TypeError): + with pytest.raises(ValueError): await async_index.load(data, id_field="id", preprocess=bad_preprocess) diff --git a/tests/integration/test_flow_async.py b/tests/integration/test_flow_async.py index a368f677..c727fd28 100644 --- a/tests/integration/test_flow_async.py +++ b/tests/integration/test_flow_async.py @@ -52,7 +52,7 @@ async def test_simple(async_client, schema, sample_data): await index.create(overwrite=True, drop=True) # Prepare and load the data based on storage type - async def hash_preprocess(item: dict) -> dict: + def hash_preprocess(item: dict) -> dict: return { **item, "user_embedding": array_to_buffer(item["user_embedding"], "float32"), diff --git a/tests/integration/test_search_index.py b/tests/integration/test_search_index.py index 800f6a06..02b6d5e4 100644 --- a/tests/integration/test_search_index.py +++ b/tests/integration/test_search_index.py @@ -268,7 +268,7 @@ def preprocess(record): def bad_preprocess(record): return 1 - with pytest.raises(TypeError): + with pytest.raises(ValueError): index.load(data, id_field="id", preprocess=bad_preprocess) diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py new file mode 100644 index 00000000..91a558f2 --- /dev/null +++ b/tests/unit/conftest.py @@ -0,0 +1,183 @@ +""" +Common test fixtures and utilities for RedisVL validation tests. +""" + +from typing import Any, Dict + +import pytest + +from redisvl.schema import IndexSchema +from redisvl.schema.fields import VectorDataType, VectorDistanceMetric + + +@pytest.fixture +def comprehensive_schema(): + """Create a comprehensive schema with all field types for testing.""" + return IndexSchema.from_dict( + { + "index": { + "name": "test-index", + "prefix": "test", + "key_separator": ":", + "storage_type": "hash", + }, + "fields": [ + {"name": "id", "type": "tag"}, + {"name": "title", "type": "text"}, + {"name": "rating", "type": "numeric"}, + {"name": "location", "type": "geo"}, + { + "name": "embedding", + "type": "vector", + "attrs": { + "algorithm": "flat", + "dims": 4, + "datatype": "float32", + "distance_metric": "cosine", + }, + }, + { + "name": "int_vector", + "type": "vector", + "attrs": { + "algorithm": "flat", + "dims": 3, + "datatype": "int8", + "distance_metric": "l2", + }, + }, + { + "name": "hnsw_vector", + "type": "vector", + "attrs": { + "algorithm": "hnsw", + "dims": 3, + "distance_metric": "cosine", + "datatype": "float32", + "m": 16, + "ef_construction": 200, + "ef_runtime": 10, + "epsilon": 0.01, + }, + }, + ], + } + ) + + +@pytest.fixture +def json_schema(): + """Create a schema with JSON storage and path fields.""" + return IndexSchema.from_dict( + { + "index": { + "name": "test-json-index", + "prefix": "test", + "key_separator": ":", + "storage_type": "json", + }, + "fields": [ + {"name": "id", "type": "tag", "path": "$.id"}, + {"name": "user", "type": "tag", "path": "$.metadata.user"}, + {"name": "title", "type": "text", "path": "$.content.title"}, + {"name": "rating", "type": "numeric", "path": "$.metadata.rating"}, + { + "name": "embedding", + "type": "vector", + "path": "$.content.embedding", + "attrs": { + "algorithm": "flat", + "dims": 4, + "datatype": "float32", + "distance_metric": "cosine", + }, + }, + ], + } + ) + + +@pytest.fixture +def valid_data(): + """Sample valid data for testing validation.""" + return { + "id": "doc1", + "title": "Test Document", + "rating": 4.5, + "location": "37.7749,-122.4194", + "embedding": [0.1, 0.2, 0.3, 0.4], + "int_vector": [1, 2, 3], + "hnsw_vector": [0.1, 0.2, 0.3], + } + + +@pytest.fixture +def valid_nested_data(): + """Sample valid nested data for testing JSON path validation.""" + return { + "id": "doc1", + "metadata": {"user": "user123", "rating": 4.5}, + "content": {"title": "Test Document", "embedding": [0.1, 0.2, 0.3, 0.4]}, + } + + +@pytest.fixture +def invalid_data_cases(): + """ + Test cases for invalid data. + Each case contains: + - field: name of the field + - value: invalid value to test + - error_text: text that should appear in error message + """ + return [ + # Text field errors + {"field": "title", "value": 123, "error_text": "must be a string"}, + # Numeric field errors + {"field": "rating", "value": "high", "error_text": "must be a number"}, + {"field": "rating", "value": "123.45", "error_text": "must be a number"}, + # Tag field errors + {"field": "id", "value": 123, "error_text": "must be a string"}, + # Geo field errors + { + "field": "location", + "value": "invalid_geo", + "error_text": "not a valid 'lat,lon' format", + }, + { + "field": "location", + "value": "1000,-1000", + "error_text": "not a valid 'lat,lon' format", + }, + # Vector field errors - float32 + {"field": "embedding", "value": [0.1, 0.2, 0.3], "error_text": "dimensions"}, + { + "field": "embedding", + "value": [0.1, "string", 0.3, 0.4], + "error_text": "numeric values", + }, + { + "field": "embedding", + "value": "not_a_vector", + "error_text": "must be a list or bytes", + }, + # Vector field errors - int8 + { + "field": "int_vector", + "value": [0.1, 0.2, 0.3], + "error_text": "integer values", + }, + {"field": "int_vector", "value": [1, 2], "error_text": "dimensions"}, + { + "field": "int_vector", + "value": [1000, 2000, 3000], + "error_text": "INT8 values must be between", + }, + # HNSW Vector field errors + {"field": "hnsw_vector", "value": [0.1, 0.2], "error_text": "dimensions"}, + { + "field": "hnsw_vector", + "value": ["a", "b", "c"], + "error_text": "numeric values", + }, + ] diff --git a/tests/unit/test_edge_cases.py b/tests/unit/test_edge_cases.py new file mode 100644 index 00000000..3646cc1f --- /dev/null +++ b/tests/unit/test_edge_cases.py @@ -0,0 +1,451 @@ +""" +Tests for edge cases in the RedisVL validation module. + +This module tests edge cases in the validation system that might not be +covered in the main test files, including: +1. Performance and caching behavior +2. Handling of unusual data types +3. Extreme values +4. Boundary conditions +""" + +import time +from typing import Any, Dict, List + +import pytest + +from redisvl.index.storage import BaseStorage +from redisvl.schema.fields import Field, FieldTypes, VectorDataType +from redisvl.schema.index import Index, IndexSchema +from redisvl.schema.validation import SchemaModelGenerator, validate_object + + +class TestSchemaModelCaching: + """Tests for model caching behavior.""" + + def test_caching_improves_performance(self): + """Test that caching improves model generation performance.""" + # Create a complex schema + fields = { + f"field_{i}": Field(name=f"field_{i}", type=FieldTypes.TEXT) + for i in range(50) # 50 fields should be enough to measure performance + } + + schema = IndexSchema( + index=Index(name="performance_test", prefix="doc"), fields=fields + ) + + # First generation (not cached) + start_time = time.time() + model1 = SchemaModelGenerator.get_model_for_schema(schema) + first_time = time.time() - start_time + + # Second generation (should be cached) + start_time = time.time() + model2 = SchemaModelGenerator.get_model_for_schema(schema) + second_time = time.time() - start_time + + # Verify second generation is faster + assert second_time < first_time + + # Should be much faster (usually at least 10x) + assert second_time < (first_time * 0.5) + + # Verify same model instance + assert model1 is model2 + + def test_different_schemas_get_different_models(self): + """Test that different schemas get different model instances.""" + # Create two different schemas + schema1 = IndexSchema( + index=Index(name="test1", prefix="doc1"), + fields={"field1": Field(name="field1", type=FieldTypes.TEXT)}, + ) + + schema2 = IndexSchema( + index=Index(name="test2", prefix="doc2"), + fields={"field1": Field(name="field1", type=FieldTypes.TEXT)}, + ) + + # Get models + model1 = SchemaModelGenerator.get_model_for_schema(schema1) + model2 = SchemaModelGenerator.get_model_for_schema(schema2) + + # Verify different model instances + assert model1 is not model2 + assert model1.__name__ != model2.__name__ + + +class TestUnusualDataTypes: + """Tests for handling unusual data types during validation.""" + + @pytest.fixture + def basic_schema(self): + """Create a basic schema for testing.""" + return IndexSchema( + index=Index(name="test", prefix="doc"), + fields={ + "text_field": Field(name="text_field", type=FieldTypes.TEXT), + "tag_field": Field(name="tag_field", type=FieldTypes.TAG), + "num_field": Field(name="num_field", type=FieldTypes.NUMERIC), + }, + ) + + def test_none_values(self, basic_schema): + """Test handling of None values.""" + # Data with None values + data = {"text_field": None, "tag_field": None, "num_field": None} + + # Validate + result = validate_object(basic_schema, data) + + # None values should be excluded + assert len(result) == 0 + + def test_empty_string_values(self, basic_schema): + """Test handling of empty strings.""" + # Data with empty strings + data = {"text_field": "", "tag_field": "", "num_field": 0} + + # Validate + result = validate_object(basic_schema, data) + + # Empty strings are valid for text and tag + assert result["text_field"] == "" + assert result["tag_field"] == "" + assert result["num_field"] == 0 + + def test_boolean_values(self, basic_schema): + """Test handling of boolean values.""" + # Data with booleans + data = {"text_field": True, "tag_field": False, "num_field": True} + + # Booleans aren't valid for text or tag + with pytest.raises(ValueError) as exc_info: + validate_object(basic_schema, data) + + assert "text_field" in str(exc_info.value) + + # Create new schema with only numeric + num_schema = IndexSchema( + index=Index(name="test", prefix="doc"), + fields={"num_field": Field(name="num_field", type=FieldTypes.NUMERIC)}, + ) + + # Validate with only the numeric field + result = validate_object(num_schema, {"num_field": True}) + + # Python converts True to 1, False to 0 + assert result["num_field"] == 1 + + def test_list_for_text(self, basic_schema): + """Test handling lists for text fields.""" + # Data with list for text + data = {"text_field": ["item1", "item2"]} + + # Lists aren't valid for text + with pytest.raises(ValueError) as exc_info: + validate_object(basic_schema, data) + + assert "text_field" in str(exc_info.value) + + +class TestVectorEdgeCases: + """Tests for edge cases with vector fields.""" + + @pytest.fixture + def vector_schema(self): + """Create a schema with vector fields for testing.""" + return IndexSchema( + index=Index(name="test_vectors", prefix="vec"), + fields={ + "float_vec": Field( + name="float_vec", + type=FieldTypes.VECTOR, + attrs={"dims": 3, "datatype": VectorDataType.FLOAT32}, + ), + "int_vec": Field( + name="int_vec", + type=FieldTypes.VECTOR, + attrs={"dims": 3, "datatype": VectorDataType.INT8}, + ), + }, + ) + + def test_large_vectors(self, vector_schema): + """Test validation of very large vectors.""" + # Create a large vector (1000 dimensions) + large_schema = IndexSchema( + index=Index(name="large_vec", prefix="vec"), + fields={ + "large_vec": Field( + name="large_vec", + type=FieldTypes.VECTOR, + attrs={"dims": 1000, "datatype": VectorDataType.FLOAT32}, + ) + }, + ) + + # Valid large vector + large_vector = {"large_vec": [0.1] * 1000} + result = validate_object(large_schema, large_vector) + assert len(result["large_vec"]) == 1000 + + # Invalid dimensions + invalid_dims = {"large_vec": [0.1] * 999} + with pytest.raises(ValueError) as exc_info: + validate_object(large_schema, invalid_dims) + assert "dimensions" in str(exc_info.value) + + def test_mixed_vector_types(self, vector_schema): + """Test validation of vectors with mixed element types.""" + # Float vector with mixed types + mixed_float = {"float_vec": [1, 2.5, "3"]} + with pytest.raises(ValueError) as exc_info: + validate_object(vector_schema, mixed_float) + assert "float_vec" in str(exc_info.value) + + # Int vector with mixed types + mixed_int = {"int_vec": [1, 2.5, 3]} + with pytest.raises(ValueError) as exc_info: + validate_object(vector_schema, mixed_int) + assert "int_vec" in str(exc_info.value) + + def test_empty_vector(self, vector_schema): + """Test validation of empty vectors.""" + # Empty float vector + empty_vec = {"float_vec": []} + with pytest.raises(ValueError) as exc_info: + validate_object(vector_schema, empty_vec) + assert "float_vec" in str(exc_info.value) + assert "dimensions" in str(exc_info.value) + + def test_vector_int_range(self, vector_schema): + """Test validation of integer vectors with values outside allowed range.""" + # INT8 vector with values outside range + out_of_range = {"int_vec": [100, 200, 300]} # Valid int, but outside INT8 range + with pytest.raises(ValueError) as exc_info: + validate_object(vector_schema, out_of_range) + assert "int_vec" in str(exc_info.value) + assert "must be between" in str(exc_info.value) + + # INT8 vector with valid range + valid_range = {"int_vec": [-128, 0, 127]} + result = validate_object(vector_schema, valid_range) + assert result["int_vec"] == [-128, 0, 127] + + +class TestGeoEdgeCases: + """Tests for edge cases with geo fields.""" + + @pytest.fixture + def geo_schema(self): + """Create a schema with geo fields for testing.""" + return IndexSchema( + index=Index(name="test_geo", prefix="geo"), + fields={"location": Field(name="location", type=FieldTypes.GEO)}, + ) + + def test_geo_boundary_values(self, geo_schema): + """Test validation of geo fields with boundary values.""" + # Valid boundary values + valid_boundaries = [ + {"location": "90,180"}, # Max lat, max lon + {"location": "-90,-180"}, # Min lat, min lon + {"location": "0,0"}, # Zero point + {"location": "90,0"}, # North pole + {"location": "-90,0"}, # South pole + ] + + for data in valid_boundaries: + result = validate_object(geo_schema, data) + assert result["location"] == data["location"] + + def test_geo_invalid_boundary_values(self, geo_schema): + """Test validation of geo fields with invalid boundary values.""" + # Invalid boundary values + invalid_boundaries = [ + {"location": "91,0"}, # Lat > 90 + {"location": "-91,0"}, # Lat < -90 + {"location": "0,181"}, # Lon > 180 + {"location": "0,-181"}, # Lon < -180 + {"location": "90.1,0"}, # Lat > 90 (decimal) + {"location": "0,180.1"}, # Lon > 180 (decimal) + ] + + for data in invalid_boundaries: + with pytest.raises(ValueError) as exc_info: + validate_object(geo_schema, data) + assert "location" in str(exc_info.value) + assert "not a valid" in str(exc_info.value) + + def test_geo_formats(self, geo_schema): + """Test validation of geo fields with different formats.""" + # Various valid formats + valid_formats = [ + {"location": "37.7749,-122.4194"}, # Decimal degrees + {"location": "-37.7749,122.4194"}, # Negative latitude + {"location": "37.7749,122.4194"}, # Positive longitude + {"location": "0.0000,0.0000"}, # Zeros with decimal + {"location": "37,-122"}, # Integer degrees + ] + + for data in valid_formats: + result = validate_object(geo_schema, data) + assert result["location"] == data["location"] + + # Invalid formats + invalid_formats = [ + {"location": "37.7749"}, # Missing longitude + {"location": "37.7749,"}, # Missing longitude value + {"location": ",122.4194"}, # Missing latitude value + {"location": "37.7749:122.4194"}, # Wrong separator + {"location": "37.7749, 122.4194"}, # Space after separator + {"location": "North,South"}, # Non-numeric values + ] + + for data in invalid_formats: + with pytest.raises(ValueError) as exc_info: + validate_object(geo_schema, data) + assert "location" in str(exc_info.value) + + +class TestNestedJsonEdgeCases: + """Tests for edge cases with nested JSON.""" + + @pytest.fixture + def nested_schema(self): + """Create a schema with JSON paths for testing.""" + fields = { + "id": Field(name="id", type=FieldTypes.TAG), + "title": Field(name="title", type=FieldTypes.TEXT, path="$.content.title"), + "rating": Field( + name="rating", type=FieldTypes.NUMERIC, path="$.metadata.rating" + ), + "deeply_nested": Field( + name="deeply_nested", + type=FieldTypes.TEXT, + path="$.level1.level2.level3.level4.value", + ), + } + + return IndexSchema( + index=Index(name="test_nested", prefix="nested"), fields=fields + ) + + def test_very_deeply_nested_json(self, nested_schema): + """Test validation with very deeply nested JSON.""" + # Create a deeply nested structure + deeply_nested = { + "id": "doc1", + "level1": { + "level2": {"level3": {"level4": {"value": "deeply nested value"}}} + }, + } + + # Validate + result = validate_object(nested_schema, deeply_nested) + assert result["id"] == "doc1" + assert result["deeply_nested"] == "deeply nested value" + + def test_partial_path_missing(self, nested_schema): + """Test validation when part of a JSON path is missing.""" + # Create object with partial path missing + partial_missing = { + "id": "doc1", + "level1": { + "level2": { + # level3 missing + } + }, + } + + # Validate - should ignore missing path + result = validate_object(nested_schema, partial_missing) + assert result["id"] == "doc1" + assert "deeply_nested" not in result + + def test_nested_arrays(self): + """Test validation with nested arrays in JSON.""" + # Create schema with path to array element + array_schema = IndexSchema( + index=Index(name="test_arrays", prefix="arr"), + fields={ + "id": Field(name="id", type=FieldTypes.TAG), + "first_item": Field( + name="first_item", type=FieldTypes.TEXT, path="$.items[0]" + ), + "nested_item": Field( + name="nested_item", + type=FieldTypes.TEXT, + path="$.nested.items[1].name", + ), + }, + ) + + # Note: JSONPath with array indexing is not supported currently + # This test documents this limitation + + # Create data with arrays + array_data = { + "id": "arr1", + "items": ["first", "second", "third"], + "nested": {"items": [{"name": "item1"}, {"name": "item2"}]}, + } + + # Validate - array paths won't be found + result = validate_object(array_schema, array_data) + assert result["id"] == "arr1" + assert "first_item" not in result + assert "nested_item" not in result + + +class TestValidationIntegrationEdgeCases: + """Tests for integration edge cases between storage and validation.""" + + @pytest.fixture + def storage_with_schema(self): + """Create a storage instance with schema for testing.""" + schema = IndexSchema( + index=Index(name="test_storage", prefix="doc"), + fields={ + "id": Field(name="id", type=FieldTypes.TAG), + "vec": Field( + name="vec", + type=FieldTypes.VECTOR, + attrs={"dims": 3, "datatype": VectorDataType.FLOAT32}, + ), + }, + ) + + return BaseStorage(schema=schema, client=None) + + def test_validation_with_bytes_no_client(self, storage_with_schema): + """Test validation with bytes when no Redis client is available.""" + # No Redis client was provided, so hset won't be called + # This just tests that validation works with bytes + + # Valid data with bytes + data = {"id": "doc1", "vec": b"\x00\x01\x02"} # 3 bytes + + # Validate - should work even without client + validated = storage_with_schema.validate_object(data) + assert validated["id"] == "doc1" + assert validated["vec"] == b"\x00\x01\x02" + + def test_unexpected_field_is_ignored(self, storage_with_schema): + """Test that unexpected fields are ignored during validation.""" + # Data with extra field + data = { + "id": "doc1", + "vec": [0.1, 0.2, 0.3], + "extra": "This field is not in the schema", + } + + # Validate + validated = storage_with_schema.validate_object(data) + + # Extra field should be ignored + assert validated["id"] == "doc1" + assert validated["vec"] == [0.1, 0.2, 0.3] + assert "extra" not in validated diff --git a/tests/unit/test_fields.py b/tests/unit/test_fields.py index f420afff..3376a67c 100644 --- a/tests/unit/test_fields.py +++ b/tests/unit/test_fields.py @@ -1,3 +1,5 @@ +from typing import Any, Optional, Tuple + import pytest from redis.commands.search.field import GeoField as RedisGeoField from redis.commands.search.field import NumericField as RedisNumericField @@ -217,3 +219,58 @@ def test_create_unknown_field_type(): with pytest.raises(ValueError) as excinfo: FieldFactory.create_field("unknown", "example_field") assert "Unknown field type: unknown" in str(excinfo.value) + + +# Add validation tests for each field type +@pytest.mark.parametrize( + "field_class,valid_value,invalid_value,error_msg", + [ + (TextField, "sample text", 123, "expects a string"), + (NumericField, 123.45, "123.45", "looks like a number"), + (TagField, ["tag1", "tag2"], ["tag1", 123], "must be a string"), + (GeoField, "37.7749,-122.4194", "invalid-geo", "not a valid 'lat,lon' format"), + # Add vector field test cases + ], +) +def test_field_validation(field_class, valid_value, invalid_value, error_msg): + """Test validation logic for each field type""" + # Create field instance + field = field_class(name="test_field") + + # Test valid value + is_valid, error = field.validate(valid_value) + assert is_valid, f"Field should accept valid value: {valid_value}" + assert error is None, "No error message should be returned for valid value" + + # Test invalid value + is_valid, error = field.validate(invalid_value) + assert not is_valid, f"Field should reject invalid value: {invalid_value}" + assert ( + error_msg in error + ), f"Error message should contain '{error_msg}', got: {error}" + + +def test_vector_field_validation(): + """Test validation for vector fields specifically""" + # Create vector fields with specific dimensions + flat_field = create_flat_vector_field(dims=3) + hnsw_field = create_hnsw_vector_field(dims=3) + + # Valid vector + valid_vector = [0.1, 0.2, 0.3] + + # Test valid cases + assert flat_field.validate(valid_vector)[0], "Should accept valid vector" + assert hnsw_field.validate(valid_vector)[0], "Should accept valid vector" + + # Test wrong dimensions + wrong_dims = [0.1, 0.2] # Only 2 dimensions + is_valid, error = flat_field.validate(wrong_dims) + assert not is_valid, "Should reject vector with wrong dimensions" + assert "expects 3 dimensions" in error + + # Test wrong type + wrong_type = ["a", "b", "c"] # Strings instead of numbers + is_valid, error = hnsw_field.validate(wrong_type) + assert not is_valid, "Should reject vector with non-numeric elements" + assert "must be a number" in error diff --git a/tests/unit/test_storage.py b/tests/unit/test_storage.py index 21637dd5..539a1b76 100644 --- a/tests/unit/test_storage.py +++ b/tests/unit/test_storage.py @@ -1,32 +1,105 @@ +""" +Tests for RedisVL storage classes with focus on validation integration. + +This module tests how the storage classes integrate with the validation system: +1. How validation is used in storage operations +2. Preprocessing and validation flow +3. Error handling in write operations +""" + +from typing import Any, Dict +from unittest.mock import MagicMock, Mock, patch + import pytest -from redisvl.index.storage import BaseStorage, HashStorage, JsonStorage +from redisvl.index.storage import HashStorage, JsonStorage +from redisvl.schema import IndexInfo, IndexSchema +from redisvl.schema.fields import ( + FlatVectorField, + FlatVectorFieldAttributes, + GeoField, + HNSWVectorField, + HNSWVectorFieldAttributes, + NumericField, + TagField, + TextField, + VectorDataType, + VectorDistanceMetric, +) +from redisvl.schema.validation import validate_object + + +@pytest.fixture +def sample_schema(): + """Create a comprehensive schema for testing with all field types""" + return IndexSchema.from_dict( + { + "index": { + "name": "test-index", + "prefix": "test", + "key_separator": ":", + "storage_type": "hash", + }, + "fields": [ + # Standard fields + {"type": "text", "name": "text_field"}, + {"type": "numeric", "name": "num_field"}, + {"type": "tag", "name": "tag_field"}, + {"type": "geo", "name": "geo_field"}, + # Vector fields + { + "type": "vector", + "name": "flat_vector", + "attrs": { + "algorithm": "flat", + "dims": 3, + "distance_metric": "cosine", + "data_type": "float32", + }, + }, + { + "type": "vector", + "name": "hnsw_vector", + "attrs": { + "algorithm": "hnsw", + "dims": 3, + "distance_metric": "cosine", + "data_type": "float32", + "m": 16, + "ef_construction": 200, + "ef_runtime": 10, + "epsilon": 0.01, + }, + }, + ], + } + ) @pytest.fixture(params=[JsonStorage, HashStorage]) -def storage_instance(request): +def storage_instance(request, sample_schema): StorageClass = request.param - instance = StorageClass(prefix="test", key_separator=":") + instance = StorageClass(index_schema=sample_schema) return instance def test_key_formatting(storage_instance): key = "1234" generated_key = storage_instance._key(key, "", "") - assert generated_key == key, "The generated key does not match the expected format." + assert generated_key == key generated_key = storage_instance._key(key, "", ":") - assert generated_key == key, "The generated key does not match the expected format." + assert generated_key == key generated_key = storage_instance._key(key, "test", ":") - assert ( - generated_key == f"test:{key}" - ), "The generated key does not match the expected format." + assert generated_key == f"test:{key}" def test_create_key(storage_instance): id_field = "id" obj = {id_field: "1234"} expected_key = ( - f"{storage_instance.prefix}{storage_instance.key_separator}{obj[id_field]}" + f"{storage_instance.index_schema.index.prefix}" + f"{storage_instance.index_schema.index.key_separator}" + f"{obj[id_field]}" ) generated_key = storage_instance._create_key(obj, id_field) assert ( @@ -34,47 +107,456 @@ def test_create_key(storage_instance): ), "The generated key does not match the expected format." -def test_validate_success(storage_instance): - data = {"foo": "bar"} - try: - storage_instance._validate(data) - except Exception as e: - pytest.fail(f"_validate should not raise an exception here, but raised {e}") - - -def test_validate_failure(storage_instance): - data = "Some invalid data type" - with pytest.raises(TypeError): - storage_instance._validate(data) - data = 12345 - with pytest.raises(TypeError): - storage_instance._validate(data) - - def test_preprocess(storage_instance): data = {"key": "value"} - preprocessed_data = storage_instance._preprocess(preprocess=None, obj=data) + preprocessed_data = storage_instance._preprocess(data, preprocess=None) assert preprocessed_data == data def fn(d): d["foo"] = "bar" return d - preprocessed_data = storage_instance._preprocess(fn, data) + preprocessed_data = storage_instance._preprocess(data, fn) assert "foo" in preprocessed_data assert preprocessed_data["foo"] == "bar" -@pytest.mark.asyncio -async def test_preprocess(storage_instance): - data = {"key": "value"} - preprocessed_data = await storage_instance._apreprocess(preprocess=None, obj=data) - assert preprocessed_data == data +def test_preprocess_and_validate_objects(storage_instance): + """Test combined preprocessing and validation""" + objects = [ + {"num_field": 123, "text_field": "valid text"}, # Valid + {"num_field": "123", "text_field": "valid text"}, # Invalid numeric field + ] - async def fn(d): - d["foo"] = "bar" - return d + def preprocess(obj): + obj["processed"] = True + return obj - preprocessed_data = await storage_instance._apreprocess(data, fn) - assert "foo" in preprocessed_data - assert preprocessed_data["foo"] == "bar" + # When validate=True, should raise ValueError for invalid object + with pytest.raises(ValueError) as exc_info: + storage_instance._preprocess_and_validate_objects( + objects, preprocess=preprocess, validate=True + ) + + # Error message should mention the issue + assert "Validation failed" in str(exc_info.value) + assert "must be a number" in str(exc_info.value) + + # When validate=False, should process both objects without errors + prepared_objects = storage_instance._preprocess_and_validate_objects( + objects, preprocess=preprocess, validate=False + ) + + assert len(prepared_objects) == 2 + # Preprocessing should have worked for both objects + assert all(obj[1].get("processed") for obj in prepared_objects) + + +def test_validate_object(storage_instance): + """Test validation of individual objects""" + + # Valid data should be returned unchanged (except for any type coercion) + valid_data = { + "text_field": "some text", + "num_field": 123.45, + "tag_field": "tag1,tag2,tag3", + "geo_field": "37.7749,-122.4194", + "flat_vector": [0.1, 0.2, 0.3], + "hnsw_vector": [0.4, 0.5, 0.6], + } + + validated = storage_instance.validate(valid_data) + assert validated is not None + assert validated["num_field"] == valid_data["num_field"] + assert validated["text_field"] == valid_data["text_field"] + + # Invalid text field + invalid_text = valid_data.copy() + invalid_text["text_field"] = 123 + with pytest.raises(ValueError) as exc_info: + storage_instance.validate(invalid_text) + assert "text_field" in str(exc_info.value) + + # Invalid numeric field (string that looks like number) + invalid_numeric = valid_data.copy() + invalid_numeric["num_field"] = "123.45" + with pytest.raises(ValueError) as exc_info: + storage_instance.validate(invalid_numeric) + assert "num_field" in str(exc_info.value) + + # Invalid geo field + invalid_geo = valid_data.copy() + invalid_geo["geo_field"] = "invalid-geo-format" + with pytest.raises(ValueError) as exc_info: + storage_instance.validate(invalid_geo) + assert "geo_field" in str(exc_info.value) + + # Invalid vector field (wrong dimensions) + invalid_vector_dims = valid_data.copy() + invalid_vector_dims["flat_vector"] = [0.1, 0.2] + with pytest.raises(ValueError) as exc_info: + storage_instance.validate(invalid_vector_dims) + assert "flat_vector" in str(exc_info.value) + assert "dimensions" in str(exc_info.value) + + # Invalid vector field (non-numeric values) + invalid_vector_values = valid_data.copy() + invalid_vector_values["hnsw_vector"] = ["a", "b", "c"] + with pytest.raises(ValueError) as exc_info: + storage_instance.validate(invalid_vector_values) + assert "hnsw_vector" in str(exc_info.value) + assert "numeric values" in str(exc_info.value) + + +def test_partial_object_validation(storage_instance): + """Test validation of partial objects (missing fields)""" + + # Object with only some fields + partial_data = { + "text_field": "valid text", + # Missing num_field, tag_field, etc. + } + + # Should validate successfully since fields are optional + validated = storage_instance.validate(partial_data) + assert validated is not None + assert "text_field" in validated + assert "num_field" not in validated + + # Explicitly setting a field to None should result in it being excluded + null_field_data = {"text_field": "valid text", "num_field": None} + + validated = storage_instance.validate(null_field_data) + assert "num_field" not in validated + + +def test_write_with_validation(storage_instance, mocker): + """Test the write method with validation enabled""" + # Mock the _set method to avoid actual Redis calls + mocker.patch.object(storage_instance, "_set") + + # Mock pipeline execution + mock_pipe = mocker.MagicMock() + mock_pipe.execute = mocker.MagicMock() + + # Mock Redis client + mock_client = mocker.MagicMock() + mock_client.pipeline.return_value.__enter__.return_value = mock_pipe + + # Valid and invalid objects + objects = [ + {"text_field": "valid", "num_field": 123}, # Valid + {"text_field": 456, "num_field": 789}, # Invalid text field + ] + + # With validation enabled, should raise error on first invalid object + with pytest.raises(ValueError) as exc_info: + storage_instance.write(mock_client, objects, validate=True) + + assert "Validation failed" in str(exc_info.value) + assert "text_field" in str(exc_info.value) + + # With validation disabled, should process all objects + keys = storage_instance.write(mock_client, objects, validate=False) + + assert len(keys) == 2 + assert storage_instance._set.call_count == 2 + + +class TestBaseStorageValidation: + """Tests for validation in BaseStorage class.""" + + def test_validate_object(self, comprehensive_schema, valid_data): + """Test the validate_object method.""" + # Create storage + storage = BaseStorage(schema=comprehensive_schema) + + # Validate object + validated = storage.validate_object(valid_data) + + # Verify object was validated + assert validated is not None + assert "id" in validated + assert "title" in validated + + def test_validate_object_with_invalid_data(self, comprehensive_schema, valid_data): + """Test validation with invalid data.""" + # Create storage + storage = BaseStorage(schema=comprehensive_schema) + + # Create invalid data + invalid_data = valid_data.copy() + invalid_data["rating"] = "not a number" + + # Validation should fail + with pytest.raises(ValueError) as exc_info: + storage.validate_object(invalid_data) + + # Error message should mention validation failure + assert "Validation failed" in str(exc_info.value) + + def test_preprocess_and_validate_objects_success( + self, comprehensive_schema, valid_data + ): + """Test _preprocess_and_validate_objects with valid data.""" + # Create storage + storage = BaseStorage(schema=comprehensive_schema) + + # Process objects + objects = [valid_data] + validated_objects = storage._preprocess_and_validate_objects(objects) + + # Verify objects were validated + assert len(validated_objects) == 1 + assert "id" in validated_objects[0] + assert "title" in validated_objects[0] + + def test_preprocess_and_validate_objects_fail( + self, comprehensive_schema, valid_data + ): + """Test _preprocess_and_validate_objects with invalid data.""" + # Create storage + storage = BaseStorage(schema=comprehensive_schema) + + # Create mix of valid and invalid data + invalid_data = valid_data.copy() + invalid_data["rating"] = "not a number" + + # Process should fail fast on first invalid object + with pytest.raises(ValueError) as exc_info: + storage._preprocess_and_validate_objects([invalid_data, valid_data]) + + # Error message should mention validation failure + assert "Validation failed" in str(exc_info.value) + + def test_write_one_validation(self, comprehensive_schema, valid_data): + """Test that write_one validates objects.""" + # Create storage with mocked redis client + client_mock = Mock() + storage = BaseStorage(schema=comprehensive_schema, client=client_mock) + + # Mock hset to avoid actual Redis call + client_mock.hset = Mock() + + # Call write_one + storage.write_one(valid_data) + + # Verify hset was called + client_mock.hset.assert_called_once() + + def test_write_one_validation_fail(self, comprehensive_schema, valid_data): + """Test that write_one fails on invalid data.""" + # Create storage with mocked redis client + client_mock = Mock() + storage = BaseStorage(schema=comprehensive_schema, client=client_mock) + + # Create invalid data + invalid_data = valid_data.copy() + invalid_data["rating"] = "not a number" + + # Call write_one with invalid data + with pytest.raises(ValueError) as exc_info: + storage.write_one(invalid_data) + + # Verify error and that hset was not called + assert "Validation failed" in str(exc_info.value) + client_mock.hset.assert_not_called() + + def test_write_many_validation(self, comprehensive_schema, valid_data): + """Test that write_many validates all objects.""" + # Create storage with mocked redis client + client_mock = Mock() + storage = BaseStorage(schema=comprehensive_schema, client=client_mock) + + # Mock pipeline to avoid actual Redis call + pipeline_mock = Mock() + client_mock.pipeline.return_value.__enter__.return_value = pipeline_mock + + # Call write_many with multiple valid objects + storage.write_many([valid_data, valid_data.copy()]) + + # Verify pipeline executed + pipeline_mock.execute.assert_called_once() + + def test_write_many_validation_fail(self, comprehensive_schema, valid_data): + """Test that write_many fails on invalid data.""" + # Create storage with mocked redis client + client_mock = Mock() + storage = BaseStorage(schema=comprehensive_schema, client=client_mock) + + # Mock pipeline to avoid actual Redis call + pipeline_mock = Mock() + client_mock.pipeline.return_value.__enter__.return_value = pipeline_mock + + # Create invalid data + invalid_data = valid_data.copy() + invalid_data["rating"] = "not a number" + + # Call write_many with invalid data + with pytest.raises(ValueError) as exc_info: + storage.write_many([valid_data, invalid_data]) + + # Verify error and that execute was not called + assert "Validation failed" in str(exc_info.value) + pipeline_mock.execute.assert_not_called() + + +class TestJsonStorageValidation: + """Tests for validation in JsonStorage class.""" + + def test_validate_json_document(self, json_schema, valid_nested_data): + """Test validating a JSON document.""" + # Create JSON storage + storage = JsonStorage(schema=json_schema) + + # Validate object + validated = storage.validate_object(valid_nested_data) + + # Verify object was validated and flattened + assert validated is not None + assert "id" in validated + assert "user" in validated + assert "title" in validated + assert "rating" in validated + + def test_validate_json_missing_paths(self, json_schema): + """Test validating JSON with missing paths.""" + # Create JSON storage + storage = JsonStorage(schema=json_schema) + + # Create object with missing paths + partial_nested = { + "id": "doc1", + "metadata": { + "user": "user123" + # missing rating + }, + "content": { + "title": "Test Document" + # missing embedding + }, + } + + # Validate object + validated = storage.validate_object(partial_nested) + + # Verify validation succeeds with missing fields + assert validated is not None + assert "id" in validated + assert "user" in validated + assert "title" in validated + + # Missing fields should be absent + assert "rating" not in validated + assert "embedding" not in validated + + def test_validate_json_invalid_path(self, json_schema, valid_nested_data): + """Test validating JSON with invalid path values.""" + # Create JSON storage + storage = JsonStorage(schema=json_schema) + + # Create object with invalid data + invalid_nested = valid_nested_data.copy() + invalid_nested["metadata"]["rating"] = "not a number" + + # Validation should fail + with pytest.raises(ValueError) as exc_info: + storage.validate_object(invalid_nested) + + # Error message should mention validation failure + assert "Validation failed" in str(exc_info.value) + assert "rating" in str(exc_info.value) + + def test_write_json_document(self, json_schema, valid_nested_data): + """Test writing a JSON document.""" + # Create storage with mocked redis client + client_mock = Mock() + storage = JsonStorage(schema=json_schema, client=client_mock) + + # Mock json.set to avoid actual Redis call + client_mock.json.set = Mock() + + # Call write_one + storage.write_one(valid_nested_data) + + # Verify json.set was called + client_mock.json.set.assert_called_once() + + def test_write_json_validation_fail(self, json_schema, valid_nested_data): + """Test that write fails on invalid JSON.""" + # Create storage with mocked redis client + client_mock = Mock() + storage = JsonStorage(schema=json_schema, client=client_mock) + + # Create invalid data + invalid_nested = valid_nested_data.copy() + invalid_nested["metadata"]["rating"] = "not a number" + + # Call write_one with invalid data + with pytest.raises(ValueError) as exc_info: + storage.write_one(invalid_nested) + + # Verify error and that json.set was not called + assert "Validation failed" in str(exc_info.value) + client_mock.json.set.assert_not_called() + + +@patch("redisvl.schema.validation.validate_object") +class TestValidationIntegration: + """Tests for integration between storage and validation.""" + + def test_validate_object_is_called( + self, mock_validate, comprehensive_schema, valid_data + ): + """Test that validate_object is called from BaseStorage.""" + # Create storage + storage = BaseStorage(schema=comprehensive_schema) + + # Set up mock to return the input data + mock_validate.return_value = valid_data + + # Call validate_object + storage.validate_object(valid_data) + + # Verify mock was called with correct args + mock_validate.assert_called_once_with(comprehensive_schema, valid_data) + + def test_preprocess_calls_validate_for_each_object( + self, mock_validate, comprehensive_schema, valid_data + ): + """Test that _preprocess_and_validate_objects calls validate for each object.""" + # Create storage + storage = BaseStorage(schema=comprehensive_schema) + + # Set up mock to return the input data + mock_validate.return_value = valid_data + + # Call _preprocess_and_validate_objects with multiple objects + objects = [valid_data, valid_data.copy(), valid_data.copy()] + storage._preprocess_and_validate_objects(objects) + + # Verify mock was called for each object + assert mock_validate.call_count == len(objects) + + def test_preprocess_stops_on_first_validation_error( + self, mock_validate, comprehensive_schema, valid_data + ): + """Test that processing stops on first validation error.""" + # Create storage + storage = BaseStorage(schema=comprehensive_schema) + + # Set up mock to raise error on second call + mock_validate.side_effect = [ + valid_data, + ValueError("Validation failed for 2nd object"), + valid_data, + ] + + # Call _preprocess_and_validate_objects + objects = [valid_data, valid_data.copy(), valid_data.copy()] + with pytest.raises(ValueError) as exc_info: + storage._preprocess_and_validate_objects(objects) + + # Verify error and that mock was called twice + assert "Validation failed for 2nd object" in str(exc_info.value) + assert mock_validate.call_count == 2 diff --git a/tests/unit/test_validation.py b/tests/unit/test_validation.py new file mode 100644 index 00000000..6431303c --- /dev/null +++ b/tests/unit/test_validation.py @@ -0,0 +1,515 @@ +""" +Tests for the RedisVL schema validation module. + +This module tests the core validation functionality: +1. Model generation from schemas +2. Field-specific validators +3. JSON path extraction +4. Validation of various field types +""" + +import re +from typing import Any, Dict, List + +import pytest + +from redisvl.schema import IndexSchema +from redisvl.schema.fields import FieldTypes, VectorDataType +from redisvl.schema.type_utils import TypeInferrer +from redisvl.schema.validation import ( + SchemaModelGenerator, + extract_from_json_path, + validate_object, +) + + +@pytest.fixture +def sample_schema(): + """Create a sample schema with different field types for testing.""" + schema_dict = { + "index": { + "name": "test-index", + "prefix": "test", + "key_separator": ":", + "storage_type": "hash", + }, + "fields": [ + {"name": "id", "type": "tag"}, + {"name": "title", "type": "text"}, + {"name": "rating", "type": "numeric"}, + {"name": "location", "type": "geo"}, + { + "name": "embedding", + "type": "vector", + "attrs": { + "algorithm": "flat", + "dims": 4, + "datatype": "float32", + "distance_metric": "cosine", + }, + }, + ], + } + return IndexSchema.from_dict(schema_dict) + + +@pytest.fixture +def sample_json_schema(): + """Create a sample schema with JSON storage and path fields.""" + schema_dict = { + "index": { + "name": "test-json-index", + "prefix": "test", + "key_separator": ":", + "storage_type": "json", + }, + "fields": [ + {"name": "id", "type": "tag", "path": "$.id"}, + {"name": "user", "type": "tag", "path": "$.metadata.user"}, + {"name": "title", "type": "text", "path": "$.content.title"}, + {"name": "rating", "type": "numeric", "path": "$.metadata.rating"}, + { + "name": "embedding", + "type": "vector", + "path": "$.content.embedding", + "attrs": { + "algorithm": "flat", + "dims": 4, + "datatype": "float32", + "distance_metric": "cosine", + }, + }, + ], + } + return IndexSchema.from_dict(schema_dict) + + +@pytest.fixture +def valid_data(): + """Sample valid data for testing validation.""" + return { + "id": "doc1", + "title": "Test Document", + "rating": 4.5, + "location": "37.7749,-122.4194", + "embedding": [0.1, 0.2, 0.3, 0.4], + } + + +@pytest.fixture +def valid_nested_data(): + """Sample valid nested data for testing JSON path validation.""" + return { + "id": "doc1", + "metadata": {"user": "user123", "rating": 4.5}, + "content": {"title": "Test Document", "embedding": [0.1, 0.2, 0.3, 0.4]}, + } + + +class TestSchemaModelGenerator: + """Tests for the SchemaModelGenerator class.""" + + def test_get_model_for_schema(self, sample_schema): + """Test generating a model from a schema.""" + # Get model for schema + model_class = SchemaModelGenerator.get_model_for_schema(sample_schema) + + # Verify model name matches the index name + assert model_class.__name__ == "test-index__PydanticModel" + + # Verify model has expected fields + for field_name in sample_schema.field_names: + assert field_name in model_class.model_fields + + def test_model_caching(self, sample_schema): + """Test that models are cached and reused.""" + # Get model twice + model1 = SchemaModelGenerator.get_model_for_schema(sample_schema) + model2 = SchemaModelGenerator.get_model_for_schema(sample_schema) + + # Verify same instance + assert model1 is model2 + + def test_type_mapping(self, sample_schema): + """Test mapping Redis field types to Pydantic types.""" + for field_name, field in sample_schema.fields.items(): + field_type = SchemaModelGenerator._map_field_to_pydantic_type(field) + + # Verify each field type maps to expected Python type + if field.type == FieldTypes.TEXT: + assert field_type == str + elif field.type == FieldTypes.TAG: + assert field_type == str + elif field.type == FieldTypes.NUMERIC: + assert field_type.__origin__ == type(Union) # Check it's a Union + elif field.type == FieldTypes.VECTOR: + assert field_type.__origin__ == type(Union) # Check it's a Union + + def test_unsupported_field_type(self): + """Test that an error is raised for unsupported field types.""" + + # Create a dummy field with unsupported type + class DummyField: + type = "unsupported_type" + + # Mapping should raise ValueError + with pytest.raises(ValueError) as exc_info: + SchemaModelGenerator._map_field_to_pydantic_type(DummyField()) + + assert "Unsupported field type" in str(exc_info.value) + + +class TestFieldValidators: + """Tests for field-specific validators.""" + + def test_text_field_validation(self, sample_schema, valid_data): + """Test validation of text fields.""" + model_class = SchemaModelGenerator.get_model_for_schema(sample_schema) + + # Valid text field + valid = valid_data.copy() + validated = model_class.model_validate(valid) + assert validated.title == "Test Document" + + # Invalid text field (number) + invalid = valid_data.copy() + invalid["title"] = 123 + with pytest.raises(ValueError) as exc_info: + model_class.model_validate(invalid) + assert "title" in str(exc_info.value) + assert "must be a string" in str(exc_info.value) + + def test_tag_field_validation(self, sample_schema, valid_data): + """Test validation of tag fields.""" + model_class = SchemaModelGenerator.get_model_for_schema(sample_schema) + + # Valid tag field + valid = valid_data.copy() + validated = model_class.model_validate(valid) + assert validated.id == "doc1" + + # Invalid tag field (number) + invalid = valid_data.copy() + invalid["id"] = 123 + with pytest.raises(ValueError) as exc_info: + model_class.model_validate(invalid) + assert "id" in str(exc_info.value) + assert "must be a string" in str(exc_info.value) + + def test_numeric_field_validation(self, sample_schema, valid_data): + """Test validation of numeric fields.""" + model_class = SchemaModelGenerator.get_model_for_schema(sample_schema) + + # Valid numeric field (integer) + valid_int = valid_data.copy() + valid_int["rating"] = 5 + validated = model_class.model_validate(valid_int) + assert validated.rating == 5 + + # Valid numeric field (float) + valid_float = valid_data.copy() + valid_float["rating"] = 4.5 + validated = model_class.model_validate(valid_float) + assert validated.rating == 4.5 + + # Invalid numeric field (string) + invalid = valid_data.copy() + invalid["rating"] = "high" + with pytest.raises(ValueError) as exc_info: + model_class.model_validate(invalid) + assert "rating" in str(exc_info.value) + assert "must be a number" in str(exc_info.value) + + # Invalid numeric field (string that looks like number) + invalid_num_str = valid_data.copy() + invalid_num_str["rating"] = "4.5" + with pytest.raises(ValueError) as exc_info: + model_class.model_validate(invalid_num_str) + assert "rating" in str(exc_info.value) + assert "must be a number" in str(exc_info.value) + + def test_geo_field_validation(self, sample_schema, valid_data): + """Test validation of geo fields.""" + model_class = SchemaModelGenerator.get_model_for_schema(sample_schema) + + # Valid geo format + valid_geo = valid_data.copy() + valid_geo["location"] = "37.7749,-122.4194" + validated = model_class.model_validate(valid_geo) + assert validated.location == "37.7749,-122.4194" + + # Invalid geo format (not matching lat,lon pattern) + invalid_geo = valid_data.copy() + invalid_geo["location"] = "invalid_geo" + with pytest.raises(ValueError) as exc_info: + model_class.model_validate(invalid_geo) + assert "location" in str(exc_info.value) + assert "not a valid 'lat,lon' format" in str(exc_info.value) + + # Verify the geo pattern actually works with valid formats + valid_formats = [ + "0,0", + "90,-180", + "-90,180", + "37.7749,-122.4194", + "37.7749,122.4194", + "-37.7749,-122.4194", + ] + for format in valid_formats: + assert re.match(TypeInferrer.GEO_PATTERN.pattern, format) + + # Verify invalid formats fail the pattern + invalid_formats = [ + "invalid", + "37.7749", + "37.7749,", + ",122.4194", + "91,0", # Latitude > 90 + "-91,0", # Latitude < -90 + "0,181", # Longitude > 180 + "0,-181", # Longitude < -180 + ] + for format in invalid_formats: + assert not re.match(TypeInferrer.GEO_PATTERN.pattern, format) + + def test_vector_field_validation_float(self, sample_schema, valid_data): + """Test validation of float vector fields.""" + model_class = SchemaModelGenerator.get_model_for_schema(sample_schema) + + # Valid vector + valid_vector = valid_data.copy() + valid_vector["embedding"] = [0.1, 0.2, 0.3, 0.4] + validated = model_class.model_validate(valid_vector) + assert validated.embedding == [0.1, 0.2, 0.3, 0.4] + + # Valid vector as bytes + valid_bytes = valid_data.copy() + valid_bytes["embedding"] = b"\x00\x01\x02\x03" + validated = model_class.model_validate(valid_bytes) + assert validated.embedding == b"\x00\x01\x02\x03" + + # Invalid vector type (string) + invalid_type = valid_data.copy() + invalid_type["embedding"] = "not a vector" + with pytest.raises(ValueError) as exc_info: + model_class.model_validate(invalid_type) + assert "embedding" in str(exc_info.value) + + # Invalid dimensions + invalid_dims = valid_data.copy() + invalid_dims["embedding"] = [0.1, 0.2, 0.3] # 3 dimensions instead of 4 + with pytest.raises(ValueError) as exc_info: + model_class.model_validate(invalid_dims) + assert "embedding" in str(exc_info.value) + assert "dimensions" in str(exc_info.value) + + # Invalid vector values + invalid_values = valid_data.copy() + invalid_values["embedding"] = [0.1, "string", 0.3, 0.4] + with pytest.raises(ValueError) as exc_info: + model_class.model_validate(invalid_values) + assert "embedding" in str(exc_info.value) + + def test_vector_field_validation_int(self, sample_schema, valid_data): + """Test validation of integer vector fields.""" + model_class = SchemaModelGenerator.get_model_for_schema(sample_schema) + + # Valid integer vector + valid_vector = valid_data.copy() + valid_vector["int_vector"] = [1, 2, 3] + validated = model_class.model_validate(valid_vector) + assert validated.int_vector == [1, 2, 3] + + # Invalid: float values in int vector + invalid_floats = valid_data.copy() + invalid_floats["int_vector"] = [0.1, 0.2, 0.3] + with pytest.raises(ValueError) as exc_info: + model_class.model_validate(invalid_floats) + assert "int_vector" in str(exc_info.value) + assert "integer values" in str(exc_info.value) + + # Invalid: values outside INT8 range + invalid_range = valid_data.copy() + invalid_range["int_vector"] = [1000, 2000, 3000] # Outside INT8 range + with pytest.raises(ValueError) as exc_info: + model_class.model_validate(invalid_range) + assert "int_vector" in str(exc_info.value) + assert "must be between" in str(exc_info.value) + + +class TestJsonPathValidation: + """Tests for JSON path-based validation.""" + + def test_extract_from_json_path(self, valid_nested_data): + """Test extracting values using JSON paths.""" + # Test simple path + assert extract_from_json_path(valid_nested_data, "$.id") == "doc1" + + # Test nested path + assert extract_from_json_path(valid_nested_data, "$.metadata.user") == "user123" + assert extract_from_json_path(valid_nested_data, "$.metadata.rating") == 4.5 + assert ( + extract_from_json_path(valid_nested_data, "$.content.title") + == "Test Document" + ) + assert extract_from_json_path(valid_nested_data, "$.content.embedding") == [ + 0.1, + 0.2, + 0.3, + 0.4, + ] + + # Test non-existent path + assert extract_from_json_path(valid_nested_data, "$.nonexistent") is None + assert ( + extract_from_json_path(valid_nested_data, "$.metadata.nonexistent") is None + ) + + # Test path with alternate formats + assert extract_from_json_path(valid_nested_data, "metadata.user") == "user123" + + def test_validate_nested_json(self, sample_json_schema, valid_nested_data): + """Test validating a nested JSON object.""" + # Validate nested object + validated = validate_object(sample_json_schema, valid_nested_data) + + # Verify validation succeeds and flattens the structure + assert validated is not None + assert "id" in validated + assert "user" in validated + assert "title" in validated + assert "rating" in validated + assert "embedding" in validated + + # Verify values were extracted correctly + assert validated["id"] == "doc1" + assert validated["user"] == "user123" + assert validated["title"] == "Test Document" + assert validated["rating"] == 4.5 + assert validated["embedding"] == [0.1, 0.2, 0.3, 0.4] + + def test_validate_nested_json_missing_paths(self, sample_json_schema): + """Test validating a nested JSON with missing paths.""" + # Nested object with missing paths + partial_nested = { + "id": "doc1", + "metadata": { + "user": "user123" + # missing rating + }, + "content": { + "title": "Test Document" + # missing embedding + }, + } + + # Validate object + validated = validate_object(sample_json_schema, partial_nested) + + # Verify validation succeeds with partial data + assert validated is not None + assert "id" in validated + assert "user" in validated + assert "title" in validated + assert "rating" not in validated + assert "embedding" not in validated + + +class TestObjectValidation: + """Tests for complete object validation.""" + + def test_validate_valid_object(self, sample_schema, valid_data): + """Test validating a valid object.""" + # Validate object + validated = validate_object(sample_schema, valid_data) + + # Verify no exceptions and data is returned + assert validated is not None + + # Verify all fields are present + for field_name in sample_schema.field_names: + if field_name in valid_data: + assert field_name in validated + + def test_validate_missing_optional_fields(self, sample_schema): + """Test validating an object with missing optional fields.""" + # Object with only some fields + partial_data = {"id": "doc1", "title": "Test Document"} + + # Validate object + validated = validate_object(sample_schema, partial_data) + + # Verify validation passes with partial data + assert validated is not None + assert "id" in validated + assert "title" in validated + assert "rating" not in validated + assert "location" not in validated + assert "embedding" not in validated + + def test_explicit_none_fields_are_excluded(self, sample_schema): + """Test that fields explicitly set to None are excluded from output.""" + # Object with some fields set to None + data_with_none = { + "id": "doc1", + "title": "Test Document", + "rating": None, + "location": None, + } + + # Validate object + validated = validate_object(sample_schema, data_with_none) + + # Verify None fields are excluded + assert validated is not None + assert "id" in validated + assert "title" in validated + assert "rating" not in validated + assert "location" not in validated + + def test_validate_with_multiple_invalid_fields(self, sample_schema, valid_data): + """Test validation with multiple invalid fields.""" + # Create object with multiple invalid fields + invalid_data = valid_data.copy() + invalid_data["title"] = 123 + invalid_data["rating"] = "not a number" + invalid_data["location"] = "invalid" + + # Validation should fail with the first error encountered + with pytest.raises(ValueError) as exc_info: + validate_object(sample_schema, invalid_data) + + # Error message should mention validation failure + assert "Validation failed" in str(exc_info.value) + + @pytest.mark.parametrize( + "case", + [ + {"field": "title", "value": 123, "error_text": "must be a string"}, + {"field": "rating", "value": "high", "error_text": "must be a number"}, + { + "field": "location", + "value": "invalid_geo", + "error_text": "not a valid 'lat,lon' format", + }, + { + "field": "embedding", + "value": [0.1, 0.2, 0.3], + "error_text": "dimensions", + }, + ], + ) + def test_validate_invalid_field_parametrized(self, sample_schema, valid_data, case): + """Parametrized test for validating invalid fields.""" + # Create invalid data according to test case + invalid_data = valid_data.copy() + invalid_data[case["field"]] = case["value"] + + # Validate and check error + with pytest.raises(ValueError) as exc_info: + validate_object(sample_schema, invalid_data) + + # Error should mention the field and specific issue + error_message = str(exc_info.value) + assert case["field"] in error_message + assert case["error_text"] in error_message From fa8041a662cad82d5ad8b16860d2c0f3bcc96d88 Mon Sep 17 00:00:00 2001 From: Tyler Hutcherson Date: Tue, 25 Mar 2025 22:16:58 -0400 Subject: [PATCH 02/11] dynamic pydantic model validation on load --- redisvl/exceptions.py | 34 +- redisvl/index/index.py | 52 +- redisvl/index/storage.py | 22 +- redisvl/schema/validation.py | 216 ++++---- tests/unit/conftest.py | 183 ------- tests/unit/test_storage.py | 647 +++++------------------- tests/unit/test_validation.py | 901 ++++++++++++++++++++-------------- 7 files changed, 824 insertions(+), 1231 deletions(-) delete mode 100644 tests/unit/conftest.py diff --git a/redisvl/exceptions.py b/redisvl/exceptions.py index e645e3e2..f8917c3d 100644 --- a/redisvl/exceptions.py +++ b/redisvl/exceptions.py @@ -1,10 +1,32 @@ -class RedisVLException(Exception): - """Base RedisVL exception""" +""" +RedisVL Exception Classes +This module defines all custom exceptions used throughout the RedisVL library. +""" -class RedisModuleVersionError(RedisVLException): - """Invalid module versions installed""" +class RedisVLError(Exception): + """Base exception for all RedisVL errors.""" -class RedisSearchError(RedisVLException): - """Error while performing a search or aggregate request""" + pass + + +class RedisModuleVersionError(RedisVLError): + """Error raised when required Redis modules are missing or have incompatible versions.""" + + pass + + +class RedisSearchError(RedisVLError): + """Error raised for Redis Search specific operations.""" + + pass + + +class SchemaValidationError(RedisVLError): + """Error when validating data against a schema.""" + + def __init__(self, message, index=None): + if index is not None: + message = f"Validation failed for object at index {index}: {message}" + super().__init__(message) diff --git a/redisvl/index/index.py b/redisvl/index/index.py index 0cf9b172..806b4ba5 100644 --- a/redisvl/index/index.py +++ b/redisvl/index/index.py @@ -32,7 +32,12 @@ from redis.commands.helpers import get_protocol_version # type: ignore from redis.commands.search.indexDefinition import IndexDefinition -from redisvl.exceptions import RedisModuleVersionError, RedisSearchError +from redisvl.exceptions import ( + RedisModuleVersionError, + RedisSearchError, + RedisVLError, + SchemaValidationError, +) from redisvl.index.storage import BaseStorage, HashStorage, JsonStorage from redisvl.query import BaseQuery, CountQuery, FilterQuery from redisvl.query.filter import FilterExpression @@ -594,27 +599,8 @@ def load( List[str]: List of keys loaded to Redis. Raises: - ValueError: If the length of provided keys does not match the length - of objects or if validation fails when validate_on_load is enabled. - - .. code-block:: python - - data = [{"test": "foo"}, {"test": "bar"}] - - # simple case - keys = index.load(data) - - # set 360 second ttl policy on data - keys = index.load(data, ttl=360) - - # load data with predefined keys - keys = index.load(data, keys=["rvl:foo", "rvl:bar"]) - - # load data with preprocessing step - def add_field(d): - d["new_field"] = 123 - return d - keys = index.load(data, preprocess=add_field) + SchemaValidationError: If validation fails when validate_on_load is enabled. + RedisVLError: If there's an error loading data to Redis. """ try: return self._storage.write( @@ -627,9 +613,14 @@ def add_field(d): batch_size=batch_size, validate=self._validate_on_load, ) - except: - logger.exception("Error while loading data to Redis") + except SchemaValidationError: + # Pass through validation errors directly + logger.exception("Schema validation error while loading data") raise + except Exception as e: + # Wrap other errors as general RedisVL errors + logger.exception("Error while loading data to Redis") + raise RedisVLError(f"Failed to load data: {str(e)}") from e def fetch(self, id: str) -> Optional[Dict[str, Any]]: """Fetch an object from Redis by id. @@ -1246,8 +1237,8 @@ async def load( List[str]: List of keys loaded to Redis. Raises: - ValueError: If the length of provided keys does not match the - length of objects or if validation fails when validate_on_load is enabled. + SchemaValidationError: If validation fails when validate_on_load is enabled. + RedisVLError: If there's an error loading data to Redis. .. code-block:: python @@ -1281,9 +1272,14 @@ def add_field(d): batch_size=batch_size, validate=self._validate_on_load, ) - except: - logger.exception("Error while loading data to Redis") + except SchemaValidationError: + # Pass through validation errors directly + logger.exception("Schema validation error while loading data") raise + except Exception as e: + # Wrap other errors as general RedisVL errors + logger.exception("Error while loading data to Redis") + raise RedisVLError(f"Failed to load data: {str(e)}") from e async def fetch(self, id: str) -> Optional[Dict[str, Any]]: """Asynchronously etch an object from Redis by id. The id is typically diff --git a/redisvl/index/storage.py b/redisvl/index/storage.py index f90e45b4..f0af1e5b 100644 --- a/redisvl/index/storage.py +++ b/redisvl/index/storage.py @@ -5,6 +5,7 @@ from redis.asyncio import Redis as AsyncRedis from redis.commands.search.indexDefinition import IndexType +from redisvl.exceptions import SchemaValidationError from redisvl.redis.utils import convert_bytes from redisvl.schema import IndexSchema from redisvl.schema.validation import validate_object @@ -180,7 +181,8 @@ def _preprocess_and_validate_objects( List of tuples (key, processed_obj) for valid objects Raises: - ValueError: If any validation fails with object context + SchemaValidationError: If validation fails, with context about which object failed + ValueError: If any other processing errors occur """ prepared_objects = [] keys_iterator = iter(keys) if keys else None @@ -197,12 +199,6 @@ def _preprocess_and_validate_objects( # Preprocess processed_obj = self._preprocess(obj, preprocess) - # Basic type validation - if not isinstance(processed_obj, dict): - raise ValueError( - f"Object must be a dictionary, got {type(processed_obj).__name__}" - ) - # Schema validation if enabled if validate: processed_obj = self.validate(processed_obj) @@ -210,13 +206,15 @@ def _preprocess_and_validate_objects( # Store valid object with its key for writing prepared_objects.append((key, processed_obj)) + except ValidationError as e: + # Convert Pydantic ValidationError to SchemaValidationError with index context + raise SchemaValidationError(str(e), index=i) from e except Exception as e: - # Enhance error message with object context + # Capture other exceptions with context object_id = f"at index {i}" - if id_field and isinstance(obj, dict) and id_field in obj: - object_id = f"with {id_field}={obj[id_field]}" - - raise ValueError(f"Validation failed for object {object_id}: {str(e)}") + raise ValueError( + f"Error processing object {object_id}: {str(e)}" + ) from e return prepared_objects diff --git a/redisvl/schema/validation.py b/redisvl/schema/validation.py index 51fcf445..b102166c 100644 --- a/redisvl/schema/validation.py +++ b/redisvl/schema/validation.py @@ -10,7 +10,7 @@ import warnings from typing import Any, Dict, List, Optional, Type, Union -from pydantic import BaseModel, Field, ValidationError, create_model, field_validator +from pydantic import BaseModel, Field, field_validator from redisvl.schema import IndexSchema from redisvl.schema.fields import BaseField, FieldTypes, VectorDataType @@ -78,7 +78,10 @@ def _map_field_to_pydantic_type( elif field.type == FieldTypes.VECTOR: # For JSON storage, vectors are always lists if storage_type == StorageType.JSON: - return List[Union[int, float]] + # For int data types, vectors must be ints, otherwise floats + if field.attrs.datatype in (VectorDataType.INT8, VectorDataType.UINT8): + return List[int] + return List[float] else: return bytes @@ -88,7 +91,7 @@ def _map_field_to_pydantic_type( @classmethod def _create_model(cls, schema: IndexSchema) -> Type[BaseModel]: """ - Create a Pydantic model from schema definition. + Create a Pydantic model from schema definition using type() approach. Args: schema: The IndexSchema to convert @@ -96,134 +99,111 @@ def _create_model(cls, schema: IndexSchema) -> Type[BaseModel]: Returns: A Pydantic model class with appropriate fields and validators """ - field_definitions = {} - validators = {} - # Get storage type from schema storage_type = schema.index.storage_type - # Create field definitions dictionary for create_model + # Create annotations dictionary for the dynamic model + annotations = {} + class_dict = {} + + # Build annotations and field metadata for field_name, field in schema.fields.items(): field_type = cls._map_field_to_pydantic_type(field, storage_type) - # Create field definition (all fields are optional in the model) - # this handles the cases where objects have missing fields (supported behavior) - field_definitions[field_name] = ( - Optional[field_type], # Make fields optional - Field( - default=None, - json_schema_extra={ - "field_type": field.type, - }, - ), - ) - - # Add field-specific validator info to our validator registry - if field.type == FieldTypes.GEO: - validators[field_name] = {"type": "geo"} - - elif field.type == FieldTypes.VECTOR: - validators[field_name] = { - "type": "vector", - "dims": field.attrs.dims, - "datatype": field.attrs.datatype, - "storage_type": storage_type, - } - - # First create the model class with field definitions - model_name = f"{schema.index.name}__PydanticModel" - model_class = create_model(model_name, **field_definitions) - - # Then add validators to the model class - for field_name, validator_info in validators.items(): - if validator_info["type"] == "geo": - # Add geo validator - validator = cls._create_geo_validator(field_name) - setattr(model_class, f"validate_{field_name}", validator) - - elif validator_info["type"] == "vector": - # Add vector validator - validator = cls._create_vector_validator( - field_name, - validator_info["dims"], - validator_info["datatype"], - validator_info["storage_type"], - ) - setattr(model_class, f"validate_{field_name}", validator) - - return model_class - - @staticmethod - def _create_geo_validator(field_name: str): - """ - Create a validator for geo fields. + # Make all fields optional in the model + annotations[field_name] = Optional[field_type] - Args: - field_name: Name of the field to validate - - Returns: - A validator function that can be attached to a Pydantic model - """ + # Add default=None to make fields truly optional (can be missing from input) + class_dict[field_name] = Field(default=None) - # Create the validator function - def validate_geo_field(cls, value): - # Skip validation for None values - if value is not None: - # Validate against pattern - if not re.match(TypeInferrer.GEO_PATTERN.pattern, value): - raise ValueError( - f"Geo field '{field_name}' value '{value}' is not a valid 'lat,lon' format" - ) - return value - - # Add the field_validator decorator - return field_validator(field_name, mode="after")(validate_geo_field) - - @staticmethod - def _create_vector_validator( - field_name: str, dims: int, datatype: VectorDataType, storage_type: StorageType - ): - """ - Create a validator for vector fields. - - Args: - field_name: Name of the field to validate - dims: Expected dimensions of the vector - datatype: Expected datatype of the vector elements - storage_type: Type of storage (HASH or JSON) - - Returns: - A validator function that can be attached to a Pydantic model - """ - - # Create the validator function - def validate_vector_field(cls, value): - # Skip validation for None values - if value is not None: - - # Handle list representation - if isinstance(value, list): + # Register validators for GEO fields + if field.type == FieldTypes.GEO: - # Validate dimensions - if len(value) != dims: - raise ValueError( - f"Vector field '{field_name}' must have {dims} dimensions, got {len(value)}" - ) + def make_geo_validator(fname: str): + @field_validator(fname, mode="after") + def _validate_geo(cls, value): + # Skip validation for None values + if value is not None: + # Validate against pattern + if not TypeInferrer._is_geographic(value): + raise ValueError( + f"Geo field '{fname}' value '{value}' is not a valid 'lat,lon' format" + ) + return value + + return _validate_geo + + class_dict[f"validate_{field_name}"] = make_geo_validator(field_name) + + # Register validators for NUMERIC fields + elif field.type == FieldTypes.NUMERIC: + + def make_numeric_validator(fname: str): + # mode='before' so it catches bools before parsing + @field_validator(fname, mode="before") + def _disallow_bool(cls, value): + if isinstance(value, bool): + raise ValueError(f"Field '{fname}' cannot be boolean.") + return value + + return _disallow_bool + + class_dict[f"validate_{field_name}"] = make_numeric_validator( + field_name + ) - # Validate data types - datatype_str = str(datatype).upper() + # Register validators for VECTOR fields + elif field.type == FieldTypes.VECTOR: - # Integer-based datatypes - if datatype_str in ("INT8", "UINT8"): - # Check type - if not all(isinstance(v, int) for v in value): - raise ValueError( - f"Vector field '{field_name}' must contain only integer values for {datatype_str}" - ) + def make_vector_validator( + fname: str, dims: int, datatype: VectorDataType + ): + @field_validator(fname, mode="after") + def _validate_vector(cls, value): + # Skip validation for None values + if value is not None: + # Handle list representation + if isinstance(value, list): + # Validate dimensions + if len(value) != dims: + raise ValueError( + f"Vector field '{fname}' must have {dims} dimensions, got {len(value)}" + ) + # Validate data types + datatype_str = str(datatype).upper() + # Integer-based datatypes + if datatype_str in ("INT8", "UINT8"): + # Check range for INT8 + if datatype_str == "INT8": + if any(v < -128 or v > 127 for v in value): + raise ValueError( + f"Vector field '{fname}' contains values outside the INT8 range (-128 to 127)" + ) + # Check range for UINT8 + elif datatype_str == "UINT8": + if any(v < 0 or v > 255 for v in value): + raise ValueError( + f"Vector field '{fname}' contains values outside the UINT8 range (0 to 255)" + ) + return value + + return _validate_vector + + class_dict[f"validate_{field_name}"] = make_vector_validator( + field_name, field.attrs.dims, field.attrs.datatype + ) - return value + # Create class dictionary with annotations and field metadata + class_dict.update( + **{ + "__annotations__": annotations, + "model_config": {"arbitrary_types_allowed": True, "extra": "allow"}, + } + ) - return validate_vector_field + # Create the model class using type() + model_name = f"{schema.index.name}__PydanticModel" + return type(model_name, (BaseModel,), class_dict) def extract_from_json_path(obj: Dict[str, Any], path: str) -> Any: diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py deleted file mode 100644 index 91a558f2..00000000 --- a/tests/unit/conftest.py +++ /dev/null @@ -1,183 +0,0 @@ -""" -Common test fixtures and utilities for RedisVL validation tests. -""" - -from typing import Any, Dict - -import pytest - -from redisvl.schema import IndexSchema -from redisvl.schema.fields import VectorDataType, VectorDistanceMetric - - -@pytest.fixture -def comprehensive_schema(): - """Create a comprehensive schema with all field types for testing.""" - return IndexSchema.from_dict( - { - "index": { - "name": "test-index", - "prefix": "test", - "key_separator": ":", - "storage_type": "hash", - }, - "fields": [ - {"name": "id", "type": "tag"}, - {"name": "title", "type": "text"}, - {"name": "rating", "type": "numeric"}, - {"name": "location", "type": "geo"}, - { - "name": "embedding", - "type": "vector", - "attrs": { - "algorithm": "flat", - "dims": 4, - "datatype": "float32", - "distance_metric": "cosine", - }, - }, - { - "name": "int_vector", - "type": "vector", - "attrs": { - "algorithm": "flat", - "dims": 3, - "datatype": "int8", - "distance_metric": "l2", - }, - }, - { - "name": "hnsw_vector", - "type": "vector", - "attrs": { - "algorithm": "hnsw", - "dims": 3, - "distance_metric": "cosine", - "datatype": "float32", - "m": 16, - "ef_construction": 200, - "ef_runtime": 10, - "epsilon": 0.01, - }, - }, - ], - } - ) - - -@pytest.fixture -def json_schema(): - """Create a schema with JSON storage and path fields.""" - return IndexSchema.from_dict( - { - "index": { - "name": "test-json-index", - "prefix": "test", - "key_separator": ":", - "storage_type": "json", - }, - "fields": [ - {"name": "id", "type": "tag", "path": "$.id"}, - {"name": "user", "type": "tag", "path": "$.metadata.user"}, - {"name": "title", "type": "text", "path": "$.content.title"}, - {"name": "rating", "type": "numeric", "path": "$.metadata.rating"}, - { - "name": "embedding", - "type": "vector", - "path": "$.content.embedding", - "attrs": { - "algorithm": "flat", - "dims": 4, - "datatype": "float32", - "distance_metric": "cosine", - }, - }, - ], - } - ) - - -@pytest.fixture -def valid_data(): - """Sample valid data for testing validation.""" - return { - "id": "doc1", - "title": "Test Document", - "rating": 4.5, - "location": "37.7749,-122.4194", - "embedding": [0.1, 0.2, 0.3, 0.4], - "int_vector": [1, 2, 3], - "hnsw_vector": [0.1, 0.2, 0.3], - } - - -@pytest.fixture -def valid_nested_data(): - """Sample valid nested data for testing JSON path validation.""" - return { - "id": "doc1", - "metadata": {"user": "user123", "rating": 4.5}, - "content": {"title": "Test Document", "embedding": [0.1, 0.2, 0.3, 0.4]}, - } - - -@pytest.fixture -def invalid_data_cases(): - """ - Test cases for invalid data. - Each case contains: - - field: name of the field - - value: invalid value to test - - error_text: text that should appear in error message - """ - return [ - # Text field errors - {"field": "title", "value": 123, "error_text": "must be a string"}, - # Numeric field errors - {"field": "rating", "value": "high", "error_text": "must be a number"}, - {"field": "rating", "value": "123.45", "error_text": "must be a number"}, - # Tag field errors - {"field": "id", "value": 123, "error_text": "must be a string"}, - # Geo field errors - { - "field": "location", - "value": "invalid_geo", - "error_text": "not a valid 'lat,lon' format", - }, - { - "field": "location", - "value": "1000,-1000", - "error_text": "not a valid 'lat,lon' format", - }, - # Vector field errors - float32 - {"field": "embedding", "value": [0.1, 0.2, 0.3], "error_text": "dimensions"}, - { - "field": "embedding", - "value": [0.1, "string", 0.3, 0.4], - "error_text": "numeric values", - }, - { - "field": "embedding", - "value": "not_a_vector", - "error_text": "must be a list or bytes", - }, - # Vector field errors - int8 - { - "field": "int_vector", - "value": [0.1, 0.2, 0.3], - "error_text": "integer values", - }, - {"field": "int_vector", "value": [1, 2], "error_text": "dimensions"}, - { - "field": "int_vector", - "value": [1000, 2000, 3000], - "error_text": "INT8 values must be between", - }, - # HNSW Vector field errors - {"field": "hnsw_vector", "value": [0.1, 0.2], "error_text": "dimensions"}, - { - "field": "hnsw_vector", - "value": ["a", "b", "c"], - "error_text": "numeric values", - }, - ] diff --git a/tests/unit/test_storage.py b/tests/unit/test_storage.py index 539a1b76..4a34d340 100644 --- a/tests/unit/test_storage.py +++ b/tests/unit/test_storage.py @@ -1,562 +1,165 @@ -""" -Tests for RedisVL storage classes with focus on validation integration. - -This module tests how the storage classes integrate with the validation system: -1. How validation is used in storage operations -2. Preprocessing and validation flow -3. Error handling in write operations -""" - -from typing import Any, Dict -from unittest.mock import MagicMock, Mock, patch - import pytest +from pydantic import ValidationError -from redisvl.index.storage import HashStorage, JsonStorage -from redisvl.schema import IndexInfo, IndexSchema -from redisvl.schema.fields import ( - FlatVectorField, - FlatVectorFieldAttributes, - GeoField, - HNSWVectorField, - HNSWVectorFieldAttributes, - NumericField, - TagField, - TextField, - VectorDataType, - VectorDistanceMetric, -) -from redisvl.schema.validation import validate_object +from redisvl.exceptions import SchemaValidationError +from redisvl.index.storage import BaseStorage, HashStorage, JsonStorage +from redisvl.schema import IndexSchema @pytest.fixture -def sample_schema(): - """Create a comprehensive schema for testing with all field types""" - return IndexSchema.from_dict( - { - "index": { - "name": "test-index", - "prefix": "test", - "key_separator": ":", - "storage_type": "hash", +def sample_hash_schema(): + """Create a sample schema with HASH storage for testing.""" + schema_dict = { + "index": { + "name": "test-hash-index", + "prefix": "test", + "key_separator": ":", + "storage_type": "hash", + }, + "fields": [ + {"name": "test_id", "type": "tag"}, + {"name": "title", "type": "text"}, + {"name": "rating", "type": "numeric"}, + {"name": "location", "type": "geo"}, + { + "name": "embedding", + "type": "vector", + "attrs": { + "algorithm": "flat", + "dims": 4, + "datatype": "float32", + "distance_metric": "cosine", + }, }, - "fields": [ - # Standard fields - {"type": "text", "name": "text_field"}, - {"type": "numeric", "name": "num_field"}, - {"type": "tag", "name": "tag_field"}, - {"type": "geo", "name": "geo_field"}, - # Vector fields - { - "type": "vector", - "name": "flat_vector", - "attrs": { - "algorithm": "flat", - "dims": 3, - "distance_metric": "cosine", - "data_type": "float32", - }, + { + "name": "int_vector", + "type": "vector", + "attrs": { + "algorithm": "flat", + "dims": 3, + "datatype": "int8", + "distance_metric": "l2", }, - { - "type": "vector", - "name": "hnsw_vector", - "attrs": { - "algorithm": "hnsw", - "dims": 3, - "distance_metric": "cosine", - "data_type": "float32", - "m": 16, - "ef_construction": 200, - "ef_runtime": 10, - "epsilon": 0.01, - }, + }, + ], + } + return IndexSchema.from_dict(schema_dict) + + +@pytest.fixture +def sample_json_schema(): + """Create a sample schema with JSON storage for testing.""" + schema_dict = { + "index": { + "name": "test-json-index", + "prefix": "test", + "key_separator": ":", + "storage_type": "json", + }, + "fields": [ + {"name": "test_id", "type": "tag", "path": "$.test_id"}, + {"name": "user", "type": "tag", "path": "$.metadata.user"}, + {"name": "title", "type": "text", "path": "$.content.title"}, + {"name": "rating", "type": "numeric", "path": "$.metadata.rating"}, + { + "name": "embedding", + "type": "vector", + "path": "$.content.embedding", + "attrs": { + "algorithm": "flat", + "dims": 4, + "datatype": "float32", + "distance_metric": "cosine", }, - ], - } - ) + }, + { + "name": "int_vector", + "type": "vector", + "path": "$.content.int_vector", + "attrs": { + "algorithm": "flat", + "dims": 3, + "datatype": "int8", + "distance_metric": "l2", + }, + }, + ], + } + return IndexSchema.from_dict(schema_dict) @pytest.fixture(params=[JsonStorage, HashStorage]) -def storage_instance(request, sample_schema): +def storage_instance(request, sample_hash_schema, sample_json_schema): StorageClass = request.param - instance = StorageClass(index_schema=sample_schema) - return instance + if isinstance(StorageClass, JsonStorage): + return StorageClass(index_schema=sample_json_schema) + return StorageClass(index_schema=sample_hash_schema) def test_key_formatting(storage_instance): key = "1234" generated_key = storage_instance._key(key, "", "") - assert generated_key == key + assert generated_key == key, "The generated key does not match the expected format." generated_key = storage_instance._key(key, "", ":") - assert generated_key == key + assert generated_key == key, "The generated key does not match the expected format." generated_key = storage_instance._key(key, "test", ":") - assert generated_key == f"test:{key}" + assert ( + generated_key == f"test:{key}" + ), "The generated key does not match the expected format." def test_create_key(storage_instance): id_field = "id" obj = {id_field: "1234"} - expected_key = ( - f"{storage_instance.index_schema.index.prefix}" - f"{storage_instance.index_schema.index.key_separator}" - f"{obj[id_field]}" - ) + expected_key = f"{storage_instance.index_schema.index.prefix}{storage_instance.index_schema.index.key_separator}{obj[id_field]}" generated_key = storage_instance._create_key(obj, id_field) assert ( generated_key == expected_key ), "The generated key does not match the expected format." +def test_validate_success(storage_instance): + try: + storage_instance.validate( + {"test_id": "1234", "rating": 5, "user": "john", "title": "engineer"} + ) + except Exception as e: + pytest.fail(f"_validate should not raise an exception here, but raised {e}") + + +def test_validate_failure(storage_instance): + data = {"title": 5} + with pytest.raises(ValidationError): + storage_instance.validate(data) + data = {"user": True} + with pytest.raises(ValidationError): + storage_instance.validate(data) + + +def test_validate_preprocess_and_validate_failure(storage_instance): + data = {"title": 5} + data == storage_instance._preprocess_and_validate_objects( + objects=[data], validate=False + ) + with pytest.raises(SchemaValidationError): + storage_instance._preprocess_and_validate_objects(objects=[data], validate=True) + data = {"user": True} + data == storage_instance._preprocess_and_validate_objects( + objects=[data], validate=False + ) + with pytest.raises(SchemaValidationError): + storage_instance._preprocess_and_validate_objects(objects=[data], validate=True) + + def test_preprocess(storage_instance): data = {"key": "value"} - preprocessed_data = storage_instance._preprocess(data, preprocess=None) + preprocessed_data = storage_instance._preprocess(obj=data, preprocess=None) assert preprocessed_data == data def fn(d): d["foo"] = "bar" return d - preprocessed_data = storage_instance._preprocess(data, fn) + preprocessed_data = storage_instance._preprocess(obj=data, preprocess=fn) assert "foo" in preprocessed_data assert preprocessed_data["foo"] == "bar" - - -def test_preprocess_and_validate_objects(storage_instance): - """Test combined preprocessing and validation""" - objects = [ - {"num_field": 123, "text_field": "valid text"}, # Valid - {"num_field": "123", "text_field": "valid text"}, # Invalid numeric field - ] - - def preprocess(obj): - obj["processed"] = True - return obj - - # When validate=True, should raise ValueError for invalid object - with pytest.raises(ValueError) as exc_info: - storage_instance._preprocess_and_validate_objects( - objects, preprocess=preprocess, validate=True - ) - - # Error message should mention the issue - assert "Validation failed" in str(exc_info.value) - assert "must be a number" in str(exc_info.value) - - # When validate=False, should process both objects without errors - prepared_objects = storage_instance._preprocess_and_validate_objects( - objects, preprocess=preprocess, validate=False - ) - - assert len(prepared_objects) == 2 - # Preprocessing should have worked for both objects - assert all(obj[1].get("processed") for obj in prepared_objects) - - -def test_validate_object(storage_instance): - """Test validation of individual objects""" - - # Valid data should be returned unchanged (except for any type coercion) - valid_data = { - "text_field": "some text", - "num_field": 123.45, - "tag_field": "tag1,tag2,tag3", - "geo_field": "37.7749,-122.4194", - "flat_vector": [0.1, 0.2, 0.3], - "hnsw_vector": [0.4, 0.5, 0.6], - } - - validated = storage_instance.validate(valid_data) - assert validated is not None - assert validated["num_field"] == valid_data["num_field"] - assert validated["text_field"] == valid_data["text_field"] - - # Invalid text field - invalid_text = valid_data.copy() - invalid_text["text_field"] = 123 - with pytest.raises(ValueError) as exc_info: - storage_instance.validate(invalid_text) - assert "text_field" in str(exc_info.value) - - # Invalid numeric field (string that looks like number) - invalid_numeric = valid_data.copy() - invalid_numeric["num_field"] = "123.45" - with pytest.raises(ValueError) as exc_info: - storage_instance.validate(invalid_numeric) - assert "num_field" in str(exc_info.value) - - # Invalid geo field - invalid_geo = valid_data.copy() - invalid_geo["geo_field"] = "invalid-geo-format" - with pytest.raises(ValueError) as exc_info: - storage_instance.validate(invalid_geo) - assert "geo_field" in str(exc_info.value) - - # Invalid vector field (wrong dimensions) - invalid_vector_dims = valid_data.copy() - invalid_vector_dims["flat_vector"] = [0.1, 0.2] - with pytest.raises(ValueError) as exc_info: - storage_instance.validate(invalid_vector_dims) - assert "flat_vector" in str(exc_info.value) - assert "dimensions" in str(exc_info.value) - - # Invalid vector field (non-numeric values) - invalid_vector_values = valid_data.copy() - invalid_vector_values["hnsw_vector"] = ["a", "b", "c"] - with pytest.raises(ValueError) as exc_info: - storage_instance.validate(invalid_vector_values) - assert "hnsw_vector" in str(exc_info.value) - assert "numeric values" in str(exc_info.value) - - -def test_partial_object_validation(storage_instance): - """Test validation of partial objects (missing fields)""" - - # Object with only some fields - partial_data = { - "text_field": "valid text", - # Missing num_field, tag_field, etc. - } - - # Should validate successfully since fields are optional - validated = storage_instance.validate(partial_data) - assert validated is not None - assert "text_field" in validated - assert "num_field" not in validated - - # Explicitly setting a field to None should result in it being excluded - null_field_data = {"text_field": "valid text", "num_field": None} - - validated = storage_instance.validate(null_field_data) - assert "num_field" not in validated - - -def test_write_with_validation(storage_instance, mocker): - """Test the write method with validation enabled""" - # Mock the _set method to avoid actual Redis calls - mocker.patch.object(storage_instance, "_set") - - # Mock pipeline execution - mock_pipe = mocker.MagicMock() - mock_pipe.execute = mocker.MagicMock() - - # Mock Redis client - mock_client = mocker.MagicMock() - mock_client.pipeline.return_value.__enter__.return_value = mock_pipe - - # Valid and invalid objects - objects = [ - {"text_field": "valid", "num_field": 123}, # Valid - {"text_field": 456, "num_field": 789}, # Invalid text field - ] - - # With validation enabled, should raise error on first invalid object - with pytest.raises(ValueError) as exc_info: - storage_instance.write(mock_client, objects, validate=True) - - assert "Validation failed" in str(exc_info.value) - assert "text_field" in str(exc_info.value) - - # With validation disabled, should process all objects - keys = storage_instance.write(mock_client, objects, validate=False) - - assert len(keys) == 2 - assert storage_instance._set.call_count == 2 - - -class TestBaseStorageValidation: - """Tests for validation in BaseStorage class.""" - - def test_validate_object(self, comprehensive_schema, valid_data): - """Test the validate_object method.""" - # Create storage - storage = BaseStorage(schema=comprehensive_schema) - - # Validate object - validated = storage.validate_object(valid_data) - - # Verify object was validated - assert validated is not None - assert "id" in validated - assert "title" in validated - - def test_validate_object_with_invalid_data(self, comprehensive_schema, valid_data): - """Test validation with invalid data.""" - # Create storage - storage = BaseStorage(schema=comprehensive_schema) - - # Create invalid data - invalid_data = valid_data.copy() - invalid_data["rating"] = "not a number" - - # Validation should fail - with pytest.raises(ValueError) as exc_info: - storage.validate_object(invalid_data) - - # Error message should mention validation failure - assert "Validation failed" in str(exc_info.value) - - def test_preprocess_and_validate_objects_success( - self, comprehensive_schema, valid_data - ): - """Test _preprocess_and_validate_objects with valid data.""" - # Create storage - storage = BaseStorage(schema=comprehensive_schema) - - # Process objects - objects = [valid_data] - validated_objects = storage._preprocess_and_validate_objects(objects) - - # Verify objects were validated - assert len(validated_objects) == 1 - assert "id" in validated_objects[0] - assert "title" in validated_objects[0] - - def test_preprocess_and_validate_objects_fail( - self, comprehensive_schema, valid_data - ): - """Test _preprocess_and_validate_objects with invalid data.""" - # Create storage - storage = BaseStorage(schema=comprehensive_schema) - - # Create mix of valid and invalid data - invalid_data = valid_data.copy() - invalid_data["rating"] = "not a number" - - # Process should fail fast on first invalid object - with pytest.raises(ValueError) as exc_info: - storage._preprocess_and_validate_objects([invalid_data, valid_data]) - - # Error message should mention validation failure - assert "Validation failed" in str(exc_info.value) - - def test_write_one_validation(self, comprehensive_schema, valid_data): - """Test that write_one validates objects.""" - # Create storage with mocked redis client - client_mock = Mock() - storage = BaseStorage(schema=comprehensive_schema, client=client_mock) - - # Mock hset to avoid actual Redis call - client_mock.hset = Mock() - - # Call write_one - storage.write_one(valid_data) - - # Verify hset was called - client_mock.hset.assert_called_once() - - def test_write_one_validation_fail(self, comprehensive_schema, valid_data): - """Test that write_one fails on invalid data.""" - # Create storage with mocked redis client - client_mock = Mock() - storage = BaseStorage(schema=comprehensive_schema, client=client_mock) - - # Create invalid data - invalid_data = valid_data.copy() - invalid_data["rating"] = "not a number" - - # Call write_one with invalid data - with pytest.raises(ValueError) as exc_info: - storage.write_one(invalid_data) - - # Verify error and that hset was not called - assert "Validation failed" in str(exc_info.value) - client_mock.hset.assert_not_called() - - def test_write_many_validation(self, comprehensive_schema, valid_data): - """Test that write_many validates all objects.""" - # Create storage with mocked redis client - client_mock = Mock() - storage = BaseStorage(schema=comprehensive_schema, client=client_mock) - - # Mock pipeline to avoid actual Redis call - pipeline_mock = Mock() - client_mock.pipeline.return_value.__enter__.return_value = pipeline_mock - - # Call write_many with multiple valid objects - storage.write_many([valid_data, valid_data.copy()]) - - # Verify pipeline executed - pipeline_mock.execute.assert_called_once() - - def test_write_many_validation_fail(self, comprehensive_schema, valid_data): - """Test that write_many fails on invalid data.""" - # Create storage with mocked redis client - client_mock = Mock() - storage = BaseStorage(schema=comprehensive_schema, client=client_mock) - - # Mock pipeline to avoid actual Redis call - pipeline_mock = Mock() - client_mock.pipeline.return_value.__enter__.return_value = pipeline_mock - - # Create invalid data - invalid_data = valid_data.copy() - invalid_data["rating"] = "not a number" - - # Call write_many with invalid data - with pytest.raises(ValueError) as exc_info: - storage.write_many([valid_data, invalid_data]) - - # Verify error and that execute was not called - assert "Validation failed" in str(exc_info.value) - pipeline_mock.execute.assert_not_called() - - -class TestJsonStorageValidation: - """Tests for validation in JsonStorage class.""" - - def test_validate_json_document(self, json_schema, valid_nested_data): - """Test validating a JSON document.""" - # Create JSON storage - storage = JsonStorage(schema=json_schema) - - # Validate object - validated = storage.validate_object(valid_nested_data) - - # Verify object was validated and flattened - assert validated is not None - assert "id" in validated - assert "user" in validated - assert "title" in validated - assert "rating" in validated - - def test_validate_json_missing_paths(self, json_schema): - """Test validating JSON with missing paths.""" - # Create JSON storage - storage = JsonStorage(schema=json_schema) - - # Create object with missing paths - partial_nested = { - "id": "doc1", - "metadata": { - "user": "user123" - # missing rating - }, - "content": { - "title": "Test Document" - # missing embedding - }, - } - - # Validate object - validated = storage.validate_object(partial_nested) - - # Verify validation succeeds with missing fields - assert validated is not None - assert "id" in validated - assert "user" in validated - assert "title" in validated - - # Missing fields should be absent - assert "rating" not in validated - assert "embedding" not in validated - - def test_validate_json_invalid_path(self, json_schema, valid_nested_data): - """Test validating JSON with invalid path values.""" - # Create JSON storage - storage = JsonStorage(schema=json_schema) - - # Create object with invalid data - invalid_nested = valid_nested_data.copy() - invalid_nested["metadata"]["rating"] = "not a number" - - # Validation should fail - with pytest.raises(ValueError) as exc_info: - storage.validate_object(invalid_nested) - - # Error message should mention validation failure - assert "Validation failed" in str(exc_info.value) - assert "rating" in str(exc_info.value) - - def test_write_json_document(self, json_schema, valid_nested_data): - """Test writing a JSON document.""" - # Create storage with mocked redis client - client_mock = Mock() - storage = JsonStorage(schema=json_schema, client=client_mock) - - # Mock json.set to avoid actual Redis call - client_mock.json.set = Mock() - - # Call write_one - storage.write_one(valid_nested_data) - - # Verify json.set was called - client_mock.json.set.assert_called_once() - - def test_write_json_validation_fail(self, json_schema, valid_nested_data): - """Test that write fails on invalid JSON.""" - # Create storage with mocked redis client - client_mock = Mock() - storage = JsonStorage(schema=json_schema, client=client_mock) - - # Create invalid data - invalid_nested = valid_nested_data.copy() - invalid_nested["metadata"]["rating"] = "not a number" - - # Call write_one with invalid data - with pytest.raises(ValueError) as exc_info: - storage.write_one(invalid_nested) - - # Verify error and that json.set was not called - assert "Validation failed" in str(exc_info.value) - client_mock.json.set.assert_not_called() - - -@patch("redisvl.schema.validation.validate_object") -class TestValidationIntegration: - """Tests for integration between storage and validation.""" - - def test_validate_object_is_called( - self, mock_validate, comprehensive_schema, valid_data - ): - """Test that validate_object is called from BaseStorage.""" - # Create storage - storage = BaseStorage(schema=comprehensive_schema) - - # Set up mock to return the input data - mock_validate.return_value = valid_data - - # Call validate_object - storage.validate_object(valid_data) - - # Verify mock was called with correct args - mock_validate.assert_called_once_with(comprehensive_schema, valid_data) - - def test_preprocess_calls_validate_for_each_object( - self, mock_validate, comprehensive_schema, valid_data - ): - """Test that _preprocess_and_validate_objects calls validate for each object.""" - # Create storage - storage = BaseStorage(schema=comprehensive_schema) - - # Set up mock to return the input data - mock_validate.return_value = valid_data - - # Call _preprocess_and_validate_objects with multiple objects - objects = [valid_data, valid_data.copy(), valid_data.copy()] - storage._preprocess_and_validate_objects(objects) - - # Verify mock was called for each object - assert mock_validate.call_count == len(objects) - - def test_preprocess_stops_on_first_validation_error( - self, mock_validate, comprehensive_schema, valid_data - ): - """Test that processing stops on first validation error.""" - # Create storage - storage = BaseStorage(schema=comprehensive_schema) - - # Set up mock to raise error on second call - mock_validate.side_effect = [ - valid_data, - ValueError("Validation failed for 2nd object"), - valid_data, - ] - - # Call _preprocess_and_validate_objects - objects = [valid_data, valid_data.copy(), valid_data.copy()] - with pytest.raises(ValueError) as exc_info: - storage._preprocess_and_validate_objects(objects) - - # Verify error and that mock was called twice - assert "Validation failed for 2nd object" in str(exc_info.value) - assert mock_validate.call_count == 2 diff --git a/tests/unit/test_validation.py b/tests/unit/test_validation.py index 6431303c..ac67e810 100644 --- a/tests/unit/test_validation.py +++ b/tests/unit/test_validation.py @@ -9,12 +9,13 @@ """ import re -from typing import Any, Dict, List +from typing import Any, Dict, List, Optional, Tuple, Union import pytest from redisvl.schema import IndexSchema from redisvl.schema.fields import FieldTypes, VectorDataType +from redisvl.schema.schema import StorageType from redisvl.schema.type_utils import TypeInferrer from redisvl.schema.validation import ( SchemaModelGenerator, @@ -22,19 +23,21 @@ validate_object, ) +# -------------------- FIXTURES -------------------- + @pytest.fixture -def sample_schema(): - """Create a sample schema with different field types for testing.""" +def sample_hash_schema(): + """Create a sample schema with HASH storage for testing.""" schema_dict = { "index": { - "name": "test-index", + "name": "test-hash-index", "prefix": "test", "key_separator": ":", "storage_type": "hash", }, "fields": [ - {"name": "id", "type": "tag"}, + {"name": "test_id", "type": "tag"}, {"name": "title", "type": "text"}, {"name": "rating", "type": "numeric"}, {"name": "location", "type": "geo"}, @@ -48,6 +51,16 @@ def sample_schema(): "distance_metric": "cosine", }, }, + { + "name": "int_vector", + "type": "vector", + "attrs": { + "algorithm": "flat", + "dims": 3, + "datatype": "int8", + "distance_metric": "l2", + }, + }, ], } return IndexSchema.from_dict(schema_dict) @@ -55,7 +68,7 @@ def sample_schema(): @pytest.fixture def sample_json_schema(): - """Create a sample schema with JSON storage and path fields.""" + """Create a sample schema with JSON storage for testing.""" schema_dict = { "index": { "name": "test-json-index", @@ -64,7 +77,7 @@ def sample_json_schema(): "storage_type": "json", }, "fields": [ - {"name": "id", "type": "tag", "path": "$.id"}, + {"name": "test_id", "type": "tag", "path": "$.test_id"}, {"name": "user", "type": "tag", "path": "$.metadata.user"}, {"name": "title", "type": "text", "path": "$.content.title"}, {"name": "rating", "type": "numeric", "path": "$.metadata.rating"}, @@ -79,71 +92,178 @@ def sample_json_schema(): "distance_metric": "cosine", }, }, + { + "name": "int_vector", + "type": "vector", + "path": "$.content.int_vector", + "attrs": { + "algorithm": "flat", + "dims": 3, + "datatype": "int8", + "distance_metric": "l2", + }, + }, ], } return IndexSchema.from_dict(schema_dict) @pytest.fixture -def valid_data(): - """Sample valid data for testing validation.""" +def valid_hash_data(): + """Sample valid data for testing HASH storage validation.""" return { - "id": "doc1", + "test_id": "doc1", "title": "Test Document", "rating": 4.5, "location": "37.7749,-122.4194", - "embedding": [0.1, 0.2, 0.3, 0.4], + "embedding": b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", # Bytes for HASH + "int_vector": b"\x01\x02\x03", # Bytes for HASH } @pytest.fixture -def valid_nested_data(): - """Sample valid nested data for testing JSON path validation.""" +def valid_json_data(): + """Sample valid data for testing JSON storage validation.""" return { - "id": "doc1", + "test_id": "doc1", "metadata": {"user": "user123", "rating": 4.5}, - "content": {"title": "Test Document", "embedding": [0.1, 0.2, 0.3, 0.4]}, + "content": { + "title": "Test Document", + "embedding": [0.1, 0.2, 0.3, 0.4], # List for JSON + "int_vector": [1, 2, 3], # List for JSON + }, } +# -------------------- TEST HELPERS -------------------- + + +def validate_field( + schema: IndexSchema, + field_name: str, + value: Any, + should_pass: bool, + error_text: Optional[str] = None, +) -> Tuple[bool, Optional[str]]: + """ + Helper function to validate a field value against a schema. + + Args: + schema: The schema to validate against + field_name: The name of the field to validate + value: The value to validate + should_pass: Whether validation should pass + error_text: Expected error text if validation should fail + + Returns: + Tuple of (validation_success, error_message) + """ + # Get model for schema + model_class = SchemaModelGenerator.get_model_for_schema(schema) + + # Create test data with minimal viable fields + test_data = {field_name: value} + + # Try to validate + try: + validated = model_class.model_validate(test_data) + + # If we got here, validation passed + success = True + error_msg = None + + except Exception as e: + # Validation failed + success = False + error_msg = str(e) + + # Check if result matches expectation + if success != should_pass: + print("ERROR", error_msg, flush=True) + print(validated, flush=True) + assert ( + success == should_pass + ), f"Validation {'passed' if success else 'failed'} but expected {'pass' if should_pass else 'fail'}" + + # Check error text if specified and validation failed + if not success and error_text and error_msg: + assert ( + error_text in error_msg + ), f"Error '{error_msg}' does not contain expected text '{error_text}'" + + return success, error_msg + + +# -------------------- CATEGORY 1: BASIC UNIT TESTS -------------------- + + class TestSchemaModelGenerator: """Tests for the SchemaModelGenerator class.""" - def test_get_model_for_schema(self, sample_schema): + @pytest.mark.parametrize("schema_type", ["hash", "json"]) + def test_get_model_for_schema( + self, schema_type, sample_hash_schema, sample_json_schema + ): """Test generating a model from a schema.""" + # Select schema based on type + schema = sample_hash_schema if schema_type == "hash" else sample_json_schema + # Get model for schema - model_class = SchemaModelGenerator.get_model_for_schema(sample_schema) + model_class = SchemaModelGenerator.get_model_for_schema(schema) # Verify model name matches the index name - assert model_class.__name__ == "test-index__PydanticModel" + assert model_class.__name__ == f"{schema.index.name}__PydanticModel" # Verify model has expected fields - for field_name in sample_schema.field_names: + for field_name in schema.field_names: assert field_name in model_class.model_fields - def test_model_caching(self, sample_schema): + def test_model_caching(self, sample_hash_schema): """Test that models are cached and reused.""" # Get model twice - model1 = SchemaModelGenerator.get_model_for_schema(sample_schema) - model2 = SchemaModelGenerator.get_model_for_schema(sample_schema) + model1 = SchemaModelGenerator.get_model_for_schema(sample_hash_schema) + model2 = SchemaModelGenerator.get_model_for_schema(sample_hash_schema) # Verify same instance assert model1 is model2 - def test_type_mapping(self, sample_schema): + @pytest.mark.parametrize( + "field_type,storage_type,expected_type", + [ + (FieldTypes.TEXT, StorageType.HASH, str), + (FieldTypes.TAG, StorageType.HASH, str), + (FieldTypes.NUMERIC, StorageType.HASH, Union[int, float]), + (FieldTypes.GEO, StorageType.HASH, str), + (FieldTypes.VECTOR, StorageType.HASH, bytes), + (FieldTypes.TEXT, StorageType.JSON, str), + (FieldTypes.TAG, StorageType.JSON, str), + (FieldTypes.NUMERIC, StorageType.JSON, Union[int, float]), + (FieldTypes.GEO, StorageType.JSON, str), + (FieldTypes.VECTOR, StorageType.JSON, List[float]), + ], + ) + def test_type_mapping(self, field_type, storage_type, expected_type): """Test mapping Redis field types to Pydantic types.""" - for field_name, field in sample_schema.fields.items(): - field_type = SchemaModelGenerator._map_field_to_pydantic_type(field) - - # Verify each field type maps to expected Python type - if field.type == FieldTypes.TEXT: - assert field_type == str - elif field.type == FieldTypes.TAG: - assert field_type == str - elif field.type == FieldTypes.NUMERIC: - assert field_type.__origin__ == type(Union) # Check it's a Union - elif field.type == FieldTypes.VECTOR: - assert field_type.__origin__ == type(Union) # Check it's a Union + + # Create a basic field of the specified type + class SimpleField: + def __init__(self, ftype): + self.type = ftype + # Add attrs for vector fields + if ftype == FieldTypes.VECTOR: + + class Attrs: + dims = 4 + datatype = VectorDataType.FLOAT32 + + self.attrs = Attrs() + + field = SimpleField(field_type) + field_type_result = SchemaModelGenerator._map_field_to_pydantic_type( + field, storage_type + ) + + assert field_type_result == expected_type def test_unsupported_field_type(self): """Test that an error is raised for unsupported field types.""" @@ -154,362 +274,419 @@ class DummyField: # Mapping should raise ValueError with pytest.raises(ValueError) as exc_info: - SchemaModelGenerator._map_field_to_pydantic_type(DummyField()) + SchemaModelGenerator._map_field_to_pydantic_type( + DummyField(), StorageType.HASH + ) assert "Unsupported field type" in str(exc_info.value) -class TestFieldValidators: - """Tests for field-specific validators.""" - - def test_text_field_validation(self, sample_schema, valid_data): - """Test validation of text fields.""" - model_class = SchemaModelGenerator.get_model_for_schema(sample_schema) - - # Valid text field - valid = valid_data.copy() - validated = model_class.model_validate(valid) - assert validated.title == "Test Document" +class TestJsonPathExtraction: + """Tests for JSON path extraction functionality.""" - # Invalid text field (number) - invalid = valid_data.copy() - invalid["title"] = 123 - with pytest.raises(ValueError) as exc_info: - model_class.model_validate(invalid) - assert "title" in str(exc_info.value) - assert "must be a string" in str(exc_info.value) - - def test_tag_field_validation(self, sample_schema, valid_data): - """Test validation of tag fields.""" - model_class = SchemaModelGenerator.get_model_for_schema(sample_schema) - - # Valid tag field - valid = valid_data.copy() - validated = model_class.model_validate(valid) - assert validated.id == "doc1" - - # Invalid tag field (number) - invalid = valid_data.copy() - invalid["id"] = 123 - with pytest.raises(ValueError) as exc_info: - model_class.model_validate(invalid) - assert "id" in str(exc_info.value) - assert "must be a string" in str(exc_info.value) - - def test_numeric_field_validation(self, sample_schema, valid_data): - """Test validation of numeric fields.""" - model_class = SchemaModelGenerator.get_model_for_schema(sample_schema) - - # Valid numeric field (integer) - valid_int = valid_data.copy() - valid_int["rating"] = 5 - validated = model_class.model_validate(valid_int) - assert validated.rating == 5 - - # Valid numeric field (float) - valid_float = valid_data.copy() - valid_float["rating"] = 4.5 - validated = model_class.model_validate(valid_float) - assert validated.rating == 4.5 - - # Invalid numeric field (string) - invalid = valid_data.copy() - invalid["rating"] = "high" - with pytest.raises(ValueError) as exc_info: - model_class.model_validate(invalid) - assert "rating" in str(exc_info.value) - assert "must be a number" in str(exc_info.value) - - # Invalid numeric field (string that looks like number) - invalid_num_str = valid_data.copy() - invalid_num_str["rating"] = "4.5" - with pytest.raises(ValueError) as exc_info: - model_class.model_validate(invalid_num_str) - assert "rating" in str(exc_info.value) - assert "must be a number" in str(exc_info.value) - - def test_geo_field_validation(self, sample_schema, valid_data): - """Test validation of geo fields.""" - model_class = SchemaModelGenerator.get_model_for_schema(sample_schema) - - # Valid geo format - valid_geo = valid_data.copy() - valid_geo["location"] = "37.7749,-122.4194" - validated = model_class.model_validate(valid_geo) - assert validated.location == "37.7749,-122.4194" - - # Invalid geo format (not matching lat,lon pattern) - invalid_geo = valid_data.copy() - invalid_geo["location"] = "invalid_geo" - with pytest.raises(ValueError) as exc_info: - model_class.model_validate(invalid_geo) - assert "location" in str(exc_info.value) - assert "not a valid 'lat,lon' format" in str(exc_info.value) - - # Verify the geo pattern actually works with valid formats - valid_formats = [ - "0,0", - "90,-180", - "-90,180", - "37.7749,-122.4194", - "37.7749,122.4194", - "-37.7749,-122.4194", - ] - for format in valid_formats: - assert re.match(TypeInferrer.GEO_PATTERN.pattern, format) - - # Verify invalid formats fail the pattern - invalid_formats = [ - "invalid", - "37.7749", - "37.7749,", - ",122.4194", - "91,0", # Latitude > 90 - "-91,0", # Latitude < -90 - "0,181", # Longitude > 180 - "0,-181", # Longitude < -180 - ] - for format in invalid_formats: - assert not re.match(TypeInferrer.GEO_PATTERN.pattern, format) - - def test_vector_field_validation_float(self, sample_schema, valid_data): - """Test validation of float vector fields.""" - model_class = SchemaModelGenerator.get_model_for_schema(sample_schema) - - # Valid vector - valid_vector = valid_data.copy() - valid_vector["embedding"] = [0.1, 0.2, 0.3, 0.4] - validated = model_class.model_validate(valid_vector) - assert validated.embedding == [0.1, 0.2, 0.3, 0.4] - - # Valid vector as bytes - valid_bytes = valid_data.copy() - valid_bytes["embedding"] = b"\x00\x01\x02\x03" - validated = model_class.model_validate(valid_bytes) - assert validated.embedding == b"\x00\x01\x02\x03" - - # Invalid vector type (string) - invalid_type = valid_data.copy() - invalid_type["embedding"] = "not a vector" - with pytest.raises(ValueError) as exc_info: - model_class.model_validate(invalid_type) - assert "embedding" in str(exc_info.value) - - # Invalid dimensions - invalid_dims = valid_data.copy() - invalid_dims["embedding"] = [0.1, 0.2, 0.3] # 3 dimensions instead of 4 - with pytest.raises(ValueError) as exc_info: - model_class.model_validate(invalid_dims) - assert "embedding" in str(exc_info.value) - assert "dimensions" in str(exc_info.value) + @pytest.mark.parametrize( + "path,expected_value", + [ + ("$.test_id", "doc1"), + ("$.metadata.user", "user123"), + ("$.metadata.rating", 4.5), + ("$.content.title", "Test Document"), + ("$.content.embedding", [0.1, 0.2, 0.3, 0.4]), + ("metadata.user", "user123"), # alternate format + ("$.nonexistent", None), # nonexistent path + ("$.metadata.nonexistent", None), # nonexistent nested path + ], + ) + def test_extract_from_json_path(self, valid_json_data, path, expected_value): + """Test extracting values using JSON paths.""" + assert extract_from_json_path(valid_json_data, path) == expected_value - # Invalid vector values - invalid_values = valid_data.copy() - invalid_values["embedding"] = [0.1, "string", 0.3, 0.4] - with pytest.raises(ValueError) as exc_info: - model_class.model_validate(invalid_values) - assert "embedding" in str(exc_info.value) - - def test_vector_field_validation_int(self, sample_schema, valid_data): - """Test validation of integer vector fields.""" - model_class = SchemaModelGenerator.get_model_for_schema(sample_schema) - - # Valid integer vector - valid_vector = valid_data.copy() - valid_vector["int_vector"] = [1, 2, 3] - validated = model_class.model_validate(valid_vector) - assert validated.int_vector == [1, 2, 3] - - # Invalid: float values in int vector - invalid_floats = valid_data.copy() - invalid_floats["int_vector"] = [0.1, 0.2, 0.3] - with pytest.raises(ValueError) as exc_info: - model_class.model_validate(invalid_floats) - assert "int_vector" in str(exc_info.value) - assert "integer values" in str(exc_info.value) - # Invalid: values outside INT8 range - invalid_range = valid_data.copy() - invalid_range["int_vector"] = [1000, 2000, 3000] # Outside INT8 range - with pytest.raises(ValueError) as exc_info: - model_class.model_validate(invalid_range) - assert "int_vector" in str(exc_info.value) - assert "must be between" in str(exc_info.value) +# # -------------------- CATEGORY 2: PARAMETRIZED VALIDATOR TESTS -------------------- -class TestJsonPathValidation: - """Tests for JSON path-based validation.""" +class TestBasicFieldValidation: + """Tests for validating non-vector field types.""" - def test_extract_from_json_path(self, valid_nested_data): - """Test extracting values using JSON paths.""" - # Test simple path - assert extract_from_json_path(valid_nested_data, "$.id") == "doc1" + @pytest.mark.parametrize( + "field_type,field_name,valid_values,invalid_values", + [ + # TEXT fields + ( + "text", + "title", + [("Test Document", None), ("123", None), ("", None)], + [(123, "string"), (True, "string"), ([], "string")], + ), + # TAG fields + ( + "tag", + "test_id", + [("doc1", None), ("123", None), ("abc,def", None), ("", None)], + [ + (123, "string"), + (True, "string"), + ([], "string"), + ([1, 2, 3], "string"), + ], + ), + # NUMERIC fields + ( + "numeric", + "rating", + [(5, None), (4.5, None), (0, None), (-1.5, None), ("5.3", None)], + [("high", "number"), (True, "boolean"), ([], "number")], + ), + # GEO fields + ( + "geo", + "location", + [ + ("0,0", None), + ("90,-180", None), + ("-90,180", None), + ("37.7749,-122.4194", None), + ], + [ + ("invalid_geo", "lat,lon"), + ("37.7749", "lat,lon"), + ("37.7749,", "lat,lon"), + (",122.4194", "lat,lon"), + ("91,0", "lat,lon"), # Latitude > 90 + ("-91,0", "lat,lon"), # Latitude < -90 + ("0,181", "lat,lon"), # Longitude > 180 + ("0,-181", "lat,lon"), # Longitude < -180 + (123, "string"), + (True, "string"), + ], + ), + ], + ) + def test_basic_field_validation( + self, sample_hash_schema, field_type, field_name, valid_values, invalid_values + ): + """ + Test validation of basic field types (text, tag, numeric, geo). + + This test consolidates previously separate tests for different field types. + """ + # Test valid values + for value, _ in valid_values: + validate_field(sample_hash_schema, field_name, value, True) + + # For GEO fields, also verify pattern + if field_type == "geo" and isinstance(value, str): + assert re.match(TypeInferrer.GEO_PATTERN.pattern, value) + + # Test invalid values + for value, error_text in invalid_values: + validate_field(sample_hash_schema, field_name, value, False, error_text) + + # For GEO fields, also verify pattern failure + if field_type == "geo" and isinstance(value, str): + assert not re.match(TypeInferrer.GEO_PATTERN.pattern, value) - # Test nested path - assert extract_from_json_path(valid_nested_data, "$.metadata.user") == "user123" - assert extract_from_json_path(valid_nested_data, "$.metadata.rating") == 4.5 - assert ( - extract_from_json_path(valid_nested_data, "$.content.title") - == "Test Document" + @pytest.mark.parametrize( + "test_case", + [ + # Valid cases for HASH storage (bytes) + { + "storage": StorageType.HASH, + "field_name": "embedding", + "value": b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", + "valid": True, + "error_text": None, + "description": "Valid bytes for HASH storage", + }, + { + "storage": StorageType.HASH, + "field_name": "int_vector", + "value": b"\x01\x02\x03", + "valid": True, + "error_text": None, + "description": "Valid bytes for HASH storage (int vector)", + }, + # Invalid cases for HASH storage (trying to use lists) + { + "storage": StorageType.HASH, + "field_name": "embedding", + "value": [0.1, 0.2, 0.3, 0.4], + "valid": False, + "error_text": "bytes", + "description": "List not valid for HASH storage", + }, + # Valid cases for JSON storage (lists) + { + "storage": StorageType.JSON, + "field_name": "embedding", + "value": [0.1, 0.2, 0.3, 0.4], + "valid": True, + "error_text": None, + "description": "Valid list for JSON storage", + }, + { + "storage": StorageType.JSON, + "field_name": "int_vector", + "value": [1, 2, 3], + "valid": True, + "error_text": None, + "description": "Valid int list for JSON storage", + }, + # Invalid cases for JSON storage (trying to use bytes) + { + "storage": StorageType.JSON, + "field_name": "embedding", + "value": b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", + "valid": False, + "error_text": "list", + "description": "Bytes not valid for JSON storage", + }, + # Dimension validation + { + "storage": StorageType.JSON, + "field_name": "embedding", + "value": [0.1, 0.2, 0.3], # Should be 4 dimensions + "valid": False, + "error_text": "dimensions", + "description": "Wrong dimensions for vector", + }, + # Type validation for int vectors + { + "storage": StorageType.JSON, + "field_name": "int_vector", + "value": [0.1, 0.2, 0.3], # Should be integers + "valid": False, + "error_text": "integer", + "description": "Float values in int vector", + }, + ], + ) + def test_vector_field_validation( + self, sample_hash_schema, sample_json_schema, test_case + ): + """Test validation of vector fields with storage-specific requirements.""" + # Select the appropriate schema based on storage type + schema = ( + sample_hash_schema + if test_case["storage"] == StorageType.HASH + else sample_json_schema ) - assert extract_from_json_path(valid_nested_data, "$.content.embedding") == [ - 0.1, - 0.2, - 0.3, - 0.4, - ] - - # Test non-existent path - assert extract_from_json_path(valid_nested_data, "$.nonexistent") is None - assert ( - extract_from_json_path(valid_nested_data, "$.metadata.nonexistent") is None + + # Validate the field + validate_field( + schema, + test_case["field_name"], + test_case["value"], + test_case["valid"], + test_case["error_text"], ) - # Test path with alternate formats - assert extract_from_json_path(valid_nested_data, "metadata.user") == "user123" - def test_validate_nested_json(self, sample_json_schema, valid_nested_data): - """Test validating a nested JSON object.""" - # Validate nested object - validated = validate_object(sample_json_schema, valid_nested_data) +class TestNestedJsonValidation: + """Tests for JSON path-based validation with nested structures.""" - # Verify validation succeeds and flattens the structure - assert validated is not None - assert "id" in validated - assert "user" in validated - assert "title" in validated - assert "rating" in validated - assert "embedding" in validated - - # Verify values were extracted correctly - assert validated["id"] == "doc1" - assert validated["user"] == "user123" - assert validated["title"] == "Test Document" - assert validated["rating"] == 4.5 - assert validated["embedding"] == [0.1, 0.2, 0.3, 0.4] - - def test_validate_nested_json_missing_paths(self, sample_json_schema): - """Test validating a nested JSON with missing paths.""" - # Nested object with missing paths - partial_nested = { - "id": "doc1", - "metadata": { - "user": "user123" - # missing rating + @pytest.mark.parametrize( + "test_case", + [ + # Complete valid data + { + "data": { + "test_id": "doc1", + "metadata": {"user": "user123", "rating": 4.5}, + "content": { + "title": "Test Document", + "embedding": [0.1, 0.2, 0.3, 0.4], + "int_vector": [1, 2, 3], + }, + }, + "expected_fields": [ + "test_id", + "user", + "title", + "rating", + "embedding", + "int_vector", + ], + "missing_fields": [], }, - "content": { - "title": "Test Document" - # missing embedding + # Partial data - missing some fields + { + "data": { + "test_id": "doc1", + "metadata": {"user": "user123"}, + "content": {"title": "Test Document"}, + }, + "expected_fields": ["test_id", "user", "title"], + "missing_fields": ["rating", "embedding", "int_vector"], }, - } - - # Validate object - validated = validate_object(sample_json_schema, partial_nested) - - # Verify validation succeeds with partial data - assert validated is not None - assert "id" in validated - assert "user" in validated - assert "title" in validated - assert "rating" not in validated - assert "embedding" not in validated - - -class TestObjectValidation: - """Tests for complete object validation.""" - - def test_validate_valid_object(self, sample_schema, valid_data): - """Test validating a valid object.""" + # Minimal data + { + "data": {"test_id": "doc1"}, + "expected_fields": ["test_id"], + "missing_fields": [ + "user", + "title", + "rating", + "embedding", + "int_vector", + ], + }, + ], + ) + def test_nested_json_validation(self, sample_json_schema, test_case): + """Test validating nested JSON with various data structures.""" # Validate object - validated = validate_object(sample_schema, valid_data) + validated = validate_object(sample_json_schema, test_case["data"]) - # Verify no exceptions and data is returned - assert validated is not None + # Verify expected fields are present + for field in test_case["expected_fields"]: + assert field in validated - # Verify all fields are present - for field_name in sample_schema.field_names: - if field_name in valid_data: - assert field_name in validated + # Verify missing fields are not present + for field in test_case["missing_fields"]: + assert field not in validated - def test_validate_missing_optional_fields(self, sample_schema): - """Test validating an object with missing optional fields.""" - # Object with only some fields - partial_data = {"id": "doc1", "title": "Test Document"} - # Validate object - validated = validate_object(sample_schema, partial_data) +class TestEndToEndValidation: + """End-to-end tests for complete object validation against schema.""" - # Verify validation passes with partial data - assert validated is not None - assert "id" in validated - assert "title" in validated - assert "rating" not in validated - assert "location" not in validated - assert "embedding" not in validated - - def test_explicit_none_fields_are_excluded(self, sample_schema): - """Test that fields explicitly set to None are excluded from output.""" - # Object with some fields set to None - data_with_none = { - "id": "doc1", + @pytest.mark.parametrize( + "schema_type,data,expected_result", + [ + # Valid HASH data + ( + "hash", + { + "test_id": "doc1", + "title": "Test Document", + "rating": 4.5, + "location": "37.7749,-122.4194", + "embedding": b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", + "int_vector": b"\x01\x02\x03", + }, + { + "success": True, + "fields": [ + "test_id", + "title", + "rating", + "location", + "embedding", + "int_vector", + ], + }, + ), + # Partial HASH data + ( + "hash", + {"test_id": "doc1", "title": "Test Document"}, + {"success": True, "fields": ["test_id", "title"]}, + ), + # Valid JSON data + ( + "json", + { + "test_id": "doc1", + "metadata": {"user": "user123", "rating": 4.5}, + "content": { + "title": "Test Document", + "embedding": [0.1, 0.2, 0.3, 0.4], + "int_vector": [1, 2, 3], + }, + }, + { + "success": True, + "fields": [ + "test_id", + "user", + "rating", + "title", + "embedding", + "int_vector", + ], + }, + ), + # Invalid HASH data - wrong vector type + ( + "hash", + { + "test_id": "doc1", + "embedding": [0.1, 0.2, 0.3, 0.4], # Should be bytes for HASH + }, + {"success": False, "error_field": "embedding"}, + ), + # Invalid JSON data - wrong vector type + ( + "json", + { + "test_id": "doc1", + "content": { + "embedding": b"\x00\x00\x00\x00" # Should be list for JSON + }, + }, + {"success": False, "error_field": "embedding"}, + ), + ], + ) + def test_end_to_end_validation( + self, sample_hash_schema, sample_json_schema, schema_type, data, expected_result + ): + """Test validating complete objects with various data scenarios.""" + # Select schema based on type + schema = sample_hash_schema if schema_type == "hash" else sample_json_schema + + if expected_result["success"]: + # Validation should succeed + validated = validate_object(schema, data) + + # Verify expected fields are present + for field in expected_result["fields"]: + assert field in validated + else: + # Validation should fail + with pytest.raises(ValueError) as exc_info: + validate_object(schema, data) + + # Error should mention the field + assert expected_result["error_field"] in str(exc_info.value) + + +# -------------------- ADDITIONAL TESTS -------------------- + + +class TestEdgeCases: + """Tests for edge cases and boundary conditions.""" + + def test_empty_object_validation(self, sample_hash_schema, sample_json_schema): + """Test validating an empty object.""" + # Empty object should validate for both storage types (all fields are optional) + # TODO confirm if this is indeed true + assert validate_object(sample_hash_schema, {}) == {} + assert validate_object(sample_json_schema, {}) == {} + + def test_additional_fields(self, sample_hash_schema, valid_hash_data): + """Test that additional fields not in schema are NOT ignored.""" + # Add extra field not in schema + data_with_extra = valid_hash_data.copy() + data_with_extra["extra_field"] = "some value" + + # Validation should succeed and ignore extra field + validated = validate_object(sample_hash_schema, data_with_extra) + assert "extra_field" in validated + + def test_explicit_none_fields_excluded(self, sample_hash_schema): + """Test that fields explicitly set to None are excluded.""" + # Data with explicit None values + data = { + "test_id": "doc1", "title": "Test Document", "rating": None, "location": None, } - # Validate object - validated = validate_object(sample_schema, data_with_none) - - # Verify None fields are excluded - assert validated is not None - assert "id" in validated + # Validate and check fields + validated = validate_object(sample_hash_schema, data) + assert "test_id" in validated assert "title" in validated assert "rating" not in validated assert "location" not in validated - - def test_validate_with_multiple_invalid_fields(self, sample_schema, valid_data): - """Test validation with multiple invalid fields.""" - # Create object with multiple invalid fields - invalid_data = valid_data.copy() - invalid_data["title"] = 123 - invalid_data["rating"] = "not a number" - invalid_data["location"] = "invalid" - - # Validation should fail with the first error encountered - with pytest.raises(ValueError) as exc_info: - validate_object(sample_schema, invalid_data) - - # Error message should mention validation failure - assert "Validation failed" in str(exc_info.value) - - @pytest.mark.parametrize( - "case", - [ - {"field": "title", "value": 123, "error_text": "must be a string"}, - {"field": "rating", "value": "high", "error_text": "must be a number"}, - { - "field": "location", - "value": "invalid_geo", - "error_text": "not a valid 'lat,lon' format", - }, - { - "field": "embedding", - "value": [0.1, 0.2, 0.3], - "error_text": "dimensions", - }, - ], - ) - def test_validate_invalid_field_parametrized(self, sample_schema, valid_data, case): - """Parametrized test for validating invalid fields.""" - # Create invalid data according to test case - invalid_data = valid_data.copy() - invalid_data[case["field"]] = case["value"] - - # Validate and check error - with pytest.raises(ValueError) as exc_info: - validate_object(sample_schema, invalid_data) - - # Error should mention the field and specific issue - error_message = str(exc_info.value) - assert case["field"] in error_message - assert case["error_text"] in error_message From fdd70a047ae13c60a5ce4627cfd9f541d35d16ea Mon Sep 17 00:00:00 2001 From: Tyler Hutcherson Date: Wed, 26 Mar 2025 11:05:53 -0400 Subject: [PATCH 03/11] update tests, docs, and formatting/linting --- docs/user_guide/01_getting_started.ipynb | 236 ++-- docs/user_guide/data_validation.ipynb | 1102 ------------------ redisvl/index/index.py | 32 +- redisvl/index/storage.py | 10 +- redisvl/schema/validation.py | 19 +- tests/integration/test_async_search_index.py | 10 +- tests/integration/test_search_index.py | 6 +- tests/unit/test_edge_cases.py | 451 ------- tests/unit/test_fields.py | 55 - tests/unit/test_storage.py | 24 +- 10 files changed, 185 insertions(+), 1760 deletions(-) delete mode 100644 docs/user_guide/data_validation.ipynb delete mode 100644 tests/unit/test_edge_cases.py diff --git a/docs/user_guide/01_getting_started.ipynb b/docs/user_guide/01_getting_started.ipynb index 6130f589..7ab3a234 100644 --- a/docs/user_guide/01_getting_started.ipynb +++ b/docs/user_guide/01_getting_started.ipynb @@ -81,7 +81,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -126,7 +126,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -173,29 +173,7 @@ "source": [ "## Create a `SearchIndex`\n", "\n", - "With the schema and sample dataset ready, instantiate a `SearchIndex`:" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "from redisvl.index import SearchIndex\n", - "\n", - "index = SearchIndex.from_dict(schema)\n", - "# or use .from_yaml('schema_file.yaml')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now we also need to facilitate a Redis connection. There are a few ways to do this:\n", - "\n", - "- Create & manage your own client connection (recommended)\n", - "- Provide a Redis URL and let RedisVL connect on your behalf (by default, it will connect to \"redis://localhost:6379\")" + "With the schema and sample dataset ready, create a `SearchIndex`." ] }, { @@ -209,31 +187,15 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 11, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ + "from redisvl.index import SearchIndex\n", "from redis import Redis\n", "\n", "client = Redis.from_url(\"redis://localhost:6379\")\n", - "index = SearchIndex.from_dict(schema, redis_client=client)\n", - "\n", - "# alternatively, provide an async Redis client object to enable async index operations\n", - "# from redis.asyncio import Redis\n", - "# from redisvl.index import AsyncSearchIndex\n", - "# client = Redis.from_url(\"redis://localhost:6379\")\n", - "# index = AsyncSearchIndex.from_dict(schema, redis_client=client)\n" + "index = SearchIndex.from_dict(schema, redis_client=client, validate_on_load=True)" ] }, { @@ -262,24 +224,24 @@ } ], "source": [ - "index = SearchIndex.from_dict(schema, redis_url=\"redis://localhost:6379\")\n", + "index = SearchIndex.from_dict(schema, redis_url=\"redis://localhost:6379\", validate_on_load=True)\n", "\n", "# If you don't specify a client or Redis URL, the index will attempt to\n", - "# connect to Redis at the default address (\"redis://localhost:6379\")." + "# connect to Redis at the default address \"redis://localhost:6379\"." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Create the underlying index\n", + "### Create the index\n", "\n", "Now that we are connected to Redis, we need to run the create command." ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -303,15 +265,15 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "\u001b[32m11:50:15\u001b[0m \u001b[34m[RedisVL]\u001b[0m \u001b[1;30mINFO\u001b[0m Indices:\n", - "\u001b[32m11:50:15\u001b[0m \u001b[34m[RedisVL]\u001b[0m \u001b[1;30mINFO\u001b[0m 1. user_simple\n" + "\u001b[32m10:59:25\u001b[0m \u001b[34m[RedisVL]\u001b[0m \u001b[1;30mINFO\u001b[0m Indices:\n", + "\u001b[32m10:59:25\u001b[0m \u001b[34m[RedisVL]\u001b[0m \u001b[1;30mINFO\u001b[0m 1. user_simple\n" ] } ], @@ -321,7 +283,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 14, "metadata": {}, "outputs": [ { @@ -359,19 +321,22 @@ "source": [ "## Load Data to `SearchIndex`\n", "\n", - "Load the sample dataset to Redis:" + "Load the sample dataset to Redis.\n", + "\n", + "### Validate data entries on load\n", + "RedisVL uses pydantic validation under the hood to ensure loaded data is valid and confirms to your schema. This setting is optional and can be configured in the `SearchIndex` class." ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "['user_simple_docs:01JM2NWFWNH0BNA640MT5DS8BD', 'user_simple_docs:01JM2NWFWNF4S2V4E4HYG25CVA', 'user_simple_docs:01JM2NWFWNBFXJJ4PV9F4KMJSE']\n" + "['user_simple_docs:01JQ9FEZ4GAAYT9W7BWAF7CV18', 'user_simple_docs:01JQ9FEZ4JCE5FD1D5QY6BAJ0J', 'user_simple_docs:01JQ9FEZ4KF9AZYBKMYNMYBZ5A']\n" ] } ], @@ -388,6 +353,96 @@ ">By default, `load` will create a unique Redis key as a combination of the index key `prefix` and a random ULID. You can also customize the key by providing direct keys or pointing to a specified `id_field` on load." ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Load invalid data\n", + "This will raise a `SchemaValidationError` if `validate_on_load` is set to true in the `SearchIndex` class." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "11:00:03 redisvl.index.index ERROR Schema validation error while loading data\n", + "Traceback (most recent call last):\n", + " File \"/Users/tyler.hutcherson/Documents/AppliedAI/redis-vl-python/redisvl/index/storage.py\", line 204, in _preprocess_and_validate_objects\n", + " processed_obj = self._validate(processed_obj)\n", + " File \"/Users/tyler.hutcherson/Documents/AppliedAI/redis-vl-python/redisvl/index/storage.py\", line 160, in _validate\n", + " return validate_object(self.index_schema, obj)\n", + " File \"/Users/tyler.hutcherson/Documents/AppliedAI/redis-vl-python/redisvl/schema/validation.py\", line 274, in validate_object\n", + " validated = model_class.model_validate(flat_obj)\n", + " File \"/Users/tyler.hutcherson/Library/Caches/pypoetry/virtualenvs/redisvl-VnTEShF2-py3.13/lib/python3.13/site-packages/pydantic/main.py\", line 627, in model_validate\n", + " return cls.__pydantic_validator__.validate_python(\n", + " ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^\n", + " obj, strict=strict, from_attributes=from_attributes, context=context\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " )\n", + " ^\n", + "pydantic_core._pydantic_core.ValidationError: 1 validation error for user_simple__PydanticModel\n", + "user_embedding\n", + " Input should be a valid bytes [type=bytes_type, input_value=True, input_type=bool]\n", + " For further information visit https://errors.pydantic.dev/2.10/v/bytes_type\n", + "\n", + "The above exception was the direct cause of the following exception:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/Users/tyler.hutcherson/Documents/AppliedAI/redis-vl-python/redisvl/index/index.py\", line 586, in load\n", + " return self._storage.write(\n", + " ~~~~~~~~~~~~~~~~~~~^\n", + " self._redis_client, # type: ignore\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " ...<6 lines>...\n", + " validate=self._validate_on_load,\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " )\n", + " ^\n", + " File \"/Users/tyler.hutcherson/Documents/AppliedAI/redis-vl-python/redisvl/index/storage.py\", line 265, in write\n", + " prepared_objects = self._preprocess_and_validate_objects(\n", + " list(objects), # Convert Iterable to List\n", + " ...<3 lines>...\n", + " validate=validate,\n", + " )\n", + " File \"/Users/tyler.hutcherson/Documents/AppliedAI/redis-vl-python/redisvl/index/storage.py\", line 211, in _preprocess_and_validate_objects\n", + " raise SchemaValidationError(str(e), index=i) from e\n", + "redisvl.exceptions.SchemaValidationError: Validation failed for object at index 0: 1 validation error for user_simple__PydanticModel\n", + "user_embedding\n", + " Input should be a valid bytes [type=bytes_type, input_value=True, input_type=bool]\n", + " For further information visit https://errors.pydantic.dev/2.10/v/bytes_type\n" + ] + }, + { + "ename": "SchemaValidationError", + "evalue": "Validation failed for object at index 0: 1 validation error for user_simple__PydanticModel\nuser_embedding\n Input should be a valid bytes [type=bytes_type, input_value=True, input_type=bool]\n For further information visit https://errors.pydantic.dev/2.10/v/bytes_type", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mValidationError\u001b[0m Traceback (most recent call last)", + "File \u001b[0;32m~/Documents/AppliedAI/redis-vl-python/redisvl/index/storage.py:204\u001b[0m, in \u001b[0;36mBaseStorage._preprocess_and_validate_objects\u001b[0;34m(self, objects, id_field, keys, preprocess, validate)\u001b[0m\n\u001b[1;32m 203\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m validate:\n\u001b[0;32m--> 204\u001b[0m processed_obj \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_validate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mprocessed_obj\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 206\u001b[0m \u001b[38;5;66;03m# Store valid object with its key for writing\u001b[39;00m\n", + "File \u001b[0;32m~/Documents/AppliedAI/redis-vl-python/redisvl/index/storage.py:160\u001b[0m, in \u001b[0;36mBaseStorage._validate\u001b[0;34m(self, obj)\u001b[0m\n\u001b[1;32m 159\u001b[0m \u001b[38;5;66;03m# Pass directly to validation function and let any errors propagate\u001b[39;00m\n\u001b[0;32m--> 160\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mvalidate_object\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mindex_schema\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mobj\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/Documents/AppliedAI/redis-vl-python/redisvl/schema/validation.py:274\u001b[0m, in \u001b[0;36mvalidate_object\u001b[0;34m(schema, obj)\u001b[0m\n\u001b[1;32m 273\u001b[0m \u001b[38;5;66;03m# Validate against model\u001b[39;00m\n\u001b[0;32m--> 274\u001b[0m validated \u001b[38;5;241m=\u001b[39m \u001b[43mmodel_class\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmodel_validate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mflat_obj\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 275\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m validated\u001b[38;5;241m.\u001b[39mmodel_dump(exclude_none\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n", + "File \u001b[0;32m~/Library/Caches/pypoetry/virtualenvs/redisvl-VnTEShF2-py3.13/lib/python3.13/site-packages/pydantic/main.py:627\u001b[0m, in \u001b[0;36mBaseModel.model_validate\u001b[0;34m(cls, obj, strict, from_attributes, context)\u001b[0m\n\u001b[1;32m 626\u001b[0m __tracebackhide__ \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[0;32m--> 627\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mcls\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m__pydantic_validator__\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mvalidate_python\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 628\u001b[0m \u001b[43m \u001b[49m\u001b[43mobj\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstrict\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstrict\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfrom_attributes\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfrom_attributes\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcontext\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcontext\u001b[49m\n\u001b[1;32m 629\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", + "\u001b[0;31mValidationError\u001b[0m: 1 validation error for user_simple__PydanticModel\nuser_embedding\n Input should be a valid bytes [type=bytes_type, input_value=True, input_type=bool]\n For further information visit https://errors.pydantic.dev/2.10/v/bytes_type", + "\nThe above exception was the direct cause of the following exception:\n", + "\u001b[0;31mSchemaValidationError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[16], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m keys \u001b[38;5;241m=\u001b[39m \u001b[43mindex\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mload\u001b[49m\u001b[43m(\u001b[49m\u001b[43m[\u001b[49m\u001b[43m{\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43muser_embedding\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m}\u001b[49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/Documents/AppliedAI/redis-vl-python/redisvl/index/index.py:586\u001b[0m, in \u001b[0;36mSearchIndex.load\u001b[0;34m(self, data, id_field, keys, ttl, preprocess, batch_size)\u001b[0m\n\u001b[1;32m 556\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Load objects to the Redis database. Returns the list of keys loaded\u001b[39;00m\n\u001b[1;32m 557\u001b[0m \u001b[38;5;124;03mto Redis.\u001b[39;00m\n\u001b[1;32m 558\u001b[0m \n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 583\u001b[0m \u001b[38;5;124;03m RedisVLError: If there's an error loading data to Redis.\u001b[39;00m\n\u001b[1;32m 584\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 585\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 586\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_storage\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mwrite\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 587\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_redis_client\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m# type: ignore\u001b[39;49;00m\n\u001b[1;32m 588\u001b[0m \u001b[43m \u001b[49m\u001b[43mobjects\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdata\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 589\u001b[0m \u001b[43m \u001b[49m\u001b[43mid_field\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mid_field\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 590\u001b[0m \u001b[43m \u001b[49m\u001b[43mkeys\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mkeys\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 591\u001b[0m \u001b[43m \u001b[49m\u001b[43mttl\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mttl\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 592\u001b[0m \u001b[43m \u001b[49m\u001b[43mpreprocess\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpreprocess\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 593\u001b[0m \u001b[43m \u001b[49m\u001b[43mbatch_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbatch_size\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 594\u001b[0m \u001b[43m \u001b[49m\u001b[43mvalidate\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_validate_on_load\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 595\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 596\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m SchemaValidationError:\n\u001b[1;32m 597\u001b[0m \u001b[38;5;66;03m# Pass through validation errors directly\u001b[39;00m\n\u001b[1;32m 598\u001b[0m logger\u001b[38;5;241m.\u001b[39mexception(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mSchema validation error while loading data\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", + "File \u001b[0;32m~/Documents/AppliedAI/redis-vl-python/redisvl/index/storage.py:265\u001b[0m, in \u001b[0;36mBaseStorage.write\u001b[0;34m(self, redis_client, objects, id_field, keys, ttl, preprocess, batch_size, validate)\u001b[0m\n\u001b[1;32m 262\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m []\n\u001b[1;32m 264\u001b[0m \u001b[38;5;66;03m# Pass 1: Preprocess and validate all objects\u001b[39;00m\n\u001b[0;32m--> 265\u001b[0m prepared_objects \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_preprocess_and_validate_objects\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 266\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mlist\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mobjects\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m# Convert Iterable to List\u001b[39;49;00m\n\u001b[1;32m 267\u001b[0m \u001b[43m \u001b[49m\u001b[43mid_field\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mid_field\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 268\u001b[0m \u001b[43m \u001b[49m\u001b[43mkeys\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mkeys\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 269\u001b[0m \u001b[43m \u001b[49m\u001b[43mpreprocess\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpreprocess\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 270\u001b[0m \u001b[43m \u001b[49m\u001b[43mvalidate\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mvalidate\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 271\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 273\u001b[0m \u001b[38;5;66;03m# Pass 2: Write all valid objects in batches\u001b[39;00m\n\u001b[1;32m 274\u001b[0m added_keys \u001b[38;5;241m=\u001b[39m []\n", + "File \u001b[0;32m~/Documents/AppliedAI/redis-vl-python/redisvl/index/storage.py:211\u001b[0m, in \u001b[0;36mBaseStorage._preprocess_and_validate_objects\u001b[0;34m(self, objects, id_field, keys, preprocess, validate)\u001b[0m\n\u001b[1;32m 207\u001b[0m prepared_objects\u001b[38;5;241m.\u001b[39mappend((key, processed_obj))\n\u001b[1;32m 209\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m ValidationError \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 210\u001b[0m \u001b[38;5;66;03m# Convert Pydantic ValidationError to SchemaValidationError with index context\u001b[39;00m\n\u001b[0;32m--> 211\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m SchemaValidationError(\u001b[38;5;28mstr\u001b[39m(e), index\u001b[38;5;241m=\u001b[39mi) \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01me\u001b[39;00m\n\u001b[1;32m 212\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 213\u001b[0m \u001b[38;5;66;03m# Capture other exceptions with context\u001b[39;00m\n\u001b[1;32m 214\u001b[0m object_id \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mat index \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mi\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n", + "\u001b[0;31mSchemaValidationError\u001b[0m: Validation failed for object at index 0: 1 validation error for user_simple__PydanticModel\nuser_embedding\n Input should be a valid bytes [type=bytes_type, input_value=True, input_type=bool]\n For further information visit https://errors.pydantic.dev/2.10/v/bytes_type" + ] + } + ], + "source": [ + "keys = index.load([{\"user_embedding\": True}])" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -398,14 +453,14 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "['user_simple_docs:01JM2NWJGYMJ0QTR5YB4MB0BX9']\n" + "['user_simple_docs:01JQ9FHCB1B64GXF6WPK127VZ6']\n" ] } ], @@ -435,7 +490,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 18, "metadata": {}, "outputs": [], "source": [ @@ -460,20 +515,13 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 19, "metadata": {}, "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "*=>[KNN 3 @user_embedding $vector AS vector_distance] RETURN 6 user age job credit_score vector_distance vector_distance SORTBY vector_distance ASC DIALECT 2 LIMIT 0 3\n" - ] - }, { "data": { "text/html": [ - "table>vector_distanceuseragejobcredit_score0john1engineerhigh0mary2doctorlow0.0566299557686tyler9engineerhigh" + "
vector_distanceuseragejobcredit_score
0john1engineerhigh
0mary2doctorlow
0.0566299557686tyler9engineerhigh
" ], "text/plain": [ "" @@ -500,7 +548,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 20, "metadata": {}, "outputs": [ { @@ -519,7 +567,7 @@ " 'datatype': 'float32'}}]}" ] }, - "execution_count": 13, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -530,32 +578,20 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 21, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "from redisvl.index import AsyncSearchIndex\n", "from redis.asyncio import Redis\n", "\n", "client = Redis.from_url(\"redis://localhost:6379\")\n", - "\n", "index = AsyncSearchIndex.from_dict(schema, redis_client=client)" ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 22, "metadata": {}, "outputs": [ { @@ -596,7 +632,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 23, "metadata": {}, "outputs": [], "source": [ @@ -621,14 +657,14 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 24, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "11:28:32 redisvl.index.index INFO Index already exists, overwriting.\n" + "11:01:30 redisvl.index.index INFO Index already exists, overwriting.\n" ] } ], @@ -639,13 +675,13 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/html": [ - "
vector_distanceuseragejobcredit_score
0mary2doctorlow
0john1engineerhigh
0.0566299557686tyler9engineerhigh
" + "
vector_distanceuseragejobcredit_score
0john1engineerhigh
0mary2doctorlow
0.0566299557686tyler9engineerhigh
" ], "text/plain": [ "" @@ -671,7 +707,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 26, "metadata": {}, "outputs": [ { @@ -684,9 +720,9 @@ "│ Stat Key │ Value │\n", "├─────────────────────────────┼─────────────┤\n", "│ num_docs │ 4 │\n", - "│ num_terms │ 4 │\n", + "│ num_terms │ 0 │\n", "│ max_doc_id │ 4 │\n", - "│ num_records │ 22 │\n", + "│ num_records │ 20 │\n", "│ percent_indexed │ 1 │\n", "│ hash_indexing_failures │ 0 │\n", "│ number_of_uses │ 2 │\n", @@ -699,9 +735,9 @@ "│ offsets_per_term_avg │ 0 │\n", "│ records_per_doc_avg │ 5 │\n", "│ sortable_values_size_mb │ 0 │\n", - "│ total_indexing_time │ 0.239 │\n", + "│ total_indexing_time │ 6.529 │\n", "│ total_inverted_index_blocks │ 11 │\n", - "│ vector_index_sz_mb │ 0.235603 │\n", + "│ vector_index_sz_mb │ 0.235947 │\n", "╰─────────────────────────────┴─────────────╯\n" ] } @@ -730,7 +766,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 27, "metadata": {}, "outputs": [ { @@ -739,7 +775,7 @@ "4" ] }, - "execution_count": 21, + "execution_count": 27, "metadata": {}, "output_type": "execute_result" } @@ -751,7 +787,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 28, "metadata": {}, "outputs": [ { @@ -760,7 +796,7 @@ "True" ] }, - "execution_count": 22, + "execution_count": 28, "metadata": {}, "output_type": "execute_result" } @@ -772,7 +808,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 29, "metadata": {}, "outputs": [], "source": [ diff --git a/docs/user_guide/data_validation.ipynb b/docs/user_guide/data_validation.ipynb deleted file mode 100644 index 366f47a4..00000000 --- a/docs/user_guide/data_validation.ipynb +++ /dev/null @@ -1,1102 +0,0 @@ -{ - "cells": [ - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Getting Started with RedisVL\n", - "`redisvl` is a versatile Python library with an integrated CLI, designed to enhance AI applications using Redis. This guide will walk you through the following steps:\n", - "\n", - "1. Defining an `IndexSchema`\n", - "2. Preparing a sample dataset\n", - "3. Creating a `SearchIndex` object\n", - "4. Testing `rvl` CLI functionality\n", - "5. Loading the sample data\n", - "6. Building `VectorQuery` objects and executing searches\n", - "7. Updating a `SearchIndex` object\n", - "\n", - "...and more!\n", - "\n", - "Prerequisites:\n", - "- Ensure `redisvl` is installed in your Python environment.\n", - "- Have a running instance of [Redis Stack](https://redis.io/docs/install/install-stack/) or [Redis Cloud](https://redis.io/cloud).\n", - "\n", - "_____" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Define an `IndexSchema`\n", - "\n", - "The `IndexSchema` maintains crucial **index configuration** and **field definitions** to\n", - "enable search with Redis. For ease of use, the schema can be constructed from a\n", - "python dictionary or yaml file.\n", - "\n", - "### Example Schema Creation\n", - "Consider a dataset with user information, including `job`, `age`, `credit_score`,\n", - "and a 3-dimensional `user_embedding` vector.\n", - "\n", - "You must also decide on a Redis index name and key prefix to use for this\n", - "dataset. Below are example schema definitions in both YAML and Dict format.\n", - "\n", - "**YAML Definition:**\n", - "\n", - "```yaml\n", - "version: '0.1.0'\n", - "\n", - "index:\n", - " name: user_simple\n", - " prefix: user_simple_docs\n", - "\n", - "fields:\n", - " - name: user\n", - " type: tag\n", - " - name: credit_score\n", - " type: tag\n", - " - name: job\n", - " type: text\n", - " - name: age\n", - " type: numeric\n", - " - name: user_embedding\n", - " type: vector\n", - " attrs:\n", - " algorithm: flat\n", - " dims: 3\n", - " distance_metric: cosine\n", - " datatype: float32\n", - "```\n", - "> Store this in a local file, such as `schema.yaml`, for RedisVL usage." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Python Dictionary:**" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "schema = {\n", - " \"index\": {\n", - " \"name\": \"user_simple\",\n", - " \"prefix\": \"user_simple_docs\",\n", - " \"storage_type\": \"json\"\n", - " },\n", - " \"fields\": [\n", - " {\"name\": \"user\", \"type\": \"tag\"},\n", - " {\"name\": \"credit_score\", \"type\": \"tag\"},\n", - " {\"name\": \"job\", \"type\": \"text\"},\n", - " {\"name\": \"age\", \"type\": \"numeric\"},\n", - " {\"name\": \"location\", \"type\": \"geo\"},\n", - " {\n", - " \"name\": \"user_embedding\",\n", - " \"type\": \"vector\",\n", - " \"attrs\": {\n", - " \"dims\": 3,\n", - " \"distance_metric\": \"cosine\",\n", - " \"algorithm\": \"flat\",\n", - " \"datatype\": \"float32\"\n", - " }\n", - " }\n", - " ]\n", - "}" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Sample Dataset Preparation\n", - "\n", - "Below, create a mock dataset with `user`, `job`, `age`, `credit_score`, and\n", - "`user_embedding` fields. The `user_embedding` vectors are synthetic examples\n", - "for demonstration purposes.\n", - "\n", - "For more information on creating real-world embeddings, refer to this\n", - "[article](https://mlops.community/vector-similarity-search-from-basics-to-production/)." - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "\n", - "\n", - "data = [\n", - " {\n", - " 'user': 'john',\n", - " 'age': 1,\n", - " 'job': 'engineer',\n", - " 'credit_score': 'high',\n", - " 'location': '37.540760,-77.433929',\n", - " 'user_embedding': np.array([0.1, 0.1, 0.5], dtype=np.float32).tobytes()\n", - " },\n", - " {\n", - " 'user': 'mary',\n", - " 'age': 2,\n", - " 'job': 'doctor',\n", - " 'credit_score': 'low',\n", - " 'location': '37.540760,-77.433929',\n", - " 'user_embedding': np.array([0.1, 0.1, 0.5], dtype=np.float32).tobytes()\n", - " },\n", - " {\n", - " 'user': 'joe',\n", - " 'age': 3,\n", - " 'job': 'dentist',\n", - " 'credit_score': 'medium',\n", - " 'location': '37.540760,-77.433929',\n", - " 'user_embedding': np.array([0.9, 0.9, 0.1], dtype=np.float32).tobytes()\n", - " }\n", - "]" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - ">As seen above, the sample `user_embedding` vectors are converted into bytes. Using the `NumPy`, this is fairly trivial." - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Create a `SearchIndex`\n", - "\n", - "With the schema and sample dataset ready, instantiate a `SearchIndex`:" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "from redisvl.index import SearchIndex\n", - "\n", - "index = SearchIndex.from_dict(schema)\n", - "# or use .from_yaml('schema_file.yaml')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now we also need to facilitate a Redis connection. There are a few ways to do this:\n", - "\n", - "- Create & manage your own client connection (recommended)\n", - "- Provide a Redis URL and let RedisVL connect on your behalf (by default, it will connect to \"redis://localhost:6379\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Bring your own Redis connection instance\n", - "\n", - "This is ideal in scenarios where you have custom settings on the connection instance or if your application will share a connection pool:" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "from redis import Redis\n", - "\n", - "client = Redis.from_url(\"redis://localhost:6379\")\n", - "index = SearchIndex.from_dict(schema, redis_client=client)\n", - "\n", - "# alternatively, provide an async Redis client object to enable async index operations\n", - "# from redis.asyncio import Redis\n", - "# from redisvl.index import AsyncSearchIndex\n", - "# client = Redis.from_url(\"redis://localhost:6379\")\n", - "# index = AsyncSearchIndex.from_dict(schema, redis_client=client)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Let the index manage the connection instance\n", - "\n", - "This is ideal for simple cases:" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "index = SearchIndex.from_dict(schema, redis_url=\"redis://localhost:6379\", validate_on_load=True)\n", - "\n", - "# If you don't specify a client or Redis URL, the index will attempt to\n", - "# connect to Redis at the default address (\"redis://localhost:6379\")." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Create the underlying index\n", - "\n", - "Now that we are connected to Redis, we need to run the create command." - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "16:42:16 redisvl.index.index INFO Index already exists, overwriting.\n" - ] - } - ], - "source": [ - "index.create(overwrite=True, drop=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - ">Note that at this point, the index has no entries. Data loading follows." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Inspect with the `rvl` CLI\n", - "Use the `rvl` CLI to inspect the created index and its fields:" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[32m16:36:30\u001b[0m \u001b[34m[RedisVL]\u001b[0m \u001b[1;30mINFO\u001b[0m Indices:\n", - "\u001b[32m16:36:30\u001b[0m \u001b[34m[RedisVL]\u001b[0m \u001b[1;30mINFO\u001b[0m 1. user_simple\n" - ] - } - ], - "source": [ - "!rvl index listall" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "\n", - "Index Information:\n", - "╭──────────────┬────────────────┬──────────────────────┬─────────────────┬────────────╮\n", - "│ Index Name │ Storage Type │ Prefixes │ Index Options │ Indexing │\n", - "├──────────────┼────────────────┼──────────────────────┼─────────────────┼────────────┤\n", - "│ user_simple │ JSON │ ['user_simple_docs'] │ [] │ 0 │\n", - "╰──────────────┴────────────────┴──────────────────────┴─────────────────┴────────────╯\n", - "Index Fields:\n", - "╭──────────────────┬────────────────┬─────────┬────────────────┬────────────────┬────────────────┬────────────────┬────────────────┬────────────────┬─────────────────┬────────────────╮\n", - "│ Name │ Attribute │ Type │ Field Option │ Option Value │ Field Option │ Option Value │ Field Option │ Option Value │ Field Option │ Option Value │\n", - "├──────────────────┼────────────────┼─────────┼────────────────┼────────────────┼────────────────┼────────────────┼────────────────┼────────────────┼─────────────────┼────────────────┤\n", - "│ $.user │ user │ TAG │ SEPARATOR │ , │ │ │ │ │ │ │\n", - "│ $.credit_score │ credit_score │ TAG │ SEPARATOR │ , │ │ │ │ │ │ │\n", - "│ $.job │ job │ TEXT │ WEIGHT │ 1 │ │ │ │ │ │ │\n", - "│ $.age │ age │ NUMERIC │ │ │ │ │ │ │ │ │\n", - "│ $.location │ location │ GEO │ │ │ │ │ │ │ │ │\n", - "│ $.user_embedding │ user_embedding │ VECTOR │ algorithm │ FLAT │ data_type │ FLOAT32 │ dim │ 3 │ distance_metric │ COSINE │\n", - "╰──────────────────┴────────────────┴─────────┴────────────────┴────────────────┴────────────────┴────────────────┴────────────────┴────────────────┴─────────────────┴────────────────╯\n" - ] - } - ], - "source": [ - "!rvl index info -i user_simple" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Load Data to `SearchIndex`\n", - "\n", - "Load the sample dataset to Redis:" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "16:42:27 redisvl.index.index ERROR Error while loading data to Redis\n", - "Traceback (most recent call last):\n", - " File \"/Users/tyler.hutcherson/Documents/AppliedAI/redis-vl-python/redisvl/index/storage.py\", line 201, in _preprocess_and_validate_objects\n", - " processed_obj = self.validate(processed_obj)\n", - " File \"/Users/tyler.hutcherson/Documents/AppliedAI/redis-vl-python/redisvl/index/storage.py\", line 158, in validate\n", - " return validate_object(self.index_schema, obj)\n", - " File \"/Users/tyler.hutcherson/Documents/AppliedAI/redis-vl-python/redisvl/schema/validation.py\", line 254, in validate_object\n", - " validated = model_class.model_validate(flat_obj)\n", - " File \"/Users/tyler.hutcherson/Library/Caches/pypoetry/virtualenvs/redisvl-VnTEShF2-py3.13/lib/python3.13/site-packages/pydantic/main.py\", line 627, in model_validate\n", - " return cls.__pydantic_validator__.validate_python(\n", - " ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^\n", - " obj, strict=strict, from_attributes=from_attributes, context=context\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " )\n", - " ^\n", - "pydantic_core._pydantic_core.ValidationError: 1 validation error for user_simple__PydanticModel\n", - "user_embedding\n", - " Input should be a valid list [type=list_type, input_value=b'\\xcd\\xcc\\xcc=\\xcd\\xcc\\xcc=\\x00\\x00\\x00?', input_type=bytes]\n", - " For further information visit https://errors.pydantic.dev/2.10/v/list_type\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/Users/tyler.hutcherson/Documents/AppliedAI/redis-vl-python/redisvl/index/index.py\", line 600, in load\n", - " return self._storage.write(\n", - " ~~~~~~~~~~~~~~~~~~~^\n", - " self._redis_client, # type: ignore\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " ...<6 lines>...\n", - " validate=self._validate_on_load,\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " )\n", - " ^\n", - " File \"/Users/tyler.hutcherson/Documents/AppliedAI/redis-vl-python/redisvl/index/storage.py\", line 260, in write\n", - " prepared_objects = self._preprocess_and_validate_objects(\n", - " objects,\n", - " ...<3 lines>...\n", - " validate=validate\n", - " )\n", - " File \"/Users/tyler.hutcherson/Documents/AppliedAI/redis-vl-python/redisvl/index/storage.py\", line 212, in _preprocess_and_validate_objects\n", - " raise ValueError(f\"Validation failed for object {object_id}: {str(e)}\")\n", - "ValueError: Validation failed for object at index 0: 1 validation error for user_simple__PydanticModel\n", - "user_embedding\n", - " Input should be a valid list [type=list_type, input_value=b'\\xcd\\xcc\\xcc=\\xcd\\xcc\\xcc=\\x00\\x00\\x00?', input_type=bytes]\n", - " For further information visit https://errors.pydantic.dev/2.10/v/list_type\n" - ] - }, - { - "ename": "ValueError", - "evalue": "Validation failed for object at index 0: 1 validation error for user_simple__PydanticModel\nuser_embedding\n Input should be a valid list [type=list_type, input_value=b'\\xcd\\xcc\\xcc=\\xcd\\xcc\\xcc=\\x00\\x00\\x00?', input_type=bytes]\n For further information visit https://errors.pydantic.dev/2.10/v/list_type", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mValidationError\u001b[0m Traceback (most recent call last)", - "File \u001b[0;32m~/Documents/AppliedAI/redis-vl-python/redisvl/index/storage.py:201\u001b[0m, in \u001b[0;36mBaseStorage._preprocess_and_validate_objects\u001b[0;34m(self, objects, id_field, keys, preprocess, validate)\u001b[0m\n\u001b[1;32m 200\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m validate:\n\u001b[0;32m--> 201\u001b[0m processed_obj \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mvalidate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mprocessed_obj\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 203\u001b[0m \u001b[38;5;66;03m# Store valid object with its key for writing\u001b[39;00m\n", - "File \u001b[0;32m~/Documents/AppliedAI/redis-vl-python/redisvl/index/storage.py:158\u001b[0m, in \u001b[0;36mBaseStorage.validate\u001b[0;34m(self, obj)\u001b[0m\n\u001b[1;32m 157\u001b[0m \u001b[38;5;66;03m# Pass directly to validation function and let any errors propagate\u001b[39;00m\n\u001b[0;32m--> 158\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mvalidate_object\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mindex_schema\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mobj\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/Documents/AppliedAI/redis-vl-python/redisvl/schema/validation.py:254\u001b[0m, in \u001b[0;36mvalidate_object\u001b[0;34m(schema, obj)\u001b[0m\n\u001b[1;32m 253\u001b[0m \u001b[38;5;66;03m# Validate against model\u001b[39;00m\n\u001b[0;32m--> 254\u001b[0m validated \u001b[38;5;241m=\u001b[39m \u001b[43mmodel_class\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmodel_validate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mflat_obj\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 255\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m validated\u001b[38;5;241m.\u001b[39mmodel_dump(exclude_none\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n", - "File \u001b[0;32m~/Library/Caches/pypoetry/virtualenvs/redisvl-VnTEShF2-py3.13/lib/python3.13/site-packages/pydantic/main.py:627\u001b[0m, in \u001b[0;36mBaseModel.model_validate\u001b[0;34m(cls, obj, strict, from_attributes, context)\u001b[0m\n\u001b[1;32m 626\u001b[0m __tracebackhide__ \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[0;32m--> 627\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mcls\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m__pydantic_validator__\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mvalidate_python\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 628\u001b[0m \u001b[43m \u001b[49m\u001b[43mobj\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstrict\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstrict\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfrom_attributes\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfrom_attributes\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcontext\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcontext\u001b[49m\n\u001b[1;32m 629\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", - "\u001b[0;31mValidationError\u001b[0m: 1 validation error for user_simple__PydanticModel\nuser_embedding\n Input should be a valid list [type=list_type, input_value=b'\\xcd\\xcc\\xcc=\\xcd\\xcc\\xcc=\\x00\\x00\\x00?', input_type=bytes]\n For further information visit https://errors.pydantic.dev/2.10/v/list_type", - "\nDuring handling of the above exception, another exception occurred:\n", - "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[16], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m keys \u001b[38;5;241m=\u001b[39m \u001b[43mindex\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mload\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdata\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28mprint\u001b[39m(keys)\n", - "File \u001b[0;32m~/Documents/AppliedAI/redis-vl-python/redisvl/index/index.py:600\u001b[0m, in \u001b[0;36mSearchIndex.load\u001b[0;34m(self, data, id_field, keys, ttl, preprocess, batch_size)\u001b[0m\n\u001b[1;32m 551\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Load objects to the Redis database. Returns the list of keys loaded\u001b[39;00m\n\u001b[1;32m 552\u001b[0m \u001b[38;5;124;03mto Redis.\u001b[39;00m\n\u001b[1;32m 553\u001b[0m \n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 597\u001b[0m \u001b[38;5;124;03m keys = index.load(data, preprocess=add_field)\u001b[39;00m\n\u001b[1;32m 598\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 599\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 600\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_storage\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mwrite\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 601\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_redis_client\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m# type: ignore\u001b[39;49;00m\n\u001b[1;32m 602\u001b[0m \u001b[43m \u001b[49m\u001b[43mobjects\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdata\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 603\u001b[0m \u001b[43m \u001b[49m\u001b[43mid_field\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mid_field\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 604\u001b[0m \u001b[43m \u001b[49m\u001b[43mkeys\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mkeys\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 605\u001b[0m \u001b[43m \u001b[49m\u001b[43mttl\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mttl\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 606\u001b[0m \u001b[43m \u001b[49m\u001b[43mpreprocess\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpreprocess\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 607\u001b[0m \u001b[43m \u001b[49m\u001b[43mbatch_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbatch_size\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 608\u001b[0m \u001b[43m \u001b[49m\u001b[43mvalidate\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_validate_on_load\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 609\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 610\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m:\n\u001b[1;32m 611\u001b[0m logger\u001b[38;5;241m.\u001b[39mexception(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mError while loading data to Redis\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", - "File \u001b[0;32m~/Documents/AppliedAI/redis-vl-python/redisvl/index/storage.py:260\u001b[0m, in \u001b[0;36mBaseStorage.write\u001b[0;34m(self, redis_client, objects, id_field, keys, ttl, preprocess, batch_size, validate)\u001b[0m\n\u001b[1;32m 257\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m []\n\u001b[1;32m 259\u001b[0m \u001b[38;5;66;03m# Pass 1: Preprocess and validate all objects\u001b[39;00m\n\u001b[0;32m--> 260\u001b[0m prepared_objects \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_preprocess_and_validate_objects\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 261\u001b[0m \u001b[43m \u001b[49m\u001b[43mobjects\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\n\u001b[1;32m 262\u001b[0m \u001b[43m \u001b[49m\u001b[43mid_field\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mid_field\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 263\u001b[0m \u001b[43m \u001b[49m\u001b[43mkeys\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mkeys\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 264\u001b[0m \u001b[43m \u001b[49m\u001b[43mpreprocess\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpreprocess\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 265\u001b[0m \u001b[43m \u001b[49m\u001b[43mvalidate\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mvalidate\u001b[49m\n\u001b[1;32m 266\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 268\u001b[0m \u001b[38;5;66;03m# Pass 2: Write all valid objects in batches\u001b[39;00m\n\u001b[1;32m 269\u001b[0m added_keys \u001b[38;5;241m=\u001b[39m []\n", - "File \u001b[0;32m~/Documents/AppliedAI/redis-vl-python/redisvl/index/storage.py:212\u001b[0m, in \u001b[0;36mBaseStorage._preprocess_and_validate_objects\u001b[0;34m(self, objects, id_field, keys, preprocess, validate)\u001b[0m\n\u001b[1;32m 209\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m id_field \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(obj, \u001b[38;5;28mdict\u001b[39m) \u001b[38;5;129;01mand\u001b[39;00m id_field \u001b[38;5;129;01min\u001b[39;00m obj:\n\u001b[1;32m 210\u001b[0m object_id \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mwith \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mid_field\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m=\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mobj[id_field]\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m--> 212\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mValidation failed for object \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mobject_id\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mstr\u001b[39m(e)\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 214\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m prepared_objects\n", - "\u001b[0;31mValueError\u001b[0m: Validation failed for object at index 0: 1 validation error for user_simple__PydanticModel\nuser_embedding\n Input should be a valid list [type=list_type, input_value=b'\\xcd\\xcc\\xcc=\\xcd\\xcc\\xcc=\\x00\\x00\\x00?', input_type=bytes]\n For further information visit https://errors.pydantic.dev/2.10/v/list_type" - ] - } - ], - "source": [ - "keys = index.load(data)\n", - "\n", - "print(keys)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "IndexInfo(name='user_simple', prefix='user_simple_docs', key_separator=':', storage_type=)" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "index.schema.index" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['user_simple_docs:01JQ4Y9V0NK7QBYKMCP47MT3DE']" - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "index.load(data=[\n", - " {\n", - " 'user': 'john',\n", - " 'age': 1,\n", - " 'job': 'engineer',\n", - " 'credit_score': 'high',\n", - " 'location': 1,\n", - " 'user_embedding': [\n", - " ]\n", - " }\n", - "])" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'index_name': 'user_simple',\n", - " 'index_options': [],\n", - " 'index_definition': ['key_type',\n", - " 'JSON',\n", - " 'prefixes',\n", - " ['user_simple_docs'],\n", - " 'default_score',\n", - " '1',\n", - " 'indexes_all',\n", - " 'false'],\n", - " 'attributes': [['identifier',\n", - " '$.user',\n", - " 'attribute',\n", - " 'user',\n", - " 'type',\n", - " 'TAG',\n", - " 'SEPARATOR',\n", - " ','],\n", - " ['identifier',\n", - " '$.credit_score',\n", - " 'attribute',\n", - " 'credit_score',\n", - " 'type',\n", - " 'TAG',\n", - " 'SEPARATOR',\n", - " ','],\n", - " ['identifier', '$.job', 'attribute', 'job', 'type', 'TEXT', 'WEIGHT', '1'],\n", - " ['identifier', '$.age', 'attribute', 'age', 'type', 'NUMERIC'],\n", - " ['identifier', '$.location', 'attribute', 'location', 'type', 'GEO'],\n", - " ['identifier',\n", - " '$.user_embedding',\n", - " 'attribute',\n", - " 'user_embedding',\n", - " 'type',\n", - " 'VECTOR',\n", - " 'algorithm',\n", - " 'FLAT',\n", - " 'data_type',\n", - " 'FLOAT32',\n", - " 'dim',\n", - " 3,\n", - " 'distance_metric',\n", - " 'COSINE']],\n", - " 'num_docs': 2,\n", - " 'max_doc_id': 2,\n", - " 'num_terms': 2,\n", - " 'num_records': 12,\n", - " 'inverted_sz_mb': '4.61578369140625e-4',\n", - " 'vector_index_sz_mb': '0.028045654296875',\n", - " 'total_inverted_index_blocks': 5,\n", - " 'offset_vectors_sz_mb': '3.814697265625e-6',\n", - " 'doc_table_size_mb': '2.117156982421875e-4',\n", - " 'sortable_values_size_mb': '0',\n", - " 'key_table_size_mb': '8.296966552734375e-5',\n", - " 'tag_overhead_sz_mb': '5.53131103515625e-5',\n", - " 'text_overhead_sz_mb': '6.67572021484375e-5',\n", - " 'total_index_memory_sz_mb': '9.565353393554688e-4',\n", - " 'geoshapes_sz_mb': '0',\n", - " 'records_per_doc_avg': '6',\n", - " 'bytes_per_record_avg': '40.33333206176758',\n", - " 'offsets_per_term_avg': '0.3333333432674408',\n", - " 'offset_bits_per_record_avg': '8',\n", - " 'hash_indexing_failures': 4,\n", - " 'total_indexing_time': '0.3160000145435333',\n", - " 'indexing': 0,\n", - " 'percent_indexed': '1',\n", - " 'number_of_uses': 2,\n", - " 'cleaning': 0,\n", - " 'gc_stats': ['bytes_collected',\n", - " '0',\n", - " 'total_ms_run',\n", - " '0',\n", - " 'total_cycles',\n", - " '0',\n", - " 'average_cycle_time_ms',\n", - " 'nan',\n", - " 'last_run_time_ms',\n", - " '0',\n", - " 'gc_numeric_trees_missed',\n", - " '0',\n", - " 'gc_blocks_denied',\n", - " '0'],\n", - " 'cursor_stats': ['global_idle',\n", - " 0,\n", - " 'global_total',\n", - " 0,\n", - " 'index_capacity',\n", - " 128,\n", - " 'index_total',\n", - " 0],\n", - " 'dialect_stats': ['dialect_1',\n", - " 0,\n", - " 'dialect_2',\n", - " 0,\n", - " 'dialect_3',\n", - " 0,\n", - " 'dialect_4',\n", - " 0],\n", - " 'Index Errors': ['indexing failures',\n", - " 4,\n", - " 'last indexing error',\n", - " 'Empty array for vector field on JSON document',\n", - " 'last indexing error key',\n", - " 'user_simple_docs:01JQ4Y9V0NK7QBYKMCP47MT3DE'],\n", - " 'field statistics': [['identifier',\n", - " '$.user',\n", - " 'attribute',\n", - " 'user',\n", - " 'Index Errors',\n", - " ['indexing failures',\n", - " 0,\n", - " 'last indexing error',\n", - " 'N/A',\n", - " 'last indexing error key',\n", - " 'N/A']],\n", - " ['identifier',\n", - " '$.credit_score',\n", - " 'attribute',\n", - " 'credit_score',\n", - " 'Index Errors',\n", - " ['indexing failures',\n", - " 0,\n", - " 'last indexing error',\n", - " 'N/A',\n", - " 'last indexing error key',\n", - " 'N/A']],\n", - " ['identifier',\n", - " '$.job',\n", - " 'attribute',\n", - " 'job',\n", - " 'Index Errors',\n", - " ['indexing failures',\n", - " 0,\n", - " 'last indexing error',\n", - " 'N/A',\n", - " 'last indexing error key',\n", - " 'N/A']],\n", - " ['identifier',\n", - " '$.age',\n", - " 'attribute',\n", - " 'age',\n", - " 'Index Errors',\n", - " ['indexing failures',\n", - " 0,\n", - " 'last indexing error',\n", - " 'N/A',\n", - " 'last indexing error key',\n", - " 'N/A']],\n", - " ['identifier',\n", - " '$.location',\n", - " 'attribute',\n", - " 'location',\n", - " 'Index Errors',\n", - " ['indexing failures',\n", - " 0,\n", - " 'last indexing error',\n", - " 'N/A',\n", - " 'last indexing error key',\n", - " 'N/A']],\n", - " ['identifier',\n", - " '$.user_embedding',\n", - " 'attribute',\n", - " 'user_embedding',\n", - " 'Index Errors',\n", - " ['indexing failures',\n", - " 4,\n", - " 'last indexing error',\n", - " 'Empty array for vector field on JSON document',\n", - " 'last indexing error key',\n", - " 'user_simple_docs:01JQ4Y9V0NK7QBYKMCP47MT3DE']]]}" - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "index.info()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - ">By default, `load` will create a unique Redis key as a combination of the index key `prefix` and a random ULID. You can also customize the key by providing direct keys or pointing to a specified `id_field` on load." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Upsert the index with new data\n", - "Upsert data by using the `load` method again:" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "['user_simple_docs:01JM2NWJGYMJ0QTR5YB4MB0BX9']\n" - ] - } - ], - "source": [ - "# Add more data\n", - "new_data = [{\n", - " 'user': 'tyler',\n", - " 'age': 9,\n", - " 'job': 'engineer',\n", - " 'credit_score': 'high',\n", - " 'user_embedding': np.array([0.1, 0.3, 0.5], dtype=np.float32).tobytes()\n", - "}]\n", - "keys = index.load(new_data)\n", - "\n", - "print(keys)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Creating `VectorQuery` Objects\n", - "\n", - "Next we will create a vector query object for our newly populated index. This example will use a simple vector to demonstrate how vector similarity works. Vectors in production will likely be much larger than 3 floats and often require Machine Learning models (i.e. Huggingface sentence transformers) or an embeddings API (Cohere, OpenAI). `redisvl` provides a set of [Vectorizers](https://docs.redisvl.com/en/latest/user_guide/vectorizers_04.html#openai) to assist in vector creation." - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "from redisvl.query import VectorQuery\n", - "from jupyterutils import result_print\n", - "\n", - "query = VectorQuery(\n", - " vector=[0.1, 0.1, 0.5],\n", - " vector_field_name=\"user_embedding\",\n", - " return_fields=[\"user\", \"age\", \"job\", \"credit_score\", \"vector_distance\"],\n", - " num_results=3\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Executing queries\n", - "With our `VectorQuery` object defined above, we can execute the query over the `SearchIndex` using the `query` method." - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "*=>[KNN 3 @user_embedding $vector AS vector_distance] RETURN 6 user age job credit_score vector_distance vector_distance SORTBY vector_distance ASC DIALECT 2 LIMIT 0 3\n" - ] - }, - { - "data": { - "text/html": [ - "table>vector_distanceuseragejobcredit_score0john1engineerhigh0mary2doctorlow0.0566299557686tyler9engineerhigh" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "results = index.query(query)\n", - "result_print(results)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Using an Asynchronous Redis Client\n", - "\n", - "The `AsyncSearchIndex` class along with an async Redis python client allows for queries, index creation, and data loading to be done asynchronously. This is the\n", - "recommended route for working with `redisvl` in production-like settings." - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'index': {'name': 'user_simple', 'prefix': 'user_simple_docs'},\n", - " 'fields': [{'name': 'user', 'type': 'tag'},\n", - " {'name': 'credit_score', 'type': 'tag'},\n", - " {'name': 'job', 'type': 'text'},\n", - " {'name': 'age', 'type': 'numeric'},\n", - " {'name': 'user_embedding',\n", - " 'type': 'vector',\n", - " 'attrs': {'dims': 3,\n", - " 'distance_metric': 'cosine',\n", - " 'algorithm': 'flat',\n", - " 'datatype': 'float32'}}]}" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "schema" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from redisvl.index import AsyncSearchIndex\n", - "from redis.asyncio import Redis\n", - "\n", - "client = Redis.from_url(\"redis://localhost:6379\")\n", - "\n", - "index = AsyncSearchIndex.from_dict(schema, redis_client=client)" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
vector_distanceuseragejobcredit_score
0john1engineerhigh
0mary2doctorlow
0.0566299557686tyler9engineerhigh
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "# execute the vector query async\n", - "results = await index.query(query)\n", - "result_print(results)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Updating a schema\n", - "In some scenarios, it makes sense to update the index schema. With Redis and `redisvl`, this is easy because Redis can keep the underlying data in place while you change or make updates to the index configuration." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "So for our scenario, let's imagine we want to reindex this data in 2 ways:\n", - "- by using a `Tag` type for `job` field instead of `Text`\n", - "- by using an `hnsw` vector index for the `user_embedding` field instead of a `flat` vector index" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [], - "source": [ - "# Modify this schema to have what we want\n", - "\n", - "index.schema.remove_field(\"job\")\n", - "index.schema.remove_field(\"user_embedding\")\n", - "index.schema.add_fields([\n", - " {\"name\": \"job\", \"type\": \"tag\"},\n", - " {\n", - " \"name\": \"user_embedding\",\n", - " \"type\": \"vector\",\n", - " \"attrs\": {\n", - " \"dims\": 3,\n", - " \"distance_metric\": \"cosine\",\n", - " \"algorithm\": \"hnsw\",\n", - " \"datatype\": \"float32\"\n", - " }\n", - " }\n", - "])" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "11:28:32 redisvl.index.index INFO Index already exists, overwriting.\n" - ] - } - ], - "source": [ - "# Run the index update but keep underlying data in place\n", - "await index.create(overwrite=True, drop=False)" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
vector_distanceuseragejobcredit_score
0mary2doctorlow
0john1engineerhigh
0.0566299557686tyler9engineerhigh
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "# Execute the vector query async\n", - "results = await index.query(query)\n", - "result_print(results)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Check Index Stats\n", - "Use the `rvl` CLI to check the stats for the index:" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Statistics:\n", - "╭─────────────────────────────┬─────────────╮\n", - "│ Stat Key │ Value │\n", - "├─────────────────────────────┼─────────────┤\n", - "│ num_docs │ 4 │\n", - "│ num_terms │ 4 │\n", - "│ max_doc_id │ 4 │\n", - "│ num_records │ 22 │\n", - "│ percent_indexed │ 1 │\n", - "│ hash_indexing_failures │ 0 │\n", - "│ number_of_uses │ 2 │\n", - "│ bytes_per_record_avg │ 47.8 │\n", - "│ doc_table_size_mb │ 0.000423431 │\n", - "│ inverted_sz_mb │ 0.000911713 │\n", - "│ key_table_size_mb │ 0.000165939 │\n", - "│ offset_bits_per_record_avg │ nan │\n", - "│ offset_vectors_sz_mb │ 0 │\n", - "│ offsets_per_term_avg │ 0 │\n", - "│ records_per_doc_avg │ 5 │\n", - "│ sortable_values_size_mb │ 0 │\n", - "│ total_indexing_time │ 0.239 │\n", - "│ total_inverted_index_blocks │ 11 │\n", - "│ vector_index_sz_mb │ 0.235603 │\n", - "╰─────────────────────────────┴─────────────╯\n" - ] - } - ], - "source": [ - "!rvl stats -i user_simple" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Cleanup" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Below we will clean up after our work. First, you can flush all data from Redis associated with the index by\n", - "using the `.clear()` method. This will leave the secondary index in place for future insertions or updates.\n", - "\n", - "But if you want to clean up everything, including the index, just use `.delete()`\n", - "which will by default remove the index AND the underlying data." - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "4" - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Clear all data from Redis associated with the index\n", - "await index.clear()" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Butm the index is still in place\n", - "await index.exists()" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [], - "source": [ - "# Remove / delete the index in its entirety\n", - "await index.delete()" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "env", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.13.2" - }, - "orig_nbformat": 4 - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/redisvl/index/index.py b/redisvl/index/index.py index 806b4ba5..4ec473a4 100644 --- a/redisvl/index/index.py +++ b/redisvl/index/index.py @@ -175,7 +175,7 @@ def from_yaml(cls, schema_path: str, **kwargs): from redisvl.index import SearchIndex - index = SearchIndex.from_yaml("schemas/schema.yaml") + index = SearchIndex.from_yaml("schemas/schema.yaml", redis_url="redis://localhost:6379") """ schema = IndexSchema.from_yaml(schema_path) return cls(schema=schema, **kwargs) @@ -203,7 +203,7 @@ def from_dict(cls, schema_dict: Dict[str, Any], **kwargs): "fields": [ {"name": "doc-id", "type": "tag"} ] - }) + }, redis_url="redis://localhost:6379") """ schema = IndexSchema.from_dict(schema_dict) @@ -247,10 +247,14 @@ class SearchIndex(BaseSearchIndex): from redisvl.index import SearchIndex # initialize the index object with schema from file - index = SearchIndex.from_yaml("schemas/schema.yaml", redis_url="redis://localhost:6379") + index = SearchIndex.from_yaml( + "schemas/schema.yaml", + redis_url="redis://localhost:6379", + validate_on_load=True + ) # create the index - index.create(overwrite=True) + index.create(overwrite=True, drop=False) # data is an iterable of dictionaries index.load(data) @@ -407,11 +411,6 @@ def connect(self, redis_url: Optional[str] = None, **kwargs): ValueError: If the Redis URL is not provided nor accessible through the `REDIS_URL` environment variable. ModuleNotFoundError: If required Redis modules are not installed. - - .. code-block:: python - - index.connect(redis_url="redis://localhost:6379") - """ self.__redis_client = RedisConnectionFactory.get_redis_connection( redis_url=redis_url, **kwargs @@ -431,16 +430,6 @@ def set_client(self, redis_client: redis.Redis, **kwargs): Raises: TypeError: If the provided client is not valid. - - .. code-block:: python - - import redis - from redisvl.index import SearchIndex - - client = redis.Redis.from_url("redis://localhost:6379") - index = SearchIndex.from_yaml("schemas/schema.yaml") - index.set_client(client) - """ RedisConnectionFactory.validate_sync_redis(redis_client) self.__redis_client = redis_client @@ -906,11 +895,12 @@ class AsyncSearchIndex(BaseSearchIndex): # initialize the index object with schema from file index = AsyncSearchIndex.from_yaml( "schemas/schema.yaml", - redis_url="redis://localhost:6379" + redis_url="redis://localhost:6379", + validate_on_load=True ) # create the index - await index.create(overwrite=True) + await index.create(overwrite=True, drop=False) # data is an iterable of dictionaries await index.load(data) diff --git a/redisvl/index/storage.py b/redisvl/index/storage.py index f0af1e5b..792b6bc4 100644 --- a/redisvl/index/storage.py +++ b/redisvl/index/storage.py @@ -143,7 +143,7 @@ async def _aget(client: AsyncRedis, key: str) -> Dict[str, Any]: """ raise NotImplementedError - def validate(self, obj: Dict[str, Any]) -> Dict[str, Any]: + def _validate(self, obj: Dict[str, Any]) -> Dict[str, Any]: """ Validate an object against the schema using Pydantic-based validation. @@ -161,7 +161,7 @@ def validate(self, obj: Dict[str, Any]) -> Dict[str, Any]: def _preprocess_and_validate_objects( self, - objects: List[Any], + objects: Iterable[Any], id_field: Optional[str] = None, keys: Optional[Iterable[str]] = None, preprocess: Optional[Callable] = None, @@ -201,7 +201,7 @@ def _preprocess_and_validate_objects( # Schema validation if enabled if validate: - processed_obj = self.validate(processed_obj) + processed_obj = self._validate(processed_obj) # Store valid object with its key for writing prepared_objects.append((key, processed_obj)) @@ -263,7 +263,7 @@ def write( # Pass 1: Preprocess and validate all objects prepared_objects = self._preprocess_and_validate_objects( - objects, + list(objects), # Convert Iterable to List id_field=id_field, keys=keys, preprocess=preprocess, @@ -342,7 +342,7 @@ async def awrite( # Pass 1: Preprocess and validate all objects prepared_objects = self._preprocess_and_validate_objects( - objects, + list(objects), # Convert Iterable to List id_field=id_field, keys=keys, preprocess=preprocess, diff --git a/redisvl/schema/validation.py b/redisvl/schema/validation.py index b102166c..c4ddd3e7 100644 --- a/redisvl/schema/validation.py +++ b/redisvl/schema/validation.py @@ -8,7 +8,7 @@ import json import re import warnings -from typing import Any, Dict, List, Optional, Type, Union +from typing import Any, Dict, List, Optional, Type, Union, cast from pydantic import BaseModel, Field, field_validator @@ -53,7 +53,7 @@ def get_model_for_schema(cls, schema: IndexSchema) -> Type[BaseModel]: @classmethod def _map_field_to_pydantic_type( cls, field: BaseField, storage_type: StorageType - ) -> Type: + ) -> Type[Any]: """ Map Redis field types to appropriate Pydantic types. @@ -72,14 +72,17 @@ def _map_field_to_pydantic_type( elif field.type == FieldTypes.TAG: return str elif field.type == FieldTypes.NUMERIC: - return Union[int, float] + return Union[int, float] # type: ignore elif field.type == FieldTypes.GEO: return str elif field.type == FieldTypes.VECTOR: # For JSON storage, vectors are always lists if storage_type == StorageType.JSON: # For int data types, vectors must be ints, otherwise floats - if field.attrs.datatype in (VectorDataType.INT8, VectorDataType.UINT8): + if field.attrs.datatype in ( # type: ignore + VectorDataType.INT8, + VectorDataType.UINT8, + ): return List[int] return List[float] else: @@ -103,8 +106,8 @@ def _create_model(cls, schema: IndexSchema) -> Type[BaseModel]: storage_type = schema.index.storage_type # Create annotations dictionary for the dynamic model - annotations = {} - class_dict = {} + annotations: Dict[str, Any] = {} + class_dict: Dict[str, Any] = {} # Build annotations and field metadata for field_name, field in schema.fields.items(): @@ -154,6 +157,8 @@ def _disallow_bool(cls, value): # Register validators for VECTOR fields elif field.type == FieldTypes.VECTOR: + dims = field.attrs.dims # type: ignore + datatype = field.attrs.datatype # type: ignore def make_vector_validator( fname: str, dims: int, datatype: VectorDataType @@ -190,7 +195,7 @@ def _validate_vector(cls, value): return _validate_vector class_dict[f"validate_{field_name}"] = make_vector_validator( - field_name, field.attrs.dims, field.attrs.datatype + field_name, dims, datatype ) # Create class dictionary with annotations and field metadata diff --git a/tests/integration/test_async_search_index.py b/tests/integration/test_async_search_index.py index ea122d5d..32a2e3d3 100644 --- a/tests/integration/test_async_search_index.py +++ b/tests/integration/test_async_search_index.py @@ -5,7 +5,7 @@ from redis import Redis as SyncRedis from redis.asyncio import Redis as AsyncRedis -from redisvl.exceptions import RedisModuleVersionError, RedisSearchError +from redisvl.exceptions import RedisModuleVersionError, RedisSearchError, RedisVLError from redisvl.index import AsyncSearchIndex from redisvl.query import VectorQuery from redisvl.query.query import FilterQuery @@ -269,7 +269,7 @@ async def test_search_index_load_preprocess(async_index): await async_index.create(overwrite=True, drop=True) data = [{"id": "1", "test": "foo"}] - async def preprocess(record): + def preprocess(record): record["test"] = "bar" return record @@ -281,10 +281,10 @@ async def preprocess(record): == "bar" ) - async def bad_preprocess(record): + def bad_preprocess(record): return 1 - with pytest.raises(ValueError): + with pytest.raises(RedisVLError): await async_index.load(data, id_field="id", preprocess=bad_preprocess) @@ -300,7 +300,7 @@ async def test_no_id_field(async_index): bad_data = [{"wrong_key": "1", "value": "test"}] # catch missing / invalid id_field - with pytest.raises(ValueError): + with pytest.raises(RedisVLError): await async_index.load(bad_data, id_field="key") diff --git a/tests/integration/test_search_index.py b/tests/integration/test_search_index.py index 02b6d5e4..875449be 100644 --- a/tests/integration/test_search_index.py +++ b/tests/integration/test_search_index.py @@ -4,7 +4,7 @@ import pytest from redis import Redis -from redisvl.exceptions import RedisModuleVersionError, RedisSearchError +from redisvl.exceptions import RedisModuleVersionError, RedisSearchError, RedisVLError from redisvl.index import SearchIndex from redisvl.query import VectorQuery from redisvl.query.query import FilterQuery @@ -268,7 +268,7 @@ def preprocess(record): def bad_preprocess(record): return 1 - with pytest.raises(ValueError): + with pytest.raises(RedisVLError): index.load(data, id_field="id", preprocess=bad_preprocess) @@ -277,7 +277,7 @@ def test_no_id_field(index): bad_data = [{"wrong_key": "1", "value": "test"}] # catch missing / invalid id_field - with pytest.raises(ValueError): + with pytest.raises(RedisVLError): index.load(bad_data, id_field="key") diff --git a/tests/unit/test_edge_cases.py b/tests/unit/test_edge_cases.py deleted file mode 100644 index 3646cc1f..00000000 --- a/tests/unit/test_edge_cases.py +++ /dev/null @@ -1,451 +0,0 @@ -""" -Tests for edge cases in the RedisVL validation module. - -This module tests edge cases in the validation system that might not be -covered in the main test files, including: -1. Performance and caching behavior -2. Handling of unusual data types -3. Extreme values -4. Boundary conditions -""" - -import time -from typing import Any, Dict, List - -import pytest - -from redisvl.index.storage import BaseStorage -from redisvl.schema.fields import Field, FieldTypes, VectorDataType -from redisvl.schema.index import Index, IndexSchema -from redisvl.schema.validation import SchemaModelGenerator, validate_object - - -class TestSchemaModelCaching: - """Tests for model caching behavior.""" - - def test_caching_improves_performance(self): - """Test that caching improves model generation performance.""" - # Create a complex schema - fields = { - f"field_{i}": Field(name=f"field_{i}", type=FieldTypes.TEXT) - for i in range(50) # 50 fields should be enough to measure performance - } - - schema = IndexSchema( - index=Index(name="performance_test", prefix="doc"), fields=fields - ) - - # First generation (not cached) - start_time = time.time() - model1 = SchemaModelGenerator.get_model_for_schema(schema) - first_time = time.time() - start_time - - # Second generation (should be cached) - start_time = time.time() - model2 = SchemaModelGenerator.get_model_for_schema(schema) - second_time = time.time() - start_time - - # Verify second generation is faster - assert second_time < first_time - - # Should be much faster (usually at least 10x) - assert second_time < (first_time * 0.5) - - # Verify same model instance - assert model1 is model2 - - def test_different_schemas_get_different_models(self): - """Test that different schemas get different model instances.""" - # Create two different schemas - schema1 = IndexSchema( - index=Index(name="test1", prefix="doc1"), - fields={"field1": Field(name="field1", type=FieldTypes.TEXT)}, - ) - - schema2 = IndexSchema( - index=Index(name="test2", prefix="doc2"), - fields={"field1": Field(name="field1", type=FieldTypes.TEXT)}, - ) - - # Get models - model1 = SchemaModelGenerator.get_model_for_schema(schema1) - model2 = SchemaModelGenerator.get_model_for_schema(schema2) - - # Verify different model instances - assert model1 is not model2 - assert model1.__name__ != model2.__name__ - - -class TestUnusualDataTypes: - """Tests for handling unusual data types during validation.""" - - @pytest.fixture - def basic_schema(self): - """Create a basic schema for testing.""" - return IndexSchema( - index=Index(name="test", prefix="doc"), - fields={ - "text_field": Field(name="text_field", type=FieldTypes.TEXT), - "tag_field": Field(name="tag_field", type=FieldTypes.TAG), - "num_field": Field(name="num_field", type=FieldTypes.NUMERIC), - }, - ) - - def test_none_values(self, basic_schema): - """Test handling of None values.""" - # Data with None values - data = {"text_field": None, "tag_field": None, "num_field": None} - - # Validate - result = validate_object(basic_schema, data) - - # None values should be excluded - assert len(result) == 0 - - def test_empty_string_values(self, basic_schema): - """Test handling of empty strings.""" - # Data with empty strings - data = {"text_field": "", "tag_field": "", "num_field": 0} - - # Validate - result = validate_object(basic_schema, data) - - # Empty strings are valid for text and tag - assert result["text_field"] == "" - assert result["tag_field"] == "" - assert result["num_field"] == 0 - - def test_boolean_values(self, basic_schema): - """Test handling of boolean values.""" - # Data with booleans - data = {"text_field": True, "tag_field": False, "num_field": True} - - # Booleans aren't valid for text or tag - with pytest.raises(ValueError) as exc_info: - validate_object(basic_schema, data) - - assert "text_field" in str(exc_info.value) - - # Create new schema with only numeric - num_schema = IndexSchema( - index=Index(name="test", prefix="doc"), - fields={"num_field": Field(name="num_field", type=FieldTypes.NUMERIC)}, - ) - - # Validate with only the numeric field - result = validate_object(num_schema, {"num_field": True}) - - # Python converts True to 1, False to 0 - assert result["num_field"] == 1 - - def test_list_for_text(self, basic_schema): - """Test handling lists for text fields.""" - # Data with list for text - data = {"text_field": ["item1", "item2"]} - - # Lists aren't valid for text - with pytest.raises(ValueError) as exc_info: - validate_object(basic_schema, data) - - assert "text_field" in str(exc_info.value) - - -class TestVectorEdgeCases: - """Tests for edge cases with vector fields.""" - - @pytest.fixture - def vector_schema(self): - """Create a schema with vector fields for testing.""" - return IndexSchema( - index=Index(name="test_vectors", prefix="vec"), - fields={ - "float_vec": Field( - name="float_vec", - type=FieldTypes.VECTOR, - attrs={"dims": 3, "datatype": VectorDataType.FLOAT32}, - ), - "int_vec": Field( - name="int_vec", - type=FieldTypes.VECTOR, - attrs={"dims": 3, "datatype": VectorDataType.INT8}, - ), - }, - ) - - def test_large_vectors(self, vector_schema): - """Test validation of very large vectors.""" - # Create a large vector (1000 dimensions) - large_schema = IndexSchema( - index=Index(name="large_vec", prefix="vec"), - fields={ - "large_vec": Field( - name="large_vec", - type=FieldTypes.VECTOR, - attrs={"dims": 1000, "datatype": VectorDataType.FLOAT32}, - ) - }, - ) - - # Valid large vector - large_vector = {"large_vec": [0.1] * 1000} - result = validate_object(large_schema, large_vector) - assert len(result["large_vec"]) == 1000 - - # Invalid dimensions - invalid_dims = {"large_vec": [0.1] * 999} - with pytest.raises(ValueError) as exc_info: - validate_object(large_schema, invalid_dims) - assert "dimensions" in str(exc_info.value) - - def test_mixed_vector_types(self, vector_schema): - """Test validation of vectors with mixed element types.""" - # Float vector with mixed types - mixed_float = {"float_vec": [1, 2.5, "3"]} - with pytest.raises(ValueError) as exc_info: - validate_object(vector_schema, mixed_float) - assert "float_vec" in str(exc_info.value) - - # Int vector with mixed types - mixed_int = {"int_vec": [1, 2.5, 3]} - with pytest.raises(ValueError) as exc_info: - validate_object(vector_schema, mixed_int) - assert "int_vec" in str(exc_info.value) - - def test_empty_vector(self, vector_schema): - """Test validation of empty vectors.""" - # Empty float vector - empty_vec = {"float_vec": []} - with pytest.raises(ValueError) as exc_info: - validate_object(vector_schema, empty_vec) - assert "float_vec" in str(exc_info.value) - assert "dimensions" in str(exc_info.value) - - def test_vector_int_range(self, vector_schema): - """Test validation of integer vectors with values outside allowed range.""" - # INT8 vector with values outside range - out_of_range = {"int_vec": [100, 200, 300]} # Valid int, but outside INT8 range - with pytest.raises(ValueError) as exc_info: - validate_object(vector_schema, out_of_range) - assert "int_vec" in str(exc_info.value) - assert "must be between" in str(exc_info.value) - - # INT8 vector with valid range - valid_range = {"int_vec": [-128, 0, 127]} - result = validate_object(vector_schema, valid_range) - assert result["int_vec"] == [-128, 0, 127] - - -class TestGeoEdgeCases: - """Tests for edge cases with geo fields.""" - - @pytest.fixture - def geo_schema(self): - """Create a schema with geo fields for testing.""" - return IndexSchema( - index=Index(name="test_geo", prefix="geo"), - fields={"location": Field(name="location", type=FieldTypes.GEO)}, - ) - - def test_geo_boundary_values(self, geo_schema): - """Test validation of geo fields with boundary values.""" - # Valid boundary values - valid_boundaries = [ - {"location": "90,180"}, # Max lat, max lon - {"location": "-90,-180"}, # Min lat, min lon - {"location": "0,0"}, # Zero point - {"location": "90,0"}, # North pole - {"location": "-90,0"}, # South pole - ] - - for data in valid_boundaries: - result = validate_object(geo_schema, data) - assert result["location"] == data["location"] - - def test_geo_invalid_boundary_values(self, geo_schema): - """Test validation of geo fields with invalid boundary values.""" - # Invalid boundary values - invalid_boundaries = [ - {"location": "91,0"}, # Lat > 90 - {"location": "-91,0"}, # Lat < -90 - {"location": "0,181"}, # Lon > 180 - {"location": "0,-181"}, # Lon < -180 - {"location": "90.1,0"}, # Lat > 90 (decimal) - {"location": "0,180.1"}, # Lon > 180 (decimal) - ] - - for data in invalid_boundaries: - with pytest.raises(ValueError) as exc_info: - validate_object(geo_schema, data) - assert "location" in str(exc_info.value) - assert "not a valid" in str(exc_info.value) - - def test_geo_formats(self, geo_schema): - """Test validation of geo fields with different formats.""" - # Various valid formats - valid_formats = [ - {"location": "37.7749,-122.4194"}, # Decimal degrees - {"location": "-37.7749,122.4194"}, # Negative latitude - {"location": "37.7749,122.4194"}, # Positive longitude - {"location": "0.0000,0.0000"}, # Zeros with decimal - {"location": "37,-122"}, # Integer degrees - ] - - for data in valid_formats: - result = validate_object(geo_schema, data) - assert result["location"] == data["location"] - - # Invalid formats - invalid_formats = [ - {"location": "37.7749"}, # Missing longitude - {"location": "37.7749,"}, # Missing longitude value - {"location": ",122.4194"}, # Missing latitude value - {"location": "37.7749:122.4194"}, # Wrong separator - {"location": "37.7749, 122.4194"}, # Space after separator - {"location": "North,South"}, # Non-numeric values - ] - - for data in invalid_formats: - with pytest.raises(ValueError) as exc_info: - validate_object(geo_schema, data) - assert "location" in str(exc_info.value) - - -class TestNestedJsonEdgeCases: - """Tests for edge cases with nested JSON.""" - - @pytest.fixture - def nested_schema(self): - """Create a schema with JSON paths for testing.""" - fields = { - "id": Field(name="id", type=FieldTypes.TAG), - "title": Field(name="title", type=FieldTypes.TEXT, path="$.content.title"), - "rating": Field( - name="rating", type=FieldTypes.NUMERIC, path="$.metadata.rating" - ), - "deeply_nested": Field( - name="deeply_nested", - type=FieldTypes.TEXT, - path="$.level1.level2.level3.level4.value", - ), - } - - return IndexSchema( - index=Index(name="test_nested", prefix="nested"), fields=fields - ) - - def test_very_deeply_nested_json(self, nested_schema): - """Test validation with very deeply nested JSON.""" - # Create a deeply nested structure - deeply_nested = { - "id": "doc1", - "level1": { - "level2": {"level3": {"level4": {"value": "deeply nested value"}}} - }, - } - - # Validate - result = validate_object(nested_schema, deeply_nested) - assert result["id"] == "doc1" - assert result["deeply_nested"] == "deeply nested value" - - def test_partial_path_missing(self, nested_schema): - """Test validation when part of a JSON path is missing.""" - # Create object with partial path missing - partial_missing = { - "id": "doc1", - "level1": { - "level2": { - # level3 missing - } - }, - } - - # Validate - should ignore missing path - result = validate_object(nested_schema, partial_missing) - assert result["id"] == "doc1" - assert "deeply_nested" not in result - - def test_nested_arrays(self): - """Test validation with nested arrays in JSON.""" - # Create schema with path to array element - array_schema = IndexSchema( - index=Index(name="test_arrays", prefix="arr"), - fields={ - "id": Field(name="id", type=FieldTypes.TAG), - "first_item": Field( - name="first_item", type=FieldTypes.TEXT, path="$.items[0]" - ), - "nested_item": Field( - name="nested_item", - type=FieldTypes.TEXT, - path="$.nested.items[1].name", - ), - }, - ) - - # Note: JSONPath with array indexing is not supported currently - # This test documents this limitation - - # Create data with arrays - array_data = { - "id": "arr1", - "items": ["first", "second", "third"], - "nested": {"items": [{"name": "item1"}, {"name": "item2"}]}, - } - - # Validate - array paths won't be found - result = validate_object(array_schema, array_data) - assert result["id"] == "arr1" - assert "first_item" not in result - assert "nested_item" not in result - - -class TestValidationIntegrationEdgeCases: - """Tests for integration edge cases between storage and validation.""" - - @pytest.fixture - def storage_with_schema(self): - """Create a storage instance with schema for testing.""" - schema = IndexSchema( - index=Index(name="test_storage", prefix="doc"), - fields={ - "id": Field(name="id", type=FieldTypes.TAG), - "vec": Field( - name="vec", - type=FieldTypes.VECTOR, - attrs={"dims": 3, "datatype": VectorDataType.FLOAT32}, - ), - }, - ) - - return BaseStorage(schema=schema, client=None) - - def test_validation_with_bytes_no_client(self, storage_with_schema): - """Test validation with bytes when no Redis client is available.""" - # No Redis client was provided, so hset won't be called - # This just tests that validation works with bytes - - # Valid data with bytes - data = {"id": "doc1", "vec": b"\x00\x01\x02"} # 3 bytes - - # Validate - should work even without client - validated = storage_with_schema.validate_object(data) - assert validated["id"] == "doc1" - assert validated["vec"] == b"\x00\x01\x02" - - def test_unexpected_field_is_ignored(self, storage_with_schema): - """Test that unexpected fields are ignored during validation.""" - # Data with extra field - data = { - "id": "doc1", - "vec": [0.1, 0.2, 0.3], - "extra": "This field is not in the schema", - } - - # Validate - validated = storage_with_schema.validate_object(data) - - # Extra field should be ignored - assert validated["id"] == "doc1" - assert validated["vec"] == [0.1, 0.2, 0.3] - assert "extra" not in validated diff --git a/tests/unit/test_fields.py b/tests/unit/test_fields.py index 3376a67c..0c0d504e 100644 --- a/tests/unit/test_fields.py +++ b/tests/unit/test_fields.py @@ -219,58 +219,3 @@ def test_create_unknown_field_type(): with pytest.raises(ValueError) as excinfo: FieldFactory.create_field("unknown", "example_field") assert "Unknown field type: unknown" in str(excinfo.value) - - -# Add validation tests for each field type -@pytest.mark.parametrize( - "field_class,valid_value,invalid_value,error_msg", - [ - (TextField, "sample text", 123, "expects a string"), - (NumericField, 123.45, "123.45", "looks like a number"), - (TagField, ["tag1", "tag2"], ["tag1", 123], "must be a string"), - (GeoField, "37.7749,-122.4194", "invalid-geo", "not a valid 'lat,lon' format"), - # Add vector field test cases - ], -) -def test_field_validation(field_class, valid_value, invalid_value, error_msg): - """Test validation logic for each field type""" - # Create field instance - field = field_class(name="test_field") - - # Test valid value - is_valid, error = field.validate(valid_value) - assert is_valid, f"Field should accept valid value: {valid_value}" - assert error is None, "No error message should be returned for valid value" - - # Test invalid value - is_valid, error = field.validate(invalid_value) - assert not is_valid, f"Field should reject invalid value: {invalid_value}" - assert ( - error_msg in error - ), f"Error message should contain '{error_msg}', got: {error}" - - -def test_vector_field_validation(): - """Test validation for vector fields specifically""" - # Create vector fields with specific dimensions - flat_field = create_flat_vector_field(dims=3) - hnsw_field = create_hnsw_vector_field(dims=3) - - # Valid vector - valid_vector = [0.1, 0.2, 0.3] - - # Test valid cases - assert flat_field.validate(valid_vector)[0], "Should accept valid vector" - assert hnsw_field.validate(valid_vector)[0], "Should accept valid vector" - - # Test wrong dimensions - wrong_dims = [0.1, 0.2] # Only 2 dimensions - is_valid, error = flat_field.validate(wrong_dims) - assert not is_valid, "Should reject vector with wrong dimensions" - assert "expects 3 dimensions" in error - - # Test wrong type - wrong_type = ["a", "b", "c"] # Strings instead of numbers - is_valid, error = hnsw_field.validate(wrong_type) - assert not is_valid, "Should reject vector with non-numeric elements" - assert "must be a number" in error diff --git a/tests/unit/test_storage.py b/tests/unit/test_storage.py index 4a34d340..11f51e73 100644 --- a/tests/unit/test_storage.py +++ b/tests/unit/test_storage.py @@ -19,6 +19,7 @@ def sample_hash_schema(): "fields": [ {"name": "test_id", "type": "tag"}, {"name": "title", "type": "text"}, + {"name": "user", "type": "tag"}, {"name": "rating", "type": "numeric"}, {"name": "location", "type": "geo"}, { @@ -57,14 +58,14 @@ def sample_json_schema(): "storage_type": "json", }, "fields": [ - {"name": "test_id", "type": "tag", "path": "$.test_id"}, - {"name": "user", "type": "tag", "path": "$.metadata.user"}, - {"name": "title", "type": "text", "path": "$.content.title"}, - {"name": "rating", "type": "numeric", "path": "$.metadata.rating"}, + {"name": "test_id", "type": "tag"}, + {"name": "user", "type": "tag"}, + {"name": "title", "type": "text"}, + {"name": "rating", "type": "numeric"}, + {"name": "location", "type": "geo"}, { "name": "embedding", "type": "vector", - "path": "$.content.embedding", "attrs": { "algorithm": "flat", "dims": 4, @@ -75,7 +76,6 @@ def sample_json_schema(): { "name": "int_vector", "type": "vector", - "path": "$.content.int_vector", "attrs": { "algorithm": "flat", "dims": 3, @@ -120,7 +120,7 @@ def test_create_key(storage_instance): def test_validate_success(storage_instance): try: - storage_instance.validate( + storage_instance._validate( {"test_id": "1234", "rating": 5, "user": "john", "title": "engineer"} ) except Exception as e: @@ -130,10 +130,11 @@ def test_validate_success(storage_instance): def test_validate_failure(storage_instance): data = {"title": 5} with pytest.raises(ValidationError): - storage_instance.validate(data) - data = {"user": True} + storage_instance._validate(data) + + data = {"user": [1]} with pytest.raises(ValidationError): - storage_instance.validate(data) + storage_instance._validate(data) def test_validate_preprocess_and_validate_failure(storage_instance): @@ -143,7 +144,8 @@ def test_validate_preprocess_and_validate_failure(storage_instance): ) with pytest.raises(SchemaValidationError): storage_instance._preprocess_and_validate_objects(objects=[data], validate=True) - data = {"user": True} + + data = {"user": [1]} data == storage_instance._preprocess_and_validate_objects( objects=[data], validate=False ) From 7f56857a2efd4e0bdc9452f7a84b642c510cc8b3 Mon Sep 17 00:00:00 2001 From: Tyler Hutcherson Date: Wed, 26 Mar 2025 11:07:26 -0400 Subject: [PATCH 04/11] Remove validation docs page --- docs/validation.md | 228 --------------------------------------------- 1 file changed, 228 deletions(-) delete mode 100644 docs/validation.md diff --git a/docs/validation.md b/docs/validation.md deleted file mode 100644 index 204a009f..00000000 --- a/docs/validation.md +++ /dev/null @@ -1,228 +0,0 @@ -# RedisVL Validation System - -The RedisVL validation system ensures that data written to Redis indexes conforms to the defined schema. It uses dynamic Pydantic model generation to validate objects before they are stored. - -## Key Features - -- **Schema-Based Validation**: Validates objects against your index schema definition -- **Dynamic Model Generation**: Creates Pydantic models on the fly based on your schema -- **Type Checking**: Ensures fields contain appropriate data types -- **Field-Specific Validation**: - - Text and Tag fields must be strings - - Numeric fields must be integers or floats - - Geo fields must be properly formatted latitude/longitude strings - - Vector fields must have the correct dimensions and data types -- **JSON Path Support**: Validates fields extracted from nested JSON structures -- **Fail-Fast Approach**: Stops processing at the first validation error -- **Performance Optimized**: Caches models for repeated validation - -## Usage - -### Basic Validation - -```python -from redisvl.schema.validation import validate_object - -# Assuming you have a schema defined -validated_data = validate_object(schema, data) -``` - -### Storage Integration - -The validation is automatically integrated with the storage classes: - -```python -from redisvl.index.storage import BaseStorage - -# Create storage with schema -storage = BaseStorage(schema=schema, client=redis_client) - -# Write data - validation happens automatically -storage.write_one(data) - -# Or validate explicitly -validated = storage.validate_object(data) -``` - -## Field Type Validation - -The validation system supports all Redis field types: - -### Text Fields - -Text fields are validated to ensure they contain string values: - -```python -# Valid -{"title": "Hello World"} - -# Invalid -{"title": 123} # Not a string -``` - -### Tag Fields - -Tag fields are validated to ensure they contain string values: - -```python -# Valid -{"category": "electronics"} - -# Invalid -{"category": 123} # Not a string -``` - -### Numeric Fields - -Numeric fields must contain integers or floats: - -```python -# Valid -{"price": 19.99} -{"quantity": 5} - -# Invalid -{"price": "19.99"} # String, not a number -``` - -### Geo Fields - -Geo fields must contain properly formatted latitude/longitude strings: - -```python -# Valid -{"location": "37.7749,-122.4194"} # San Francisco -{"location": "40.7128,-74.0060"} # New York - -# Invalid -{"location": "invalid"} # Not in lat,lon format -{"location": "91.0,0.0"} # Latitude out of range (-90 to 90) -{"location": "0.0,181.0"} # Longitude out of range (-180 to 180) -``` - -### Vector Fields - -Vector fields must contain arrays with the correct dimensions and data types: - -```python -# Valid -{"embedding": [0.1, 0.2, 0.3, 0.4]} # 4-dimensional float vector -{"embedding": b'\x00\x01\x02\x03'} # Raw bytes (dimensions not checked) - -# Invalid -{"embedding": [0.1, 0.2, 0.3]} # Wrong dimensions -{"embedding": "not a vector"} # Wrong type -{"embedding": [0.1, "text", 0.3]} # Mixed types -``` - -For integer vectors, the values must be within the appropriate range: - -- **INT8**: -128 to 127 -- **INT16**: -32,768 to 32,767 - -```python -# Valid INT8 vector -{"int_vector": [1, 2, 3]} - -# Invalid INT8 vector -{"int_vector": [1000, 2000, 3000]} # Values out of range -``` - -## Nested JSON Validation - -The validation system supports extracting and validating fields from nested JSON structures: - -```python -# Schema with JSON paths -fields = { - "id": Field(name="id", type=FieldTypes.TAG), - "title": Field(name="title", type=FieldTypes.TEXT, path="$.content.title"), - "rating": Field(name="rating", type=FieldTypes.NUMERIC, path="$.metadata.rating") -} - -# Nested JSON data -data = { - "id": "doc1", - "content": { - "title": "Hello World" - }, - "metadata": { - "rating": 4.5 - } -} - -# Validation extracts fields using JSON paths -validated = validate_object(schema, data) -# Result: {"id": "doc1", "title": "Hello World", "rating": 4.5} -``` - -## Error Handling - -The validation system uses a fail-fast approach, raising a `ValueError` when validation fails: - -```python -try: - validated = validate_object(schema, data) -except ValueError as e: - print(f"Validation error: {e}") - # Handle the error -``` - -The error message includes information about the field that failed validation. - -## Optional Fields - -All fields are considered optional during validation. If a field is missing, it will be excluded from the validated result: - -```python -# Schema with multiple fields -fields = { - "id": Field(name="id", type=FieldTypes.TAG), - "title": Field(name="title", type=FieldTypes.TEXT), - "rating": Field(name="rating", type=FieldTypes.NUMERIC) -} - -# Data with missing fields -data = { - "id": "doc1", - "title": "Hello World" - # rating is missing -} - -# Validation succeeds with partial data -validated = validate_object(schema, data) -# Result: {"id": "doc1", "title": "Hello World"} -``` - -## Performance Considerations - -The validation system is optimized for performance: - -- **Model Caching**: Pydantic models are cached by schema name to avoid regeneration -- **Lazy Validation**: Fields are validated only when needed -- **Fail-Fast Approach**: Processing stops at the first validation error - -For large datasets, validation can be a significant part of the processing time. If you need to write many objects with the same structure, consider validating a sample first to ensure correctness. - -## Limitations - -- **JSON Path**: The current implementation only supports simple dot notation paths (e.g., `$.field.subfield`). Array indexing is not supported. -- **Vector Bytes**: When vectors are provided as bytes, the dimensions cannot be validated. -- **Custom Validators**: The current implementation does not support custom user-defined validators. - -## Best Practices - -1. **Define Clear Schemas**: Be explicit about field types and constraints -2. **Pre-validate Critical Data**: For large datasets, validate a sample before processing everything -3. **Handle Validation Errors**: Implement proper error handling for validation failures -4. **Use JSON Paths Carefully**: Test nested JSON extraction to ensure paths are correctly defined -5. **Consider Optional Fields**: Decide which fields are truly required for your application - -## Integration with Storage Classes - -The validation system is fully integrated with the storage classes: - -- **BaseStorage**: For hash-based storage, validates each field individually -- **JsonStorage**: For JSON storage, extracts and validates fields from nested structures - -Each storage class automatically validates data before writing to Redis, ensuring data integrity. \ No newline at end of file From 1d14f5f2e62c71ff13097433af57464b9253e0de Mon Sep 17 00:00:00 2001 From: Tyler Hutcherson Date: Wed, 26 Mar 2025 11:24:22 -0400 Subject: [PATCH 05/11] skip cell in notebook testing --- docs/user_guide/01_getting_started.ipynb | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/user_guide/01_getting_started.ipynb b/docs/user_guide/01_getting_started.ipynb index 7ab3a234..bf097415 100644 --- a/docs/user_guide/01_getting_started.ipynb +++ b/docs/user_guide/01_getting_started.ipynb @@ -440,6 +440,8 @@ } ], "source": [ + "# NBVAL_SKIP\n", + "\n", "keys = index.load([{\"user_embedding\": True}])" ] }, From f2f5010524f39ed9f72793e0c12dc72e25aaacfa Mon Sep 17 00:00:00 2001 From: Tyler Hutcherson Date: Thu, 27 Mar 2025 15:25:45 -0400 Subject: [PATCH 06/11] update json path parser --- poetry.lock | 562 ++++++++++++++++++++++++++++++++++- pyproject.toml | 2 + redisvl/schema/validation.py | 41 +-- 3 files changed, 570 insertions(+), 35 deletions(-) diff --git a/poetry.lock b/poetry.lock index 85c682e7..ed0e1652 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. +# This file is automatically @generated by Poetry 2.0.1 and should not be changed by hand. [[package]] name = "accessible-pygments" @@ -6,6 +6,8 @@ version = "0.0.5" description = "A collection of accessible pygments styles" optional = false python-versions = ">=3.9" +groups = ["docs"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "accessible_pygments-0.0.5-py3-none-any.whl", hash = "sha256:88ae3211e68a1d0b011504b2ffc1691feafce124b845bd072ab6f9f66f34d4b7"}, {file = "accessible_pygments-0.0.5.tar.gz", hash = "sha256:40918d3e6a2b619ad424cb91e556bd3bd8865443d9f22f1dcdf79e33c8046872"}, @@ -24,6 +26,8 @@ version = "2.4.6" description = "Happy Eyeballs for asyncio" optional = true python-versions = ">=3.9" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"voyageai\"" files = [ {file = "aiohappyeyeballs-2.4.6-py3-none-any.whl", hash = "sha256:147ec992cf873d74f5062644332c539fcd42956dc69453fe5204195e560517e1"}, {file = "aiohappyeyeballs-2.4.6.tar.gz", hash = "sha256:9b05052f9042985d32ecbe4b59a77ae19c006a78f1344d7fdad69d28ded3d0b0"}, @@ -35,6 +39,8 @@ version = "3.11.13" description = "Async http client/server framework (asyncio)" optional = true python-versions = ">=3.9" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"voyageai\"" files = [ {file = "aiohttp-3.11.13-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:a4fe27dbbeec445e6e1291e61d61eb212ee9fed6e47998b27de71d70d3e8777d"}, {file = "aiohttp-3.11.13-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:9e64ca2dbea28807f8484c13f684a2f761e69ba2640ec49dacd342763cc265ef"}, @@ -138,6 +144,8 @@ version = "1.2.1" description = "asyncio rate limiter, a leaky bucket implementation" optional = true python-versions = "<4.0,>=3.8" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"voyageai\"" files = [ {file = "aiolimiter-1.2.1-py3-none-any.whl", hash = "sha256:d3f249e9059a20badcb56b61601a83556133655c11d1eb3dd3e04ff069e5f3c7"}, {file = "aiolimiter-1.2.1.tar.gz", hash = "sha256:e02a37ea1a855d9e832252a105420ad4d15011505512a1a1d814647451b5cca9"}, @@ -149,6 +157,8 @@ version = "1.3.2" description = "aiosignal: a list of registered asynchronous callbacks" optional = true python-versions = ">=3.9" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"voyageai\"" files = [ {file = "aiosignal-1.3.2-py2.py3-none-any.whl", hash = "sha256:45cde58e409a301715980c2b01d0c28bdde3770d8290b5eb2173759d9acb31a5"}, {file = "aiosignal-1.3.2.tar.gz", hash = "sha256:a8c255c66fafb1e499c9351d0bf32ff2d8a0321595ebac3b93713656d2436f54"}, @@ -163,6 +173,8 @@ version = "0.7.16" description = "A light, configurable Sphinx theme" optional = false python-versions = ">=3.9" +groups = ["docs"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "alabaster-0.7.16-py3-none-any.whl", hash = "sha256:b46733c07dce03ae4e150330b975c75737fa60f0a7c591b6c8bf4928a28e2c92"}, {file = "alabaster-0.7.16.tar.gz", hash = "sha256:75a8b99c28a5dad50dd7f8ccdd447a121ddb3892da9e53d1ca5cca3106d58d65"}, @@ -174,6 +186,8 @@ version = "0.7.0" description = "Reusable constraint types to use with typing.Annotated" optional = false python-versions = ">=3.8" +groups = ["main"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53"}, {file = "annotated_types-0.7.0.tar.gz", hash = "sha256:aff07c09a53a08bc8cfccb9c85b05f1aa9a2a6f23728d790723543408344ce89"}, @@ -185,6 +199,8 @@ version = "4.8.0" description = "High level compatibility layer for multiple asynchronous event loop implementations" optional = true python-versions = ">=3.9" +groups = ["main"] +markers = "(extra == \"openai\" or extra == \"cohere\" or extra == \"mistralai\") and (python_version <= \"3.11\" or python_version >= \"3.12\")" files = [ {file = "anyio-4.8.0-py3-none-any.whl", hash = "sha256:b5011f270ab5eb0abf13385f851315585cc37ef330dd88e27ec3d34d651fd47a"}, {file = "anyio-4.8.0.tar.gz", hash = "sha256:1d9fe889df5212298c0c0723fa20479d1b94883a2df44bd3897aa91083316f7a"}, @@ -207,6 +223,8 @@ version = "0.1.4" description = "Disable App Nap on macOS >= 10.9" optional = false python-versions = ">=3.6" +groups = ["dev", "docs"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and platform_system == \"Darwin\"" files = [ {file = "appnope-0.1.4-py2.py3-none-any.whl", hash = "sha256:502575ee11cd7a28c0205f379b525beefebab9d161b7c964670864014ed7213c"}, {file = "appnope-0.1.4.tar.gz", hash = "sha256:1de3860566df9caf38f01f86f65e0e13e379af54f9e4bee1e66b48f2efffd1ee"}, @@ -218,6 +236,8 @@ version = "3.3.8" description = "An abstract syntax tree for Python with inference support." optional = false python-versions = ">=3.9.0" +groups = ["dev"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "astroid-3.3.8-py3-none-any.whl", hash = "sha256:187ccc0c248bfbba564826c26f070494f7bc964fd286b6d9fff4420e55de828c"}, {file = "astroid-3.3.8.tar.gz", hash = "sha256:a88c7994f914a4ea8572fac479459f4955eeccc877be3f2d959a33273b0cf40b"}, @@ -232,6 +252,8 @@ version = "3.0.0" description = "Annotate AST trees with source code positions" optional = false python-versions = ">=3.8" +groups = ["dev", "docs"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "asttokens-3.0.0-py3-none-any.whl", hash = "sha256:e3078351a059199dd5138cb1c706e6430c05eff2ff136af5eb4790f9d28932e2"}, {file = "asttokens-3.0.0.tar.gz", hash = "sha256:0dcd8baa8d62b0c1d118b399b2ddba3c4aff271d0d7a9e0d4c1681c79035bbc7"}, @@ -247,6 +269,8 @@ version = "5.0.1" description = "Timeout context manager for asyncio programs" optional = false python-versions = ">=3.8" +groups = ["main"] +markers = "python_full_version < \"3.11.3\"" files = [ {file = "async_timeout-5.0.1-py3-none-any.whl", hash = "sha256:39e3809566ff85354557ec2398b55e096c8364bacac9405a7a1fa429e77fe76c"}, {file = "async_timeout-5.0.1.tar.gz", hash = "sha256:d9321a7a3d5a6a5e187e824d2fa0793ce379a202935782d555d6e9d2735677d3"}, @@ -258,10 +282,12 @@ version = "25.1.0" description = "Classes Without Boilerplate" optional = false python-versions = ">=3.8" +groups = ["main", "dev", "docs"] files = [ {file = "attrs-25.1.0-py3-none-any.whl", hash = "sha256:c75a69e28a550a7e93789579c22aa26b0f5b83b75dc4e08fe092980051e1090a"}, {file = "attrs-25.1.0.tar.gz", hash = "sha256:1c97078a80c814273a76b2a298a932eb681c87415c11dee0a6921de7f1b02c3e"}, ] +markers = {main = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"voyageai\"", dev = "python_version <= \"3.11\" or python_version >= \"3.12\"", docs = "python_version <= \"3.11\" or python_version >= \"3.12\""} [package.extras] benchmark = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-codspeed", "pytest-mypy-plugins", "pytest-xdist[psutil]"] @@ -277,6 +303,8 @@ version = "2.17.0" description = "Internationalization utilities" optional = false python-versions = ">=3.8" +groups = ["docs"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "babel-2.17.0-py3-none-any.whl", hash = "sha256:4d0b53093fdfb4b21c92b5213dba5a1b23885afa8383709427046b21c366e5f2"}, {file = "babel-2.17.0.tar.gz", hash = "sha256:0c54cffb19f690cdcc52a3b50bcbf71e07a808d1c80d549f2459b9d2cf0afb9d"}, @@ -291,10 +319,12 @@ version = "4.13.3" description = "Screen-scraping library" optional = false python-versions = ">=3.7.0" +groups = ["main", "docs"] files = [ {file = "beautifulsoup4-4.13.3-py3-none-any.whl", hash = "sha256:99045d7d3f08f91f0d656bc9b7efbae189426cd913d830294a15eefa0ea4df16"}, {file = "beautifulsoup4-4.13.3.tar.gz", hash = "sha256:1bd32405dacc920b42b83ba01644747ed77456a65760e285fbc47633ceddaf8b"}, ] +markers = {main = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"ranx\" and python_version >= \"3.10\"", docs = "python_version <= \"3.11\" or python_version >= \"3.12\""} [package.dependencies] soupsieve = ">1.2" @@ -313,6 +343,8 @@ version = "25.1.0" description = "The uncompromising code formatter." optional = false python-versions = ">=3.9" +groups = ["dev"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "black-25.1.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:759e7ec1e050a15f89b770cefbf91ebee8917aac5c20483bc2d80a6c3a04df32"}, {file = "black-25.1.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:0e519ecf93120f34243e6b0054db49c00a35f84f195d5bce7e9f5cfc578fc2da"}, @@ -359,6 +391,8 @@ version = "6.2.0" description = "An easy safelist-based HTML-sanitizing tool." optional = false python-versions = ">=3.9" +groups = ["docs"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "bleach-6.2.0-py3-none-any.whl", hash = "sha256:117d9c6097a7c3d22fd578fcd8d35ff1e125df6736f554da4e432fdd63f31e5e"}, {file = "bleach-6.2.0.tar.gz", hash = "sha256:123e894118b8a599fd80d3ec1a6d4cc7ce4e5882b1317a7e1ba69b56e95f991f"}, @@ -377,6 +411,8 @@ version = "1.36.0" description = "The AWS SDK for Python" optional = true python-versions = ">=3.8" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"bedrock\"" files = [ {file = "boto3-1.36.0-py3-none-any.whl", hash = "sha256:d0ca7a58ce25701a52232cc8df9d87854824f1f2964b929305722ebc7959d5a9"}, {file = "boto3-1.36.0.tar.gz", hash = "sha256:159898f51c2997a12541c0e02d6e5a8fe2993ddb307b9478fd9a339f98b57e00"}, @@ -396,6 +432,8 @@ version = "1.36.26" description = "Low-level, data-driven core of boto 3." optional = true python-versions = ">=3.8" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"bedrock\"" files = [ {file = "botocore-1.36.26-py3-none-any.whl", hash = "sha256:4e3f19913887a58502e71ef8d696fe7eaa54de7813ff73390cd5883f837dfa6e"}, {file = "botocore-1.36.26.tar.gz", hash = "sha256:4a63bcef7ecf6146fd3a61dc4f9b33b7473b49bdaf1770e9aaca6eee0c9eab62"}, @@ -418,6 +456,8 @@ version = "5.5.2" description = "Extensible memoizing collections and decorators" optional = true python-versions = ">=3.7" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"vertexai\"" files = [ {file = "cachetools-5.5.2-py3-none-any.whl", hash = "sha256:d26a22bcc62eb95c3beabd9f1ee5e820d3d2704fe2967cbe350e20c8ffcd3f0a"}, {file = "cachetools-5.5.2.tar.gz", hash = "sha256:1a661caa9175d26759571b2e19580f9d6393969e5dfca11fdb1f947a23e640d4"}, @@ -429,6 +469,8 @@ version = "1.0.0" description = "RFC 7049 - Concise Binary Object Representation" optional = true python-versions = "*" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"ranx\" and python_version >= \"3.10\"" files = [ {file = "cbor-1.0.0.tar.gz", hash = "sha256:13225a262ddf5615cbd9fd55a76a0d53069d18b07d2e9f19c39e6acb8609bbb6"}, ] @@ -439,6 +481,8 @@ version = "5.6.5" description = "CBOR (de)serializer with extensive tag support" optional = true python-versions = ">=3.8" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"ranx\" and python_version >= \"3.10\"" files = [ {file = "cbor2-5.6.5-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e16c4a87fc999b4926f5c8f6c696b0d251b4745bc40f6c5aee51d69b30b15ca2"}, {file = "cbor2-5.6.5-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:87026fc838370d69f23ed8572939bd71cea2b3f6c8f8bb8283f573374b4d7f33"}, @@ -497,10 +541,12 @@ version = "2025.1.31" description = "Python package for providing Mozilla's CA Bundle." optional = false python-versions = ">=3.6" +groups = ["main", "dev", "docs"] files = [ {file = "certifi-2025.1.31-py3-none-any.whl", hash = "sha256:ca78db4565a652026a4db2bcdf68f2fb589ea80d0be70e03929ed730746b84fe"}, {file = "certifi-2025.1.31.tar.gz", hash = "sha256:3d5da6925056f6f18f119200434a4780a94263f10d1c21d032a6f6b2baa20651"}, ] +markers = {main = "(python_version <= \"3.11\" or python_version >= \"3.12\") and (extra == \"openai\" or extra == \"cohere\" or extra == \"mistralai\" or extra == \"sentence-transformers\" or extra == \"vertexai\" or extra == \"voyageai\" or extra == \"ranx\") and (extra == \"openai\" or extra == \"cohere\" or extra == \"mistralai\" or extra == \"sentence-transformers\" or extra == \"vertexai\" or extra == \"voyageai\" or python_version >= \"3.10\")", dev = "python_version <= \"3.11\" or python_version >= \"3.12\"", docs = "python_version <= \"3.11\" or python_version >= \"3.12\""} [[package]] name = "cffi" @@ -508,6 +554,7 @@ version = "1.17.1" description = "Foreign Function Interface for Python calling C code." optional = false python-versions = ">=3.8" +groups = ["dev", "docs"] files = [ {file = "cffi-1.17.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:df8b1c11f177bc2313ec4b2d46baec87a5f3e71fc8b45dab2ee7cae86d9aba14"}, {file = "cffi-1.17.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8f2cdc858323644ab277e9bb925ad72ae0e67f69e804f4898c070998d50b1a67"}, @@ -577,6 +624,7 @@ files = [ {file = "cffi-1.17.1-cp39-cp39-win_amd64.whl", hash = "sha256:d016c76bdd850f3c626af19b0542c9677ba156e4ee4fccfdd7848803533ef662"}, {file = "cffi-1.17.1.tar.gz", hash = "sha256:1c39c6016c32bc48dd54561950ebd6836e1670f2ae46128f67cf49e789c52824"}, ] +markers = {dev = "(implementation_name == \"pypy\" or platform_python_implementation != \"PyPy\") and (python_version <= \"3.11\" or python_version >= \"3.12\")", docs = "(python_version <= \"3.11\" or python_version >= \"3.12\") and implementation_name == \"pypy\""} [package.dependencies] pycparser = "*" @@ -587,6 +635,8 @@ version = "3.4.0" description = "Validate configuration and produce human readable error messages." optional = false python-versions = ">=3.8" +groups = ["dev"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "cfgv-3.4.0-py2.py3-none-any.whl", hash = "sha256:b7265b1f29fd3316bfcd2b330d63d024f2bfd8bcb8b0272f8e19a504856c48f9"}, {file = "cfgv-3.4.0.tar.gz", hash = "sha256:e52591d4c5f5dead8e0f673fb16db7949d2cfb3f7da4582893288f0ded8fe560"}, @@ -598,6 +648,7 @@ version = "3.4.1" description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet." optional = false python-versions = ">=3.7" +groups = ["main", "dev", "docs"] files = [ {file = "charset_normalizer-3.4.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:91b36a978b5ae0ee86c394f5a54d6ef44db1de0815eb43de826d41d21e4af3de"}, {file = "charset_normalizer-3.4.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7461baadb4dc00fd9e0acbe254e3d7d2112e7f92ced2adc96e54ef6501c5f176"}, @@ -692,6 +743,7 @@ files = [ {file = "charset_normalizer-3.4.1-py3-none-any.whl", hash = "sha256:d98b1668f06378c6dbefec3b92299716b931cd4e6061f3c875a71ced1780ab85"}, {file = "charset_normalizer-3.4.1.tar.gz", hash = "sha256:44251f18cd68a75b56585dd00dae26183e102cd5e0f9f1466e6df5da2ed64ea3"}, ] +markers = {main = "(python_version <= \"3.11\" or python_version >= \"3.12\") and (extra == \"sentence-transformers\" or extra == \"cohere\" or extra == \"vertexai\" or extra == \"voyageai\" or extra == \"ranx\") and (extra == \"sentence-transformers\" or extra == \"cohere\" or extra == \"vertexai\" or extra == \"voyageai\" or python_version >= \"3.10\")", dev = "python_version <= \"3.11\" or python_version >= \"3.12\"", docs = "python_version <= \"3.11\" or python_version >= \"3.12\""} [[package]] name = "click" @@ -699,6 +751,8 @@ version = "8.1.8" description = "Composable command line interface toolkit" optional = false python-versions = ">=3.7" +groups = ["dev", "docs"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "click-8.1.8-py3-none-any.whl", hash = "sha256:63c132bbbed01578a06712a2d1f497bb62d9c1c0d329b7903a866228027263b2"}, {file = "click-8.1.8.tar.gz", hash = "sha256:ed53c9d8990d83c2a27deae68e4ee337473f6330c040a31d4225c9574d16096a"}, @@ -713,6 +767,8 @@ version = "5.13.12" description = "" optional = true python-versions = "<4.0,>=3.9" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"cohere\"" files = [ {file = "cohere-5.13.12-py3-none-any.whl", hash = "sha256:2a043591a3e5280b47716a6b311e4c7f58e799364113a9cb81b50cd4f6c95f7e"}, {file = "cohere-5.13.12.tar.gz", hash = "sha256:97bb9ac107e580780b941acbabd3aa5e71960e6835398292c46aaa8a0a4cab88"}, @@ -735,10 +791,12 @@ version = "0.4.6" description = "Cross-platform colored terminal text." optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" +groups = ["main", "dev", "docs"] files = [ {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"}, {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, ] +markers = {main = "(python_version <= \"3.11\" or python_version >= \"3.12\") and (extra == \"sentence-transformers\" or extra == \"cohere\" or extra == \"openai\" or extra == \"ranx\") and platform_system == \"Windows\" and (extra == \"sentence-transformers\" or extra == \"cohere\" or extra == \"openai\" or python_version >= \"3.10\")", dev = "(platform_system == \"Windows\" or sys_platform == \"win32\") and (python_version <= \"3.11\" or python_version >= \"3.12\")", docs = "(platform_system == \"Windows\" or sys_platform == \"win32\") and (python_version <= \"3.11\" or python_version >= \"3.12\")"} [[package]] name = "coloredlogs" @@ -746,6 +804,8 @@ version = "15.0.1" description = "Colored terminal output for Python's logging module" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +groups = ["main"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "coloredlogs-15.0.1-py2.py3-none-any.whl", hash = "sha256:612ee75c546f53e92e70049c9dbfcc18c935a2b9a53b66085ce9ef6a6e5c0934"}, {file = "coloredlogs-15.0.1.tar.gz", hash = "sha256:7c991aa71a4577af2f82600d8f8f3a89f936baeaf9b50a9c197da014e5bf16b0"}, @@ -763,6 +823,8 @@ version = "0.2.2" description = "Jupyter Python Comm implementation, for usage in ipykernel, xeus-python etc." optional = false python-versions = ">=3.8" +groups = ["dev", "docs"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "comm-0.2.2-py3-none-any.whl", hash = "sha256:e6fb86cb70ff661ee8c9c14e7d36d6de3b4066f1441be4063df9c5009f0a64d3"}, {file = "comm-0.2.2.tar.gz", hash = "sha256:3fd7a84065306e07bea1773df6eb8282de51ba82f77c72f9c85716ab11fe980e"}, @@ -780,6 +842,8 @@ version = "1.3.1" description = "Python library for calculating contours of 2D quadrilateral grids" optional = true python-versions = ">=3.10" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"ranx\" and python_version >= \"3.10\"" files = [ {file = "contourpy-1.3.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:a045f341a77b77e1c5de31e74e966537bba9f3c4099b35bf4c2e3939dd54cdab"}, {file = "contourpy-1.3.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:500360b77259914f7805af7462e41f9cb7ca92ad38e9f94d6c8641b089338124"}, @@ -853,6 +917,8 @@ version = "7.6.12" description = "Code coverage measurement for Python" optional = false python-versions = ">=3.9" +groups = ["dev"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "coverage-7.6.12-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:704c8c8c6ce6569286ae9622e534b4f5b9759b6f2cd643f1c1a61f666d534fe8"}, {file = "coverage-7.6.12-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ad7525bf0241e5502168ae9c643a2f6c219fa0a283001cee4cf23a9b7da75879"}, @@ -928,6 +994,8 @@ version = "2.9.1" description = "Thin Python bindings to de/compression algorithms in Rust" optional = true python-versions = ">=3.8" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"ranx\" and python_version >= \"3.10\"" files = [ {file = "cramjam-2.9.1-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:8e82464d1e00fbbb12958999b8471ba5e9f3d9711954505a0a7b378762332e6f"}, {file = "cramjam-2.9.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:6d2df8a6511cc08ef1fccd2e0c65e2ebc9f57574ec8376052a76851af5398810"}, @@ -1030,6 +1098,8 @@ version = "44.0.1" description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers." optional = false python-versions = "!=3.9.0,!=3.9.1,>=3.7" +groups = ["dev"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "cryptography-44.0.1-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:bf688f615c29bfe9dfc44312ca470989279f0e94bb9f631f85e3459af8efc009"}, {file = "cryptography-44.0.1-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dd7c7e2d71d908dc0f8d2027e1604102140d84b155e658c20e8ad1304317691f"}, @@ -1083,6 +1153,8 @@ version = "0.12.1" description = "Composable style cycles" optional = true python-versions = ">=3.8" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"ranx\" and python_version >= \"3.10\"" files = [ {file = "cycler-0.12.1-py3-none-any.whl", hash = "sha256:85cef7cff222d8644161529808465972e51340599459b8ac3ccbac5a854e0d30"}, {file = "cycler-0.12.1.tar.gz", hash = "sha256:88bb128f02ba341da8ef447245a9e138fae777f6a23943da4540077d3601eb1c"}, @@ -1098,6 +1170,8 @@ version = "1.8.12" description = "An implementation of the Debug Adapter Protocol for Python" optional = false python-versions = ">=3.8" +groups = ["dev", "docs"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "debugpy-1.8.12-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:a2ba7ffe58efeae5b8fad1165357edfe01464f9aef25e814e891ec690e7dd82a"}, {file = "debugpy-1.8.12-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cbbd4149c4fc5e7d508ece083e78c17442ee13b0e69bfa6bd63003e486770f45"}, @@ -1133,6 +1207,8 @@ version = "5.2.1" description = "Decorators for Humans" optional = false python-versions = ">=3.8" +groups = ["dev", "docs"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "decorator-5.2.1-py3-none-any.whl", hash = "sha256:d316bb415a2d9e2d2b3abcc4084c6502fc09240e292cd76a76afc106a1c8e04a"}, {file = "decorator-5.2.1.tar.gz", hash = "sha256:65f266143752f734b0a7cc83c46f4618af75b8c5911b00ccb61d0ac9b6da0360"}, @@ -1144,6 +1220,8 @@ version = "0.7.1" description = "XML bomb protection for Python stdlib modules" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +groups = ["docs"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "defusedxml-0.7.1-py2.py3-none-any.whl", hash = "sha256:a352e7e428770286cc899e2542b6cdaedb2b4953ff269a210103ec58f6198a61"}, {file = "defusedxml-0.7.1.tar.gz", hash = "sha256:1bb3032db185915b62d7c6209c5a8792be6a32ab2fedacc84e01b52c51aa3e69"}, @@ -1155,6 +1233,8 @@ version = "0.3.9" description = "serialize all of Python" optional = false python-versions = ">=3.8" +groups = ["dev"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "dill-0.3.9-py3-none-any.whl", hash = "sha256:468dff3b89520b474c0397703366b7b95eebe6303f108adf9b19da1f702be87a"}, {file = "dill-0.3.9.tar.gz", hash = "sha256:81aa267dddf68cbfe8029c42ca9ec6a4ab3b22371d1c450abc54422577b4512c"}, @@ -1170,6 +1250,8 @@ version = "0.3.9" description = "Distribution utilities" optional = false python-versions = "*" +groups = ["dev"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "distlib-0.3.9-py2.py3-none-any.whl", hash = "sha256:47f8c22fd27c27e25a65601af709b38e4f0a45ea4fc2e710f65755fa8caaaf87"}, {file = "distlib-0.3.9.tar.gz", hash = "sha256:a60f20dea646b8a33f3e7772f74dc0b2d0772d2837ee1342a00645c81edf9403"}, @@ -1181,6 +1263,8 @@ version = "1.9.0" description = "Distro - an OS platform information API" optional = true python-versions = ">=3.6" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"openai\"" files = [ {file = "distro-1.9.0-py3-none-any.whl", hash = "sha256:7bffd925d65168f85027d8da9af6bddab658135b840670a223589bc0c8ef02b2"}, {file = "distro-1.9.0.tar.gz", hash = "sha256:2fa77c6fd8940f116ee1d6b94a2f90b13b5ea8d019b98bc8bafdcabcdd9bdbed"}, @@ -1192,6 +1276,8 @@ version = "7.1.0" description = "A Python library for the Docker Engine API." optional = false python-versions = ">=3.8" +groups = ["dev"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "docker-7.1.0-py3-none-any.whl", hash = "sha256:c96b93b7f0a746f9e77d325bcfb87422a3d8bd4f03136ae8a85b37f1898d5fc0"}, {file = "docker-7.1.0.tar.gz", hash = "sha256:ad8c70e6e3f8926cb8a92619b832b4ea5299e2831c14284663184e200546fa6c"}, @@ -1214,6 +1300,8 @@ version = "0.16" description = "Parse Python docstrings in reST, Google and Numpydoc format" optional = true python-versions = ">=3.6,<4.0" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"vertexai\"" files = [ {file = "docstring_parser-0.16-py3-none-any.whl", hash = "sha256:bf0a1387354d3691d102edef7ec124f219ef639982d096e26e3b60aeffa90637"}, {file = "docstring_parser-0.16.tar.gz", hash = "sha256:538beabd0af1e2db0146b6bd3caa526c35a34d61af9fd2887f3a8a27a739aa6e"}, @@ -1225,6 +1313,8 @@ version = "0.21.2" description = "Docutils -- Python Documentation Utilities" optional = false python-versions = ">=3.9" +groups = ["docs"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "docutils-0.21.2-py3-none-any.whl", hash = "sha256:dafca5b9e384f0e419294eb4d2ff9fa826435bf15f15b7bd45723e8ad76811b2"}, {file = "docutils-0.21.2.tar.gz", hash = "sha256:3a6b18732edf182daa3cd12775bbb338cf5691468f91eeeb109deff6ebfa986f"}, @@ -1236,6 +1326,8 @@ version = "0.2.2" description = "Like `typing._eval_type`, but lets older Python versions use newer typing features." optional = true python-versions = ">=3.8" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"mistralai\"" files = [ {file = "eval_type_backport-0.2.2-py3-none-any.whl", hash = "sha256:cb6ad7c393517f476f96d456d0412ea80f0a8cf96f6892834cd9340149111b0a"}, {file = "eval_type_backport-0.2.2.tar.gz", hash = "sha256:f0576b4cf01ebb5bd358d02314d31846af5e07678387486e2c798af0e7d849c1"}, @@ -1250,10 +1342,12 @@ version = "1.2.2" description = "Backport of PEP 654 (exception groups)" optional = false python-versions = ">=3.7" +groups = ["main", "dev", "docs"] files = [ {file = "exceptiongroup-1.2.2-py3-none-any.whl", hash = "sha256:3111b9d131c238bec2f8f516e123e14ba243563fb135d3fe885990585aa7795b"}, {file = "exceptiongroup-1.2.2.tar.gz", hash = "sha256:47c2edf7c6738fafb49fd34290706d1a1a2f4d1c6df275526b62cbb4aa5393cc"}, ] +markers = {main = "(extra == \"openai\" or extra == \"cohere\" or extra == \"mistralai\") and python_version < \"3.11\"", dev = "python_version < \"3.11\"", docs = "python_version < \"3.11\""} [package.extras] test = ["pytest (>=6)"] @@ -1264,6 +1358,8 @@ version = "2.1.1" description = "execnet: rapid multi-Python deployment" optional = false python-versions = ">=3.8" +groups = ["dev"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "execnet-2.1.1-py3-none-any.whl", hash = "sha256:26dee51f1b80cebd6d0ca8e74dd8745419761d3bef34163928cbebbdc4749fdc"}, {file = "execnet-2.1.1.tar.gz", hash = "sha256:5189b52c6121c24feae288166ab41b32549c7e2348652736540b9e6e7d4e72e3"}, @@ -1278,6 +1374,8 @@ version = "2.2.0" description = "Get the currently executing AST node of a frame, and other information" optional = false python-versions = ">=3.8" +groups = ["dev", "docs"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "executing-2.2.0-py2.py3-none-any.whl", hash = "sha256:11387150cad388d62750327a53d3339fad4888b39a6fe233c3afbb54ecffd3aa"}, {file = "executing-2.2.0.tar.gz", hash = "sha256:5d108c028108fe2551d1a7b2e8b713341e2cb4fc0aa7dcf966fa4327a5226755"}, @@ -1292,6 +1390,8 @@ version = "1.10.0" description = "Fast read/write of AVRO files" optional = true python-versions = ">=3.9" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"cohere\"" files = [ {file = "fastavro-1.10.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:1a9fe0672d2caf0fe54e3be659b13de3cad25a267f2073d6f4b9f8862acc31eb"}, {file = "fastavro-1.10.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:86dd0410770e0c99363788f0584523709d85e57bb457372ec5c285a482c17fe6"}, @@ -1338,6 +1438,8 @@ version = "2.21.1" description = "Fastest Python implementation of JSON schema" optional = false python-versions = "*" +groups = ["dev", "docs"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "fastjsonschema-2.21.1-py3-none-any.whl", hash = "sha256:c9e5b7e908310918cf494a434eeb31384dd84a98b57a30bcb1f535015b554667"}, {file = "fastjsonschema-2.21.1.tar.gz", hash = "sha256:794d4f0a58f848961ba16af7b9c85a3e88cd360df008c59aac6fc5ae9323b5d4"}, @@ -1352,6 +1454,8 @@ version = "2024.11.0" description = "Python support for Parquet file format" optional = true python-versions = ">=3.9" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"ranx\" and python_version >= \"3.10\"" files = [ {file = "fastparquet-2024.11.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:60ccf587410f0979105e17036df61bb60e1c2b81880dc91895cdb4ee65b71e7f"}, {file = "fastparquet-2024.11.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:a5ad5fc14b0567e700bea3cd528a0bd45a6f9371370b49de8889fb3d10a6574a"}, @@ -1412,10 +1516,12 @@ version = "3.17.0" description = "A platform independent file lock." optional = false python-versions = ">=3.9" +groups = ["main", "dev"] files = [ {file = "filelock-3.17.0-py3-none-any.whl", hash = "sha256:533dc2f7ba78dc2f0f531fc6c4940addf7b70a481e269a5a3b93be94ffbe8338"}, {file = "filelock-3.17.0.tar.gz", hash = "sha256:ee4e77401ef576ebb38cd7f13b9b28893194acc20a8e68e18730ba9c0e54660e"}, ] +markers = {main = "(extra == \"sentence-transformers\" or extra == \"cohere\") and (python_version <= \"3.11\" or python_version >= \"3.12\")", dev = "python_version <= \"3.11\" or python_version >= \"3.12\""} [package.extras] docs = ["furo (>=2024.8.6)", "sphinx (>=8.1.3)", "sphinx-autodoc-typehints (>=3)"] @@ -1428,6 +1534,8 @@ version = "4.56.0" description = "Tools to manipulate font files" optional = true python-versions = ">=3.8" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"ranx\" and python_version >= \"3.10\"" files = [ {file = "fonttools-4.56.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:331954d002dbf5e704c7f3756028e21db07097c19722569983ba4d74df014000"}, {file = "fonttools-4.56.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:8d1613abd5af2f93c05867b3a3759a56e8bf97eb79b1da76b2bc10892f96ff16"}, @@ -1501,6 +1609,8 @@ version = "1.5.0" description = "A list-like structure which implements collections.abc.MutableSequence" optional = true python-versions = ">=3.8" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"voyageai\"" files = [ {file = "frozenlist-1.5.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:5b6a66c18b5b9dd261ca98dffcb826a525334b2f29e7caa54e182255c5f6a65a"}, {file = "frozenlist-1.5.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d1b3eb7b05ea246510b43a7e53ed1653e55c2121019a97e60cad7efb881a97bb"}, @@ -1602,6 +1712,8 @@ version = "2025.2.0" description = "File-system specification" optional = true python-versions = ">=3.8" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and (extra == \"sentence-transformers\" or extra == \"cohere\" or extra == \"ranx\") and (extra == \"sentence-transformers\" or extra == \"cohere\" or python_version >= \"3.10\")" files = [ {file = "fsspec-2025.2.0-py3-none-any.whl", hash = "sha256:9de2ad9ce1f85e1931858535bc882543171d197001a0a5eb2ddc04f1781ab95b"}, {file = "fsspec-2025.2.0.tar.gz", hash = "sha256:1c24b16eaa0a1798afa0337aa0db9b256718ab2a89c425371f5628d22c3b6afd"}, @@ -1641,6 +1753,8 @@ version = "2.24.1" description = "Google API client core library" optional = true python-versions = ">=3.7" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"vertexai\"" files = [ {file = "google_api_core-2.24.1-py3-none-any.whl", hash = "sha256:bc78d608f5a5bf853b80bd70a795f703294de656c096c0968320830a4bc280f1"}, {file = "google_api_core-2.24.1.tar.gz", hash = "sha256:f8b36f5456ab0dd99a1b693a40a31d1e7757beea380ad1b38faaf8941eae9d8a"}, @@ -1676,6 +1790,8 @@ version = "2.38.0" description = "Google Authentication Library" optional = true python-versions = ">=3.7" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"vertexai\"" files = [ {file = "google_auth-2.38.0-py2.py3-none-any.whl", hash = "sha256:e7dae6694313f434a2727bf2906f27ad259bae090d7aa896590d86feec3d9d4a"}, {file = "google_auth-2.38.0.tar.gz", hash = "sha256:8285113607d3b80a3f1543b75962447ba8a09fe85783432a784fdeef6ac094c4"}, @@ -1700,6 +1816,8 @@ version = "1.82.0" description = "Vertex AI API client library" optional = true python-versions = ">=3.8" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"vertexai\"" files = [ {file = "google_cloud_aiplatform-1.82.0-py2.py3-none-any.whl", hash = "sha256:13368a961b2bfa8f46ccd10371bb19bd5f946d8f29c411726061ed1a140ce890"}, {file = "google_cloud_aiplatform-1.82.0.tar.gz", hash = "sha256:b7ea7379249cc1821aa46300a16e4b15aa64aa22665e2536b2bcb7e473d7438e"}, @@ -1751,6 +1869,8 @@ version = "3.30.0" description = "Google BigQuery API client library" optional = true python-versions = ">=3.7" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"vertexai\"" files = [ {file = "google_cloud_bigquery-3.30.0-py2.py3-none-any.whl", hash = "sha256:f4d28d846a727f20569c9b2d2f4fa703242daadcb2ec4240905aa485ba461877"}, {file = "google_cloud_bigquery-3.30.0.tar.gz", hash = "sha256:7e27fbafc8ed33cc200fe05af12ecd74d279fe3da6692585a3cef7aee90575b6"}, @@ -1782,6 +1902,8 @@ version = "2.4.2" description = "Google Cloud API client core library" optional = true python-versions = ">=3.7" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"vertexai\"" files = [ {file = "google_cloud_core-2.4.2-py2.py3-none-any.whl", hash = "sha256:7459c3e83de7cb8b9ecfec9babc910efb4314030c56dd798eaad12c426f7d180"}, {file = "google_cloud_core-2.4.2.tar.gz", hash = "sha256:a4fcb0e2fcfd4bfe963837fad6d10943754fd79c1a50097d68540b6eb3d67f35"}, @@ -1800,6 +1922,8 @@ version = "1.14.1" description = "Google Cloud Resource Manager API client library" optional = true python-versions = ">=3.7" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"vertexai\"" files = [ {file = "google_cloud_resource_manager-1.14.1-py2.py3-none-any.whl", hash = "sha256:68340599f85ebf07a6e18487e460ea07cc15e132068f6b188786d01c2cf25518"}, {file = "google_cloud_resource_manager-1.14.1.tar.gz", hash = "sha256:41e9e546aaa03d5160cdfa2341dbe81ef7596706c300a89b94c429f1f3411f87"}, @@ -1821,6 +1945,8 @@ version = "2.19.0" description = "Google Cloud Storage API client library" optional = true python-versions = ">=3.7" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"vertexai\"" files = [ {file = "google_cloud_storage-2.19.0-py2.py3-none-any.whl", hash = "sha256:aeb971b5c29cf8ab98445082cbfe7b161a1f48ed275822f59ed3f1524ea54fba"}, {file = "google_cloud_storage-2.19.0.tar.gz", hash = "sha256:cd05e9e7191ba6cb68934d8eb76054d9be4562aa89dbc4236feee4d7d51342b2"}, @@ -1844,6 +1970,8 @@ version = "1.6.0" description = "A python wrapper of the C library 'Google CRC32C'" optional = true python-versions = ">=3.9" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"vertexai\"" files = [ {file = "google_crc32c-1.6.0-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:5bcc90b34df28a4b38653c36bb5ada35671ad105c99cfe915fb5bed7ad6924aa"}, {file = "google_crc32c-1.6.0-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:d9e9913f7bd69e093b81da4535ce27af842e7bf371cde42d1ae9e9bd382dc0e9"}, @@ -1883,6 +2011,8 @@ version = "2.7.2" description = "Utilities for Google Media Downloads and Resumable Uploads" optional = true python-versions = ">=3.7" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"vertexai\"" files = [ {file = "google_resumable_media-2.7.2-py2.py3-none-any.whl", hash = "sha256:3ce7551e9fe6d99e9a126101d2536612bb73486721951e9562fee0f90c6ababa"}, {file = "google_resumable_media-2.7.2.tar.gz", hash = "sha256:5280aed4629f2b60b847b0d42f9857fd4935c11af266744df33d8074cae92fe0"}, @@ -1901,6 +2031,8 @@ version = "1.68.0" description = "Common protobufs used in Google APIs" optional = true python-versions = ">=3.7" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"vertexai\"" files = [ {file = "googleapis_common_protos-1.68.0-py2.py3-none-any.whl", hash = "sha256:aaf179b2f81df26dfadac95def3b16a95064c76a5f45f07e4c68a21bb371c4ac"}, {file = "googleapis_common_protos-1.68.0.tar.gz", hash = "sha256:95d38161f4f9af0d9423eed8fb7b64ffd2568c3464eb542ff02c5bfa1953ab3c"}, @@ -1919,6 +2051,8 @@ version = "3.1.1" description = "Lightweight in-process concurrent programming" optional = false python-versions = ">=3.7" +groups = ["docs"] +markers = "(platform_machine == \"aarch64\" or platform_machine == \"ppc64le\" or platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"AMD64\" or platform_machine == \"win32\" or platform_machine == \"WIN32\") and (python_version <= \"3.11\" or python_version >= \"3.12\")" files = [ {file = "greenlet-3.1.1-cp310-cp310-macosx_11_0_universal2.whl", hash = "sha256:0bbae94a29c9e5c7e4a2b7f0aae5c17e8e90acbfd3bf6270eeba60c39fce3563"}, {file = "greenlet-3.1.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0fde093fb93f35ca72a556cf72c92ea3ebfda3d79fc35bb19fbe685853869a83"}, @@ -2005,6 +2139,8 @@ version = "0.14.0" description = "IAM API client library" optional = true python-versions = ">=3.7" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"vertexai\"" files = [ {file = "grpc_google_iam_v1-0.14.0-py2.py3-none-any.whl", hash = "sha256:fb4a084b30099ba3ab07d61d620a0d4429570b13ff53bd37bac75235f98b7da4"}, {file = "grpc_google_iam_v1-0.14.0.tar.gz", hash = "sha256:c66e07aa642e39bb37950f9e7f491f70dad150ac9801263b42b2814307c2df99"}, @@ -2021,6 +2157,8 @@ version = "1.70.0" description = "HTTP/2-based RPC framework" optional = true python-versions = ">=3.8" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"vertexai\"" files = [ {file = "grpcio-1.70.0-cp310-cp310-linux_armv7l.whl", hash = "sha256:95469d1977429f45fe7df441f586521361e235982a0b39e33841549143ae2851"}, {file = "grpcio-1.70.0-cp310-cp310-macosx_12_0_universal2.whl", hash = "sha256:ed9718f17fbdb472e33b869c77a16d0b55e166b100ec57b016dc7de9c8d236bf"}, @@ -2088,6 +2226,8 @@ version = "1.70.0" description = "Status proto mapping for gRPC" optional = true python-versions = ">=3.8" +groups = ["main"] +markers = "extra == \"vertexai\" and (python_version <= \"3.11\" or python_version >= \"3.12\")" files = [ {file = "grpcio_status-1.70.0-py3-none-any.whl", hash = "sha256:fc5a2ae2b9b1c1969cc49f3262676e6854aa2398ec69cb5bd6c47cd501904a85"}, {file = "grpcio_status-1.70.0.tar.gz", hash = "sha256:0e7b42816512433b18b9d764285ff029bde059e9d41f8fe10a60631bd8348101"}, @@ -2104,6 +2244,8 @@ version = "0.14.0" description = "A pure-Python, bring-your-own-I/O implementation of HTTP/1.1" optional = true python-versions = ">=3.7" +groups = ["main"] +markers = "(extra == \"openai\" or extra == \"cohere\" or extra == \"mistralai\") and (python_version <= \"3.11\" or python_version >= \"3.12\")" files = [ {file = "h11-0.14.0-py3-none-any.whl", hash = "sha256:e3fe4ac4b851c468cc8363d500db52c2ead036020723024a109d37346efaa761"}, {file = "h11-0.14.0.tar.gz", hash = "sha256:8f19fbbe99e72420ff35c00b27a34cb9937e902a8b810e2c88300c6f0a3b699d"}, @@ -2115,6 +2257,8 @@ version = "1.0.7" description = "A minimal low-level HTTP client." optional = true python-versions = ">=3.8" +groups = ["main"] +markers = "(extra == \"openai\" or extra == \"cohere\" or extra == \"mistralai\") and (python_version <= \"3.11\" or python_version >= \"3.12\")" files = [ {file = "httpcore-1.0.7-py3-none-any.whl", hash = "sha256:a3fff8f43dc260d5bd363d9f9cf1830fa3a458b332856f34282de498ed420edd"}, {file = "httpcore-1.0.7.tar.gz", hash = "sha256:8551cb62a169ec7162ac7be8d4817d561f60e08eaa485234898414bb5a8a0b4c"}, @@ -2136,6 +2280,8 @@ version = "0.28.1" description = "The next generation HTTP client." optional = true python-versions = ">=3.8" +groups = ["main"] +markers = "(extra == \"openai\" or extra == \"cohere\" or extra == \"mistralai\") and (python_version <= \"3.11\" or python_version >= \"3.12\")" files = [ {file = "httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad"}, {file = "httpx-0.28.1.tar.gz", hash = "sha256:75e98c5f16b0f35b567856f597f06ff2270a374470a5c2392242528e3e3e42fc"}, @@ -2160,6 +2306,8 @@ version = "0.4.0" description = "Consume Server-Sent Event (SSE) messages with HTTPX." optional = true python-versions = ">=3.8" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"cohere\"" files = [ {file = "httpx-sse-0.4.0.tar.gz", hash = "sha256:1e81a3a3070ce322add1d3529ed42eb5f70817f45ed6ec915ab753f961139721"}, {file = "httpx_sse-0.4.0-py3-none-any.whl", hash = "sha256:f329af6eae57eaa2bdfd962b42524764af68075ea87370a2de920af5341e318f"}, @@ -2171,6 +2319,8 @@ version = "0.29.1" description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub" optional = true python-versions = ">=3.8.0" +groups = ["main"] +markers = "(extra == \"sentence-transformers\" or extra == \"cohere\") and (python_version <= \"3.11\" or python_version >= \"3.12\")" files = [ {file = "huggingface_hub-0.29.1-py3-none-any.whl", hash = "sha256:352f69caf16566c7b6de84b54a822f6238e17ddd8ae3da4f8f2272aea5b198d5"}, {file = "huggingface_hub-0.29.1.tar.gz", hash = "sha256:9524eae42077b8ff4fc459ceb7a514eca1c1232b775276b009709fe2a084f250"}, @@ -2205,6 +2355,8 @@ version = "10.0" description = "Human friendly output for text interfaces using Python" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +groups = ["main"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "humanfriendly-10.0-py2.py3-none-any.whl", hash = "sha256:1697e1a8a8f550fd43c2865cd84542fc175a61dcb779b6fee18cf6b6ccba1477"}, {file = "humanfriendly-10.0.tar.gz", hash = "sha256:6b0b831ce8f15f7300721aa49829fc4e83921a9a301cc7f606be6686a2288ddc"}, @@ -2219,6 +2371,8 @@ version = "2.6.8" description = "File identification library for Python" optional = false python-versions = ">=3.9" +groups = ["dev"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "identify-2.6.8-py2.py3-none-any.whl", hash = "sha256:83657f0f766a3c8d0eaea16d4ef42494b39b34629a4b3192a9d020d349b3e255"}, {file = "identify-2.6.8.tar.gz", hash = "sha256:61491417ea2c0c5c670484fd8abbb34de34cdae1e5f39a73ee65e48e4bb663fc"}, @@ -2233,10 +2387,12 @@ version = "3.10" description = "Internationalized Domain Names in Applications (IDNA)" optional = false python-versions = ">=3.6" +groups = ["main", "dev", "docs"] files = [ {file = "idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3"}, {file = "idna-3.10.tar.gz", hash = "sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9"}, ] +markers = {main = "(python_version <= \"3.11\" or python_version >= \"3.12\") and (extra == \"openai\" or extra == \"cohere\" or extra == \"mistralai\" or extra == \"sentence-transformers\" or extra == \"vertexai\" or extra == \"voyageai\" or extra == \"ranx\") and (extra == \"openai\" or extra == \"cohere\" or extra == \"mistralai\" or extra == \"sentence-transformers\" or extra == \"vertexai\" or extra == \"voyageai\" or python_version >= \"3.10\")", dev = "python_version <= \"3.11\" or python_version >= \"3.12\"", docs = "python_version <= \"3.11\" or python_version >= \"3.12\""} [package.extras] all = ["flake8 (>=7.1.1)", "mypy (>=1.11.2)", "pytest (>=8.3.2)", "ruff (>=0.6.2)"] @@ -2247,6 +2403,8 @@ version = "3.3.0" description = "Iterative JSON parser with standard Python iterator interfaces" optional = true python-versions = "*" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"ranx\" and python_version >= \"3.10\"" files = [ {file = "ijson-3.3.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:7f7a5250599c366369fbf3bc4e176f5daa28eb6bc7d6130d02462ed335361675"}, {file = "ijson-3.3.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:f87a7e52f79059f9c58f6886c262061065eb6f7554a587be7ed3aa63e6b71b34"}, @@ -2350,6 +2508,8 @@ version = "1.4.1" description = "Getting image size from png/jpeg/jpeg2000/gif file" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +groups = ["docs"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "imagesize-1.4.1-py2.py3-none-any.whl", hash = "sha256:0d8d18d08f840c19d0ee7ca1fd82490fdc3729b7ac93f49870406ddde8ef8d8b"}, {file = "imagesize-1.4.1.tar.gz", hash = "sha256:69150444affb9cb0d5cc5a92b3676f0b2fb7cd9ae39e947a5e11a36b4497cd4a"}, @@ -2361,10 +2521,12 @@ version = "8.6.1" description = "Read metadata from Python packages" optional = false python-versions = ">=3.9" +groups = ["dev", "docs"] files = [ {file = "importlib_metadata-8.6.1-py3-none-any.whl", hash = "sha256:02a89390c1e15fdfdc0d7c6b25cb3e62650d0494005c97d6f148bf5b9787525e"}, {file = "importlib_metadata-8.6.1.tar.gz", hash = "sha256:310b41d755445d74569f993ccfc22838295d9fe005425094fad953d7f15c8580"}, ] +markers = {dev = "python_version < \"3.10\"", docs = "python_version <= \"3.11\" or python_version >= \"3.12\""} [package.dependencies] zipp = ">=3.20" @@ -2384,6 +2546,8 @@ version = "2.0.0" description = "brain-dead simple config-ini parsing" optional = false python-versions = ">=3.7" +groups = ["dev"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374"}, {file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"}, @@ -2395,6 +2559,8 @@ version = "2.5.3" description = "inscriptis - HTML to text converter." optional = true python-versions = "<4.0,>=3.9" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"ranx\" and python_version >= \"3.10\"" files = [ {file = "inscriptis-2.5.3-py3-none-any.whl", hash = "sha256:25962cf5a60b1a8f33e7bfbbea08a29af82299702339b9b90c538653a5c7aa38"}, {file = "inscriptis-2.5.3.tar.gz", hash = "sha256:256043caa13e4995c71fafdeadec4ac42b57f3914cb41023ecbee8bc27ca1cc0"}, @@ -2413,6 +2579,8 @@ version = "6.29.5" description = "IPython Kernel for Jupyter" optional = false python-versions = ">=3.8" +groups = ["dev", "docs"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "ipykernel-6.29.5-py3-none-any.whl", hash = "sha256:afdb66ba5aa354b09b91379bac28ae4afebbb30e8b39510c9690afb7a10421b5"}, {file = "ipykernel-6.29.5.tar.gz", hash = "sha256:f093a22c4a40f8828f8e330a9c297cb93dcab13bd9678ded6de8e5cf81c56215"}, @@ -2446,6 +2614,8 @@ version = "8.18.1" description = "IPython: Productive Interactive Computing" optional = false python-versions = ">=3.9" +groups = ["dev", "docs"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "ipython-8.18.1-py3-none-any.whl", hash = "sha256:e8267419d72d81955ec1177f8a29aaa90ac80ad647499201119e2f05e99aa397"}, {file = "ipython-8.18.1.tar.gz", hash = "sha256:ca6f079bb33457c66e233e4580ebfc4128855b4cf6370dddd73842a9563e8a27"}, @@ -2483,6 +2653,8 @@ version = "0.5.9" description = "provides a common interface to many IR ad-hoc ranking benchmarks, training datasets, etc." optional = true python-versions = ">=3.8" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"ranx\" and python_version >= \"3.10\"" files = [ {file = "ir_datasets-0.5.9-py3-none-any.whl", hash = "sha256:07c9bed07f31031f1da1bc02afc7a1077b1179a3af402d061f83bf6fb833b90a"}, {file = "ir_datasets-0.5.9.tar.gz", hash = "sha256:35c90980fbd0f4ea8fe22a1ab16d2bb6be3dc373cbd6dfab1d905f176a70e5ac"}, @@ -2510,6 +2682,8 @@ version = "5.13.2" description = "A Python utility / library to sort Python imports." optional = false python-versions = ">=3.8.0" +groups = ["dev"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "isort-5.13.2-py3-none-any.whl", hash = "sha256:8ca5e72a8d85860d5a3fa69b8745237f2939afe12dbf656afbcb47fe72d947a6"}, {file = "isort-5.13.2.tar.gz", hash = "sha256:48fdfcb9face5d58a4f6dde2e72a1fb8dcaf8ab26f95ab49fab84c2ddefb0109"}, @@ -2524,6 +2698,8 @@ version = "0.19.2" description = "An autocompletion tool for Python that can be used for text editors." optional = false python-versions = ">=3.6" +groups = ["dev", "docs"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "jedi-0.19.2-py2.py3-none-any.whl", hash = "sha256:a8ef22bde8490f57fe5c7681a3c83cb58874daf72b4784de3cce5b6ef6edb5b9"}, {file = "jedi-0.19.2.tar.gz", hash = "sha256:4770dc3de41bde3966b02eb84fbcf557fb33cce26ad23da12c742fb50ecb11f0"}, @@ -2543,6 +2719,8 @@ version = "3.1.5" description = "A very fast and expressive template engine." optional = false python-versions = ">=3.7" +groups = ["main", "docs"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "jinja2-3.1.5-py3-none-any.whl", hash = "sha256:aba0f4dc9ed8013c424088f68a5c226f7d6097ed89b246d7749c2ec4175c6adb"}, {file = "jinja2-3.1.5.tar.gz", hash = "sha256:8fefff8dc3034e27bb80d67c671eb8a9bc424c0ef4c0826edbff304cceff43bb"}, @@ -2560,6 +2738,8 @@ version = "0.8.2" description = "Fast iterable JSON parser." optional = true python-versions = ">=3.8" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"openai\"" files = [ {file = "jiter-0.8.2-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:ca8577f6a413abe29b079bc30f907894d7eb07a865c4df69475e868d73e71c7b"}, {file = "jiter-0.8.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:b25bd626bde7fb51534190c7e3cb97cee89ee76b76d7585580e22f34f5e3f393"}, @@ -2645,6 +2825,8 @@ version = "1.0.1" description = "JSON Matching Expressions" optional = true python-versions = ">=3.7" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"bedrock\"" files = [ {file = "jmespath-1.0.1-py3-none-any.whl", hash = "sha256:02e2e4cc71b5bcab88332eebf907519190dd9e6e82107fa7f83b1003a6252980"}, {file = "jmespath-1.0.1.tar.gz", hash = "sha256:90261b206d6defd58fdd5e85f478bf633a2901798906be2ad389150c5c60edbe"}, @@ -2656,17 +2838,38 @@ version = "1.4.2" description = "Lightweight pipelining with Python functions" optional = true python-versions = ">=3.8" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"sentence-transformers\"" files = [ {file = "joblib-1.4.2-py3-none-any.whl", hash = "sha256:06d478d5674cbc267e7496a410ee875abd68e4340feff4490bcb7afb88060ae6"}, {file = "joblib-1.4.2.tar.gz", hash = "sha256:2382c5816b2636fbd20a09e0f4e9dad4736765fdfb7dca582943b9c1366b3f0e"}, ] +[[package]] +name = "jsonpath-ng" +version = "1.7.0" +description = "A final implementation of JSONPath for Python that aims to be standard compliant, including arithmetic and binary comparison operators and providing clear AST for metaprogramming." +optional = false +python-versions = "*" +groups = ["main"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" +files = [ + {file = "jsonpath-ng-1.7.0.tar.gz", hash = "sha256:f6f5f7fd4e5ff79c785f1573b394043b39849fb2bb47bcead935d12b00beab3c"}, + {file = "jsonpath_ng-1.7.0-py2-none-any.whl", hash = "sha256:898c93fc173f0c336784a3fa63d7434297544b7198124a68f9a3ef9597b0ae6e"}, + {file = "jsonpath_ng-1.7.0-py3-none-any.whl", hash = "sha256:f3d7f9e848cba1b6da28c55b1c26ff915dc9e0b1ba7e752a53d6da8d5cbd00b6"}, +] + +[package.dependencies] +ply = "*" + [[package]] name = "jsonpath-python" version = "1.0.6" description = "A more powerful JSONPath implementation in modern python" optional = true python-versions = ">=3.6" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"mistralai\"" files = [ {file = "jsonpath-python-1.0.6.tar.gz", hash = "sha256:dd5be4a72d8a2995c3f583cf82bf3cd1a9544cfdabf2d22595b67aff07349666"}, {file = "jsonpath_python-1.0.6-py3-none-any.whl", hash = "sha256:1e3b78df579f5efc23565293612decee04214609208a2335884b3ee3f786b575"}, @@ -2678,6 +2881,8 @@ version = "4.23.0" description = "An implementation of JSON Schema validation for Python" optional = false python-versions = ">=3.8" +groups = ["dev", "docs"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "jsonschema-4.23.0-py3-none-any.whl", hash = "sha256:fbadb6f8b144a8f8cf9f0b89ba94501d143e50411a1278633f56a7acf7fd5566"}, {file = "jsonschema-4.23.0.tar.gz", hash = "sha256:d71497fef26351a33265337fa77ffeb82423f3ea21283cd9467bb03999266bc4"}, @@ -2699,6 +2904,8 @@ version = "2024.10.1" description = "The JSON Schema meta-schemas and vocabularies, exposed as a Registry" optional = false python-versions = ">=3.9" +groups = ["dev", "docs"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "jsonschema_specifications-2024.10.1-py3-none-any.whl", hash = "sha256:a09a0680616357d9a0ecf05c12ad234479f549239d0f5b55f3deea67475da9bf"}, {file = "jsonschema_specifications-2024.10.1.tar.gz", hash = "sha256:0f38b83639958ce1152d02a7f062902c41c8fd20d558b0c34344292d417ae272"}, @@ -2713,6 +2920,8 @@ version = "1.0.1" description = "A defined interface for working with a cache of jupyter notebooks." optional = false python-versions = ">=3.9" +groups = ["docs"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "jupyter_cache-1.0.1-py3-none-any.whl", hash = "sha256:9c3cafd825ba7da8b5830485343091143dff903e4d8c69db9349b728b140abf6"}, {file = "jupyter_cache-1.0.1.tar.gz", hash = "sha256:16e808eb19e3fb67a223db906e131ea6e01f03aa27f49a7214ce6a5fec186fb9"}, @@ -2740,6 +2949,8 @@ version = "8.6.3" description = "Jupyter protocol implementation and client libraries" optional = false python-versions = ">=3.8" +groups = ["dev", "docs"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "jupyter_client-8.6.3-py3-none-any.whl", hash = "sha256:e8a19cc986cc45905ac3362915f410f3af85424b4c0905e94fa5f2cb08e8f23f"}, {file = "jupyter_client-8.6.3.tar.gz", hash = "sha256:35b3a0947c4a6e9d589eb97d7d4cd5e90f910ee73101611f01283732bd6d9419"}, @@ -2763,6 +2974,8 @@ version = "5.7.2" description = "Jupyter core package. A base package on which Jupyter projects rely." optional = false python-versions = ">=3.8" +groups = ["dev", "docs"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "jupyter_core-5.7.2-py3-none-any.whl", hash = "sha256:4f7315d2f6b4bcf2e3e7cb6e46772eba760ae459cd1f59d29eb57b0a01bd7409"}, {file = "jupyter_core-5.7.2.tar.gz", hash = "sha256:aa5f8d32bbf6b431ac830496da7392035d6f61b4f54872f15c4bd2a9c3f536d9"}, @@ -2783,6 +2996,8 @@ version = "0.3.0" description = "Pygments theme using JupyterLab CSS variables" optional = false python-versions = ">=3.8" +groups = ["docs"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "jupyterlab_pygments-0.3.0-py3-none-any.whl", hash = "sha256:841a89020971da1d8693f1a99997aefc5dc424bb1b251fd6322462a1b8842780"}, {file = "jupyterlab_pygments-0.3.0.tar.gz", hash = "sha256:721aca4d9029252b11cfa9d185e5b5af4d54772bb8072f9b7036f4170054d35d"}, @@ -2794,6 +3009,8 @@ version = "1.4.8" description = "A fast implementation of the Cassowary constraint solver" optional = true python-versions = ">=3.10" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"ranx\" and python_version >= \"3.10\"" files = [ {file = "kiwisolver-1.4.8-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:88c6f252f6816a73b1f8c904f7bbe02fd67c09a69f7cb8a0eecdbf5ce78e63db"}, {file = "kiwisolver-1.4.8-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c72941acb7b67138f35b879bbe85be0f6c6a70cab78fe3ef6db9c024d9223e5b"}, @@ -2883,6 +3100,8 @@ version = "0.44.0" description = "lightweight wrapper around basic LLVM functionality" optional = true python-versions = ">=3.10" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"ranx\" and python_version >= \"3.10\"" files = [ {file = "llvmlite-0.44.0-cp310-cp310-macosx_10_14_x86_64.whl", hash = "sha256:9fbadbfba8422123bab5535b293da1cf72f9f478a65645ecd73e781f962ca614"}, {file = "llvmlite-0.44.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:cccf8eb28f24840f2689fb1a45f9c0f7e582dd24e088dcf96e424834af11f791"}, @@ -2913,6 +3132,8 @@ version = "5.3.1" description = "Powerful and Pythonic XML processing library combining libxml2/libxslt with the ElementTree API." optional = true python-versions = ">=3.6" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"ranx\" and python_version >= \"3.10\"" files = [ {file = "lxml-5.3.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:a4058f16cee694577f7e4dd410263cd0ef75644b43802a689c2b3c2a7e69453b"}, {file = "lxml-5.3.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:364de8f57d6eda0c16dcfb999af902da31396949efa0e583e12675d09709881b"}, @@ -3067,6 +3288,8 @@ version = "4.4.3" description = "LZ4 Bindings for Python" optional = true python-versions = ">=3.9" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"ranx\" and python_version >= \"3.10\"" files = [ {file = "lz4-4.4.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1ebf23ffd36b32b980f720a81990fcfdeadacafe7498fbeff7a8e058259d4e58"}, {file = "lz4-4.4.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8fe3caea61427057a9e3697c69b2403510fdccfca4483520d02b98ffae74531e"}, @@ -3112,10 +3335,12 @@ version = "3.0.0" description = "Python port of markdown-it. Markdown parsing, done right!" optional = false python-versions = ">=3.8" +groups = ["main", "docs"] files = [ {file = "markdown-it-py-3.0.0.tar.gz", hash = "sha256:e3f60a94fa066dc52ec76661e37c851cb232d92f9886b15cb560aaada2df8feb"}, {file = "markdown_it_py-3.0.0-py3-none-any.whl", hash = "sha256:355216845c60bd96232cd8d8c40e8f9765cc86f46880e43a8fd22dc1a1a8cab1"}, ] +markers = {main = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"ranx\" and python_version >= \"3.10\"", docs = "python_version <= \"3.11\" or python_version >= \"3.12\""} [package.dependencies] mdurl = ">=0.1,<1.0" @@ -3136,6 +3361,8 @@ version = "3.0.2" description = "Safely add untrusted strings to HTML/XML markup." optional = false python-versions = ">=3.9" +groups = ["main", "docs"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "MarkupSafe-3.0.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:7e94c425039cde14257288fd61dcfb01963e658efbc0ff54f5306b06054700f8"}, {file = "MarkupSafe-3.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9e2d922824181480953426608b81967de705c3cef4d1af983af849d7bd619158"}, @@ -3206,6 +3433,8 @@ version = "3.10.1" description = "Python plotting package" optional = true python-versions = ">=3.10" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"ranx\" and python_version >= \"3.10\"" files = [ {file = "matplotlib-3.10.1-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:ff2ae14910be903f4a24afdbb6d7d3a6c44da210fc7d42790b87aeac92238a16"}, {file = "matplotlib-3.10.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:0721a3fd3d5756ed593220a8b86808a36c5031fce489adb5b31ee6dbb47dd5b2"}, @@ -3263,6 +3492,8 @@ version = "0.1.7" description = "Inline Matplotlib backend for Jupyter" optional = false python-versions = ">=3.8" +groups = ["dev", "docs"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "matplotlib_inline-0.1.7-py3-none-any.whl", hash = "sha256:df192d39a4ff8f21b1895d72e6a13f5fcc5099f00fa84384e0ea28c2cc0653ca"}, {file = "matplotlib_inline-0.1.7.tar.gz", hash = "sha256:8423b23ec666be3d16e16b60bdd8ac4e86e840ebd1dd11a30b9f117f2fa0ab90"}, @@ -3277,6 +3508,8 @@ version = "0.7.0" description = "McCabe checker, plugin for flake8" optional = false python-versions = ">=3.6" +groups = ["dev"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "mccabe-0.7.0-py2.py3-none-any.whl", hash = "sha256:6c2d30ab6be0e4a46919781807b4f0d834ebdd6c6e3dca0bda5a15f863427b6e"}, {file = "mccabe-0.7.0.tar.gz", hash = "sha256:348e0240c33b60bbdf4e523192ef919f28cb2c3d7d5c7794f74009290f236325"}, @@ -3288,6 +3521,8 @@ version = "0.4.2" description = "Collection of plugins for markdown-it-py" optional = false python-versions = ">=3.8" +groups = ["docs"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "mdit_py_plugins-0.4.2-py3-none-any.whl", hash = "sha256:0c673c3f889399a33b95e88d2f0d111b4447bdfea7f237dab2d488f459835636"}, {file = "mdit_py_plugins-0.4.2.tar.gz", hash = "sha256:5f2cd1fdb606ddf152d37ec30e46101a60512bc0e5fa1a7002c36647b09e26b5"}, @@ -3307,10 +3542,12 @@ version = "0.1.2" description = "Markdown URL utilities" optional = false python-versions = ">=3.7" +groups = ["main", "docs"] files = [ {file = "mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8"}, {file = "mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba"}, ] +markers = {main = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"ranx\" and python_version >= \"3.10\"", docs = "python_version <= \"3.11\" or python_version >= \"3.12\""} [[package]] name = "mistralai" @@ -3318,6 +3555,8 @@ version = "1.5.0" description = "Python Client SDK for the Mistral AI API." optional = true python-versions = ">=3.8" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"mistralai\"" files = [ {file = "mistralai-1.5.0-py3-none-any.whl", hash = "sha256:9372537719f87bd6f9feef4747d0bf1f4fbe971f8c02945ca4b4bf3c94571c97"}, {file = "mistralai-1.5.0.tar.gz", hash = "sha256:fd94bc93bc25aad9c6dd8005b1a0bc4ba1250c6b3fbf855a49936989cc6e5c0d"}, @@ -3340,6 +3579,8 @@ version = "3.1.2" description = "A sane and fast Markdown parser with useful plugins and renderers" optional = false python-versions = ">=3.8" +groups = ["docs"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "mistune-3.1.2-py3-none-any.whl", hash = "sha256:4b47731332315cdca99e0ded46fc0004001c1299ff773dfb48fbe1fd226de319"}, {file = "mistune-3.1.2.tar.gz", hash = "sha256:733bf018ba007e8b5f2d3a9eb624034f6ee26c4ea769a98ec533ee111d504dff"}, @@ -3354,6 +3595,8 @@ version = "0.4.1" description = "" optional = false python-versions = ">=3.9" +groups = ["main"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "ml_dtypes-0.4.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:1fe8b5b5e70cd67211db94b05cfd58dace592f24489b038dc6f9fe347d2e07d5"}, {file = "ml_dtypes-0.4.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8c09a6d11d8475c2a9fd2bc0695628aec105f97cab3b3a3fb7c9660348ff7d24"}, @@ -3377,8 +3620,8 @@ files = [ [package.dependencies] numpy = [ {version = ">1.20", markers = "python_version < \"3.10\""}, - {version = ">=1.23.3", markers = "python_version >= \"3.11\" and python_version < \"3.12\""}, {version = ">=1.21.2", markers = "python_version >= \"3.10\" and python_version < \"3.11\""}, + {version = ">=1.23.3", markers = "python_version >= \"3.11\" and python_version < \"3.12\""}, {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, ] @@ -3391,6 +3634,8 @@ version = "1.3.0" description = "Python library for arbitrary-precision floating-point arithmetic" optional = true python-versions = "*" +groups = ["main"] +markers = "extra == \"sentence-transformers\" and (python_version <= \"3.11\" or python_version >= \"3.12\")" files = [ {file = "mpmath-1.3.0-py3-none-any.whl", hash = "sha256:a0b2b9fe80bbcd81a6647ff13108738cfb482d481d826cc0e02f5b35e5c88d2c"}, {file = "mpmath-1.3.0.tar.gz", hash = "sha256:7a28eb2a9774d00c7bc92411c19a89209d5da7c4c9a9e227be8330a23a25b91f"}, @@ -3408,6 +3653,8 @@ version = "6.1.0" description = "multidict implementation" optional = true python-versions = ">=3.8" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"voyageai\"" files = [ {file = "multidict-6.1.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:3380252550e372e8511d49481bd836264c009adb826b23fefcc5dd3c69692f60"}, {file = "multidict-6.1.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:99f826cbf970077383d7de805c0681799491cb939c25450b9b5b3ced03ca99f1"}, @@ -3512,6 +3759,8 @@ version = "1.9.0" description = "Optional static typing for Python" optional = false python-versions = ">=3.8" +groups = ["dev"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "mypy-1.9.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:f8a67616990062232ee4c3952f41c779afac41405806042a8126fe96e098419f"}, {file = "mypy-1.9.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d357423fa57a489e8c47b7c85dfb96698caba13d66e086b412298a1a0ea3b0ed"}, @@ -3559,10 +3808,12 @@ version = "1.0.0" description = "Type system extensions for programs checked with the mypy type checker." optional = false python-versions = ">=3.5" +groups = ["main", "dev"] files = [ {file = "mypy_extensions-1.0.0-py3-none-any.whl", hash = "sha256:4392f6c0eb8a5668a69e23d168ffa70f0be9ccfd32b5cc2d26a34ae5b844552d"}, {file = "mypy_extensions-1.0.0.tar.gz", hash = "sha256:75dbf8955dc00442a438fc4d0666508a9a97b6bd41aa2f0ffe9d2f2725af0782"}, ] +markers = {main = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"mistralai\"", dev = "python_version <= \"3.11\" or python_version >= \"3.12\""} [[package]] name = "myst-nb" @@ -3570,6 +3821,8 @@ version = "1.2.0" description = "A Jupyter Notebook Sphinx reader built on top of the MyST markdown parser." optional = false python-versions = ">=3.9" +groups = ["docs"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "myst_nb-1.2.0-py3-none-any.whl", hash = "sha256:0e09909877848c0cf45e1aecee97481512efa29a0c4caa37870a03bba11c56c1"}, {file = "myst_nb-1.2.0.tar.gz", hash = "sha256:af459ec753b341952182b45b0a80b4776cebf80c9ee6aaca2a3f4027b440c9de"}, @@ -3598,6 +3851,8 @@ version = "3.0.1" description = "An extended [CommonMark](https://spec.commonmark.org/) compliant parser," optional = false python-versions = ">=3.8" +groups = ["docs"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "myst_parser-3.0.1-py3-none-any.whl", hash = "sha256:6457aaa33a5d474aca678b8ead9b3dc298e89c68e67012e73146ea6fd54babf1"}, {file = "myst_parser-3.0.1.tar.gz", hash = "sha256:88f0cb406cb363b077d176b51c476f62d60604d68a8dcdf4832e080441301a87"}, @@ -3624,6 +3879,8 @@ version = "0.10.2" description = "A client library for executing notebooks. Formerly nbconvert's ExecutePreprocessor." optional = false python-versions = ">=3.9.0" +groups = ["docs"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "nbclient-0.10.2-py3-none-any.whl", hash = "sha256:4ffee11e788b4a27fabeb7955547e4318a5298f34342a4bfd01f2e1faaeadc3d"}, {file = "nbclient-0.10.2.tar.gz", hash = "sha256:90b7fc6b810630db87a6d0c2250b1f0ab4cf4d3c27a299b0cde78a4ed3fd9193"}, @@ -3646,6 +3903,8 @@ version = "7.16.6" description = "Converting Jupyter Notebooks (.ipynb files) to other formats. Output formats include asciidoc, html, latex, markdown, pdf, py, rst, script. nbconvert can be used both as a Python library (`import nbconvert`) or as a command line tool (invoked as `jupyter nbconvert ...`)." optional = false python-versions = ">=3.8" +groups = ["docs"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "nbconvert-7.16.6-py3-none-any.whl", hash = "sha256:1375a7b67e0c2883678c48e506dc320febb57685e5ee67faa51b18a90f3a712b"}, {file = "nbconvert-7.16.6.tar.gz", hash = "sha256:576a7e37c6480da7b8465eefa66c17844243816ce1ccc372633c6b71c3c0f582"}, @@ -3683,6 +3942,8 @@ version = "5.10.4" description = "The Jupyter Notebook format" optional = false python-versions = ">=3.8" +groups = ["dev", "docs"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "nbformat-5.10.4-py3-none-any.whl", hash = "sha256:3b48d6c8fbca4b299bf3982ea7db1af21580e4fec269ad087b9e81588891200b"}, {file = "nbformat-5.10.4.tar.gz", hash = "sha256:322168b14f937a5d11362988ecac2a4952d3d8e3a2cbeb2319584631226d5b3a"}, @@ -3704,6 +3965,8 @@ version = "0.9.6" description = "Jupyter Notebook Tools for Sphinx" optional = false python-versions = ">=3.6" +groups = ["docs"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "nbsphinx-0.9.6-py3-none-any.whl", hash = "sha256:336b0b557945a7678ec7449b16449f854bc852a435bb53b8a72e6b5dc740d992"}, {file = "nbsphinx-0.9.6.tar.gz", hash = "sha256:c2b28a2d702f1159a95b843831798e86e60a17fc647b9bff9ba1585355de54e3"}, @@ -3723,6 +3986,8 @@ version = "0.11.0" description = "A py.test plugin to validate Jupyter notebooks" optional = false python-versions = ">=3.7, <4" +groups = ["dev"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "nbval-0.11.0-py2.py3-none-any.whl", hash = "sha256:307aecc866c9a1e8a13bb5bbb008a702bacfda2394dff6fe504a3108a58042a0"}, {file = "nbval-0.11.0.tar.gz", hash = "sha256:77c95797607b0a968babd2597ee3494102d25c3ad37435debbdac0e46e379094"}, @@ -3741,6 +4006,8 @@ version = "1.6.0" description = "Patch asyncio to allow nested event loops" optional = false python-versions = ">=3.5" +groups = ["dev", "docs"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "nest_asyncio-1.6.0-py3-none-any.whl", hash = "sha256:87af6efd6b5e897c81050477ef65c62e2b2f35d51703cae01aff2905b1852e1c"}, {file = "nest_asyncio-1.6.0.tar.gz", hash = "sha256:6f172d5449aca15afd6c646851f4e31e02c598d553a667e38cafa997cfec55fe"}, @@ -3752,6 +4019,8 @@ version = "3.2.1" description = "Python package for creating and manipulating graphs and networks" optional = true python-versions = ">=3.9" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"sentence-transformers\"" files = [ {file = "networkx-3.2.1-py3-none-any.whl", hash = "sha256:f18c69adc97877c42332c170849c96cefa91881c99a7cb3e95b7c659ebdc1ec2"}, {file = "networkx-3.2.1.tar.gz", hash = "sha256:9f1bb5cf3409bf324e0a722c20bdb4c20ee39bf1c30ce8ae499c8502b0b5e0c6"}, @@ -3770,6 +4039,8 @@ version = "1.9.1" description = "Node.js virtual environment builder" optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" +groups = ["dev"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "nodeenv-1.9.1-py2.py3-none-any.whl", hash = "sha256:ba11c9782d29c27c70ffbdda2d7415098754709be8a7056d79a737cd901155c9"}, {file = "nodeenv-1.9.1.tar.gz", hash = "sha256:6ec12890a2dab7946721edbfbcd91f3319c6ccc9aec47be7c7e6b7011ee6645f"}, @@ -3781,6 +4052,8 @@ version = "0.61.0" description = "compiling Python code using LLVM" optional = true python-versions = ">=3.10" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"ranx\" and python_version >= \"3.10\"" files = [ {file = "numba-0.61.0-cp310-cp310-macosx_10_14_x86_64.whl", hash = "sha256:9cab9783a700fa428b1a54d65295122bc03b3de1d01fb819a6b9dbbddfdb8c43"}, {file = "numba-0.61.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:46c5ae094fb3706f5adf9021bfb7fc11e44818d61afee695cdee4eadfed45e98"}, @@ -3815,6 +4088,8 @@ version = "1.26.4" description = "Fundamental package for array computing in Python" optional = false python-versions = ">=3.9" +groups = ["main"] +markers = "python_version <= \"3.11\"" files = [ {file = "numpy-1.26.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:9ff0f4f29c51e2803569d7a51c2304de5554655a60c5d776e35b4a41413830d0"}, {file = "numpy-1.26.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2e4ee3380d6de9c9ec04745830fd9e2eccb3e6cf790d39d7b98ffd19b0dd754a"}, @@ -3860,6 +4135,8 @@ version = "2.1.3" description = "Fundamental package for array computing in Python" optional = false python-versions = ">=3.10" +groups = ["main"] +markers = "python_version >= \"3.12\"" files = [ {file = "numpy-2.1.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c894b4305373b9c5576d7a12b473702afdf48ce5369c074ba304cc5ad8730dff"}, {file = "numpy-2.1.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:b47fbb433d3260adcd51eb54f92a2ffbc90a4595f8970ee00e064c644ac788f5"}, @@ -3924,6 +4201,8 @@ version = "12.4.5.8" description = "CUBLAS native runtime libraries" optional = true python-versions = ">=3" +groups = ["main"] +markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\" and extra == \"sentence-transformers\" and (python_version <= \"3.11\" or python_version >= \"3.12\")" files = [ {file = "nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_aarch64.whl", hash = "sha256:0f8aa1706812e00b9f19dfe0cdb3999b092ccb8ca168c0db5b8ea712456fd9b3"}, {file = "nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl", hash = "sha256:2fc8da60df463fdefa81e323eef2e36489e1c94335b5358bcb38360adf75ac9b"}, @@ -3936,6 +4215,8 @@ version = "12.4.127" description = "CUDA profiling tools runtime libs." optional = true python-versions = ">=3" +groups = ["main"] +markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\" and extra == \"sentence-transformers\" and (python_version <= \"3.11\" or python_version >= \"3.12\")" files = [ {file = "nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:79279b35cf6f91da114182a5ce1864997fd52294a87a16179ce275773799458a"}, {file = "nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:9dec60f5ac126f7bb551c055072b69d85392b13311fcc1bcda2202d172df30fb"}, @@ -3948,6 +4229,8 @@ version = "12.4.127" description = "NVRTC native runtime libraries" optional = true python-versions = ">=3" +groups = ["main"] +markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\" and extra == \"sentence-transformers\" and (python_version <= \"3.11\" or python_version >= \"3.12\")" files = [ {file = "nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:0eedf14185e04b76aa05b1fea04133e59f465b6f960c0cbf4e37c3cb6b0ea198"}, {file = "nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:a178759ebb095827bd30ef56598ec182b85547f1508941a3d560eb7ea1fbf338"}, @@ -3960,6 +4243,8 @@ version = "12.4.127" description = "CUDA Runtime native Libraries" optional = true python-versions = ">=3" +groups = ["main"] +markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\" and extra == \"sentence-transformers\" and (python_version <= \"3.11\" or python_version >= \"3.12\")" files = [ {file = "nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:961fe0e2e716a2a1d967aab7caee97512f71767f852f67432d572e36cb3a11f3"}, {file = "nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:64403288fa2136ee8e467cdc9c9427e0434110899d07c779f25b5c068934faa5"}, @@ -3972,6 +4257,8 @@ version = "9.1.0.70" description = "cuDNN runtime libraries" optional = true python-versions = ">=3" +groups = ["main"] +markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\" and extra == \"sentence-transformers\" and (python_version <= \"3.11\" or python_version >= \"3.12\")" files = [ {file = "nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl", hash = "sha256:165764f44ef8c61fcdfdfdbe769d687e06374059fbb388b6c89ecb0e28793a6f"}, {file = "nvidia_cudnn_cu12-9.1.0.70-py3-none-win_amd64.whl", hash = "sha256:6278562929433d68365a07a4a1546c237ba2849852c0d4b2262a486e805b977a"}, @@ -3986,6 +4273,8 @@ version = "11.2.1.3" description = "CUFFT native runtime libraries" optional = true python-versions = ">=3" +groups = ["main"] +markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\" and extra == \"sentence-transformers\" and (python_version <= \"3.11\" or python_version >= \"3.12\")" files = [ {file = "nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_aarch64.whl", hash = "sha256:5dad8008fc7f92f5ddfa2101430917ce2ffacd86824914c82e28990ad7f00399"}, {file = "nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl", hash = "sha256:f083fc24912aa410be21fa16d157fed2055dab1cc4b6934a0e03cba69eb242b9"}, @@ -4001,6 +4290,8 @@ version = "10.3.5.147" description = "CURAND native runtime libraries" optional = true python-versions = ">=3" +groups = ["main"] +markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\" and extra == \"sentence-transformers\" and (python_version <= \"3.11\" or python_version >= \"3.12\")" files = [ {file = "nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_aarch64.whl", hash = "sha256:1f173f09e3e3c76ab084aba0de819c49e56614feae5c12f69883f4ae9bb5fad9"}, {file = "nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl", hash = "sha256:a88f583d4e0bb643c49743469964103aa59f7f708d862c3ddb0fc07f851e3b8b"}, @@ -4013,6 +4304,8 @@ version = "11.6.1.9" description = "CUDA solver native runtime libraries" optional = true python-versions = ">=3" +groups = ["main"] +markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\" and extra == \"sentence-transformers\" and (python_version <= \"3.11\" or python_version >= \"3.12\")" files = [ {file = "nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_aarch64.whl", hash = "sha256:d338f155f174f90724bbde3758b7ac375a70ce8e706d70b018dd3375545fc84e"}, {file = "nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl", hash = "sha256:19e33fa442bcfd085b3086c4ebf7e8debc07cfe01e11513cc6d332fd918ac260"}, @@ -4030,6 +4323,8 @@ version = "12.3.1.170" description = "CUSPARSE native runtime libraries" optional = true python-versions = ">=3" +groups = ["main"] +markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\" and extra == \"sentence-transformers\" and (python_version <= \"3.11\" or python_version >= \"3.12\")" files = [ {file = "nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_aarch64.whl", hash = "sha256:9d32f62896231ebe0480efd8a7f702e143c98cfaa0e8a76df3386c1ba2b54df3"}, {file = "nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_x86_64.whl", hash = "sha256:ea4f11a2904e2a8dc4b1833cc1b5181cde564edd0d5cd33e3c168eff2d1863f1"}, @@ -4045,6 +4340,8 @@ version = "0.6.2" description = "NVIDIA cuSPARSELt" optional = true python-versions = "*" +groups = ["main"] +markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\" and extra == \"sentence-transformers\" and (python_version <= \"3.11\" or python_version >= \"3.12\")" files = [ {file = "nvidia_cusparselt_cu12-0.6.2-py3-none-manylinux2014_aarch64.whl", hash = "sha256:067a7f6d03ea0d4841c85f0c6f1991c5dda98211f6302cb83a4ab234ee95bef8"}, {file = "nvidia_cusparselt_cu12-0.6.2-py3-none-manylinux2014_x86_64.whl", hash = "sha256:df2c24502fd76ebafe7457dbc4716b2fec071aabaed4fb7691a201cde03704d9"}, @@ -4057,6 +4354,8 @@ version = "2.21.5" description = "NVIDIA Collective Communication Library (NCCL) Runtime" optional = true python-versions = ">=3" +groups = ["main"] +markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\" and extra == \"sentence-transformers\" and (python_version <= \"3.11\" or python_version >= \"3.12\")" files = [ {file = "nvidia_nccl_cu12-2.21.5-py3-none-manylinux2014_x86_64.whl", hash = "sha256:8579076d30a8c24988834445f8d633c697d42397e92ffc3f63fa26766d25e0a0"}, ] @@ -4067,7 +4366,10 @@ version = "12.4.127" description = "Nvidia JIT LTO Library" optional = true python-versions = ">=3" +groups = ["main"] +markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\" and extra == \"sentence-transformers\" and (python_version <= \"3.11\" or python_version >= \"3.12\")" files = [ + {file = "nvidia_nvjitlink_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:4abe7fef64914ccfa909bc2ba39739670ecc9e820c83ccc7a6ed414122599b83"}, {file = "nvidia_nvjitlink_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:06b3b9b25bf3f8af351d664978ca26a16d2c5127dbd53c0497e28d1fb9611d57"}, {file = "nvidia_nvjitlink_cu12-12.4.127-py3-none-win_amd64.whl", hash = "sha256:fd9020c501d27d135f983c6d3e244b197a7ccad769e34df53a42e276b0e25fa1"}, ] @@ -4078,6 +4380,8 @@ version = "12.4.127" description = "NVIDIA Tools Extension" optional = true python-versions = ">=3" +groups = ["main"] +markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\" and extra == \"sentence-transformers\" and (python_version <= \"3.11\" or python_version >= \"3.12\")" files = [ {file = "nvidia_nvtx_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:7959ad635db13edf4fc65c06a6e9f9e55fc2f92596db928d169c0bb031e88ef3"}, {file = "nvidia_nvtx_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:781e950d9b9f60d8241ccea575b32f5105a5baf4c2351cab5256a24869f12a1a"}, @@ -4090,6 +4394,8 @@ version = "1.65.1" description = "The official Python library for the openai API" optional = true python-versions = ">=3.8" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"openai\"" files = [ {file = "openai-1.65.1-py3-none-any.whl", hash = "sha256:396652a6452dd42791b3ad8a3aab09b1feb7c1c4550a672586fb300760a8e204"}, {file = "openai-1.65.1.tar.gz", hash = "sha256:9d9370a20d2b8c3ce319fd2194c2eef5eab59effbcc5b04ff480977edc530fba"}, @@ -4115,6 +4421,8 @@ version = "3.10.15" description = "Fast, correct Python JSON library supporting dataclasses, datetimes, and numpy" optional = true python-versions = ">=3.8" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"ranx\" and python_version >= \"3.10\"" files = [ {file = "orjson-3.10.15-cp310-cp310-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:552c883d03ad185f720d0c09583ebde257e41b9521b74ff40e08b7dec4559c04"}, {file = "orjson-3.10.15-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:616e3e8d438d02e4854f70bfdc03a6bcdb697358dbaa6bcd19cbe24d24ece1f8"}, @@ -4203,10 +4511,12 @@ version = "24.2" description = "Core utilities for Python packages" optional = false python-versions = ">=3.8" +groups = ["main", "dev", "docs"] files = [ {file = "packaging-24.2-py3-none-any.whl", hash = "sha256:09abb1bccd265c01f4a3aa3f7a7db064b36514d2cba19a2f694fe6150451a759"}, {file = "packaging-24.2.tar.gz", hash = "sha256:c228a6dc5e932d346bc5739379109d49e8853dd8223571c7c5b55260edc0b97f"}, ] +markers = {main = "(python_version <= \"3.11\" or python_version >= \"3.12\") and (extra == \"sentence-transformers\" or extra == \"cohere\" or extra == \"vertexai\" or extra == \"ranx\") and (extra == \"sentence-transformers\" or extra == \"cohere\" or extra == \"vertexai\" or python_version >= \"3.10\")", dev = "python_version <= \"3.11\" or python_version >= \"3.12\"", docs = "python_version <= \"3.11\" or python_version >= \"3.12\""} [[package]] name = "pandas" @@ -4214,6 +4524,8 @@ version = "2.2.3" description = "Powerful data structures for data analysis, time series, and statistics" optional = true python-versions = ">=3.9" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"ranx\" and python_version >= \"3.10\"" files = [ {file = "pandas-2.2.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1948ddde24197a0f7add2bdc4ca83bf2b1ef84a1bc8ccffd95eda17fd836ecb5"}, {file = "pandas-2.2.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:381175499d3802cde0eabbaf6324cce0c4f5d52ca6f8c377c29ad442f50f6348"}, @@ -4300,6 +4612,8 @@ version = "1.5.1" description = "Utilities for writing pandoc filters in python" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +groups = ["docs"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "pandocfilters-1.5.1-py2.py3-none-any.whl", hash = "sha256:93be382804a9cdb0a7267585f157e5d1731bbe5545a85b268d6f5fe6232de2bc"}, {file = "pandocfilters-1.5.1.tar.gz", hash = "sha256:002b4a555ee4ebc03f8b66307e287fa492e4a77b4ea14d3f934328297bb4939e"}, @@ -4311,6 +4625,8 @@ version = "0.8.4" description = "A Python Parser" optional = false python-versions = ">=3.6" +groups = ["dev", "docs"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "parso-0.8.4-py2.py3-none-any.whl", hash = "sha256:a418670a20291dacd2dddc80c377c5c3791378ee1e8d12bffc35420643d43f18"}, {file = "parso-0.8.4.tar.gz", hash = "sha256:eb3a7b58240fb99099a345571deecc0f9540ea5f4dd2fe14c2a99d6b281ab92d"}, @@ -4326,6 +4642,8 @@ version = "0.12.1" description = "Utility library for gitignore style pattern matching of file paths." optional = false python-versions = ">=3.8" +groups = ["dev"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "pathspec-0.12.1-py3-none-any.whl", hash = "sha256:a0d503e138a4c123b27490a4f7beda6a01c6f288df0e4a8b79c7eb0dc7b4cc08"}, {file = "pathspec-0.12.1.tar.gz", hash = "sha256:a482d51503a1ab33b1c67a6c3813a26953dbdc71c31dacaef9a838c4e29f5712"}, @@ -4337,6 +4655,8 @@ version = "4.9.0" description = "Pexpect allows easy control of interactive console applications." optional = false python-versions = "*" +groups = ["dev", "docs"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and sys_platform != \"win32\"" files = [ {file = "pexpect-4.9.0-py2.py3-none-any.whl", hash = "sha256:7236d1e080e4936be2dc3e326cec0af72acf9212a7e1d060210e70a47e253523"}, {file = "pexpect-4.9.0.tar.gz", hash = "sha256:ee7d41123f3c9911050ea2c2dac107568dc43b2d3b0c7557a33212c398ead30f"}, @@ -4351,6 +4671,8 @@ version = "11.1.0" description = "Python Imaging Library (Fork)" optional = true python-versions = ">=3.9" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and (extra == \"sentence-transformers\" or extra == \"ranx\") and (extra == \"sentence-transformers\" or python_version >= \"3.10\")" files = [ {file = "pillow-11.1.0-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:e1abe69aca89514737465752b4bcaf8016de61b3be1397a8fc260ba33321b3a8"}, {file = "pillow-11.1.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c640e5a06869c75994624551f45e5506e4256562ead981cce820d5ab39ae2192"}, @@ -4439,6 +4761,8 @@ version = "4.3.6" description = "A small Python package for determining appropriate platform-specific dirs, e.g. a `user data dir`." optional = false python-versions = ">=3.8" +groups = ["dev", "docs"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "platformdirs-4.3.6-py3-none-any.whl", hash = "sha256:73e575e1408ab8103900836b97580d5307456908a03e92031bab39e4554cc3fb"}, {file = "platformdirs-4.3.6.tar.gz", hash = "sha256:357fb2acbc885b0419afd3ce3ed34564c13c9b95c89360cd9563f73aa5e2b907"}, @@ -4455,6 +4779,8 @@ version = "1.5.0" description = "plugin and hook calling mechanisms for python" optional = false python-versions = ">=3.8" +groups = ["dev"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "pluggy-1.5.0-py3-none-any.whl", hash = "sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669"}, {file = "pluggy-1.5.0.tar.gz", hash = "sha256:2cffa88e94fdc978c4c574f15f9e59b7f4201d439195c3715ca9e2486f1d0cf1"}, @@ -4464,12 +4790,27 @@ files = [ dev = ["pre-commit", "tox"] testing = ["pytest", "pytest-benchmark"] +[[package]] +name = "ply" +version = "3.11" +description = "Python Lex & Yacc" +optional = false +python-versions = "*" +groups = ["main"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" +files = [ + {file = "ply-3.11-py2.py3-none-any.whl", hash = "sha256:096f9b8350b65ebd2fd1346b12452efe5b9607f7482813ffca50c22722a807ce"}, + {file = "ply-3.11.tar.gz", hash = "sha256:00c7c1aaa88358b9c765b6d3000c6eec0ba42abca5351b095321aef446081da3"}, +] + [[package]] name = "pre-commit" version = "4.1.0" description = "A framework for managing and maintaining multi-language pre-commit hooks." optional = false python-versions = ">=3.9" +groups = ["dev"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "pre_commit-4.1.0-py2.py3-none-any.whl", hash = "sha256:d29e7cb346295bcc1cc75fc3e92e343495e3ea0196c9ec6ba53f49f10ab6ae7b"}, {file = "pre_commit-4.1.0.tar.gz", hash = "sha256:ae3f018575a588e30dfddfab9a05448bfbd6b73d78709617b5a2b853549716d4"}, @@ -4488,6 +4829,8 @@ version = "3.0.50" description = "Library for building powerful interactive command lines in Python" optional = false python-versions = ">=3.8.0" +groups = ["dev", "docs"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "prompt_toolkit-3.0.50-py3-none-any.whl", hash = "sha256:9b6427eb19e479d98acff65196a307c555eb567989e6d88ebbb1b509d9779198"}, {file = "prompt_toolkit-3.0.50.tar.gz", hash = "sha256:544748f3860a2623ca5cd6d2795e7a14f3d0e1c3c9728359013f79877fc89bab"}, @@ -4502,6 +4845,8 @@ version = "0.3.0" description = "Accelerated property cache" optional = true python-versions = ">=3.9" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"voyageai\"" files = [ {file = "propcache-0.3.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:efa44f64c37cc30c9f05932c740a8b40ce359f51882c70883cc95feac842da4d"}, {file = "propcache-0.3.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:2383a17385d9800b6eb5855c2f05ee550f803878f344f58b6e194de08b96352c"}, @@ -4609,6 +4954,8 @@ version = "1.26.0" description = "Beautiful, Pythonic protocol buffers" optional = true python-versions = ">=3.7" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"vertexai\"" files = [ {file = "proto_plus-1.26.0-py3-none-any.whl", hash = "sha256:bf2dfaa3da281fc3187d12d224c707cb57214fb2c22ba854eb0c105a3fb2d4d7"}, {file = "proto_plus-1.26.0.tar.gz", hash = "sha256:6e93d5f5ca267b54300880fff156b6a3386b3fa3f43b1da62e680fc0c586ef22"}, @@ -4626,6 +4973,8 @@ version = "5.29.3" description = "" optional = true python-versions = ">=3.8" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"vertexai\"" files = [ {file = "protobuf-5.29.3-cp310-abi3-win32.whl", hash = "sha256:3ea51771449e1035f26069c4c7fd51fba990d07bc55ba80701c78f886bf9c888"}, {file = "protobuf-5.29.3-cp310-abi3-win_amd64.whl", hash = "sha256:a4fa6f80816a9a0678429e84973f2f98cbc218cca434abe8db2ad0bffc98503a"}, @@ -4646,6 +4995,8 @@ version = "7.0.0" description = "Cross-platform lib for process and system monitoring in Python. NOTE: the syntax of this script MUST be kept compatible with Python 2.7." optional = false python-versions = ">=3.6" +groups = ["dev", "docs"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "psutil-7.0.0-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:101d71dc322e3cffd7cea0650b09b3d08b8e7c4109dd6809fe452dfd00e58b25"}, {file = "psutil-7.0.0-cp36-abi3-macosx_11_0_arm64.whl", hash = "sha256:39db632f6bb862eeccf56660871433e111b6ea58f2caea825571951d4b6aa3da"}, @@ -4669,6 +5020,8 @@ version = "0.7.0" description = "Run a subprocess in a pseudo terminal" optional = false python-versions = "*" +groups = ["dev", "docs"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and sys_platform != \"win32\"" files = [ {file = "ptyprocess-0.7.0-py2.py3-none-any.whl", hash = "sha256:4b41f3967fce3af57cc7e94b888626c18bf37a083e3651ca8feeb66d492fef35"}, {file = "ptyprocess-0.7.0.tar.gz", hash = "sha256:5c5d0a3b48ceee0b48485e0c26037c0acd7d29765ca3fbb5cb3831d347423220"}, @@ -4680,6 +5033,8 @@ version = "0.2.3" description = "Safely evaluate AST nodes without side effects" optional = false python-versions = "*" +groups = ["dev", "docs"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "pure_eval-0.2.3-py3-none-any.whl", hash = "sha256:1db8e35b67b3d218d818ae653e27f06c3aa420901fa7b081ca98cbedc874e0d0"}, {file = "pure_eval-0.2.3.tar.gz", hash = "sha256:5f4e983f40564c576c7c8635ae88db5956bb2229d7e9237d03b3c0b0190eaf42"}, @@ -4694,6 +5049,8 @@ version = "0.6.1" description = "Pure-Python implementation of ASN.1 types and DER/BER/CER codecs (X.208)" optional = true python-versions = ">=3.8" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"vertexai\"" files = [ {file = "pyasn1-0.6.1-py3-none-any.whl", hash = "sha256:0d632f46f2ba09143da3a8afe9e33fb6f92fa2320ab7e886e2d0f7672af84629"}, {file = "pyasn1-0.6.1.tar.gz", hash = "sha256:6f580d2bdd84365380830acf45550f2511469f673cb4a5ae3857a3170128b034"}, @@ -4705,6 +5062,8 @@ version = "0.4.1" description = "A collection of ASN.1-based protocols modules" optional = true python-versions = ">=3.8" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"vertexai\"" files = [ {file = "pyasn1_modules-0.4.1-py3-none-any.whl", hash = "sha256:49bfa96b45a292b711e986f222502c1c9a5e1f4e568fc30e2574a6c7d07838fd"}, {file = "pyasn1_modules-0.4.1.tar.gz", hash = "sha256:c28e2dbf9c06ad61c71a075c7e0f9fd0f1b0bb2d2ad4377f240d33ac2ab60a7c"}, @@ -4719,10 +5078,12 @@ version = "2.22" description = "C parser in Python" optional = false python-versions = ">=3.8" +groups = ["dev", "docs"] files = [ {file = "pycparser-2.22-py3-none-any.whl", hash = "sha256:c3702b6d3dd8c7abc1afa565d7e63d53a1d0bd86cdc24edd75470f4de499cfcc"}, {file = "pycparser-2.22.tar.gz", hash = "sha256:491c8be9c040f5390f5bf44a5b07752bd07f56edf992381b05c701439eec10f6"}, ] +markers = {dev = "(implementation_name == \"pypy\" or platform_python_implementation != \"PyPy\") and (python_version <= \"3.11\" or python_version >= \"3.12\")", docs = "(python_version <= \"3.11\" or python_version >= \"3.12\") and implementation_name == \"pypy\""} [[package]] name = "pydantic" @@ -4730,6 +5091,8 @@ version = "2.10.6" description = "Data validation using Python type hints" optional = false python-versions = ">=3.8" +groups = ["main"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "pydantic-2.10.6-py3-none-any.whl", hash = "sha256:427d664bf0b8a2b34ff5dd0f5a18df00591adcee7198fbd71981054cef37b584"}, {file = "pydantic-2.10.6.tar.gz", hash = "sha256:ca5daa827cce33de7a42be142548b0096bf05a7e7b365aebfa5f8eeec7128236"}, @@ -4750,6 +5113,8 @@ version = "2.27.2" description = "Core functionality for Pydantic validation and serialization" optional = false python-versions = ">=3.8" +groups = ["main"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "pydantic_core-2.27.2-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:2d367ca20b2f14095a8f4fa1210f5a7b78b8a20009ecced6b12818f455b1e9fa"}, {file = "pydantic_core-2.27.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:491a2b73db93fab69731eaee494f320faa4e093dbed776be1a829c2eb222c34c"}, @@ -4862,6 +5227,8 @@ version = "0.15.4" description = "Bootstrap-based Sphinx theme from the PyData community" optional = false python-versions = ">=3.9" +groups = ["docs"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "pydata_sphinx_theme-0.15.4-py3-none-any.whl", hash = "sha256:2136ad0e9500d0949f96167e63f3e298620040aea8f9c74621959eda5d4cf8e6"}, {file = "pydata_sphinx_theme-0.15.4.tar.gz", hash = "sha256:7762ec0ac59df3acecf49fd2f889e1b4565dbce8b88b2e29ee06fdd90645a06d"}, @@ -4890,10 +5257,12 @@ version = "2.19.1" description = "Pygments is a syntax highlighting package written in Python." optional = false python-versions = ">=3.8" +groups = ["main", "dev", "docs"] files = [ {file = "pygments-2.19.1-py3-none-any.whl", hash = "sha256:9ea1544ad55cecf4b8242fab6dd35a93bbce657034b0611ee383099054ab6d8c"}, {file = "pygments-2.19.1.tar.gz", hash = "sha256:61c16d2a8576dc0649d9f39e089b5f02bcd27fba10d8fb4dcc28173f7a45151f"}, ] +markers = {main = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"ranx\" and python_version >= \"3.10\"", dev = "python_version <= \"3.11\" or python_version >= \"3.12\"", docs = "python_version <= \"3.11\" or python_version >= \"3.12\""} [package.extras] windows-terminal = ["colorama (>=0.4.6)"] @@ -4904,6 +5273,8 @@ version = "3.3.4" description = "python code static checker" optional = false python-versions = ">=3.9.0" +groups = ["dev"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "pylint-3.3.4-py3-none-any.whl", hash = "sha256:289e6a1eb27b453b08436478391a48cd53bb0efb824873f949e709350f3de018"}, {file = "pylint-3.3.4.tar.gz", hash = "sha256:74ae7a38b177e69a9b525d0794bd8183820bfa7eb68cc1bee6e8ed22a42be4ce"}, @@ -4934,6 +5305,8 @@ version = "3.2.1" description = "pyparsing module - Classes and methods to define and execute parsing grammars" optional = true python-versions = ">=3.9" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"ranx\" and python_version >= \"3.10\"" files = [ {file = "pyparsing-3.2.1-py3-none-any.whl", hash = "sha256:506ff4f4386c4cec0590ec19e6302d3aedb992fdc02c761e90416f158dacf8e1"}, {file = "pyparsing-3.2.1.tar.gz", hash = "sha256:61980854fd66de3a90028d679a954d5f2623e83144b5afe5ee86f43d762e5f0a"}, @@ -4948,6 +5321,8 @@ version = "3.5.4" description = "A python implementation of GNU readline." optional = false python-versions = ">=3.8" +groups = ["main"] +markers = "sys_platform == \"win32\" and (python_version <= \"3.11\" or python_version >= \"3.12\")" files = [ {file = "pyreadline3-3.5.4-py3-none-any.whl", hash = "sha256:eaf8e6cc3c49bcccf145fc6067ba8643d1df34d604a1ec0eccbf7a18e6d3fae6"}, {file = "pyreadline3-3.5.4.tar.gz", hash = "sha256:8d57d53039a1c75adba8e50dd3d992b28143480816187ea5efbd5c78e6c885b7"}, @@ -4962,6 +5337,8 @@ version = "8.3.4" description = "pytest: simple powerful testing with Python" optional = false python-versions = ">=3.8" +groups = ["dev"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "pytest-8.3.4-py3-none-any.whl", hash = "sha256:50e16d954148559c9a74109af1eaf0c945ba2d8f30f0a3d3335edde19788b6f6"}, {file = "pytest-8.3.4.tar.gz", hash = "sha256:965370d062bce11e73868e0335abac31b4d3de0e82f4007408d242b4f8610761"}, @@ -4984,6 +5361,8 @@ version = "0.23.8" description = "Pytest support for asyncio" optional = false python-versions = ">=3.8" +groups = ["dev"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "pytest_asyncio-0.23.8-py3-none-any.whl", hash = "sha256:50265d892689a5faefb84df80819d1ecef566eb3549cf915dfb33569359d1ce2"}, {file = "pytest_asyncio-0.23.8.tar.gz", hash = "sha256:759b10b33a6dc61cce40a8bd5205e302978bbbcc00e279a8b61d9a6a3c82e4d3"}, @@ -5002,6 +5381,8 @@ version = "3.6.1" description = "pytest xdist plugin for distributed testing, most importantly across multiple CPUs" optional = false python-versions = ">=3.8" +groups = ["dev"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "pytest_xdist-3.6.1-py3-none-any.whl", hash = "sha256:9ed4adfb68a016610848639bb7e02c9352d5d9f03d04809919e2dafc3be4cca7"}, {file = "pytest_xdist-3.6.1.tar.gz", hash = "sha256:ead156a4db231eec769737f57668ef58a2084a34b2e55c4a8fa20d861107300d"}, @@ -5023,10 +5404,12 @@ version = "2.9.0.post0" description = "Extensions to the standard Python datetime module" optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" +groups = ["main", "dev", "docs"] files = [ {file = "python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3"}, {file = "python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427"}, ] +markers = {main = "(python_version <= \"3.11\" or python_version >= \"3.12\") and (extra == \"vertexai\" or extra == \"mistralai\" or extra == \"bedrock\" or extra == \"ranx\") and (extra == \"vertexai\" or extra == \"mistralai\" or extra == \"bedrock\" or python_version >= \"3.10\")", dev = "python_version <= \"3.11\" or python_version >= \"3.12\"", docs = "python_version <= \"3.11\" or python_version >= \"3.12\""} [package.dependencies] six = ">=1.5" @@ -5037,6 +5420,8 @@ version = "1.0.1" description = "Read key-value pairs from a .env file and set them as environment variables" optional = false python-versions = ">=3.8" +groups = ["dev"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "python-dotenv-1.0.1.tar.gz", hash = "sha256:e324ee90a023d808f1959c46bcbc04446a10ced277783dc6ee09987c37ec10ca"}, {file = "python_dotenv-1.0.1-py3-none-any.whl", hash = "sha256:f7b63ef50f1b690dddf550d03497b66d609393b40b564ed0d674909a68ebf16a"}, @@ -5051,6 +5436,8 @@ version = "3.0.0" description = "Universally unique lexicographically sortable identifier" optional = false python-versions = ">=3.9" +groups = ["main"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "python_ulid-3.0.0-py3-none-any.whl", hash = "sha256:e4c4942ff50dbd79167ad01ac725ec58f924b4018025ce22c858bfcff99a5e31"}, {file = "python_ulid-3.0.0.tar.gz", hash = "sha256:e50296a47dc8209d28629a22fc81ca26c00982c78934bd7766377ba37ea49a9f"}, @@ -5065,6 +5452,8 @@ version = "2025.1" description = "World timezone definitions, modern and historical" optional = true python-versions = "*" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"ranx\" and python_version >= \"3.10\"" files = [ {file = "pytz-2025.1-py2.py3-none-any.whl", hash = "sha256:89dd22dca55b46eac6eda23b2d72721bf1bdfef212645d81513ef5d03038de57"}, {file = "pytz-2025.1.tar.gz", hash = "sha256:c2db42be2a2518b28e65f9207c4d05e6ff547d1efa4086469ef855e4ab70178e"}, @@ -5076,6 +5465,7 @@ version = "308" description = "Python for Window Extensions" optional = false python-versions = "*" +groups = ["dev", "docs"] files = [ {file = "pywin32-308-cp310-cp310-win32.whl", hash = "sha256:796ff4426437896550d2981b9c2ac0ffd75238ad9ea2d3bfa67a1abd546d262e"}, {file = "pywin32-308-cp310-cp310-win_amd64.whl", hash = "sha256:4fc888c59b3c0bef905ce7eb7e2106a07712015ea1c8234b703a088d46110e8e"}, @@ -5096,6 +5486,7 @@ files = [ {file = "pywin32-308-cp39-cp39-win32.whl", hash = "sha256:7873ca4dc60ab3287919881a7d4f88baee4a6e639aa6962de25a98ba6b193341"}, {file = "pywin32-308-cp39-cp39-win_amd64.whl", hash = "sha256:71b3322d949b4cc20776436a9c9ba0eeedcbc9c650daa536df63f0ff111bb920"}, ] +markers = {dev = "(python_version <= \"3.11\" or python_version >= \"3.12\") and sys_platform == \"win32\"", docs = "sys_platform == \"win32\" and platform_python_implementation != \"PyPy\" and (python_version <= \"3.11\" or python_version >= \"3.12\")"} [[package]] name = "pyyaml" @@ -5103,6 +5494,8 @@ version = "6.0.2" description = "YAML parser and emitter for Python" optional = false python-versions = ">=3.8" +groups = ["main", "dev", "docs"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "PyYAML-6.0.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0a9a2848a5b7feac301353437eb7d5957887edbf81d56e903999a75a3d743086"}, {file = "PyYAML-6.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:29717114e51c84ddfba879543fb232a6ed60086602313ca38cce623c1d62cfbf"}, @@ -5165,6 +5558,8 @@ version = "26.2.1" description = "Python bindings for 0MQ" optional = false python-versions = ">=3.7" +groups = ["dev", "docs"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "pyzmq-26.2.1-cp310-cp310-macosx_10_15_universal2.whl", hash = "sha256:f39d1227e8256d19899d953e6e19ed2ccb689102e6d85e024da5acf410f301eb"}, {file = "pyzmq-26.2.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:a23948554c692df95daed595fdd3b76b420a4939d7a8a28d6d7dea9711878641"}, @@ -5286,6 +5681,8 @@ version = "0.3.20" description = "ranx: A Blazing-Fast Python Library for Ranking Evaluation, Comparison, and Fusion" optional = true python-versions = ">=3.8" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"ranx\" and python_version >= \"3.10\"" files = [ {file = "ranx-0.3.20-py3-none-any.whl", hash = "sha256:e056e4d5981b0328b045868cc7064fc57a545f36009fbe9bb602295ec33335de"}, {file = "ranx-0.3.20.tar.gz", hash = "sha256:8afc6f2042c40645e5d1fd80c35ed75a885e18bd2db7e95cc7ec32a0b41e59ea"}, @@ -5312,6 +5709,8 @@ version = "5.2.1" description = "Python client for Redis database and key-value store" optional = false python-versions = ">=3.8" +groups = ["main"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "redis-5.2.1-py3-none-any.whl", hash = "sha256:ee7e1056b9aea0f04c6c2ed59452947f34c4940ee025f5dd83e6a6418b6989e4"}, {file = "redis-5.2.1.tar.gz", hash = "sha256:16f2e22dff21d5125e8481515e386711a34cbec50f0e44413dd7d9c060a54e0f"}, @@ -5330,6 +5729,8 @@ version = "0.36.2" description = "JSON Referencing + Python" optional = false python-versions = ">=3.9" +groups = ["dev", "docs"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "referencing-0.36.2-py3-none-any.whl", hash = "sha256:e8699adbbf8b5c7de96d8ffa0eb5c158b3beafce084968e2ea8bb08c6794dcd0"}, {file = "referencing-0.36.2.tar.gz", hash = "sha256:df2e89862cd09deabbdba16944cc3f10feb6b3e6f18e902f7cc25609a34775aa"}, @@ -5346,6 +5747,8 @@ version = "2024.11.6" description = "Alternative regular expression module, to replace re." optional = true python-versions = ">=3.8" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"sentence-transformers\"" files = [ {file = "regex-2024.11.6-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:ff590880083d60acc0433f9c3f713c51f7ac6ebb9adf889c79a261ecf541aa91"}, {file = "regex-2024.11.6-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:658f90550f38270639e83ce492f27d2c8d2cd63805c65a13a14d36ca126753f0"}, @@ -5449,10 +5852,12 @@ version = "2.32.3" description = "Python HTTP for Humans." optional = false python-versions = ">=3.8" +groups = ["main", "dev", "docs"] files = [ {file = "requests-2.32.3-py3-none-any.whl", hash = "sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6"}, {file = "requests-2.32.3.tar.gz", hash = "sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760"}, ] +markers = {main = "(python_version <= \"3.11\" or python_version >= \"3.12\") and (extra == \"sentence-transformers\" or extra == \"cohere\" or extra == \"vertexai\" or extra == \"voyageai\" or extra == \"ranx\") and (extra == \"sentence-transformers\" or extra == \"cohere\" or extra == \"vertexai\" or extra == \"voyageai\" or python_version >= \"3.10\")", dev = "python_version <= \"3.11\" or python_version >= \"3.12\"", docs = "python_version <= \"3.11\" or python_version >= \"3.12\""} [package.dependencies] certifi = ">=2017.4.17" @@ -5470,6 +5875,8 @@ version = "13.9.4" description = "Render rich text, tables, progress bars, syntax highlighting, markdown and more to the terminal" optional = true python-versions = ">=3.8.0" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"ranx\" and python_version >= \"3.10\"" files = [ {file = "rich-13.9.4-py3-none-any.whl", hash = "sha256:6049d5e6ec054bf2779ab3358186963bac2ea89175919d699e378b99738c2a90"}, {file = "rich-13.9.4.tar.gz", hash = "sha256:439594978a49a09530cff7ebc4b5c7103ef57baf48d5ea3184f21d9a2befa098"}, @@ -5489,6 +5896,8 @@ version = "0.23.1" description = "Python bindings to Rust's persistent data structures (rpds)" optional = false python-versions = ">=3.9" +groups = ["dev", "docs"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "rpds_py-0.23.1-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:2a54027554ce9b129fc3d633c92fa33b30de9f08bc61b32c053dc9b537266fed"}, {file = "rpds_py-0.23.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:b5ef909a37e9738d146519657a1aab4584018746a18f71c692f2f22168ece40c"}, @@ -5601,6 +6010,8 @@ version = "4.9" description = "Pure-Python RSA implementation" optional = true python-versions = ">=3.6,<4" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"vertexai\"" files = [ {file = "rsa-4.9-py3-none-any.whl", hash = "sha256:90260d9058e514786967344d0ef75fa8727eed8a7d2e43ce9f4bcf1b536174f7"}, {file = "rsa-4.9.tar.gz", hash = "sha256:e38464a49c6c85d7f1351b0126661487a7e0a14a50f1675ec50eb34d4f20ef21"}, @@ -5615,6 +6026,8 @@ version = "0.11.3" description = "An Amazon S3 Transfer Manager" optional = true python-versions = ">=3.8" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"bedrock\"" files = [ {file = "s3transfer-0.11.3-py3-none-any.whl", hash = "sha256:ca855bdeb885174b5ffa95b9913622459d4ad8e331fc98eb01e6d5eb6a30655d"}, {file = "s3transfer-0.11.3.tar.gz", hash = "sha256:edae4977e3a122445660c7c114bba949f9d191bae3b34a096f18a1c8c354527a"}, @@ -5632,6 +6045,8 @@ version = "0.5.3" description = "" optional = true python-versions = ">=3.7" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"sentence-transformers\"" files = [ {file = "safetensors-0.5.3-cp38-abi3-macosx_10_12_x86_64.whl", hash = "sha256:bd20eb133db8ed15b40110b7c00c6df51655a2998132193de2f75f72d99c7073"}, {file = "safetensors-0.5.3-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:21d01c14ff6c415c485616b8b0bf961c46b3b343ca59110d38d744e577f9cce7"}, @@ -5669,6 +6084,8 @@ version = "1.6.1" description = "A set of python modules for machine learning and data mining" optional = true python-versions = ">=3.9" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"sentence-transformers\"" files = [ {file = "scikit_learn-1.6.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d056391530ccd1e501056160e3c9673b4da4805eb67eb2bdf4e983e1f9c9204e"}, {file = "scikit_learn-1.6.1-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:0c8d036eb937dbb568c6242fa598d551d88fb4399c0344d95c001980ec1c7d36"}, @@ -5723,6 +6140,8 @@ version = "1.13.1" description = "Fundamental algorithms for scientific computing in Python" optional = true python-versions = ">=3.9" +groups = ["main"] +markers = "extra == \"sentence-transformers\" and python_version < \"3.10\"" files = [ {file = "scipy-1.13.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:20335853b85e9a49ff7572ab453794298bcf0354d8068c5f6775a0eabf350aca"}, {file = "scipy-1.13.1-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:d605e9c23906d1994f55ace80e0125c587f96c020037ea6aa98d01b4bd2e222f"}, @@ -5765,6 +6184,8 @@ version = "1.15.2" description = "Fundamental algorithms for scientific computing in Python" optional = true python-versions = ">=3.10" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and (extra == \"sentence-transformers\" or extra == \"ranx\") and (python_version >= \"3.10\" or extra == \"sentence-transformers\")" files = [ {file = "scipy-1.15.2-cp310-cp310-macosx_10_13_x86_64.whl", hash = "sha256:a2ec871edaa863e8213ea5df811cd600734f6400b4af272e1c011e69401218e9"}, {file = "scipy-1.15.2-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:6f223753c6ea76983af380787611ae1291e3ceb23917393079dcc746ba60cfb5"}, @@ -5828,6 +6249,8 @@ version = "0.13.2" description = "Statistical data visualization" optional = true python-versions = ">=3.8" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"ranx\" and python_version >= \"3.10\"" files = [ {file = "seaborn-0.13.2-py3-none-any.whl", hash = "sha256:636f8336facf092165e27924f223d3c62ca560b1f2bb5dff7ab7fad265361987"}, {file = "seaborn-0.13.2.tar.gz", hash = "sha256:93e60a40988f4d65e9f4885df477e2fdaff6b73a9ded434c1ab356dd57eefff7"}, @@ -5849,6 +6272,8 @@ version = "3.4.1" description = "State-of-the-Art Text Embeddings" optional = true python-versions = ">=3.9" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"sentence-transformers\"" files = [ {file = "sentence_transformers-3.4.1-py3-none-any.whl", hash = "sha256:e026dc6d56801fd83f74ad29a30263f401b4b522165c19386d8bc10dcca805da"}, {file = "sentence_transformers-3.4.1.tar.gz", hash = "sha256:68daa57504ff548340e54ff117bd86c1d2f784b21e0fb2689cf3272b8937b24b"}, @@ -5876,6 +6301,8 @@ version = "75.8.2" description = "Easily download, build, install, upgrade, and uninstall Python packages" optional = true python-versions = ">=3.9" +groups = ["main"] +markers = "extra == \"sentence-transformers\" and python_version >= \"3.12\"" files = [ {file = "setuptools-75.8.2-py3-none-any.whl", hash = "sha256:558e47c15f1811c1fa7adbd0096669bf76c1d3f433f58324df69f3f5ecac4e8f"}, {file = "setuptools-75.8.2.tar.gz", hash = "sha256:4880473a969e5f23f2a2be3646b2dfd84af9028716d398e46192f84bc36900d2"}, @@ -5896,6 +6323,8 @@ version = "2.0.7" description = "Manipulation and analysis of geometric objects" optional = true python-versions = ">=3.7" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"vertexai\"" files = [ {file = "shapely-2.0.7-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:33fb10e50b16113714ae40adccf7670379e9ccf5b7a41d0002046ba2b8f0f691"}, {file = "shapely-2.0.7-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f44eda8bd7a4bccb0f281264b34bf3518d8c4c9a8ffe69a1a05dabf6e8461147"}, @@ -5954,10 +6383,12 @@ version = "1.17.0" description = "Python 2 and 3 compatibility utilities" optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" +groups = ["main", "dev", "docs"] files = [ {file = "six-1.17.0-py2.py3-none-any.whl", hash = "sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274"}, {file = "six-1.17.0.tar.gz", hash = "sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81"}, ] +markers = {main = "(python_version <= \"3.11\" or python_version >= \"3.12\") and (extra == \"vertexai\" or extra == \"mistralai\" or extra == \"bedrock\" or extra == \"ranx\") and (extra == \"vertexai\" or extra == \"mistralai\" or extra == \"bedrock\" or python_version >= \"3.10\")", dev = "python_version <= \"3.11\" or python_version >= \"3.12\"", docs = "python_version <= \"3.11\" or python_version >= \"3.12\""} [[package]] name = "sniffio" @@ -5965,6 +6396,8 @@ version = "1.3.1" description = "Sniff out which async library your code is running under" optional = true python-versions = ">=3.7" +groups = ["main"] +markers = "(extra == \"openai\" or extra == \"cohere\" or extra == \"mistralai\") and (python_version <= \"3.11\" or python_version >= \"3.12\")" files = [ {file = "sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2"}, {file = "sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc"}, @@ -5976,6 +6409,8 @@ version = "2.2.0" description = "This package provides 29 stemmers for 28 languages generated from Snowball algorithms." optional = false python-versions = "*" +groups = ["docs"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "snowballstemmer-2.2.0-py2.py3-none-any.whl", hash = "sha256:c8e1716e83cc398ae16824e5572ae04e0d9fc2c6b985fb0f900f5f0c96ecba1a"}, {file = "snowballstemmer-2.2.0.tar.gz", hash = "sha256:09b16deb8547d3412ad7b590689584cd0fe25ec8db3be37788be3810cbf19cb1"}, @@ -5987,10 +6422,12 @@ version = "2.6" description = "A modern CSS selector implementation for Beautiful Soup." optional = false python-versions = ">=3.8" +groups = ["main", "docs"] files = [ {file = "soupsieve-2.6-py3-none-any.whl", hash = "sha256:e72c4ff06e4fb6e4b5a9f0f55fe6e81514581fca1515028625d0f299c602ccc9"}, {file = "soupsieve-2.6.tar.gz", hash = "sha256:e2e68417777af359ec65daac1057404a3c8a5455bb8abc36f1a9866ab1a51abb"}, ] +markers = {main = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"ranx\" and python_version >= \"3.10\"", docs = "python_version <= \"3.11\" or python_version >= \"3.12\""} [[package]] name = "sphinx" @@ -5998,6 +6435,8 @@ version = "7.4.7" description = "Python documentation generator" optional = false python-versions = ">=3.9" +groups = ["docs"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "sphinx-7.4.7-py3-none-any.whl", hash = "sha256:c2419e2135d11f1951cd994d6eb18a1835bd8fdd8429f9ca375dc1f3281bd239"}, {file = "sphinx-7.4.7.tar.gz", hash = "sha256:242f92a7ea7e6c5b406fdc2615413890ba9f699114a9c09192d7dfead2ee9cfe"}, @@ -6034,6 +6473,8 @@ version = "0.5.2" description = "Add a copy button to each of your code cells." optional = false python-versions = ">=3.7" +groups = ["docs"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "sphinx-copybutton-0.5.2.tar.gz", hash = "sha256:4cf17c82fb9646d1bc9ca92ac280813a3b605d8c421225fd9913154103ee1fbd"}, {file = "sphinx_copybutton-0.5.2-py3-none-any.whl", hash = "sha256:fb543fd386d917746c9a2c50360c7905b605726b9355cd26e9974857afeae06e"}, @@ -6052,6 +6493,8 @@ version = "0.5.0" description = "A sphinx extension for designing beautiful, view size responsive web components." optional = false python-versions = ">=3.8" +groups = ["docs"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "sphinx_design-0.5.0-py3-none-any.whl", hash = "sha256:1af1267b4cea2eedd6724614f19dcc88fe2e15aff65d06b2f6252cee9c4f4c1e"}, {file = "sphinx_design-0.5.0.tar.gz", hash = "sha256:e8e513acea6f92d15c6de3b34e954458f245b8e761b45b63950f65373352ab00"}, @@ -6075,6 +6518,8 @@ version = "1.0.1" description = "Sphinx Extension adding support for custom favicons" optional = false python-versions = ">=3.7" +groups = ["docs"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "sphinx-favicon-1.0.1.tar.gz", hash = "sha256:df796de32125609c1b4a8964db74270ebf4502089c27cd53f542354dc0b57e8e"}, {file = "sphinx_favicon-1.0.1-py3-none-any.whl", hash = "sha256:7c93d6b634cb4c9687ceab67a8526f05d3b02679df94e273e51a43282e6b034c"}, @@ -6094,6 +6539,8 @@ version = "2.0.0" description = "sphinxcontrib-applehelp is a Sphinx extension which outputs Apple help books" optional = false python-versions = ">=3.9" +groups = ["docs"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "sphinxcontrib_applehelp-2.0.0-py3-none-any.whl", hash = "sha256:4cd3f0ec4ac5dd9c17ec65e9ab272c9b867ea77425228e68ecf08d6b28ddbdb5"}, {file = "sphinxcontrib_applehelp-2.0.0.tar.gz", hash = "sha256:2f29ef331735ce958efa4734873f084941970894c6090408b079c61b2e1c06d1"}, @@ -6110,6 +6557,8 @@ version = "2.0.0" description = "sphinxcontrib-devhelp is a sphinx extension which outputs Devhelp documents" optional = false python-versions = ">=3.9" +groups = ["docs"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "sphinxcontrib_devhelp-2.0.0-py3-none-any.whl", hash = "sha256:aefb8b83854e4b0998877524d1029fd3e6879210422ee3780459e28a1f03a8a2"}, {file = "sphinxcontrib_devhelp-2.0.0.tar.gz", hash = "sha256:411f5d96d445d1d73bb5d52133377b4248ec79db5c793ce7dbe59e074b4dd1ad"}, @@ -6126,6 +6575,8 @@ version = "2.1.0" description = "sphinxcontrib-htmlhelp is a sphinx extension which renders HTML help files" optional = false python-versions = ">=3.9" +groups = ["docs"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "sphinxcontrib_htmlhelp-2.1.0-py3-none-any.whl", hash = "sha256:166759820b47002d22914d64a075ce08f4c46818e17cfc9470a9786b759b19f8"}, {file = "sphinxcontrib_htmlhelp-2.1.0.tar.gz", hash = "sha256:c9e2916ace8aad64cc13a0d233ee22317f2b9025b9cf3295249fa985cc7082e9"}, @@ -6142,6 +6593,8 @@ version = "1.0.1" description = "A sphinx extension which renders display math in HTML via JavaScript" optional = false python-versions = ">=3.5" +groups = ["docs"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "sphinxcontrib-jsmath-1.0.1.tar.gz", hash = "sha256:a9925e4a4587247ed2191a22df5f6970656cb8ca2bd6284309578f2153e0c4b8"}, {file = "sphinxcontrib_jsmath-1.0.1-py2.py3-none-any.whl", hash = "sha256:2ec2eaebfb78f3f2078e73666b1415417a116cc848b72e5172e596c871103178"}, @@ -6156,6 +6609,8 @@ version = "2.0.0" description = "sphinxcontrib-qthelp is a sphinx extension which outputs QtHelp documents" optional = false python-versions = ">=3.9" +groups = ["docs"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "sphinxcontrib_qthelp-2.0.0-py3-none-any.whl", hash = "sha256:b18a828cdba941ccd6ee8445dbe72ffa3ef8cbe7505d8cd1fa0d42d3f2d5f3eb"}, {file = "sphinxcontrib_qthelp-2.0.0.tar.gz", hash = "sha256:4fe7d0ac8fc171045be623aba3e2a8f613f8682731f9153bb2e40ece16b9bbab"}, @@ -6172,6 +6627,8 @@ version = "2.0.0" description = "sphinxcontrib-serializinghtml is a sphinx extension which outputs \"serialized\" HTML files (json and pickle)" optional = false python-versions = ">=3.9" +groups = ["docs"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "sphinxcontrib_serializinghtml-2.0.0-py3-none-any.whl", hash = "sha256:6e2cb0eef194e10c27ec0023bfeb25badbbb5868244cf5bc5bdc04e4464bf331"}, {file = "sphinxcontrib_serializinghtml-2.0.0.tar.gz", hash = "sha256:e9d912827f872c029017a53f0ef2180b327c3f7fd23c87229f7a8e8b70031d4d"}, @@ -6188,6 +6645,8 @@ version = "2.0.38" description = "Database Abstraction Library" optional = false python-versions = ">=3.7" +groups = ["docs"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "SQLAlchemy-2.0.38-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:5e1d9e429028ce04f187a9f522818386c8b076723cdbe9345708384f49ebcec6"}, {file = "SQLAlchemy-2.0.38-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:b87a90f14c68c925817423b0424381f0e16d80fc9a1a1046ef202ab25b19a444"}, @@ -6221,27 +6680,16 @@ files = [ {file = "SQLAlchemy-2.0.38-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:5dba1cdb8f319084f5b00d41207b2079822aa8d6a4667c0f369fce85e34b0c86"}, {file = "SQLAlchemy-2.0.38-cp313-cp313-win32.whl", hash = "sha256:eae27ad7580529a427cfdd52c87abb2dfb15ce2b7a3e0fc29fbb63e2ed6f8120"}, {file = "SQLAlchemy-2.0.38-cp313-cp313-win_amd64.whl", hash = "sha256:b335a7c958bc945e10c522c069cd6e5804f4ff20f9a744dd38e748eb602cbbda"}, - {file = "SQLAlchemy-2.0.38-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:40310db77a55512a18827488e592965d3dec6a3f1e3d8af3f8243134029daca3"}, {file = "SQLAlchemy-2.0.38-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3d3043375dd5bbcb2282894cbb12e6c559654c67b5fffb462fda815a55bf93f7"}, - {file = "SQLAlchemy-2.0.38-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:70065dfabf023b155a9c2a18f573e47e6ca709b9e8619b2e04c54d5bcf193178"}, {file = "SQLAlchemy-2.0.38-cp37-cp37m-musllinux_1_2_aarch64.whl", hash = "sha256:c058b84c3b24812c859300f3b5abf300daa34df20d4d4f42e9652a4d1c48c8a4"}, - {file = "SQLAlchemy-2.0.38-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:0398361acebb42975deb747a824b5188817d32b5c8f8aba767d51ad0cc7bb08d"}, {file = "SQLAlchemy-2.0.38-cp37-cp37m-win32.whl", hash = "sha256:a2bc4e49e8329f3283d99840c136ff2cd1a29e49b5624a46a290f04dff48e079"}, {file = "SQLAlchemy-2.0.38-cp37-cp37m-win_amd64.whl", hash = "sha256:9cd136184dd5f58892f24001cdce986f5d7e96059d004118d5410671579834a4"}, - {file = "SQLAlchemy-2.0.38-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:665255e7aae5f38237b3a6eae49d2358d83a59f39ac21036413fab5d1e810578"}, - {file = "SQLAlchemy-2.0.38-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:92f99f2623ff16bd4aaf786ccde759c1f676d39c7bf2855eb0b540e1ac4530c8"}, {file = "SQLAlchemy-2.0.38-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:aa498d1392216fae47eaf10c593e06c34476ced9549657fca713d0d1ba5f7248"}, - {file = "SQLAlchemy-2.0.38-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9afbc3909d0274d6ac8ec891e30210563b2c8bdd52ebbda14146354e7a69373"}, {file = "SQLAlchemy-2.0.38-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:57dd41ba32430cbcc812041d4de8d2ca4651aeefad2626921ae2a23deb8cd6ff"}, - {file = "SQLAlchemy-2.0.38-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:3e35d5565b35b66905b79ca4ae85840a8d40d31e0b3e2990f2e7692071b179ca"}, {file = "SQLAlchemy-2.0.38-cp38-cp38-win32.whl", hash = "sha256:f0d3de936b192980209d7b5149e3c98977c3810d401482d05fb6d668d53c1c63"}, {file = "SQLAlchemy-2.0.38-cp38-cp38-win_amd64.whl", hash = "sha256:3868acb639c136d98107c9096303d2d8e5da2880f7706f9f8c06a7f961961149"}, - {file = "SQLAlchemy-2.0.38-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:07258341402a718f166618470cde0c34e4cec85a39767dce4e24f61ba5e667ea"}, - {file = "SQLAlchemy-2.0.38-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:0a826f21848632add58bef4f755a33d45105d25656a0c849f2dc2df1c71f6f50"}, {file = "SQLAlchemy-2.0.38-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:386b7d136919bb66ced64d2228b92d66140de5fefb3c7df6bd79069a269a7b06"}, - {file = "SQLAlchemy-2.0.38-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2f2951dc4b4f990a4b394d6b382accb33141d4d3bd3ef4e2b27287135d6bdd68"}, {file = "SQLAlchemy-2.0.38-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:8bf312ed8ac096d674c6aa9131b249093c1b37c35db6a967daa4c84746bc1bc9"}, - {file = "SQLAlchemy-2.0.38-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:6db316d6e340f862ec059dc12e395d71f39746a20503b124edc255973977b728"}, {file = "SQLAlchemy-2.0.38-cp39-cp39-win32.whl", hash = "sha256:c09a6ea87658695e527104cf857c70f79f14e9484605e205217aae0ec27b45fc"}, {file = "SQLAlchemy-2.0.38-cp39-cp39-win_amd64.whl", hash = "sha256:12f5c9ed53334c3ce719155424dc5407aaa4f6cadeb09c5b627e06abb93933a1"}, {file = "SQLAlchemy-2.0.38-py3-none-any.whl", hash = "sha256:63178c675d4c80def39f1febd625a6333f44c0ba269edd8a468b156394b27753"}, @@ -6283,6 +6731,8 @@ version = "0.6.3" description = "Extract data from python stack frames and tracebacks for informative displays" optional = false python-versions = "*" +groups = ["dev", "docs"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "stack_data-0.6.3-py3-none-any.whl", hash = "sha256:d5558e0c25a4cb0853cddad3d77da9891a08cb85dd9f9f91b9f8cd66e511e695"}, {file = "stack_data-0.6.3.tar.gz", hash = "sha256:836a778de4fec4dcd1dcd89ed8abff8a221f58308462e1c4aa2a3cf30148f0b9"}, @@ -6302,6 +6752,8 @@ version = "1.13.1" description = "Computer algebra system (CAS) in Python" optional = true python-versions = ">=3.8" +groups = ["main"] +markers = "extra == \"sentence-transformers\" and (python_version <= \"3.11\" or python_version >= \"3.12\")" files = [ {file = "sympy-1.13.1-py3-none-any.whl", hash = "sha256:db36cdc64bf61b9b24578b6f7bab1ecdd2452cf008f34faa33776680c26d66f8"}, {file = "sympy-1.13.1.tar.gz", hash = "sha256:9cebf7e04ff162015ce31c9c6c9144daa34a93bd082f54fd8f12deca4f47515f"}, @@ -6319,6 +6771,8 @@ version = "0.9.0" description = "Pretty-print tabular data" optional = false python-versions = ">=3.7" +groups = ["main", "docs"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "tabulate-0.9.0-py3-none-any.whl", hash = "sha256:024ca478df22e9340661486f85298cff5f6dcdba14f3813e8830015b9ed1948f"}, {file = "tabulate-0.9.0.tar.gz", hash = "sha256:0095b12bf5966de529c0feb1fa08671671b3368eec77d7ef7ab114be2c068b3c"}, @@ -6333,6 +6787,8 @@ version = "9.0.0" description = "Retry code until it succeeds" optional = false python-versions = ">=3.8" +groups = ["main"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "tenacity-9.0.0-py3-none-any.whl", hash = "sha256:93de0c98785b27fcf659856aa9f54bfbd399e29969b0621bc7f762bd441b4539"}, {file = "tenacity-9.0.0.tar.gz", hash = "sha256:807f37ca97d62aa361264d497b0e31e92b8027044942bfa756160d908320d73b"}, @@ -6348,6 +6804,8 @@ version = "4.9.1" description = "Python library for throwaway instances of anything that can run in a Docker container" optional = false python-versions = "<4.0,>=3.9" +groups = ["dev"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "testcontainers-4.9.1-py3-none-any.whl", hash = "sha256:315fb94b42a383872df530aa45319745278ef0cc18b9cfcdc231a75d14afa5a0"}, {file = "testcontainers-4.9.1.tar.gz", hash = "sha256:37fe9a222549ddb788463935965b16f91809e9a8d654f437d6a59eac9b77f76f"}, @@ -6401,6 +6859,8 @@ version = "3.5.0" description = "threadpoolctl" optional = true python-versions = ">=3.8" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"sentence-transformers\"" files = [ {file = "threadpoolctl-3.5.0-py3-none-any.whl", hash = "sha256:56c1e26c150397e58c4926da8eeee87533b1e32bef131bd4bf6a2f45f3185467"}, {file = "threadpoolctl-3.5.0.tar.gz", hash = "sha256:082433502dd922bf738de0d8bcc4fdcbf0979ff44c42bd40f5af8a282f6fa107"}, @@ -6412,6 +6872,8 @@ version = "1.4.0" description = "A tiny CSS parser" optional = false python-versions = ">=3.8" +groups = ["docs"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "tinycss2-1.4.0-py3-none-any.whl", hash = "sha256:3a49cf47b7675da0b15d0c6e1df8df4ebd96e9394bb905a5775adb0d884c5289"}, {file = "tinycss2-1.4.0.tar.gz", hash = "sha256:10c0972f6fc0fbee87c3edb76549357415e94548c1ae10ebccdea16fb404a9b7"}, @@ -6430,6 +6892,8 @@ version = "0.21.0" description = "" optional = true python-versions = ">=3.7" +groups = ["main"] +markers = "(extra == \"sentence-transformers\" or extra == \"cohere\") and (python_version <= \"3.11\" or python_version >= \"3.12\")" files = [ {file = "tokenizers-0.21.0-cp39-abi3-macosx_10_12_x86_64.whl", hash = "sha256:3c4c93eae637e7d2aaae3d376f06085164e1660f89304c0ab2b1d08a406636b2"}, {file = "tokenizers-0.21.0-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:f53ea537c925422a2e0e92a24cce96f6bc5046bbef24a1652a5edc8ba975f62e"}, @@ -6462,6 +6926,8 @@ version = "2.2.1" description = "A lil' TOML parser" optional = false python-versions = ">=3.8" +groups = ["dev", "docs"] +markers = "python_version < \"3.11\"" files = [ {file = "tomli-2.2.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:678e4fa69e4575eb77d103de3df8a895e1591b48e740211bd1067378c69e8249"}, {file = "tomli-2.2.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:023aa114dd824ade0100497eb2318602af309e5a55595f76b626d6d9f3b7b0a6"}, @@ -6503,6 +6969,8 @@ version = "0.13.2" description = "Style preserving TOML library" optional = false python-versions = ">=3.8" +groups = ["dev"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "tomlkit-0.13.2-py3-none-any.whl", hash = "sha256:7a974427f6e119197f670fbbbeae7bef749a6c14e793db934baefc1b5f03efde"}, {file = "tomlkit-0.13.2.tar.gz", hash = "sha256:fff5fe59a87295b278abd31bec92c15d9bc4a06885ab12bcea52c71119392e79"}, @@ -6514,6 +6982,8 @@ version = "2.6.0" description = "Tensors and Dynamic neural networks in Python with strong GPU acceleration" optional = true python-versions = ">=3.9.0" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"sentence-transformers\"" files = [ {file = "torch-2.6.0-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:6860df13d9911ac158f4c44031609700e1eba07916fff62e21e6ffa0a9e01961"}, {file = "torch-2.6.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:c4f103a49830ce4c7561ef4434cc7926e5a5fe4e5eb100c19ab36ea1e2b634ab"}, @@ -6570,6 +7040,8 @@ version = "6.4.2" description = "Tornado is a Python web framework and asynchronous networking library, originally developed at FriendFeed." optional = false python-versions = ">=3.8" +groups = ["dev", "docs"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "tornado-6.4.2-cp38-abi3-macosx_10_9_universal2.whl", hash = "sha256:e828cce1123e9e44ae2a50a9de3055497ab1d0aeb440c5ac23064d9e44880da1"}, {file = "tornado-6.4.2-cp38-abi3-macosx_10_9_x86_64.whl", hash = "sha256:072ce12ada169c5b00b7d92a99ba089447ccc993ea2143c9ede887e0937aa803"}, @@ -6590,6 +7062,8 @@ version = "4.67.1" description = "Fast, Extensible Progress Meter" optional = true python-versions = ">=3.7" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and (extra == \"sentence-transformers\" or extra == \"cohere\" or extra == \"openai\" or extra == \"ranx\") and (extra == \"sentence-transformers\" or extra == \"cohere\" or extra == \"openai\" or python_version >= \"3.10\")" files = [ {file = "tqdm-4.67.1-py3-none-any.whl", hash = "sha256:26445eca388f82e72884e0d580d5464cd801a3ea01e63e5601bdff9ba6a48de2"}, {file = "tqdm-4.67.1.tar.gz", hash = "sha256:f8aef9c52c08c13a65f30ea34f4e5aac3fd1a34959879d7e59e63027286627f2"}, @@ -6611,6 +7085,8 @@ version = "5.14.3" description = "Traitlets Python configuration system" optional = false python-versions = ">=3.8" +groups = ["dev", "docs"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "traitlets-5.14.3-py3-none-any.whl", hash = "sha256:b74e89e397b1ed28cc831db7aea759ba6640cb3de13090ca145426688ff1ac4f"}, {file = "traitlets-5.14.3.tar.gz", hash = "sha256:9ed0579d3502c94b4b3732ac120375cda96f923114522847de4b3bb98b96b6b7"}, @@ -6626,6 +7102,8 @@ version = "4.49.0" description = "State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow" optional = true python-versions = ">=3.9.0" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"sentence-transformers\"" files = [ {file = "transformers-4.49.0-py3-none-any.whl", hash = "sha256:6b4fded1c5fee04d384b1014495b4235a2b53c87503d7d592423c06128cbbe03"}, {file = "transformers-4.49.0.tar.gz", hash = "sha256:7e40e640b5b8dc3f48743f5f5adbdce3660c82baafbd3afdfc04143cdbd2089e"}, @@ -6695,6 +7173,8 @@ version = "2.6" description = "Support tools for TREC CAR participants. Also see trec-car.cs.unh.edu" optional = true python-versions = ">=3.6" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"ranx\" and python_version >= \"3.10\"" files = [ {file = "trec-car-tools-2.6.tar.gz", hash = "sha256:2fce2de120224fd569b151d5bed358a4ed334e643889b9e3dfe3e5a3d15d21c8"}, {file = "trec_car_tools-2.6-py3-none-any.whl", hash = "sha256:e6f0373259e1c234222da7270ab54ca7af7a6f8d0dd32b13e158c1659d3991cf"}, @@ -6710,6 +7190,8 @@ version = "3.2.0" description = "A language and compiler for custom Deep Learning operations" optional = true python-versions = "*" +groups = ["main"] +markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\" and extra == \"sentence-transformers\" and (python_version <= \"3.11\" or python_version >= \"3.12\")" files = [ {file = "triton-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b3e54983cd51875855da7c68ec05c05cf8bb08df361b1d5b69e05e40b0c9bd62"}, {file = "triton-3.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8009a1fb093ee8546495e96731336a33fb8856a38e45bb4ab6affd6dbc3ba220"}, @@ -6729,6 +7211,8 @@ version = "1.16.0.20241221" description = "Typing stubs for cffi" optional = false python-versions = ">=3.8" +groups = ["dev"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "types_cffi-1.16.0.20241221-py3-none-any.whl", hash = "sha256:e5b76b4211d7a9185f6ab8d06a106d56c7eb80af7cdb8bfcb4186ade10fb112f"}, {file = "types_cffi-1.16.0.20241221.tar.gz", hash = "sha256:1c96649618f4b6145f58231acb976e0b448be6b847f7ab733dabe62dfbff6591"}, @@ -6743,6 +7227,8 @@ version = "24.1.0.20240722" description = "Typing stubs for pyOpenSSL" optional = false python-versions = ">=3.8" +groups = ["dev"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "types-pyOpenSSL-24.1.0.20240722.tar.gz", hash = "sha256:47913b4678a01d879f503a12044468221ed8576263c1540dcb0484ca21b08c39"}, {file = "types_pyOpenSSL-24.1.0.20240722-py3-none-any.whl", hash = "sha256:6a7a5d2ec042537934cfb4c9d4deb0e16c4c6250b09358df1f083682fe6fda54"}, @@ -6758,6 +7244,8 @@ version = "6.0.12.20241230" description = "Typing stubs for PyYAML" optional = false python-versions = ">=3.8" +groups = ["dev"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "types_PyYAML-6.0.12.20241230-py3-none-any.whl", hash = "sha256:fa4d32565219b68e6dee5f67534c722e53c00d1cfc09c435ef04d7353e1e96e6"}, {file = "types_pyyaml-6.0.12.20241230.tar.gz", hash = "sha256:7f07622dbd34bb9c8b264fe860a17e0efcad00d50b5f27e93984909d9363498c"}, @@ -6769,6 +7257,8 @@ version = "4.6.0.20241004" description = "Typing stubs for redis" optional = false python-versions = ">=3.8" +groups = ["dev"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "types-redis-4.6.0.20241004.tar.gz", hash = "sha256:5f17d2b3f9091ab75384153bfa276619ffa1cf6a38da60e10d5e6749cc5b902e"}, {file = "types_redis-4.6.0.20241004-py3-none-any.whl", hash = "sha256:ef5da68cb827e5f606c8f9c0b49eeee4c2669d6d97122f301d3a55dc6a63f6ed"}, @@ -6784,6 +7274,8 @@ version = "2.31.0.6" description = "Typing stubs for requests" optional = true python-versions = ">=3.7" +groups = ["main"] +markers = "python_version < \"3.10\" and extra == \"cohere\"" files = [ {file = "types-requests-2.31.0.6.tar.gz", hash = "sha256:cd74ce3b53c461f1228a9b783929ac73a666658f223e28ed29753771477b3bd0"}, {file = "types_requests-2.31.0.6-py3-none-any.whl", hash = "sha256:a2db9cb228a81da8348b49ad6db3f5519452dd20a9c1e1a868c83c5fe88fd1a9"}, @@ -6798,6 +7290,8 @@ version = "2.32.0.20241016" description = "Typing stubs for requests" optional = true python-versions = ">=3.8" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"cohere\" and python_version >= \"3.10\"" files = [ {file = "types-requests-2.32.0.20241016.tar.gz", hash = "sha256:0d9cad2f27515d0e3e3da7134a1b6f28fb97129d86b867f24d9c726452634d95"}, {file = "types_requests-2.32.0.20241016-py3-none-any.whl", hash = "sha256:4195d62d6d3e043a4eaaf08ff8a62184584d2e8684e9d2aa178c7915a7da3747"}, @@ -6812,6 +7306,8 @@ version = "75.8.0.20250225" description = "Typing stubs for setuptools" optional = false python-versions = ">=3.9" +groups = ["dev"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "types_setuptools-75.8.0.20250225-py3-none-any.whl", hash = "sha256:94c86b439cc60bcc68c1cda3fd2c301f007f8f9502f4fbb54c66cb5ce9b875af"}, {file = "types_setuptools-75.8.0.20250225.tar.gz", hash = "sha256:6038f7e983d55792a5f90d8fdbf5d4c186026214a16bb65dd6ae83c624ae9636"}, @@ -6823,6 +7319,8 @@ version = "0.9.0.20241207" description = "Typing stubs for tabulate" optional = false python-versions = ">=3.8" +groups = ["dev"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "types_tabulate-0.9.0.20241207-py3-none-any.whl", hash = "sha256:b8dad1343c2a8ba5861c5441370c3e35908edd234ff036d4298708a1d4cf8a85"}, {file = "types_tabulate-0.9.0.20241207.tar.gz", hash = "sha256:ac1ac174750c0a385dfd248edc6279fa328aaf4ea317915ab879a2ec47833230"}, @@ -6834,6 +7332,8 @@ version = "1.26.25.14" description = "Typing stubs for urllib3" optional = true python-versions = "*" +groups = ["main"] +markers = "python_version < \"3.10\" and extra == \"cohere\"" files = [ {file = "types-urllib3-1.26.25.14.tar.gz", hash = "sha256:229b7f577c951b8c1b92c1bc2b2fdb0b49847bd2af6d1cc2a2e3dd340f3bda8f"}, {file = "types_urllib3-1.26.25.14-py3-none-any.whl", hash = "sha256:9683bbb7fb72e32bfe9d2be6e04875fbe1b3eeec3cbb4ea231435aa7fd6b4f0e"}, @@ -6845,6 +7345,8 @@ version = "4.12.2" description = "Backported and Experimental Type Hints for Python 3.8+" optional = false python-versions = ">=3.8" +groups = ["main", "dev", "docs"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "typing_extensions-4.12.2-py3-none-any.whl", hash = "sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d"}, {file = "typing_extensions-4.12.2.tar.gz", hash = "sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8"}, @@ -6856,6 +7358,8 @@ version = "0.9.0" description = "Runtime inspection utilities for typing module." optional = true python-versions = "*" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"mistralai\"" files = [ {file = "typing_inspect-0.9.0-py3-none-any.whl", hash = "sha256:9ee6fc59062311ef8547596ab6b955e1b8aa46242d854bfc78f4f6b0eff35f9f"}, {file = "typing_inspect-0.9.0.tar.gz", hash = "sha256:b23fc42ff6f6ef6954e4852c1fb512cdd18dbea03134f91f856a95ccc9461f78"}, @@ -6871,6 +7375,8 @@ version = "2025.1" description = "Provider of IANA time zone data" optional = true python-versions = ">=2" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"ranx\" and python_version >= \"3.10\"" files = [ {file = "tzdata-2025.1-py2.py3-none-any.whl", hash = "sha256:7e127113816800496f027041c570f50bcd464a020098a3b6b199517772303639"}, {file = "tzdata-2025.1.tar.gz", hash = "sha256:24894909e88cdb28bd1636c6887801df64cb485bd593f2fd83ef29075a81d694"}, @@ -6882,6 +7388,8 @@ version = "0.2.3" description = "Pure Python decompression module for .Z files compressed using Unix compress utility" optional = true python-versions = ">=3.9" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"ranx\" and python_version >= \"3.10\"" files = [ {file = "unlzw3-0.2.3-py3-none-any.whl", hash = "sha256:7760fb4f3afa1225623944c061991d89a061f7fb78665dbc4cddfdb562bb4a8b"}, {file = "unlzw3-0.2.3.tar.gz", hash = "sha256:ede5d928c792fff9da406f20334f9739693327f448f383ae1df1774627197bbb"}, @@ -6897,10 +7405,12 @@ version = "1.26.20" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,>=2.7" +groups = ["main", "dev", "docs"] files = [ {file = "urllib3-1.26.20-py2.py3-none-any.whl", hash = "sha256:0ed14ccfbf1c30a9072c7ca157e4319b70d65f623e91e7b32fadb2853431016e"}, {file = "urllib3-1.26.20.tar.gz", hash = "sha256:40c2dc0c681e47eb8f90e7e27bf6ff7df2e677421fd46756da1161c39ca70d32"}, ] +markers = {main = "extra == \"sentence-transformers\" and python_version < \"3.10\" or extra == \"cohere\" and python_version < \"3.10\" or extra == \"vertexai\" and python_version < \"3.10\" or extra == \"voyageai\" and python_version < \"3.10\" or extra == \"bedrock\" and python_version < \"3.10\"", dev = "python_version < \"3.10\"", docs = "python_version < \"3.10\""} [package.extras] brotli = ["brotli (==1.0.9)", "brotli (>=1.0.9)", "brotlicffi (>=0.8.0)", "brotlipy (>=0.6.0)"] @@ -6913,10 +7423,12 @@ version = "2.3.0" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" +groups = ["main", "dev", "docs"] files = [ {file = "urllib3-2.3.0-py3-none-any.whl", hash = "sha256:1cee9ad369867bfdbbb48b7dd50374c0967a0bb7710050facf0dd6911440e3df"}, {file = "urllib3-2.3.0.tar.gz", hash = "sha256:f8c5449b3cf0861679ce7e0503c7b44b5ec981bec0d1d3795a07f1ba96f0204d"}, ] +markers = {main = "(python_version <= \"3.11\" or python_version >= \"3.12\") and (extra == \"sentence-transformers\" or extra == \"cohere\" or extra == \"vertexai\" or extra == \"voyageai\" or extra == \"ranx\" or extra == \"bedrock\") and python_version >= \"3.10\"", dev = "python_version <= \"3.11\" and python_version >= \"3.10\" or python_version >= \"3.12\"", docs = "python_version <= \"3.11\" and python_version >= \"3.10\" or python_version >= \"3.12\""} [package.extras] brotli = ["brotli (>=1.0.9)", "brotlicffi (>=0.8.0)"] @@ -6930,6 +7442,8 @@ version = "20.29.2" description = "Virtual Python Environment builder" optional = false python-versions = ">=3.8" +groups = ["dev"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "virtualenv-20.29.2-py3-none-any.whl", hash = "sha256:febddfc3d1ea571bdb1dc0f98d7b45d24def7428214d4fb73cc486c9568cce6a"}, {file = "virtualenv-20.29.2.tar.gz", hash = "sha256:fdaabebf6d03b5ba83ae0a02cfe96f48a716f4fae556461d180825866f75b728"}, @@ -6950,6 +7464,8 @@ version = "0.2.4" description = "" optional = true python-versions = "<4.0.0,>=3.7.1" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"voyageai\"" files = [ {file = "voyageai-0.2.4-py3-none-any.whl", hash = "sha256:e3070e5c78dec89adae43231334b4637aa88933dad99b1c33d3219fdfc94dfa4"}, {file = "voyageai-0.2.4.tar.gz", hash = "sha256:b9911d8629e8a4e363291c133482fead49a3536afdf1e735f3ab3aaccd8d250d"}, @@ -6968,6 +7484,8 @@ version = "0.2.5" description = "Python library to work with ARC and WARC files" optional = true python-versions = "*" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"ranx\" and python_version >= \"3.10\"" files = [ {file = "warc3_wet-0.2.5-py3-none-any.whl", hash = "sha256:5a9a525383fb1af159734baa75f349a7c4ec7bccd1b938681b5748515d2bf624"}, {file = "warc3_wet-0.2.5.tar.gz", hash = "sha256:15e50402dabaa1e95307f1e2a6169cfd5f137b70761d9f0b16a10aa6de227970"}, @@ -6979,6 +7497,8 @@ version = "0.2.5" description = "Python library to work with ARC and WARC files, with fixes for ClueWeb09" optional = true python-versions = "*" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"ranx\" and python_version >= \"3.10\"" files = [ {file = "warc3-wet-clueweb09-0.2.5.tar.gz", hash = "sha256:3054bfc07da525d5967df8ca3175f78fa3f78514c82643f8c81fbca96300b836"}, ] @@ -6989,6 +7509,8 @@ version = "0.2.13" description = "Measures the displayed width of unicode strings in a terminal" optional = false python-versions = "*" +groups = ["dev", "docs"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "wcwidth-0.2.13-py2.py3-none-any.whl", hash = "sha256:3da69048e4540d84af32131829ff948f1e022c1c6bdb8d6102117aac784f6859"}, {file = "wcwidth-0.2.13.tar.gz", hash = "sha256:72ea0c06399eb286d978fdedb6923a9eb47e1c486ce63e9b4e64fc18303972b5"}, @@ -7000,6 +7522,8 @@ version = "0.5.1" description = "Character encoding aliases for legacy web content" optional = false python-versions = "*" +groups = ["docs"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "webencodings-0.5.1-py2.py3-none-any.whl", hash = "sha256:a0af1213f3c2226497a97e2b3aa01a7e4bee4f403f95be16fc9acd2947514a78"}, {file = "webencodings-0.5.1.tar.gz", hash = "sha256:b36a1c245f2d304965eb4e0a82848379241dc04b865afcc4aab16748587e1923"}, @@ -7011,6 +7535,8 @@ version = "1.17.2" description = "Module for decorators, wrappers and monkey patching." optional = false python-versions = ">=3.8" +groups = ["dev"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "wrapt-1.17.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:3d57c572081fed831ad2d26fd430d565b76aa277ed1d30ff4d40670b1c0dd984"}, {file = "wrapt-1.17.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b5e251054542ae57ac7f3fba5d10bfff615b6c2fb09abeb37d2f1463f841ae22"}, @@ -7099,6 +7625,8 @@ version = "1.18.3" description = "Yet another URL library" optional = true python-versions = ">=3.9" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"voyageai\"" files = [ {file = "yarl-1.18.3-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:7df647e8edd71f000a5208fe6ff8c382a1de8edfbccdbbfe649d263de07d8c34"}, {file = "yarl-1.18.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c69697d3adff5aa4f874b19c0e4ed65180ceed6318ec856ebc423aa5850d84f7"}, @@ -7195,10 +7723,12 @@ version = "3.21.0" description = "Backport of pathlib-compatible object wrapper for zip files" optional = false python-versions = ">=3.9" +groups = ["dev", "docs"] files = [ {file = "zipp-3.21.0-py3-none-any.whl", hash = "sha256:ac1bbe05fd2991f160ebce24ffbac5f6d11d83dc90891255885223d42b3cd931"}, {file = "zipp-3.21.0.tar.gz", hash = "sha256:2c9958f6430a2040341a52eb608ed6dd93ef4392e02ffe219417c1b28b5dd1f4"}, ] +markers = {dev = "python_version < \"3.10\"", docs = "python_version <= \"3.11\" or python_version >= \"3.12\""} [package.extras] check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1)"] @@ -7214,6 +7744,8 @@ version = "0.1.9" description = "Low-level interface to the zlib library that enables capturing the decoding state" optional = true python-versions = ">=3.6" +groups = ["main"] +markers = "(python_version <= \"3.11\" or python_version >= \"3.12\") and extra == \"ranx\" and python_version >= \"3.10\"" files = [ {file = "zlib_state-0.1.9-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:97f45d0f80e9d7070229ecb36112eea6a17dc40053449a9c613ef837d9cb66b4"}, {file = "zlib_state-0.1.9-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3564eaa130f2533b87b82d0e622cfb5c25acec123e7bfe38d39db9ce6349cb52"}, @@ -7256,6 +7788,6 @@ vertexai = ["google-cloud-aiplatform", "protobuf"] voyageai = ["voyageai"] [metadata] -lock-version = "2.0" +lock-version = "2.1" python-versions = ">=3.9,<3.14" -content-hash = "8be5a998fb20c7b99f19af0112aa1c2c7e981f802c26e7a3bce08eeb61dfb741" +content-hash = "3bcbaaf402487a181810db22556d0207a555d5683984cc3af14a803974c8900e" diff --git a/pyproject.toml b/pyproject.toml index 2a897726..c5eae39a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,6 +33,8 @@ tenacity = ">=8.2.2" tabulate = "^0.9.0" ml-dtypes = "^0.4.0" python-ulid = "^3.0.0" +jsonpath-ng = "^1.5.0" + openai = { version = "^1.13.0", optional = true } sentence-transformers = { version = "^3.4.0", optional = true } scipy = [ diff --git a/redisvl/schema/validation.py b/redisvl/schema/validation.py index c4ddd3e7..dce0095d 100644 --- a/redisvl/schema/validation.py +++ b/redisvl/schema/validation.py @@ -5,11 +5,9 @@ using dynamically generated Pydantic models. """ -import json -import re -import warnings -from typing import Any, Dict, List, Optional, Type, Union, cast +from typing import Any, Dict, List, Optional, Type, Union +from jsonpath_ng import parse as jsonpath_parse from pydantic import BaseModel, Field, field_validator from redisvl.schema import IndexSchema @@ -213,29 +211,32 @@ def _validate_vector(cls, value): def extract_from_json_path(obj: Dict[str, Any], path: str) -> Any: """ - Extract a value from a nested JSON object using a path. + Extract a value from a nested JSON object using a JSONPath expression. Args: obj: The object to extract values from - path: JSONPath-style path (e.g., $.field.subfield) + path: JSONPath expression (e.g., $.field.subfield, $.[*].name) Returns: The extracted value or None if not found - """ - # Handle JSONPath syntax (e.g., $.field.subfield) - if path.startswith("$."): - path_parts = path[2:].split(".") - else: - path_parts = path.split(".") - current = obj - for part in path_parts: - if isinstance(current, dict) and part in current: - current = current[part] - else: - return None - - return current + Notes: + This function uses the jsonpath-ng library for proper JSONPath parsing + and supports the full JSONPath specification including filters, wildcards, + and array indexing. + """ + # If path doesn't start with $, add it as per JSONPath spec + if not path.startswith("$"): + path = f"$.{path}" + + # Parse and find the JSONPath expression + jsonpath_expr = jsonpath_parse(path) + matches = jsonpath_expr.find(obj) + + # Return the first match value, or None if no matches + if matches: + return matches[0].value + return None def validate_object(schema: IndexSchema, obj: Dict[str, Any]) -> Dict[str, Any]: From 4827f61d65d6ad3e5b65ec9918836d888f026e0a Mon Sep 17 00:00:00 2001 From: Tyler Hutcherson Date: Thu, 27 Mar 2025 15:36:19 -0400 Subject: [PATCH 07/11] hash the schema as the client side cache key --- redisvl/schema/validation.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/redisvl/schema/validation.py b/redisvl/schema/validation.py index dce0095d..629b193b 100644 --- a/redisvl/schema/validation.py +++ b/redisvl/schema/validation.py @@ -5,6 +5,7 @@ using dynamically generated Pydantic models. """ +import json from typing import Any, Dict, List, Optional, Type, Union from jsonpath_ng import parse as jsonpath_parse @@ -41,7 +42,7 @@ def get_model_for_schema(cls, schema: IndexSchema) -> Type[BaseModel]: A Pydantic model class that can validate data against the schema """ # Use schema identifier as cache key - cache_key = schema.index.name + cache_key = str(hash(json.dumps(schema.to_dict(), sort_keys=True).encode())) if cache_key not in cls._model_cache: cls._model_cache[cache_key] = cls._create_model(schema) From 51e6fc1216f84fd6d64433513ff82714fb7e3226 Mon Sep 17 00:00:00 2001 From: Tyler Hutcherson Date: Fri, 28 Mar 2025 17:16:39 -0400 Subject: [PATCH 08/11] use hf access tokens --- .github/workflows/test.yml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index f96fca62..15c05f43 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -133,6 +133,11 @@ jobs: with: credentials_json: ${{ secrets.GOOGLE_CREDENTIALS }} + - name: Set HuggingFace token + run: | + mkdir -p ~/.huggingface + echo '{"token":"${{ secrets.HF_TOKEN }}"}' > ~/.huggingface/token + - name: Run tests if: matrix.connection == 'plain' && matrix.redis-version == 'latest' env: @@ -149,6 +154,7 @@ jobs: OPENAI_API_VERSION: ${{ secrets.OPENAI_API_VERSION }} AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + HF_TOKEN: ${{ secrets.HF_TOKEN }} run: | make test-all @@ -173,6 +179,7 @@ jobs: OPENAI_API_VERSION: ${{ secrets.OPENAI_API_VERSION }} AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + HF_TOKEN: ${{ secrets.HF_TOKEN }} run: | docker run -d --name redis -p 6379:6379 redis/redis-stack-server:latest make test-notebooks From 0379db5733d9965b7a757bfdbf47b03c0e613da0 Mon Sep 17 00:00:00 2001 From: Tyler Hutcherson Date: Fri, 28 Mar 2025 18:20:12 -0400 Subject: [PATCH 09/11] make extension classes accept vectorizer kwargs --- redisvl/extensions/llmcache/semantic.py | 13 ++++++------- redisvl/extensions/router/semantic.py | 13 ++++++++++--- .../extensions/session_manager/semantic_session.py | 9 ++++++--- 3 files changed, 22 insertions(+), 13 deletions(-) diff --git a/redisvl/extensions/llmcache/semantic.py b/redisvl/extensions/llmcache/semantic.py index 2b70ae09..6e15c43b 100644 --- a/redisvl/extensions/llmcache/semantic.py +++ b/redisvl/extensions/llmcache/semantic.py @@ -95,12 +95,8 @@ def __init__( } # Use the index name as the key prefix by default - if "prefix" in kwargs: - prefix = kwargs["prefix"] - else: - prefix = name - - dtype = kwargs.get("dtype") + prefix = kwargs.pop("prefix", name) + dtype = kwargs.pop("dtype", None) # Validate a provided vectorizer or set the default if vectorizer: @@ -111,7 +107,10 @@ def __init__( f"Provided dtype {dtype} does not match vectorizer dtype {vectorizer.dtype}" ) else: - vectorizer_kwargs = {"dtype": dtype} if dtype else {} + vectorizer_kwargs = kwargs + + if dtype: + vectorizer_kwargs.update(**{"dtype": dtype}) vectorizer = HFTextVectorizer( model="sentence-transformers/all-mpnet-base-v2", diff --git a/redisvl/extensions/router/semantic.py b/redisvl/extensions/router/semantic.py index c06789e1..be83b447 100644 --- a/redisvl/extensions/router/semantic.py +++ b/redisvl/extensions/router/semantic.py @@ -72,7 +72,7 @@ def __init__( connection_kwargs (Dict[str, Any]): The connection arguments for the redis client. Defaults to empty {}. """ - dtype = kwargs.get("dtype") + dtype = kwargs.pop("dtype", None) # Validate a provided vectorizer or set the default if vectorizer: @@ -83,8 +83,15 @@ def __init__( f"Provided dtype {dtype} does not match vectorizer dtype {vectorizer.dtype}" ) else: - vectorizer_kwargs = {"dtype": dtype} if dtype else {} - vectorizer = HFTextVectorizer(**vectorizer_kwargs) + vectorizer_kwargs = kwargs + + if dtype: + vectorizer_kwargs.update(**{"dtype": dtype}) + + vectorizer = HFTextVectorizer( + model="sentence-transformers/all-mpnet-base-v2", + **vectorizer_kwargs, + ) if routing_config is None: routing_config = RoutingConfig() diff --git a/redisvl/extensions/session_manager/semantic_session.py b/redisvl/extensions/session_manager/semantic_session.py index 1aa15315..9497d06c 100644 --- a/redisvl/extensions/session_manager/semantic_session.py +++ b/redisvl/extensions/session_manager/semantic_session.py @@ -71,7 +71,7 @@ def __init__( super().__init__(name, session_tag) prefix = prefix or name - dtype = kwargs.get("dtype") + dtype = kwargs.pop("dtype", None) # Validate a provided vectorizer or set the default if vectorizer: @@ -82,10 +82,13 @@ def __init__( f"Provided dtype {dtype} does not match vectorizer dtype {vectorizer.dtype}" ) else: - vectorizer_kwargs = {"dtype": dtype} if dtype else {} + vectorizer_kwargs = kwargs + + if dtype: + vectorizer_kwargs.update(**{"dtype": dtype}) vectorizer = HFTextVectorizer( - model="sentence-transformers/msmarco-distilbert-cos-v5", + model="sentence-transformers/all-mpnet-base-v2", **vectorizer_kwargs, ) From b5f378031e3200cbf0f5c31eb6cd750f36cc596d Mon Sep 17 00:00:00 2001 From: Tyler Hutcherson Date: Fri, 28 Mar 2025 18:22:34 -0400 Subject: [PATCH 10/11] clean up tests a bit --- tests/{unit => integration}/test_cross_encoder_reranker.py | 0 tests/unit/test_utils.py | 3 --- tests/unit/test_validation.py | 2 +- 3 files changed, 1 insertion(+), 4 deletions(-) rename tests/{unit => integration}/test_cross_encoder_reranker.py (100%) diff --git a/tests/unit/test_cross_encoder_reranker.py b/tests/integration/test_cross_encoder_reranker.py similarity index 100% rename from tests/unit/test_cross_encoder_reranker.py rename to tests/integration/test_cross_encoder_reranker.py diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py index 83300d0c..af0cc192 100644 --- a/tests/unit/test_utils.py +++ b/tests/unit/test_utils.py @@ -1,6 +1,3 @@ -import importlib -import io -import logging import re import sys from functools import wraps diff --git a/tests/unit/test_validation.py b/tests/unit/test_validation.py index ac67e810..68933938 100644 --- a/tests/unit/test_validation.py +++ b/tests/unit/test_validation.py @@ -9,7 +9,7 @@ """ import re -from typing import Any, Dict, List, Optional, Tuple, Union +from typing import Any, List, Optional, Tuple, Union import pytest From e406d76dc1fe7439765a2510b80206ec50d5700a Mon Sep 17 00:00:00 2001 From: Tyler Hutcherson Date: Fri, 28 Mar 2025 18:37:16 -0400 Subject: [PATCH 11/11] start centralizing the use of fixtures for hugging face models --- tests/conftest.py | 10 ++++++++++ tests/integration/test_threshold_optimizer.py | 12 ++++++++---- 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 24da05e5..f0c3a435 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -5,6 +5,7 @@ from testcontainers.compose import DockerCompose from redisvl.redis.connection import RedisConnectionFactory +from redisvl.utils.vectorize import HFTextVectorizer @pytest.fixture(autouse=True) @@ -68,6 +69,15 @@ def client(redis_url): yield conn +@pytest.fixture(scope="session", autouse=True) +def hf_vectorizer(): + return HFTextVectorizer( + model="sentence-transformers/all-mpnet-base-v2", + token=os.getenv("HF_TOKEN"), + cache_folder=os.getenv("SENTENCE_TRANSFORMERS_HOME"), + ) + + @pytest.fixture def sample_datetimes(): return { diff --git a/tests/integration/test_threshold_optimizer.py b/tests/integration/test_threshold_optimizer.py index 44871901..b510b038 100644 --- a/tests/integration/test_threshold_optimizer.py +++ b/tests/integration/test_threshold_optimizer.py @@ -35,10 +35,11 @@ def routes(): @pytest.fixture -def semantic_router(client, routes): +def semantic_router(client, routes, hf_vectorizer): router = SemanticRouter( name="test-router", routes=routes, + vectorizer=hf_vectorizer, routing_config=RoutingConfig(max_k=2), redis_client=client, overwrite=False, @@ -86,7 +87,7 @@ def test_data_optimization(): def test_routes_different_distance_thresholds_optimizer_default( - semantic_router, routes, redis_url, test_data_optimization + semantic_router, routes, redis_url, test_data_optimization, hf_vectorizer ): redis_version = semantic_router._index.client.info()["redis_version"] if not compare_versions(redis_version, "7.0.0"): @@ -101,6 +102,7 @@ def test_routes_different_distance_thresholds_optimizer_default( router = SemanticRouter( name="test_routes_different_distance_optimizer", routes=routes, + vectorizer=hf_vectorizer, redis_url=redis_url, overwrite=True, ) @@ -119,7 +121,7 @@ def test_routes_different_distance_thresholds_optimizer_default( def test_routes_different_distance_thresholds_optimizer_precision( - semantic_router, routes, redis_url, test_data_optimization + semantic_router, routes, redis_url, test_data_optimization, hf_vectorizer ): redis_version = semantic_router._index.client.info()["redis_version"] @@ -135,6 +137,7 @@ def test_routes_different_distance_thresholds_optimizer_precision( router = SemanticRouter( name="test_routes_different_distance_optimizer", routes=routes, + vectorizer=hf_vectorizer, redis_url=redis_url, overwrite=True, ) @@ -155,7 +158,7 @@ def test_routes_different_distance_thresholds_optimizer_precision( def test_routes_different_distance_thresholds_optimizer_recall( - semantic_router, routes, redis_url, test_data_optimization + semantic_router, routes, redis_url, test_data_optimization, hf_vectorizer ): redis_version = semantic_router._index.client.info()["redis_version"] if not compare_versions(redis_version, "7.0.0"): @@ -170,6 +173,7 @@ def test_routes_different_distance_thresholds_optimizer_recall( router = SemanticRouter( name="test_routes_different_distance_optimizer", routes=routes, + vectorizer=hf_vectorizer, redis_url=redis_url, overwrite=True, )