From 15707d026753a1cb1fcffa64de9e8b5f7a972ecf Mon Sep 17 00:00:00 2001 From: Marcus0086 Date: Sat, 13 Jan 2024 00:57:08 +0530 Subject: [PATCH 01/21] Add tenant ID field to Weaviate indexing config --- .../destination-weaviate/destination_weaviate/config.py | 1 + 1 file changed, 1 insertion(+) diff --git a/airbyte-integrations/connectors/destination-weaviate/destination_weaviate/config.py b/airbyte-integrations/connectors/destination-weaviate/destination_weaviate/config.py index 6c580102e7c3..34e6521436d5 100644 --- a/airbyte-integrations/connectors/destination-weaviate/destination_weaviate/config.py +++ b/airbyte-integrations/connectors/destination-weaviate/destination_weaviate/config.py @@ -65,6 +65,7 @@ class WeaviateIndexingConfigModel(BaseModel): ) batch_size: int = Field(title="Batch Size", description="The number of records to send to Weaviate in each batch", default=128) text_field: str = Field(title="Text Field", description="The field in the object that contains the embedded text", default="text") + tenant_id: str = Field(title="Tenant ID", description="The tenant ID to use for the Weaviate cluster", default="") default_vectorizer: str = Field( title="Default Vectorizer", description="The vectorizer to use if new classes need to be created", From ec672685b96d6edcdd6817c2dcf8bbc4b21c43b7 Mon Sep 17 00:00:00 2001 From: Marcus0086 Date: Sat, 13 Jan 2024 00:57:27 +0530 Subject: [PATCH 02/21] Add multi-tenancy support and tenant assignment to WeaviateIndexer class --- .../destination_weaviate/indexer.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/airbyte-integrations/connectors/destination-weaviate/destination_weaviate/indexer.py b/airbyte-integrations/connectors/destination-weaviate/destination_weaviate/indexer.py index 45c54d54bfed..164f106203b1 100644 --- a/airbyte-integrations/connectors/destination-weaviate/destination_weaviate/indexer.py +++ b/airbyte-integrations/connectors/destination-weaviate/destination_weaviate/indexer.py @@ -81,6 +81,9 @@ def pre_sync(self, catalog: ConfiguredAirbyteCatalog) -> None: self.client.schema.create_class( { "class": class_name, + "multiTenancyConfig": { + "enabled": True if self.config.tenant_id.strip() else False, + }, "vectorizer": self.config.default_vectorizer, "properties": [ { @@ -96,6 +99,12 @@ def pre_sync(self, catalog: ConfiguredAirbyteCatalog) -> None: } ) logging.info(f"Created class {class_name}") + if self.config.tenant_id.strip(): + self.client.schema.add_class_tenants( + class_name=class_name, # The class to which the tenants will be added + tenants=[weaviate.Tenant(name=self.config.tenant_id)], + ) + logging.info(f"Added tenant {self.config.tenant_id} to class {class_name}") else: self.has_record_id_metadata[class_name] = schema is not None and any( prop.get("name") == METADATA_RECORD_ID_FIELD for prop in schema.get("properties", {}) @@ -124,7 +133,12 @@ def index(self, document_chunks, namespace, stream): weaviate_object[self.config.text_field] = chunk.page_content object_id = str(uuid.uuid4()) class_name = self._stream_to_class_name(chunk.record.stream) - self.client.batch.add_data_object(weaviate_object, class_name, object_id, vector=chunk.embedding) + if self.config.tenant_id.strip(): + self.client.batch.add_data_object( + weaviate_object, class_name, object_id, vector=chunk.embedding, tenant=self.config.tenant_id + ) + else: + self.client.batch.add_data_object(weaviate_object, class_name, object_id, vector=chunk.embedding) self._flush() def _stream_to_class_name(self, stream_name: str) -> str: From 1ae1eda609921d0290fd02ab360247cf20abb062 Mon Sep 17 00:00:00 2001 From: Marcus0086 Date: Sat, 13 Jan 2024 01:29:41 +0530 Subject: [PATCH 03/21] Update tenant ID field in WeaviateIndexingConfigModel --- .../destination-weaviate/destination_weaviate/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/airbyte-integrations/connectors/destination-weaviate/destination_weaviate/config.py b/airbyte-integrations/connectors/destination-weaviate/destination_weaviate/config.py index 34e6521436d5..8f4019484e18 100644 --- a/airbyte-integrations/connectors/destination-weaviate/destination_weaviate/config.py +++ b/airbyte-integrations/connectors/destination-weaviate/destination_weaviate/config.py @@ -65,7 +65,7 @@ class WeaviateIndexingConfigModel(BaseModel): ) batch_size: int = Field(title="Batch Size", description="The number of records to send to Weaviate in each batch", default=128) text_field: str = Field(title="Text Field", description="The field in the object that contains the embedded text", default="text") - tenant_id: str = Field(title="Tenant ID", description="The tenant ID to use for the Weaviate cluster", default="") + tenant_id: str = Field(title="Tenant ID", description="The tenant ID to use for multi tenancy", default="") default_vectorizer: str = Field( title="Default Vectorizer", description="The vectorizer to use if new classes need to be created", From 3e2e78115460bf28c61fd7fda487ec0d33c0fea7 Mon Sep 17 00:00:00 2001 From: Marcus0086 Date: Sat, 13 Jan 2024 01:46:52 +0530 Subject: [PATCH 04/21] Refactor class creation in WeaviateIndexer --- .../destination_weaviate/indexer.py | 38 +++++++++---------- 1 file changed, 18 insertions(+), 20 deletions(-) diff --git a/airbyte-integrations/connectors/destination-weaviate/destination_weaviate/indexer.py b/airbyte-integrations/connectors/destination-weaviate/destination_weaviate/indexer.py index 164f106203b1..e33358332b2e 100644 --- a/airbyte-integrations/connectors/destination-weaviate/destination_weaviate/indexer.py +++ b/airbyte-integrations/connectors/destination-weaviate/destination_weaviate/indexer.py @@ -78,26 +78,24 @@ def pre_sync(self, catalog: ConfiguredAirbyteCatalog) -> None: self.client.schema.create_class(schema) logging.info(f"Recreated class {class_name}") elif class_name not in classes: - self.client.schema.create_class( - { - "class": class_name, - "multiTenancyConfig": { - "enabled": True if self.config.tenant_id.strip() else False, - }, - "vectorizer": self.config.default_vectorizer, - "properties": [ - { - # Record ID is used for bookkeeping, not for searching - "name": METADATA_RECORD_ID_FIELD, - "dataType": ["text"], - "description": "Record ID, used for bookkeeping.", - "indexFilterable": True, - "indexSearchable": False, - "tokenization": "field", - } - ], - } - ) + config = { + "class": class_name, + "vectorizer": self.config.default_vectorizer, + "properties": [ + { + # Record ID is used for bookkeeping, not for searching + "name": METADATA_RECORD_ID_FIELD, + "dataType": ["text"], + "description": "Record ID, used for bookkeeping.", + "indexFilterable": True, + "indexSearchable": False, + "tokenization": "field", + } + ], + } + if self.config.text_field.strip(): + config["multiTenancyConfig"] = {"enabled": True} + self.client.schema.create_class(config) logging.info(f"Created class {class_name}") if self.config.tenant_id.strip(): self.client.schema.add_class_tenants( From 1c02776100eeaf9e33b1e58d809e28f4f9a7bfdf Mon Sep 17 00:00:00 2001 From: Marcus0086 Date: Sat, 13 Jan 2024 02:13:49 +0530 Subject: [PATCH 05/21] Add unit test for pre-sync with multi-tenancy enabled --- .../unit_tests/indexer_test.py | 62 ++++++++++++++----- 1 file changed, 47 insertions(+), 15 deletions(-) diff --git a/airbyte-integrations/connectors/destination-weaviate/unit_tests/indexer_test.py b/airbyte-integrations/connectors/destination-weaviate/unit_tests/indexer_test.py index 043a4b6ea68c..3909bb3dd6ab 100644 --- a/airbyte-integrations/connectors/destination-weaviate/unit_tests/indexer_test.py +++ b/airbyte-integrations/connectors/destination-weaviate/unit_tests/indexer_test.py @@ -71,6 +71,30 @@ def test_pre_sync_that_creates_class(self, MockClient): } ) + @patch("destination_weaviate.indexer.weaviate.Client") + def test_pre_sync_that_creates_class_with_multi_tenancy_enabled(self, MockClient): + mock_client = Mock() + mock_client.schema.get.return_value = {"classes": []} + MockClient.return_value = mock_client + self.indexer.pre_sync(self.mock_catalog) + mock_client.schema.create_class.assert_called_with( + { + "class": "Test", + "multiTenancyConfig": {"enabled": True}, + "vectorizer": "none", + "properties": [ + { + "name": "_ab_record_id", + "dataType": ["text"], + "description": "Record ID, used for bookkeeping.", + "indexFilterable": True, + "indexSearchable": False, + "tokenization": "field", + } + ], + } + ) + @patch("destination_weaviate.indexer.weaviate.Client") def test_pre_sync_that_deletes(self, MockClient): mock_client = Mock() @@ -200,31 +224,39 @@ def test_index_flushes_batch_and_normalizes(self): page_content="some_content", embedding=[1, 2, 3], metadata={ - "someField": "some_value", "complex": {"a": [1, 2, 3]}, "UPPERCASE_NAME": "abc", "id": 12, "empty_list": [], - "referral Agency Name": "test1", - "123StartsWithNumber": "test2", - "special&*chars": "test3", - "with spaces": "test4", - "": "test5", - "_startsWithUnderscore": "test6", - "multiple spaces": "test7", - "SpecialCharacters!@#": "test8" - }, + "someField": "some_value", + "complex": {"a": [1, 2, 3]}, + "UPPERCASE_NAME": "abc", + "id": 12, + "empty_list": [], + "referral Agency Name": "test1", + "123StartsWithNumber": "test2", + "special&*chars": "test3", + "with spaces": "test4", + "": "test5", + "_startsWithUnderscore": "test6", + "multiple spaces": "test7", + "SpecialCharacters!@#": "test8", + }, record=AirbyteRecordMessage(stream="test", data={"someField": "some_value"}, emitted_at=0), ) self.indexer.index([mock_chunk], None, "test") mock_client.batch.add_data_object.assert_called_with( - {"someField": "some_value", "complex": '{"a": [1, 2, 3]}', "uPPERCASE_NAME": "abc", "text": "some_content", "raw_id": 12, - "referral_Agency_Name": "test1", + { + "someField": "some_value", + "complex": '{"a": [1, 2, 3]}', + "uPPERCASE_NAME": "abc", + "text": "some_content", + "raw_id": 12, + "referral_Agency_Name": "test1", "_123StartsWithNumber": "test2", "specialchars": "test3", "with_spaces": "test4", "_": "test5", "_startsWithUnderscore": "test6", "multiple__spaces": "test7", - "specialCharacters": "test8" - - }, + "specialCharacters": "test8", + }, "Test", ANY, vector=[1, 2, 3], From abc4846c24b5cbd90a42b8bcd9825efc7a1190e8 Mon Sep 17 00:00:00 2001 From: Marcus0086 Date: Sat, 13 Jan 2024 02:58:26 +0530 Subject: [PATCH 06/21] Fix multi-tenancy configuration in WeaviateIndexer --- .../destination-weaviate/destination_weaviate/indexer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/airbyte-integrations/connectors/destination-weaviate/destination_weaviate/indexer.py b/airbyte-integrations/connectors/destination-weaviate/destination_weaviate/indexer.py index e33358332b2e..8d08267f3c1e 100644 --- a/airbyte-integrations/connectors/destination-weaviate/destination_weaviate/indexer.py +++ b/airbyte-integrations/connectors/destination-weaviate/destination_weaviate/indexer.py @@ -93,7 +93,7 @@ def pre_sync(self, catalog: ConfiguredAirbyteCatalog) -> None: } ], } - if self.config.text_field.strip(): + if self.config.tenant_id.strip(): config["multiTenancyConfig"] = {"enabled": True} self.client.schema.create_class(config) logging.info(f"Created class {class_name}") From e59602bfe304c7b08d101b6d350d53319b0ec204 Mon Sep 17 00:00:00 2001 From: Marcus0086 Date: Sat, 13 Jan 2024 03:07:14 +0530 Subject: [PATCH 07/21] Fix class creation and tenant addition in WeaviateIndexer --- .../destination-weaviate/destination_weaviate/indexer.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/airbyte-integrations/connectors/destination-weaviate/destination_weaviate/indexer.py b/airbyte-integrations/connectors/destination-weaviate/destination_weaviate/indexer.py index 8d08267f3c1e..bef6180fbba4 100644 --- a/airbyte-integrations/connectors/destination-weaviate/destination_weaviate/indexer.py +++ b/airbyte-integrations/connectors/destination-weaviate/destination_weaviate/indexer.py @@ -95,8 +95,10 @@ def pre_sync(self, catalog: ConfiguredAirbyteCatalog) -> None: } if self.config.tenant_id.strip(): config["multiTenancyConfig"] = {"enabled": True} + self.client.schema.create_class(config) logging.info(f"Created class {class_name}") + if self.config.tenant_id.strip(): self.client.schema.add_class_tenants( class_name=class_name, # The class to which the tenants will be added From 60ce254752ae3a3688b46160bfa7f1f800bcac9c Mon Sep 17 00:00:00 2001 From: Joe Reuter Date: Mon, 15 Jan 2024 14:36:30 +0100 Subject: [PATCH 08/21] prepare release --- .../connectors/destination-weaviate/metadata.yaml | 2 +- docs/integrations/destinations/weaviate.md | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/airbyte-integrations/connectors/destination-weaviate/metadata.yaml b/airbyte-integrations/connectors/destination-weaviate/metadata.yaml index f519260f42f7..29aeefaf0831 100644 --- a/airbyte-integrations/connectors/destination-weaviate/metadata.yaml +++ b/airbyte-integrations/connectors/destination-weaviate/metadata.yaml @@ -13,7 +13,7 @@ data: connectorSubtype: vectorstore connectorType: destination definitionId: 7b7d7a0d-954c-45a0-bcfc-39a634b97736 - dockerImageTag: 0.2.13 + dockerImageTag: 0.2.14 dockerRepository: airbyte/destination-weaviate documentationUrl: https://docs.airbyte.com/integrations/destinations/weaviate githubIssueLabel: destination-weaviate diff --git a/docs/integrations/destinations/weaviate.md b/docs/integrations/destinations/weaviate.md index ed98586015b0..fe8647f96619 100644 --- a/docs/integrations/destinations/weaviate.md +++ b/docs/integrations/destinations/weaviate.md @@ -83,6 +83,7 @@ As properties have to start will a lowercase letter in Weaviate and can't contai | Version | Date | Pull Request | Subject | | :------ | :--------- | :--------------------------------------------------------- | :------------------------------------------------------------------------------------------------------------------------------- | +| 0.2.14 | 2023-01-15 | [34229](https://github.com/airbytehq/airbyte/pull/34229) | Allow configuring tenant id | | 0.2.13 | 2023-12-11 | [33303](https://github.com/airbytehq/airbyte/pull/33303) | Fix bug with embedding special tokens | | 0.2.12 | 2023-12-07 | [33218](https://github.com/airbytehq/airbyte/pull/33218) | Normalize metadata field names | | 0.2.11 | 2023-12-01 | [32697](https://github.com/airbytehq/airbyte/pull/32697) | Allow omitting raw text | From 87cd86e7973c13d7bed705b097ec8a02d2eee541 Mon Sep 17 00:00:00 2001 From: Marcus0086 Date: Mon, 15 Jan 2024 19:11:17 +0530 Subject: [PATCH 09/21] Refactor delete_objects method in WeaviateIndexer class --- .../destination_weaviate/indexer.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/airbyte-integrations/connectors/destination-weaviate/destination_weaviate/indexer.py b/airbyte-integrations/connectors/destination-weaviate/destination_weaviate/indexer.py index bef6180fbba4..859f055d0f17 100644 --- a/airbyte-integrations/connectors/destination-weaviate/destination_weaviate/indexer.py +++ b/airbyte-integrations/connectors/destination-weaviate/destination_weaviate/indexer.py @@ -114,10 +114,18 @@ def delete(self, delete_ids, namespace, stream): if len(delete_ids) > 0: class_name = self._stream_to_class_name(stream) if self.has_record_id_metadata[class_name]: - self.client.batch.delete_objects( - class_name=class_name, - where={"path": [METADATA_RECORD_ID_FIELD], "operator": "ContainsAny", "valueStringArray": delete_ids}, - ) + where_filter = {"path": [METADATA_RECORD_ID_FIELD], "operator": "ContainsAny", "valueStringArray": delete_ids} + if self.config.tenant_id.strip(): + self.client.batch.delete_objects( + class_name=class_name, + tenant=self.config.tenant_id, + where=where_filter, + ) + else: + self.client.batch.delete_objects( + class_name=class_name, + where=where_filter, + ) def index(self, document_chunks, namespace, stream): if len(document_chunks) == 0: From cfd581968dc9cd054355c4dd97cd6df2ecf073c8 Mon Sep 17 00:00:00 2001 From: Marcus0086 Date: Mon, 15 Jan 2024 19:28:43 +0530 Subject: [PATCH 10/21] Add test for deleting records by record ID with tenant ID --- .../destination-weaviate/unit_tests/indexer_test.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/airbyte-integrations/connectors/destination-weaviate/unit_tests/indexer_test.py b/airbyte-integrations/connectors/destination-weaviate/unit_tests/indexer_test.py index 3909bb3dd6ab..59c064da604a 100644 --- a/airbyte-integrations/connectors/destination-weaviate/unit_tests/indexer_test.py +++ b/airbyte-integrations/connectors/destination-weaviate/unit_tests/indexer_test.py @@ -128,6 +128,18 @@ def test_index_deletes_by_record_id(self): where={"path": ["_ab_record_id"], "operator": "ContainsAny", "valueStringArray": ["some_id", "some_other_id"]}, ) + def test_index_deletes_by_record_id_with_tenant_id(self): + mock_client = Mock() + self.indexer.client = mock_client + self.indexer.has_record_id_metadata = defaultdict(None) + self.indexer.has_record_id_metadata["Test"] = True + self.indexer.delete(["some_id", "some_other_id"], None, "test") + mock_client.batch.delete_objects.assert_called_with( + class_name="Test", + tenant="test_tenant", + where={"path": ["_ab_record_id"], "operator": "ContainsAny", "valueStringArray": ["some_id", "some_other_id"]}, + ) + @patch("destination_weaviate.indexer.weaviate.Client") def test_index_not_delete_no_metadata_field(self, MockClient): mock_client = Mock() From 0e4638b3114cad63f075382232812c55f6881f2d Mon Sep 17 00:00:00 2001 From: Joe Reuter Date: Tue, 16 Jan 2024 11:40:23 +0100 Subject: [PATCH 11/21] format --- .../integration_tests/spec.json | 384 ++++++++++-------- 1 file changed, 207 insertions(+), 177 deletions(-) diff --git a/airbyte-integrations/connectors/destination-weaviate/integration_tests/spec.json b/airbyte-integrations/connectors/destination-weaviate/integration_tests/spec.json index 3923a8851c4d..69681ef6dc61 100644 --- a/airbyte-integrations/connectors/destination-weaviate/integration_tests/spec.json +++ b/airbyte-integrations/connectors/destination-weaviate/integration_tests/spec.json @@ -5,164 +5,6 @@ "description": "The configuration model for the Vector DB based destinations. This model is used to generate the UI for the destination configuration,\nas well as to provide type safety for the configuration passed to the destination.\n\nThe configuration model is composed of four parts:\n* Processing configuration\n* Embedding configuration\n* Indexing configuration\n* Advanced configuration\n\nProcessing, embedding and advanced configuration are provided by this base class, while the indexing configuration is provided by the destination connector in the sub class.", "type": "object", "properties": { - "processing": { - "title": "ProcessingConfigModel", - "type": "object", - "properties": { - "chunk_size": { - "title": "Chunk size", - "description": "Size of chunks in tokens to store in vector store (make sure it is not too big for the context if your LLM)", - "minimum": 1, - "maximum": 8191, - "type": "integer" - }, - "chunk_overlap": { - "title": "Chunk overlap", - "description": "Size of overlap between chunks in tokens to store in vector store to better capture relevant context", - "default": 0, - "type": "integer" - }, - "text_fields": { - "title": "Text fields to embed", - "description": "List of fields in the record that should be used to calculate the embedding. The field list is applied to all streams in the same way and non-existing fields are ignored. If none are defined, all fields are considered text fields. When specifying text fields, you can access nested fields in the record by using dot notation, e.g. `user.name` will access the `name` field in the `user` object. It's also possible to use wildcards to access all fields in an object, e.g. `users.*.name` will access all `names` fields in all entries of the `users` array.", - "default": [], - "always_show": true, - "examples": ["text", "user.name", "users.*.name"], - "type": "array", - "items": { "type": "string" } - }, - "metadata_fields": { - "title": "Fields to store as metadata", - "description": "List of fields in the record that should be stored as metadata. The field list is applied to all streams in the same way and non-existing fields are ignored. If none are defined, all fields are considered metadata fields. When specifying text fields, you can access nested fields in the record by using dot notation, e.g. `user.name` will access the `name` field in the `user` object. It's also possible to use wildcards to access all fields in an object, e.g. `users.*.name` will access all `names` fields in all entries of the `users` array. When specifying nested paths, all matching values are flattened into an array set to a field named by the path.", - "default": [], - "always_show": true, - "examples": ["age", "user", "user.name"], - "type": "array", - "items": { "type": "string" } - }, - "field_name_mappings": { - "title": "Field name mappings", - "description": "List of fields to rename. Not applicable for nested fields, but can be used to rename fields already flattened via dot notation.", - "default": [], - "type": "array", - "items": { - "title": "FieldNameMappingConfigModel", - "type": "object", - "properties": { - "from_field": { - "title": "From field name", - "description": "The field name in the source", - "type": "string" - }, - "to_field": { - "title": "To field name", - "description": "The field name to use in the destination", - "type": "string" - } - }, - "required": ["from_field", "to_field"] - } - }, - "text_splitter": { - "title": "Text splitter", - "description": "Split text fields into chunks based on the specified method.", - "type": "object", - "oneOf": [ - { - "title": "By Separator", - "type": "object", - "properties": { - "mode": { - "title": "Mode", - "default": "separator", - "const": "separator", - "enum": ["separator"], - "type": "string" - }, - "separators": { - "title": "Separators", - "description": "List of separator strings to split text fields by. The separator itself needs to be wrapped in double quotes, e.g. to split by the dot character, use \".\". To split by a newline, use \"\\n\".", - "default": ["\"\\n\\n\"", "\"\\n\"", "\" \"", "\"\""], - "type": "array", - "items": { "type": "string" } - }, - "keep_separator": { - "title": "Keep separator", - "description": "Whether to keep the separator in the resulting chunks", - "default": false, - "type": "boolean" - } - }, - "required": ["mode"], - "description": "Split the text by the list of separators until the chunk size is reached, using the earlier mentioned separators where possible. This is useful for splitting text fields by paragraphs, sentences, words, etc." - }, - { - "title": "By Markdown header", - "type": "object", - "properties": { - "mode": { - "title": "Mode", - "default": "markdown", - "const": "markdown", - "enum": ["markdown"], - "type": "string" - }, - "split_level": { - "title": "Split level", - "description": "Level of markdown headers to split text fields by. Headings down to the specified level will be used as split points", - "default": 1, - "minimum": 1, - "maximum": 6, - "type": "integer" - } - }, - "required": ["mode"], - "description": "Split the text by Markdown headers down to the specified header level. If the chunk size fits multiple sections, they will be combined into a single chunk." - }, - { - "title": "By Programming Language", - "type": "object", - "properties": { - "mode": { - "title": "Mode", - "default": "code", - "const": "code", - "enum": ["code"], - "type": "string" - }, - "language": { - "title": "Language", - "description": "Split code in suitable places based on the programming language", - "enum": [ - "cpp", - "go", - "java", - "js", - "php", - "proto", - "python", - "rst", - "ruby", - "rust", - "scala", - "swift", - "markdown", - "latex", - "html", - "sol" - ], - "type": "string" - } - }, - "required": ["language", "mode"], - "description": "Split the text by suitable delimiters based on the programming language. This is useful for splitting code into chunks." - } - ] - } - }, - "required": ["chunk_size"], - "group": "processing" - }, "embedding": { "title": "Embedding", "description": "Embedding configuration", @@ -181,8 +23,8 @@ "type": "string" } }, - "required": ["mode"], - "description": "Do not calculate and pass embeddings to Weaviate. Suitable for clusters with configured vectorizers to calculate embeddings within Weaviate or for classes that should only support regular text search." + "description": "Do not calculate and pass embeddings to Weaviate. Suitable for clusters with configured vectorizers to calculate embeddings within Weaviate or for classes that should only support regular text search.", + "required": ["mode"] }, { "title": "Azure OpenAI", @@ -296,8 +138,8 @@ "type": "string" } }, - "required": ["mode"], - "description": "Use a fake embedding made out of random vectors with 1536 embedding dimensions. This is useful for testing the data pipeline without incurring any costs." + "description": "Use a fake embedding made out of random vectors with 1536 embedding dimensions. This is useful for testing the data pipeline without incurring any costs.", + "required": ["mode"] }, { "title": "OpenAI-compatible", @@ -341,6 +183,177 @@ } ] }, + "processing": { + "title": "ProcessingConfigModel", + "type": "object", + "properties": { + "chunk_size": { + "title": "Chunk size", + "description": "Size of chunks in tokens to store in vector store (make sure it is not too big for the context if your LLM)", + "maximum": 8191, + "minimum": 1, + "type": "integer" + }, + "chunk_overlap": { + "title": "Chunk overlap", + "description": "Size of overlap between chunks in tokens to store in vector store to better capture relevant context", + "default": 0, + "type": "integer" + }, + "text_fields": { + "title": "Text fields to embed", + "description": "List of fields in the record that should be used to calculate the embedding. The field list is applied to all streams in the same way and non-existing fields are ignored. If none are defined, all fields are considered text fields. When specifying text fields, you can access nested fields in the record by using dot notation, e.g. `user.name` will access the `name` field in the `user` object. It's also possible to use wildcards to access all fields in an object, e.g. `users.*.name` will access all `names` fields in all entries of the `users` array.", + "default": [], + "always_show": true, + "examples": ["text", "user.name", "users.*.name"], + "type": "array", + "items": { + "type": "string" + } + }, + "metadata_fields": { + "title": "Fields to store as metadata", + "description": "List of fields in the record that should be stored as metadata. The field list is applied to all streams in the same way and non-existing fields are ignored. If none are defined, all fields are considered metadata fields. When specifying text fields, you can access nested fields in the record by using dot notation, e.g. `user.name` will access the `name` field in the `user` object. It's also possible to use wildcards to access all fields in an object, e.g. `users.*.name` will access all `names` fields in all entries of the `users` array. When specifying nested paths, all matching values are flattened into an array set to a field named by the path.", + "default": [], + "always_show": true, + "examples": ["age", "user", "user.name"], + "type": "array", + "items": { + "type": "string" + } + }, + "text_splitter": { + "title": "Text splitter", + "description": "Split text fields into chunks based on the specified method.", + "type": "object", + "oneOf": [ + { + "title": "By Separator", + "type": "object", + "properties": { + "mode": { + "title": "Mode", + "default": "separator", + "const": "separator", + "enum": ["separator"], + "type": "string" + }, + "separators": { + "title": "Separators", + "description": "List of separator strings to split text fields by. The separator itself needs to be wrapped in double quotes, e.g. to split by the dot character, use \".\". To split by a newline, use \"\\n\".", + "default": ["\"\\n\\n\"", "\"\\n\"", "\" \"", "\"\""], + "type": "array", + "items": { + "type": "string" + } + }, + "keep_separator": { + "title": "Keep separator", + "description": "Whether to keep the separator in the resulting chunks", + "default": false, + "type": "boolean" + } + }, + "description": "Split the text by the list of separators until the chunk size is reached, using the earlier mentioned separators where possible. This is useful for splitting text fields by paragraphs, sentences, words, etc.", + "required": ["mode"] + }, + { + "title": "By Markdown header", + "type": "object", + "properties": { + "mode": { + "title": "Mode", + "default": "markdown", + "const": "markdown", + "enum": ["markdown"], + "type": "string" + }, + "split_level": { + "title": "Split level", + "description": "Level of markdown headers to split text fields by. Headings down to the specified level will be used as split points", + "default": 1, + "minimum": 1, + "maximum": 6, + "type": "integer" + } + }, + "description": "Split the text by Markdown headers down to the specified header level. If the chunk size fits multiple sections, they will be combined into a single chunk.", + "required": ["mode"] + }, + { + "title": "By Programming Language", + "type": "object", + "properties": { + "mode": { + "title": "Mode", + "default": "code", + "const": "code", + "enum": ["code"], + "type": "string" + }, + "language": { + "title": "Language", + "description": "Split code in suitable places based on the programming language", + "enum": [ + "cpp", + "go", + "java", + "js", + "php", + "proto", + "python", + "rst", + "ruby", + "rust", + "scala", + "swift", + "markdown", + "latex", + "html", + "sol" + ], + "type": "string" + } + }, + "required": ["language", "mode"], + "description": "Split the text by suitable delimiters based on the programming language. This is useful for splitting code into chunks." + } + ] + }, + "field_name_mappings": { + "title": "Field name mappings", + "description": "List of fields to rename. Not applicable for nested fields, but can be used to rename fields already flattened via dot notation.", + "default": [], + "type": "array", + "items": { + "title": "FieldNameMappingConfigModel", + "type": "object", + "properties": { + "from_field": { + "title": "From field name", + "description": "The field name in the source", + "type": "string" + }, + "to_field": { + "title": "To field name", + "description": "The field name to use in the destination", + "type": "string" + } + }, + "required": ["from_field", "to_field"] + } + } + }, + "required": ["chunk_size"], + "group": "processing" + }, + "omit_raw_text": { + "title": "Do not store raw text", + "description": "Do not store the text that gets embedded along with the vector and the metadata in the destination. If set to true, only the vector and the metadata will be stored - in this case raw text for LLM use cases needs to be retrieved from another source.", + "default": false, + "group": "advanced", + "type": "boolean" + }, "indexing": { "title": "Indexing", "type": "object", @@ -419,8 +432,8 @@ "type": "string" } }, - "required": ["mode"], - "description": "Do not authenticate (suitable for locally running test clusters, do not use for clusters with public IP addresses)" + "description": "Do not authenticate (suitable for locally running test clusters, do not use for clusters with public IP addresses)", + "required": ["mode"] } ] }, @@ -436,6 +449,12 @@ "default": "text", "type": "string" }, + "tenant_id": { + "title": "Tenant ID", + "description": "The tenant ID to use for multi tenancy", + "default": "", + "type": "string" + }, "default_vectorizer": { "title": "Default Vectorizer", "description": "The vectorizer to use if new classes need to be created", @@ -457,14 +476,20 @@ "description": "Additional HTTP headers to send with every request.", "default": [], "examples": [ - { "header_key": "X-OpenAI-Api-Key", "value": "my-openai-api-key" } + { + "header_key": "X-OpenAI-Api-Key", + "value": "my-openai-api-key" + } ], "type": "array", "items": { "title": "Header", "type": "object", "properties": { - "header_key": { "title": "Header Key", "type": "string" }, + "header_key": { + "title": "Header Key", + "type": "string" + }, "value": { "title": "Header Value", "airbyte_secret": true, @@ -478,21 +503,26 @@ "required": ["host", "auth"], "group": "indexing", "description": "Indexing configuration" - }, - "omit_raw_text": { - "title": "Do not store raw text", - "description": "Do not store the text that gets embedded along with the vector and the metadata in the destination. If set to true, only the vector and the metadata will be stored - in this case raw text for LLM use cases needs to be retrieved from another source.", - "default": false, - "group": "advanced", - "type": "boolean" } }, "required": ["embedding", "processing", "indexing"], "groups": [ - { "id": "processing", "title": "Processing" }, - { "id": "embedding", "title": "Embedding" }, - { "id": "indexing", "title": "Indexing" }, - { "id": "advanced", "title": "Advanced" } + { + "id": "processing", + "title": "Processing" + }, + { + "id": "embedding", + "title": "Embedding" + }, + { + "id": "indexing", + "title": "Indexing" + }, + { + "id": "advanced", + "title": "Advanced" + } ] }, "supportsIncremental": true, From 68c67803bcd884ebe01bc1a94d41159a20c9f3ab Mon Sep 17 00:00:00 2001 From: Joe Reuter Date: Tue, 16 Jan 2024 11:46:11 +0100 Subject: [PATCH 12/21] fix tests --- .../connectors/destination-weaviate/unit_tests/indexer_test.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/airbyte-integrations/connectors/destination-weaviate/unit_tests/indexer_test.py b/airbyte-integrations/connectors/destination-weaviate/unit_tests/indexer_test.py index 59c064da604a..f9d0b2e95d09 100644 --- a/airbyte-integrations/connectors/destination-weaviate/unit_tests/indexer_test.py +++ b/airbyte-integrations/connectors/destination-weaviate/unit_tests/indexer_test.py @@ -74,6 +74,7 @@ def test_pre_sync_that_creates_class(self, MockClient): @patch("destination_weaviate.indexer.weaviate.Client") def test_pre_sync_that_creates_class_with_multi_tenancy_enabled(self, MockClient): mock_client = Mock() + self.config.tenant_id = "test_tenant" mock_client.schema.get.return_value = {"classes": []} MockClient.return_value = mock_client self.indexer.pre_sync(self.mock_catalog) @@ -130,6 +131,7 @@ def test_index_deletes_by_record_id(self): def test_index_deletes_by_record_id_with_tenant_id(self): mock_client = Mock() + self.config.tenant_id = "test_tenant" self.indexer.client = mock_client self.indexer.has_record_id_metadata = defaultdict(None) self.indexer.has_record_id_metadata["Test"] = True From eadd631fb6d6641042da53cfffc28d4697d9fd09 Mon Sep 17 00:00:00 2001 From: Joe Reuter Date: Tue, 16 Jan 2024 11:59:11 +0100 Subject: [PATCH 13/21] make tenant id secret --- .../destination-weaviate/destination_weaviate/config.py | 2 +- .../connectors/destination-weaviate/integration_tests/spec.json | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/airbyte-integrations/connectors/destination-weaviate/destination_weaviate/config.py b/airbyte-integrations/connectors/destination-weaviate/destination_weaviate/config.py index 8f4019484e18..c4708d59ffc9 100644 --- a/airbyte-integrations/connectors/destination-weaviate/destination_weaviate/config.py +++ b/airbyte-integrations/connectors/destination-weaviate/destination_weaviate/config.py @@ -65,7 +65,7 @@ class WeaviateIndexingConfigModel(BaseModel): ) batch_size: int = Field(title="Batch Size", description="The number of records to send to Weaviate in each batch", default=128) text_field: str = Field(title="Text Field", description="The field in the object that contains the embedded text", default="text") - tenant_id: str = Field(title="Tenant ID", description="The tenant ID to use for multi tenancy", default="") + tenant_id: str = Field(title="Tenant ID", description="The tenant ID to use for multi tenancy", airbyte_secret=True, default="") default_vectorizer: str = Field( title="Default Vectorizer", description="The vectorizer to use if new classes need to be created", diff --git a/airbyte-integrations/connectors/destination-weaviate/integration_tests/spec.json b/airbyte-integrations/connectors/destination-weaviate/integration_tests/spec.json index 69681ef6dc61..a5db30c7213d 100644 --- a/airbyte-integrations/connectors/destination-weaviate/integration_tests/spec.json +++ b/airbyte-integrations/connectors/destination-weaviate/integration_tests/spec.json @@ -452,6 +452,7 @@ "tenant_id": { "title": "Tenant ID", "description": "The tenant ID to use for multi tenancy", + "airbyte_secret": true, "default": "", "type": "string" }, From c50ddd927d373745b24aa6524186cc97d146218d Mon Sep 17 00:00:00 2001 From: Joe Reuter Date: Tue, 16 Jan 2024 12:03:17 +0100 Subject: [PATCH 14/21] improve documentation --- docs/integrations/destinations/weaviate.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/integrations/destinations/weaviate.md b/docs/integrations/destinations/weaviate.md index fe8647f96619..ff2248114d1d 100644 --- a/docs/integrations/destinations/weaviate.md +++ b/docs/integrations/destinations/weaviate.md @@ -79,6 +79,8 @@ You can also create the class in Weaviate in advance if you need more control ov As properties have to start will a lowercase letter in Weaviate and can't contain spaces or special characters. Field names might be updated during the loading process. The field names `id`, `_id` and `_additional` are reserved keywords in Weaviate, so they will be renamed to `raw_id`, `raw__id` and `raw_additional` respectively. +When using [multi-tenancy](https://weaviate.io/developers/weaviate/manage-data/multi-tenancy), the tenant id can be configured in the connector configuration. If not specified, multi-tenancy will be disabled. In case You want to index into an already created class, you need to make sure the class is created with multi-tenancy enabled and the tenant id matches the one configured in the connector. In case the class doesn't exist, it will be created with multi-tenancy properly configured. + ## Changelog | Version | Date | Pull Request | Subject | From 649d464ecfe5dc6f205a9d34cfd8f95a5dfb14ad Mon Sep 17 00:00:00 2001 From: Marcus0086 Date: Tue, 16 Jan 2024 18:44:00 +0530 Subject: [PATCH 15/21] Add tenant to class if it doesn't exist, otherwise log existing tenant --- .../destination_weaviate/indexer.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/airbyte-integrations/connectors/destination-weaviate/destination_weaviate/indexer.py b/airbyte-integrations/connectors/destination-weaviate/destination_weaviate/indexer.py index 859f055d0f17..37c687170489 100644 --- a/airbyte-integrations/connectors/destination-weaviate/destination_weaviate/indexer.py +++ b/airbyte-integrations/connectors/destination-weaviate/destination_weaviate/indexer.py @@ -100,11 +100,14 @@ def pre_sync(self, catalog: ConfiguredAirbyteCatalog) -> None: logging.info(f"Created class {class_name}") if self.config.tenant_id.strip(): - self.client.schema.add_class_tenants( - class_name=class_name, # The class to which the tenants will be added - tenants=[weaviate.Tenant(name=self.config.tenant_id)], - ) - logging.info(f"Added tenant {self.config.tenant_id} to class {class_name}") + class_tenants = self.client.schema.get_class_tenants(class_name=class_name) + if class_tenants is not None and self.config.tenant_id not in [tenant.name for tenant in class_tenants]: + self.client.schema.add_class_tenants(class_name=class_name, tenants=[weaviate.Tenant(name=self.config.tenant_id)]) + + logging.info(f"Added tenant {self.config.tenant_id} to class {class_name}") + + else: + logging.info(f"Tenant {self.config.tenant_id} already exists in class {class_name}") else: self.has_record_id_metadata[class_name] = schema is not None and any( prop.get("name") == METADATA_RECORD_ID_FIELD for prop in schema.get("properties", {}) From 2928d6504e74e96f507025ce5cea96de069cac70 Mon Sep 17 00:00:00 2001 From: Marcus0086 Date: Tue, 16 Jan 2024 18:47:01 +0530 Subject: [PATCH 16/21] Update Weaviate integration documentation --- docs/integrations/destinations/weaviate.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/integrations/destinations/weaviate.md b/docs/integrations/destinations/weaviate.md index ff2248114d1d..1b2246786a59 100644 --- a/docs/integrations/destinations/weaviate.md +++ b/docs/integrations/destinations/weaviate.md @@ -79,7 +79,7 @@ You can also create the class in Weaviate in advance if you need more control ov As properties have to start will a lowercase letter in Weaviate and can't contain spaces or special characters. Field names might be updated during the loading process. The field names `id`, `_id` and `_additional` are reserved keywords in Weaviate, so they will be renamed to `raw_id`, `raw__id` and `raw_additional` respectively. -When using [multi-tenancy](https://weaviate.io/developers/weaviate/manage-data/multi-tenancy), the tenant id can be configured in the connector configuration. If not specified, multi-tenancy will be disabled. In case You want to index into an already created class, you need to make sure the class is created with multi-tenancy enabled and the tenant id matches the one configured in the connector. In case the class doesn't exist, it will be created with multi-tenancy properly configured. +When using [multi-tenancy](https://weaviate.io/developers/weaviate/manage-data/multi-tenancy), the tenant id can be configured in the connector configuration. If not specified, multi-tenancy will be disabled. In case You want to index into an already created class, you need to make sure the class is created with multi-tenancy enabled and the tenant id matches the one configured in the connector. In case the class doesn't exist, it will be created with multi-tenancy properly configured. If the class already exists but the tenant id is not associated with the class, the connector will automatically add the tenant id to the class. This allows you to configure multiple connections for different tenants on the same schema. ## Changelog From 8fafd9980725f146d256b7296f9705d7d0c85f28 Mon Sep 17 00:00:00 2001 From: Marcus0086 Date: Tue, 16 Jan 2024 19:19:55 +0530 Subject: [PATCH 17/21] Refactor class creation in WeaviateIndexer to handle existing classes --- .../destination-weaviate/destination_weaviate/indexer.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/airbyte-integrations/connectors/destination-weaviate/destination_weaviate/indexer.py b/airbyte-integrations/connectors/destination-weaviate/destination_weaviate/indexer.py index 37c687170489..e4fdbd71e555 100644 --- a/airbyte-integrations/connectors/destination-weaviate/destination_weaviate/indexer.py +++ b/airbyte-integrations/connectors/destination-weaviate/destination_weaviate/indexer.py @@ -96,8 +96,11 @@ def pre_sync(self, catalog: ConfiguredAirbyteCatalog) -> None: if self.config.tenant_id.strip(): config["multiTenancyConfig"] = {"enabled": True} - self.client.schema.create_class(config) - logging.info(f"Created class {class_name}") + try: + self.client.schema.create_class(config) + logging.info(f"Created class {class_name}") + except Exception as e: + logging.error(f"Failed to create class or class already exists {class_name}: {e}") if self.config.tenant_id.strip(): class_tenants = self.client.schema.get_class_tenants(class_name=class_name) From 922e644e36f0f4b7b13998671238da4035f451c8 Mon Sep 17 00:00:00 2001 From: Marcus0086 Date: Tue, 16 Jan 2024 22:08:49 +0530 Subject: [PATCH 18/21] Add tenant ID to class if not already present prior to deletion --- .../destination-weaviate/destination_weaviate/indexer.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/airbyte-integrations/connectors/destination-weaviate/destination_weaviate/indexer.py b/airbyte-integrations/connectors/destination-weaviate/destination_weaviate/indexer.py index e4fdbd71e555..1f50afcd86af 100644 --- a/airbyte-integrations/connectors/destination-weaviate/destination_weaviate/indexer.py +++ b/airbyte-integrations/connectors/destination-weaviate/destination_weaviate/indexer.py @@ -69,6 +69,14 @@ def pre_sync(self, catalog: ConfiguredAirbyteCatalog) -> None: self._create_client() classes = {c["class"]: c for c in self.client.schema.get().get("classes", [])} self.has_record_id_metadata = defaultdict(lambda: False) + + if self.config.tenant_id.strip(): + for class_name in classes.keys(): + class_tenants = self.client.schema.get_class_tenants(class_name=class_name) + if class_tenants is not None and self.config.tenant_id not in [tenant.name for tenant in class_tenants]: + self.client.schema.add_class_tenants(class_name=class_name, tenants=[weaviate.Tenant(name=self.config.tenant_id)]) + logging.info(f"Added tenant {self.config.tenant_id} to class {class_name}") + for stream in catalog.streams: class_name = self._stream_to_class_name(stream.stream.name) schema = classes[class_name] if class_name in classes else None From 573533679cda74fe74190c924677bb08e93fc8bf Mon Sep 17 00:00:00 2001 From: Marcus0086 Date: Tue, 16 Jan 2024 22:45:24 +0530 Subject: [PATCH 19/21] Refactor class creation in WeaviateIndexer --- .../destination-weaviate/destination_weaviate/indexer.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/airbyte-integrations/connectors/destination-weaviate/destination_weaviate/indexer.py b/airbyte-integrations/connectors/destination-weaviate/destination_weaviate/indexer.py index 1f50afcd86af..0ed475dffb10 100644 --- a/airbyte-integrations/connectors/destination-weaviate/destination_weaviate/indexer.py +++ b/airbyte-integrations/connectors/destination-weaviate/destination_weaviate/indexer.py @@ -104,11 +104,8 @@ def pre_sync(self, catalog: ConfiguredAirbyteCatalog) -> None: if self.config.tenant_id.strip(): config["multiTenancyConfig"] = {"enabled": True} - try: - self.client.schema.create_class(config) - logging.info(f"Created class {class_name}") - except Exception as e: - logging.error(f"Failed to create class or class already exists {class_name}: {e}") + self.client.schema.create_class(config) + logging.info(f"Created class {class_name}") if self.config.tenant_id.strip(): class_tenants = self.client.schema.get_class_tenants(class_name=class_name) From d3a0ab689e6df9f0b4173e92d50187c4569df04b Mon Sep 17 00:00:00 2001 From: Marcus0086 Date: Wed, 17 Jan 2024 11:48:54 +0530 Subject: [PATCH 20/21] (refactor) Add tenant to class if missing --- .../destination_weaviate/indexer.py | 22 +++++++++---------- 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/airbyte-integrations/connectors/destination-weaviate/destination_weaviate/indexer.py b/airbyte-integrations/connectors/destination-weaviate/destination_weaviate/indexer.py index 0ed475dffb10..93adb9d825a4 100644 --- a/airbyte-integrations/connectors/destination-weaviate/destination_weaviate/indexer.py +++ b/airbyte-integrations/connectors/destination-weaviate/destination_weaviate/indexer.py @@ -52,6 +52,14 @@ def _create_client(self): batch_size=None, dynamic=False, weaviate_error_retries=weaviate.WeaviateErrorRetryConf(number_retries=5) ) + def _add_tenant_to_class_if_missing(self, class_name: str): + class_tenants = self.client.schema.get_class_tenants(class_name=class_name) + if class_tenants is not None and self.config.tenant_id not in [tenant.name for tenant in class_tenants]: + self.client.schema.add_class_tenants(class_name=class_name, tenants=[weaviate.Tenant(name=self.config.tenant_id)]) + logging.info(f"Added tenant {self.config.tenant_id} to class {class_name}") + else: + logging.info(f"Tenant {self.config.tenant_id} already exists in class {class_name}") + def check(self) -> Optional[str]: deployment_mode = os.environ.get("DEPLOYMENT_MODE", "") if deployment_mode.casefold() == CLOUD_DEPLOYMENT_MODE and not self._uses_safe_config(): @@ -72,10 +80,7 @@ def pre_sync(self, catalog: ConfiguredAirbyteCatalog) -> None: if self.config.tenant_id.strip(): for class_name in classes.keys(): - class_tenants = self.client.schema.get_class_tenants(class_name=class_name) - if class_tenants is not None and self.config.tenant_id not in [tenant.name for tenant in class_tenants]: - self.client.schema.add_class_tenants(class_name=class_name, tenants=[weaviate.Tenant(name=self.config.tenant_id)]) - logging.info(f"Added tenant {self.config.tenant_id} to class {class_name}") + self._add_tenant_to_class_if_missing(class_name) for stream in catalog.streams: class_name = self._stream_to_class_name(stream.stream.name) @@ -108,14 +113,7 @@ def pre_sync(self, catalog: ConfiguredAirbyteCatalog) -> None: logging.info(f"Created class {class_name}") if self.config.tenant_id.strip(): - class_tenants = self.client.schema.get_class_tenants(class_name=class_name) - if class_tenants is not None and self.config.tenant_id not in [tenant.name for tenant in class_tenants]: - self.client.schema.add_class_tenants(class_name=class_name, tenants=[weaviate.Tenant(name=self.config.tenant_id)]) - - logging.info(f"Added tenant {self.config.tenant_id} to class {class_name}") - - else: - logging.info(f"Tenant {self.config.tenant_id} already exists in class {class_name}") + self._add_tenant_to_class_if_missing(class_name) else: self.has_record_id_metadata[class_name] = schema is not None and any( prop.get("name") == METADATA_RECORD_ID_FIELD for prop in schema.get("properties", {}) From 688f0d2e81f5e581d0dc4c5aa25773154cb22ea1 Mon Sep 17 00:00:00 2001 From: Joe Reuter Date: Wed, 17 Jan 2024 11:31:52 +0100 Subject: [PATCH 21/21] adjust documentation --- .../connectors/destination-weaviate/unit_tests/indexer_test.py | 1 + docs/integrations/destinations/weaviate.md | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/airbyte-integrations/connectors/destination-weaviate/unit_tests/indexer_test.py b/airbyte-integrations/connectors/destination-weaviate/unit_tests/indexer_test.py index f9d0b2e95d09..a5b2526e392c 100644 --- a/airbyte-integrations/connectors/destination-weaviate/unit_tests/indexer_test.py +++ b/airbyte-integrations/connectors/destination-weaviate/unit_tests/indexer_test.py @@ -75,6 +75,7 @@ def test_pre_sync_that_creates_class(self, MockClient): def test_pre_sync_that_creates_class_with_multi_tenancy_enabled(self, MockClient): mock_client = Mock() self.config.tenant_id = "test_tenant" + mock_client.schema.get_class_tenants.return_value = [] mock_client.schema.get.return_value = {"classes": []} MockClient.return_value = mock_client self.indexer.pre_sync(self.mock_catalog) diff --git a/docs/integrations/destinations/weaviate.md b/docs/integrations/destinations/weaviate.md index 1b2246786a59..c8acb02b50de 100644 --- a/docs/integrations/destinations/weaviate.md +++ b/docs/integrations/destinations/weaviate.md @@ -79,7 +79,7 @@ You can also create the class in Weaviate in advance if you need more control ov As properties have to start will a lowercase letter in Weaviate and can't contain spaces or special characters. Field names might be updated during the loading process. The field names `id`, `_id` and `_additional` are reserved keywords in Weaviate, so they will be renamed to `raw_id`, `raw__id` and `raw_additional` respectively. -When using [multi-tenancy](https://weaviate.io/developers/weaviate/manage-data/multi-tenancy), the tenant id can be configured in the connector configuration. If not specified, multi-tenancy will be disabled. In case You want to index into an already created class, you need to make sure the class is created with multi-tenancy enabled and the tenant id matches the one configured in the connector. In case the class doesn't exist, it will be created with multi-tenancy properly configured. If the class already exists but the tenant id is not associated with the class, the connector will automatically add the tenant id to the class. This allows you to configure multiple connections for different tenants on the same schema. +When using [multi-tenancy](https://weaviate.io/developers/weaviate/manage-data/multi-tenancy), the tenant id can be configured in the connector configuration. If not specified, multi-tenancy will be disabled. In case you want to index into an already created class, you need to make sure the class is created with multi-tenancy enabled. In case the class doesn't exist, it will be created with multi-tenancy properly configured. If the class already exists but the tenant id is not associated with the class, the connector will automatically add the tenant id to the class. This allows you to configure multiple connections for different tenants on the same schema. ## Changelog