-
Notifications
You must be signed in to change notification settings - Fork 4.2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Snowflake Cortex destination : Bug fixes #38206
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,6 +2,7 @@ | |
# Copyright (c) 2024 Airbyte, Inc., all rights reserved. | ||
# | ||
|
||
import copy | ||
import uuid | ||
from typing import Any, Iterable, Optional | ||
|
||
|
@@ -85,7 +86,7 @@ def _get_updated_catalog(self) -> ConfiguredAirbyteCatalog: | |
metadata -> metadata of the record | ||
embedding -> embedding of the document content | ||
""" | ||
updated_catalog = self.catalog | ||
updated_catalog = copy.deepcopy(self.catalog) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. needed to not change the original catalog since this method is called twice |
||
# update each stream in the catalog | ||
for stream in updated_catalog.streams: | ||
# TO-DO: Revisit this - Clear existing properties, if anys, since we are not entirely sure what's in the configured catalog. | ||
|
@@ -144,7 +145,8 @@ def get_write_strategy(self, stream_name: str) -> WriteStrategy: | |
for stream in self.catalog.streams: | ||
if stream.stream.name == stream_name: | ||
if stream.destination_sync_mode == DestinationSyncMode.overwrite: | ||
return WriteStrategy.REPLACE | ||
# we will use append here since we will remove the existing records and add new ones. | ||
return WriteStrategy.APPEND | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. for overwrite mode we delete records first and just use append in pyairbyte, since data is sent in batches. |
||
if stream.destination_sync_mode == DestinationSyncMode.append: | ||
return WriteStrategy.APPEND | ||
if stream.destination_sync_mode == DestinationSyncMode.append_dedup: | ||
|
@@ -170,10 +172,22 @@ def index(self, document_chunks: Iterable[Any], namespace: str, stream: str): | |
cortex_processor.process_airbyte_messages(airbyte_messages, self.get_write_strategy(stream)) | ||
|
||
def delete(self, delete_ids: list[str], namespace: str, stream: str): | ||
# delete is generally used when we use full refresh/overwrite strategy. | ||
# PyAirbyte's sync will take care of overwriting the records. Hence, we don't need to do anything here. | ||
# this delete is specific to vector stores, hence not implemented here | ||
pass | ||
|
||
def pre_sync(self, catalog: ConfiguredAirbyteCatalog) -> None: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. new method - meant to be implemented for vector dbs - deletes records beforehand for overwrite. |
||
""" | ||
Run before the sync starts. This method makes sure that all records in the destination that belong to streams with a destination mode of overwrite are deleted. | ||
""" | ||
table_list = self.default_processor._get_tables_list() | ||
for stream in catalog.streams: | ||
# remove all records for streams with overwrite mode | ||
if stream.destination_sync_mode == DestinationSyncMode.overwrite: | ||
stream_name = stream.stream.name | ||
if stream_name.lower() in [table.lower() for table in table_list]: | ||
self.default_processor._execute_sql(f"DELETE FROM {stream_name}") | ||
pass | ||
|
||
def check(self) -> Optional[str]: | ||
self.default_processor._get_tables_list() | ||
# TODO: check to see if vector type is available in snowflake instance |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
32 seemed to low in general. each batch calls pyairbyte for write once.