forked from airbytehq/airbyte
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
🐛 Add a drop table hook to drop scd tables in case of overwrite sync (a…
…irbytehq#18015) * Add a drop table hook to drop scd tables in case of overwrite sync * Add an integration test for dropping SCD table on overwrite * skip new test for Oracle and TiDB * Add normalization run after initial reset * Bump normalization version
- Loading branch information
Showing
13 changed files
with
446 additions
and
78 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
46 changes: 46 additions & 0 deletions
46
...ntegration_tests/resources/test_reset_scd_overwrite/data_input/test_drop_scd_catalog.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
{ | ||
"streams": [ | ||
{ | ||
"stream": { | ||
"name": "stream_test_scd_drop", | ||
"json_schema": { | ||
"type": ["null", "object"], | ||
"properties": { | ||
"id": { | ||
"type": "integer" | ||
}, | ||
"date": { | ||
"type": "string", | ||
"format": "date" | ||
}, | ||
"timestamp_col": { | ||
"type": "string", | ||
"format": "date-time" | ||
}, | ||
"datetime_to_string": { | ||
"type": "string", | ||
"format": "date-time", | ||
"airbyte_type": "timestamp_with_timezone" | ||
}, | ||
"string_to_dt": { | ||
"type": "string" | ||
}, | ||
"number_to_int": { | ||
"type": "number" | ||
}, | ||
"int_to_number": { | ||
"type": "integer" | ||
} | ||
} | ||
}, | ||
"supported_sync_modes": ["incremental"], | ||
"source_defined_cursor": true, | ||
"default_cursor_field": [] | ||
}, | ||
"sync_mode": "incremental", | ||
"cursor_field": ["date"], | ||
"destination_sync_mode": "append_dedup", | ||
"primary_key": [["id"]] | ||
} | ||
] | ||
} |
46 changes: 46 additions & 0 deletions
46
...ests/resources/test_reset_scd_overwrite/data_input/test_drop_scd_catalog_incremental.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
{ | ||
"streams": [ | ||
{ | ||
"stream": { | ||
"name": "stream_test_scd_drop", | ||
"json_schema": { | ||
"type": ["null", "object"], | ||
"properties": { | ||
"id": { | ||
"type": "integer" | ||
}, | ||
"date": { | ||
"type": "string", | ||
"format": "date" | ||
}, | ||
"timestamp_col": { | ||
"type": "string", | ||
"format": "date-time" | ||
}, | ||
"datetime_to_string": { | ||
"type": "string" | ||
}, | ||
"string_to_dt": { | ||
"type": "string", | ||
"format": "date-time", | ||
"airbyte_type": "timestamp_with_timezone" | ||
}, | ||
"number_to_int": { | ||
"type": "integer" | ||
}, | ||
"int_to_number": { | ||
"type": "number" | ||
} | ||
} | ||
}, | ||
"supported_sync_modes": ["incremental"], | ||
"source_defined_cursor": true, | ||
"default_cursor_field": [] | ||
}, | ||
"sync_mode": "incremental", | ||
"cursor_field": ["date"], | ||
"destination_sync_mode": "append_dedup", | ||
"primary_key": [["id"]] | ||
} | ||
] | ||
} |
46 changes: 46 additions & 0 deletions
46
...tion_tests/resources/test_reset_scd_overwrite/data_input/test_drop_scd_catalog_reset.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
{ | ||
"streams": [ | ||
{ | ||
"stream": { | ||
"name": "stream_test_scd_drop", | ||
"json_schema": { | ||
"type": ["null", "object"], | ||
"properties": { | ||
"id": { | ||
"type": "integer" | ||
}, | ||
"date": { | ||
"type": "string", | ||
"format": "date" | ||
}, | ||
"timestamp_col": { | ||
"type": "string", | ||
"format": "date-time" | ||
}, | ||
"datetime_to_string": { | ||
"type": "string" | ||
}, | ||
"string_to_dt": { | ||
"type": "string", | ||
"format": "date-time", | ||
"airbyte_type": "timestamp_with_timezone" | ||
}, | ||
"number_to_int": { | ||
"type": "integer" | ||
}, | ||
"int_to_number": { | ||
"type": "number" | ||
} | ||
} | ||
}, | ||
"supported_sync_modes": ["incremental"], | ||
"source_defined_cursor": true, | ||
"default_cursor_field": [] | ||
}, | ||
"sync_mode": "incremental", | ||
"cursor_field": ["date"], | ||
"destination_sync_mode": "overwrite", | ||
"primary_key": [["id"]] | ||
} | ||
] | ||
} |
5 changes: 5 additions & 0 deletions
5
...ntegration_tests/resources/test_reset_scd_overwrite/data_input/test_drop_scd_messages.txt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
{"type": "RECORD", "record": {"stream": "stream_test_scd_drop", "emitted_at": 1602637589000, "data": { "id": 1, "date": "2022-08-29", "timestamp_col": "2020-08-29T00:00:00.000000-0000", "datetime_to_string":"2022-10-01T01:04:04-04:00", "string_to_dt":"2022-11-01T02:03:04-07:00", "number_to_int": 1, "int_to_number": 10}}} | ||
{"type": "RECORD", "record": {"stream": "stream_test_scd_drop", "emitted_at": 1602637689100, "data": { "id": 2, "date": "2022-08-30", "timestamp_col": "2020-08-30T00:00:00.000-00", "datetime_to_string":"2022-10-02T01:04:04-04:00", "string_to_dt":"2022-11-02T03:04:05-07:00", "number_to_int": 10, "int_to_number": 11}}} | ||
{"type": "RECORD", "record": {"stream": "stream_test_scd_drop", "emitted_at": 1602637789200, "data": { "id": 3, "date": "2022-08-31", "timestamp_col": "2020-08-31T00:00:00+00", "datetime_to_string":"2022-10-03T01:04:04-04:00", "string_to_dt":"2022-11-03T03:04:06-07:00", "number_to_int": 11, "int_to_number": 12}}} | ||
{"type": "RECORD", "record": {"stream": "stream_test_scd_drop", "emitted_at": 1602637889300, "data": { "id": 4, "date": "2022-09-01", "timestamp_col": "2020-08-31T00:00:00+0000", "datetime_to_string":"2022-10-04T01:04:04-04:00", "string_to_dt":"2022-11-04T03:04:07-07:00", "number_to_int": 111, "int_to_number": 133}}} | ||
{"type": "RECORD", "record": {"stream": "stream_test_scd_drop", "emitted_at": 1602637989400, "data": { "id": 5, "date": "2022-09-02", "timestamp_col": "2020-09-01T00:00:00Z", "datetime_to_string":"2022-10-05T01:04:04-04:00", "string_to_dt":"2022-11-05T03:04:08-12:00", "number_to_int": 1010, "int_to_number": 1300}}} |
6 changes: 6 additions & 0 deletions
6
...sts/resources/test_reset_scd_overwrite/data_input/test_scd_reset_messages_incremental.txt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
{"type": "RECORD", "record": {"stream": "stream_test_scd_drop", "emitted_at": 1602637589000, "data": { "id": 1, "date": "2022-08-29", "timestamp_col": "2020-08-29T00:00:00.000000-0000", "datetime_to_string":"2022-10-01T01:04:04-04:00", "string_to_dt":"2022-11-01T02:03:04-07:00", "number_to_int": 1, "int_to_number": 10}}} | ||
{"type": "RECORD", "record": {"stream": "stream_test_scd_drop", "emitted_at": 1602637689100, "data": { "id": 2, "date": "2022-08-30", "timestamp_col": "2020-08-30T00:00:00.000-00", "datetime_to_string":"2022-10-02T01:04:04-04:00", "string_to_dt":"2022-11-02T03:04:05-07:00", "number_to_int": 10, "int_to_number": 11}}} | ||
{"type": "RECORD", "record": {"stream": "stream_test_scd_drop", "emitted_at": 1602637789200, "data": { "id": 3, "date": "2022-08-31", "timestamp_col": "2020-08-31T00:00:00+00", "datetime_to_string":"2022-10-03T01:04:04-04:00", "string_to_dt":"2022-11-03T03:04:06-07:00", "number_to_int": 11, "int_to_number": 12}}} | ||
{"type": "RECORD", "record": {"stream": "stream_test_scd_drop", "emitted_at": 1602637889300, "data": { "id": 4, "date": "2022-09-01", "timestamp_col": "2020-08-31T00:00:00+0000", "datetime_to_string":"2022-10-04T01:04:04-04:00", "string_to_dt":"2022-11-04T03:04:07-07:00", "number_to_int": 111, "int_to_number": 133}}} | ||
{"type": "RECORD", "record": {"stream": "stream_test_scd_drop", "emitted_at": 1602637989400, "data": { "id": 5, "date": "2022-09-02", "timestamp_col": "2020-09-01T00:00:00Z", "datetime_to_string":"2022-10-05T01:04:04-04:00", "string_to_dt":"2022-11-05T03:04:08-12:00", "number_to_int": 1010, "int_to_number": 1300}}} | ||
{"type": "RECORD", "record": {"stream": "stream_test_scd_drop", "emitted_at": 1602637989400, "data": { "id": 6, "date": "2022-09-03", "timestamp_col": "2020-09-01T00:00:00Z", "datetime_to_string":"this is a string, not a datetime value", "string_to_dt":"2022-11-05T03:04:08-12:00", "number_to_int": 1010, "int_to_number": 1300.25}}} |
2 changes: 2 additions & 0 deletions
2
...sources/test_reset_scd_overwrite/dbt_test_config/dbt_data_tests/test_check_row_counts.sql
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
select * from {{ ref('test_scd_drop_row_counts') }} | ||
where row_count != expected_count |
161 changes: 161 additions & 0 deletions
161
airbyte-integrations/bases/base-normalization/integration_tests/test_drop_scd_overwrite.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,161 @@ | ||
# | ||
# Copyright (c) 2022 Airbyte, Inc., all rights reserved. | ||
# | ||
|
||
import json | ||
import os | ||
import pathlib | ||
import shutil | ||
|
||
import pytest | ||
from integration_tests.dbt_integration_test import DbtIntegrationTest | ||
from integration_tests.utils import generate_dbt_models, run_destination_process, setup_test_dir | ||
from normalization import DestinationType | ||
|
||
temporary_folders = set() | ||
dbt_test_utils = DbtIntegrationTest() | ||
|
||
|
||
@pytest.fixture(scope="module", autouse=True) | ||
def before_all_tests(request): | ||
destinations_to_test = dbt_test_utils.get_test_targets() | ||
# set clean-up args to clean target destination after the test | ||
clean_up_args = { | ||
"destination_type": [d for d in DestinationType if d.value in destinations_to_test], | ||
"test_type": "test_reset_scd_overwrite", | ||
"tmp_folders": temporary_folders, | ||
} | ||
dbt_test_utils.set_target_schema("test_reset_scd_overwrite") | ||
dbt_test_utils.change_current_test_dir(request) | ||
dbt_test_utils.setup_db(destinations_to_test) | ||
os.environ["PATH"] = os.path.abspath("../.venv/bin/") + ":" + os.environ["PATH"] | ||
yield | ||
dbt_test_utils.clean_tmp_tables(**clean_up_args) | ||
dbt_test_utils.tear_down_db() | ||
for folder in temporary_folders: | ||
print(f"Deleting temporary test folder {folder}") | ||
shutil.rmtree(folder, ignore_errors=True) | ||
|
||
|
||
@pytest.fixture | ||
def setup_test_path(request): | ||
dbt_test_utils.change_current_test_dir(request) | ||
print(f"Running from: {pathlib.Path().absolute()}") | ||
print(f"Current PATH is: {os.environ['PATH']}") | ||
yield | ||
os.chdir(request.config.invocation_dir) | ||
|
||
|
||
@pytest.mark.parametrize("destination_type", list(DestinationType)) | ||
def test_reset_scd_on_overwrite(destination_type: DestinationType, setup_test_path): | ||
if destination_type.value not in dbt_test_utils.get_test_targets(): | ||
pytest.skip(f"Destinations {destination_type} is not in NORMALIZATION_TEST_TARGET env variable") | ||
|
||
if destination_type.value in [DestinationType.ORACLE.value, DestinationType.TIDB.value]: | ||
# Oracle and TiDB do not support incremental syncs with schema changes yet | ||
pytest.skip(f"{destination_type} does not support incremental sync with schema change yet") | ||
elif destination_type.value == DestinationType.REDSHIFT.value: | ||
# set unique schema for Redshift test | ||
dbt_test_utils.set_target_schema(dbt_test_utils.generate_random_string("test_reset_scd_")) | ||
|
||
test_resource_name = "test_reset_scd_overwrite" | ||
# Select target schema | ||
target_schema = dbt_test_utils.target_schema | ||
|
||
try: | ||
print(f"Testing resetting SCD tables on overwrite with {destination_type} in schema {target_schema}") | ||
run_reset_scd_on_overwrite_test(destination_type, test_resource_name) | ||
finally: | ||
dbt_test_utils.set_target_schema(target_schema) | ||
|
||
|
||
def run_reset_scd_on_overwrite_test(destination_type: DestinationType, test_resource_name: str): | ||
# Generate DBT profile yaml | ||
integration_type = destination_type.value | ||
test_root_dir = setup_test_dir(integration_type, temporary_folders) | ||
destination_config = dbt_test_utils.generate_profile_yaml_file(destination_type, test_root_dir) | ||
test_directory = os.path.join(test_root_dir, "models/generated") | ||
shutil.rmtree(test_directory, ignore_errors=True) | ||
|
||
# Generate config file for the destination | ||
config_file = os.path.join(test_root_dir, "destination_config.json") | ||
with open(config_file, "w") as f: | ||
f.write(json.dumps(destination_config)) | ||
|
||
# make sure DBT dependencies are installed | ||
dbt_test_utils.dbt_check(destination_type, test_root_dir) | ||
|
||
# Generate catalog for an initial reset/cleanup (pre-test) | ||
original_catalog_file = os.path.join("resources", test_resource_name, "data_input", "test_drop_scd_catalog.json") | ||
dbt_test_utils.copy_replace( | ||
original_catalog_file, | ||
os.path.join(test_root_dir, "initial_reset_catalog.json"), | ||
pattern='"destination_sync_mode": ".*"', | ||
replace_value='"destination_sync_mode": "overwrite"', | ||
) | ||
|
||
# Force a reset in destination raw tables to remove any data left over from previous test runs | ||
assert run_destination_process(destination_type, test_root_dir, "", "initial_reset_catalog.json", dbt_test_utils) | ||
# generate models from catalog | ||
generate_dbt_models(destination_type, test_resource_name, test_root_dir, "models", "test_drop_scd_catalog_reset.json", dbt_test_utils) | ||
|
||
# Run dbt process to normalize data from the first sync | ||
dbt_test_utils.dbt_run(destination_type, test_root_dir, force_full_refresh=True) | ||
|
||
# Remove models generated in previous step to avoid DBT compilation errors | ||
test_directory = os.path.join(test_root_dir, "models/generated/airbyte_incremental") | ||
shutil.rmtree(test_directory, ignore_errors=True) | ||
test_directory = os.path.join(test_root_dir, "models/generated/airbyte_views") | ||
shutil.rmtree(test_directory, ignore_errors=True) | ||
test_directory = os.path.join(test_root_dir, "models/generated/airbyte_ctes") | ||
shutil.rmtree(test_directory, ignore_errors=True) | ||
test_directory = os.path.join(test_root_dir, "models/generated/airbyte_tables") | ||
shutil.rmtree(test_directory, ignore_errors=True) | ||
|
||
# Run the first sync to create raw tables in destinations | ||
dbt_test_utils.copy_replace(original_catalog_file, os.path.join(test_root_dir, "destination_catalog.json")) | ||
message_file = os.path.join("resources", test_resource_name, "data_input", "test_drop_scd_messages.txt") | ||
assert run_destination_process(destination_type, test_root_dir, message_file, "destination_catalog.json", dbt_test_utils) | ||
|
||
# generate models from catalog | ||
generate_dbt_models(destination_type, test_resource_name, test_root_dir, "models", "test_drop_scd_catalog.json", dbt_test_utils) | ||
|
||
# Run dbt process to normalize data from the first sync | ||
dbt_test_utils.dbt_run(destination_type, test_root_dir, force_full_refresh=True) | ||
|
||
# Remove models generated in previous step to avoid DBT compilation errors | ||
test_directory = os.path.join(test_root_dir, "models/generated/airbyte_incremental") | ||
shutil.rmtree(test_directory, ignore_errors=True) | ||
test_directory = os.path.join(test_root_dir, "models/generated/airbyte_views") | ||
shutil.rmtree(test_directory, ignore_errors=True) | ||
test_directory = os.path.join(test_root_dir, "models/generated/airbyte_ctes") | ||
shutil.rmtree(test_directory, ignore_errors=True) | ||
|
||
# Generate a catalog with modified schema for a reset | ||
reset_catalog_file = os.path.join("resources", test_resource_name, "data_input", "test_drop_scd_catalog_reset.json") | ||
dbt_test_utils.copy_replace(reset_catalog_file, os.path.join(test_root_dir, "reset_catalog.json")) | ||
|
||
# Run a reset | ||
assert run_destination_process(destination_type, test_root_dir, "", "reset_catalog.json", dbt_test_utils) | ||
|
||
# Run dbt process after reset to drop SCD table | ||
generate_dbt_models(destination_type, test_resource_name, test_root_dir, "models", "test_drop_scd_catalog_reset.json", dbt_test_utils) | ||
dbt_test_utils.dbt_run(destination_type, test_root_dir, force_full_refresh=True) | ||
|
||
# Remove models generated in previous step to avoid DBT compilation errors | ||
test_directory = os.path.join(test_root_dir, "models/generated/airbyte_incremental") | ||
shutil.rmtree(test_directory, ignore_errors=True) | ||
test_directory = os.path.join(test_root_dir, "models/generated/airbyte_views") | ||
shutil.rmtree(test_directory, ignore_errors=True) | ||
test_directory = os.path.join(test_root_dir, "models/generated/airbyte_ctes") | ||
shutil.rmtree(test_directory, ignore_errors=True) | ||
|
||
# Run another sync with modified catalog | ||
modified_catalog_file = os.path.join("resources", test_resource_name, "data_input", "test_drop_scd_catalog_incremental.json") | ||
dbt_test_utils.copy_replace(modified_catalog_file, os.path.join(test_root_dir, "destination_catalog.json")) | ||
message_file = os.path.join("resources", test_resource_name, "data_input", "test_scd_reset_messages_incremental.txt") | ||
assert run_destination_process(destination_type, test_root_dir, message_file, "destination_catalog.json", dbt_test_utils) | ||
|
||
# Run dbt process | ||
generate_dbt_models(destination_type, test_resource_name, test_root_dir, "models", "test_drop_scd_catalog_reset.json", dbt_test_utils) | ||
dbt_test_utils.dbt_run(destination_type, test_root_dir) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.