From 09ebb47b2442dc11c14519b54ac1a81eef712a02 Mon Sep 17 00:00:00 2001 From: Catherine Noll Date: Tue, 1 Aug 2023 21:47:58 -0400 Subject: [PATCH] File cdk parser and cursor updates (#28900) * File-based CDK: update parquet parser to handle partitions * File-based CDK: make the record output & cursor date time format consistent --- .../models/declarative_component_schema.py | 2 +- .../file_based/file_types/parquet_parser.py | 36 ++++-- .../cursor/default_file_based_cursor.py | 16 ++- .../stream/default_file_based_stream.py | 3 +- .../file_based/scenarios/avro_scenarios.py | 40 +++--- .../file_based/scenarios/csv_scenarios.py | 80 ++++++------ .../scenarios/incremental_scenarios.py | 117 +++++++++++------- .../file_based/scenarios/jsonl_scenarios.py | 62 +++++----- .../file_based/scenarios/parquet_scenarios.py | 22 ++-- .../scenarios/user_input_schema_scenarios.py | 40 +++--- .../scenarios/validation_policy_scenarios.py | 116 ++++++++--------- .../stream/test_default_file_based_cursor.py | 39 +++--- 12 files changed, 313 insertions(+), 260 deletions(-) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/models/declarative_component_schema.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/models/declarative_component_schema.py index 66419308e254..11ffb81cfb40 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/declarative/models/declarative_component_schema.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/models/declarative_component_schema.py @@ -823,7 +823,7 @@ class DatetimeBasedCursor(BaseModel): cursor_datetime_formats: Optional[List[str]] = Field( None, description="The possible formats for the cursor field", - title="Cursor Datetime Format", + title="Cursor Datetime Formats", ) cursor_granularity: Optional[str] = Field( None, diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/parquet_parser.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/parquet_parser.py index 40d5dee0fed8..09d66a1f25b6 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/parquet_parser.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/file_types/parquet_parser.py @@ -4,7 +4,9 @@ import json import logging -from typing import Any, Dict, Iterable, Mapping +import os +from typing import Any, Dict, Iterable, List, Mapping +from urllib.parse import unquote import pyarrow as pa import pyarrow.parquet as pq @@ -27,11 +29,16 @@ async def infer_schema( if not isinstance(parquet_format, ParquetFormat): raise ValueError(f"Expected ParquetFormat, got {parquet_format}") - # Pyarrow can detect the schema of a parquet file by reading only its metadata. - # https://github.com/apache/arrow/blob/main/python/pyarrow/_parquet.pyx#L1168-L1243 - parquet_file = pq.ParquetFile(stream_reader.open_file(file, self.file_read_mode, logger)) - parquet_schema = parquet_file.schema_arrow + with stream_reader.open_file(file, self.file_read_mode, logger) as fp: + parquet_file = pq.ParquetFile(fp) + parquet_schema = parquet_file.schema_arrow + + # Inferred non-partition schema schema = {field.name: ParquetParser.parquet_type_to_schema_type(field.type, parquet_format) for field in parquet_schema} + # Inferred partition schema + partition_columns = {partition.split("=")[0]: {"type": "string"} for partition in self._extract_partitions(file.uri)} + + schema.update(partition_columns) return schema def parse_records( @@ -45,13 +52,18 @@ def parse_records( if not isinstance(parquet_format, ParquetFormat): raise ValueError(f"Expected ParquetFormat, got {parquet_format}") # FIXME test this branch! with stream_reader.open_file(file, self.file_read_mode, logger) as fp: - table = pq.read_table(fp) - for batch in table.to_batches(): - for i in range(batch.num_rows): - row_dict = { - column: ParquetParser._to_output_value(batch.column(column)[i], parquet_format) for column in table.column_names - } - yield row_dict + reader = pq.ParquetFile(fp) + partition_columns = {x.split("=")[0]: x.split("=")[1] for x in self._extract_partitions(file.uri)} + for row_group in range(reader.num_row_groups): + batch_dict = reader.read_row_group(row_group).to_pydict() + for record_values in zip(*batch_dict.values()): + record = dict(zip(batch_dict.keys(), record_values)) + record.update(partition_columns) + yield record + + @staticmethod + def _extract_partitions(filepath: str) -> List[str]: + return [unquote(partition) for partition in filepath.split(os.sep) if "=" in partition] @property def file_read_mode(self) -> FileReadMode: diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/stream/cursor/default_file_based_cursor.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/stream/cursor/default_file_based_cursor.py index 9d4ab4047c55..eb672e9e42ff 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/file_based/stream/cursor/default_file_based_cursor.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/stream/cursor/default_file_based_cursor.py @@ -48,11 +48,21 @@ def add_file(self, file: RemoteFile) -> None: ) def get_state(self) -> StreamState: - state = { - "history": self._file_to_datetime_history, - } + state = {"history": self._file_to_datetime_history, "_ab_source_file_last_modified": self._get_cursor()} return state + def _get_cursor(self) -> Optional[str]: + """ + Returns the cursor value. + + Files are synced in order of last-modified with secondary sort on filename, so the cursor value is + a string joining the last-modified timestamp of the last synced file and the name of the file. + """ + if self._file_to_datetime_history.items(): + filename, timestamp = max(self._file_to_datetime_history.items(), key=lambda x: (x[1], x[0])) + return f"{timestamp}_{filename}" + return None + def _is_history_full(self) -> bool: """ Returns true if the state's history is full, meaning new entries will start to replace old entries. diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/stream/default_file_based_stream.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/stream/default_file_based_stream.py index 1486a083e52d..e31d841d6f7a 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/file_based/stream/default_file_based_stream.py @@ -34,6 +34,7 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin): The default file-based stream. """ + DATE_TIME_FORMAT = "%Y-%m-%dT%H:%M:%S.%fZ" ab_last_mod_col = "_ab_source_file_last_modified" ab_file_name_col = "_ab_source_file_url" airbyte_columns = [ab_last_mod_col, ab_file_name_col] @@ -78,7 +79,7 @@ def read_records_from_slice(self, stream_slice: StreamSlice) -> Iterable[Mapping parser = self.get_parser(self.config.file_type) for file in stream_slice["files"]: # only serialize the datetime once - file_datetime_string = file.last_modified.strftime("%Y-%m-%dT%H:%M:%SZ") + file_datetime_string = file.last_modified.strftime(self.DATE_TIME_FORMAT) n_skipped = line_no = 0 try: diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/avro_scenarios.py b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/avro_scenarios.py index b13672649f3e..a0602bc7bc60 100644 --- a/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/avro_scenarios.py +++ b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/avro_scenarios.py @@ -219,7 +219,7 @@ "data": { "col1": "val11", "col2": 12, - "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.avro", }, "stream": "stream1", @@ -228,7 +228,7 @@ "data": { "col1": "val21", "col2": 22, - "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.avro", }, "stream": "stream1", @@ -282,7 +282,7 @@ "col_double": "20.02", "col_string": "Robbers", "col_album": {"album": "The 1975"}, - "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.avro", }, "stream": "stream1", @@ -292,7 +292,7 @@ "col_double": "20.23", "col_string": "Somebody Else", "col_album": {"album": "I Like It When You Sleep, for You Are So Beautiful yet So Unaware of It"}, - "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.avro", }, "stream": "stream1", @@ -302,7 +302,7 @@ "col_double": "1975.1975", "col_string": "It's Not Living (If It's Not with You)", "col_song": {"title": "Love It If We Made It"}, - "_ab_source_file_last_modified": "2023-06-06T03:54:07Z", + "_ab_source_file_last_modified": "2023-06-06T03:54:07.000000Z", "_ab_source_file_url": "b.avro", }, "stream": "stream1", @@ -312,7 +312,7 @@ "col_double": "5791.5791", "col_string": "The 1975", "col_song": {"title": "About You"}, - "_ab_source_file_last_modified": "2023-06-06T03:54:07Z", + "_ab_source_file_last_modified": "2023-06-06T03:54:07.000000Z", "_ab_source_file_url": "b.avro", }, "stream": "stream1", @@ -407,7 +407,7 @@ "col_timestamp_micros": "2022-05-30T00:00:00.456789+00:00", "col_local_timestamp_millis": "2022-05-29T00:00:00.456000", "col_local_timestamp_micros": "2022-05-30T00:00:00.456789", - "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.avro", }, "stream": "stream1", @@ -488,7 +488,7 @@ "col_album": "A_MOMENT_APART", "col_year": 2017, "col_vocals": False, - "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "odesza_songs.avro", }, "stream": "songs_stream", @@ -499,7 +499,7 @@ "col_album": "IN_RETURN", "col_year": 2014, "col_vocals": True, - "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "odesza_songs.avro", }, "stream": "songs_stream", @@ -510,7 +510,7 @@ "col_album": "THE_LAST_GOODBYE", "col_year": 2022, "col_vocals": True, - "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "odesza_songs.avro", }, "stream": "songs_stream", @@ -521,7 +521,7 @@ "col_album": "SUMMERS_GONE", "col_year": 2012, "col_vocals": True, - "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "odesza_songs.avro", }, "stream": "songs_stream", @@ -532,7 +532,7 @@ "col_album": "IN_RETURN", "col_year": 2014, "col_vocals": True, - "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "odesza_songs.avro", }, "stream": "songs_stream", @@ -542,7 +542,7 @@ "col_name": "Coachella", "col_location": {"country": "USA", "state": "California", "city": "Indio"}, "col_attendance": 250000, - "_ab_source_file_last_modified": "2023-06-06T03:54:07Z", + "_ab_source_file_last_modified": "2023-06-06T03:54:07.000000Z", "_ab_source_file_url": "california_festivals.avro", }, "stream": "festivals_stream", @@ -552,7 +552,7 @@ "col_name": "CRSSD", "col_location": {"country": "USA", "state": "California", "city": "San Diego"}, "col_attendance": 30000, - "_ab_source_file_last_modified": "2023-06-06T03:54:07Z", + "_ab_source_file_last_modified": "2023-06-06T03:54:07.000000Z", "_ab_source_file_url": "california_festivals.avro", }, "stream": "festivals_stream", @@ -562,7 +562,7 @@ "col_name": "Lightning in a Bottle", "col_location": {"country": "USA", "state": "California", "city": "Buena Vista Lake"}, "col_attendance": 18000, - "_ab_source_file_last_modified": "2023-06-06T03:54:07Z", + "_ab_source_file_last_modified": "2023-06-06T03:54:07.000000Z", "_ab_source_file_url": "california_festivals.avro", }, "stream": "festivals_stream", @@ -572,7 +572,7 @@ "col_name": "Outside Lands", "col_location": {"country": "USA", "state": "California", "city": "San Francisco"}, "col_attendance": 220000, - "_ab_source_file_last_modified": "2023-06-06T03:54:07Z", + "_ab_source_file_last_modified": "2023-06-06T03:54:07.000000Z", "_ab_source_file_url": "california_festivals.avro", }, "stream": "festivals_stream", @@ -653,7 +653,7 @@ "col_double": 20.02, "col_string": "Robbers", "col_album": {"album": "The 1975"}, - "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.avro", }, "stream": "stream1", @@ -663,7 +663,7 @@ "col_double": 20.23, "col_string": "Somebody Else", "col_album": {"album": "I Like It When You Sleep, for You Are So Beautiful yet So Unaware of It"}, - "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.avro", }, "stream": "stream1", @@ -673,7 +673,7 @@ "col_double": 1975.1975, "col_string": "It's Not Living (If It's Not with You)", "col_song": {"title": "Love It If We Made It"}, - "_ab_source_file_last_modified": "2023-06-06T03:54:07Z", + "_ab_source_file_last_modified": "2023-06-06T03:54:07.000000Z", "_ab_source_file_url": "b.avro", }, "stream": "stream1", @@ -683,7 +683,7 @@ "col_double": 5791.5791, "col_string": "The 1975", "col_song": {"title": "About You"}, - "_ab_source_file_last_modified": "2023-06-06T03:54:07Z", + "_ab_source_file_last_modified": "2023-06-06T03:54:07.000000Z", "_ab_source_file_url": "b.avro", }, "stream": "stream1", diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/csv_scenarios.py b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/csv_scenarios.py index 06cca171b865..093cf81a6e6e 100644 --- a/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/csv_scenarios.py +++ b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/csv_scenarios.py @@ -255,7 +255,7 @@ "data": { "col1": "val11", "col2": "val12", - "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.csv", }, "stream": "stream1", @@ -264,7 +264,7 @@ "data": { "col1": "val21", "col2": "val22", - "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.csv", }, "stream": "stream1", @@ -337,7 +337,7 @@ "data": { "col1": "val11a", "col2": "val12a", - "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.csv", }, "stream": "stream1", @@ -346,7 +346,7 @@ "data": { "col1": "val21a", "col2": "val22a", - "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.csv", }, "stream": "stream1", @@ -356,7 +356,7 @@ "col1": "val11b", "col2": "val12b", "col3": "val13b", - "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.csv", }, "stream": "stream1", @@ -366,7 +366,7 @@ "col1": "val21b", "col2": "val22b", "col3": "val23b", - "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.csv", }, "stream": "stream1", @@ -438,7 +438,7 @@ "data": { "col1": "val11a", "col2": "val12a", - "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.csv", }, "stream": "stream1", @@ -447,7 +447,7 @@ "data": { "col1": "val21a", "col2": "val22a", - "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.csv", }, "stream": "stream1", @@ -457,7 +457,7 @@ "col1": "val11b", "col2": "val12b", "col3": "val13b", - "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.csv", }, "stream": "stream1", @@ -467,7 +467,7 @@ "col1": "val21b", "col2": "val22b", "col3": "val23b", - "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.csv", }, "stream": "stream1", @@ -603,7 +603,7 @@ "data": { "col1": "val11a", "col2": "val12a", - "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.csv", }, "stream": "stream1", @@ -612,7 +612,7 @@ "data": { "col1": "val21a", "col2": "val22a", - "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.csv", }, "stream": "stream1", @@ -705,7 +705,7 @@ "data": { "col1": "val11a", "col2": "val12a", - "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.csv", }, "stream": "stream1", @@ -714,25 +714,25 @@ "data": { "col1": "val21a", "col2": "val22a", - "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.csv", }, "stream": "stream1", }, { - "data": {"col3": "val13b", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", "_ab_source_file_url": "b.csv"}, + "data": {"col3": "val13b", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.csv"}, "stream": "stream1", }, { - "data": {"col3": "val23b", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", "_ab_source_file_url": "b.csv"}, + "data": {"col3": "val23b", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.csv"}, "stream": "stream1", }, { - "data": {"col3": "val13b", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", "_ab_source_file_url": "b.csv"}, + "data": {"col3": "val13b", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.csv"}, "stream": "stream2", }, { - "data": {"col3": "val23b", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", "_ab_source_file_url": "b.csv"}, + "data": {"col3": "val23b", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.csv"}, "stream": "stream2", }, ] @@ -814,7 +814,7 @@ "col1": "val11", "col2": "val12", "col3": "val |13|", - "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.csv", }, "stream": "stream1", @@ -824,7 +824,7 @@ "col1": "val21", "col2": "val22", "col3": "val23", - "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.csv", }, "stream": "stream1", @@ -834,7 +834,7 @@ "col1": "val,31", "col2": "val |,32|", "col3": "val, !! 33", - "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.csv", }, "stream": "stream1", @@ -922,7 +922,7 @@ "col1": "val11", "col2": "val12", "col3": "val |13|", - "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.csv", }, "stream": "stream1", @@ -932,7 +932,7 @@ "col1": "val21", "col2": "val22", "col3": "val23", - "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.csv", }, "stream": "stream1", @@ -942,7 +942,7 @@ "col1": "val,31", "col2": "val |,32|", "col3": "val, !! 33", - "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.csv", }, "stream": "stream1", @@ -1063,7 +1063,7 @@ "data": { "col1": "val11a", "col2": "val ! 12a", - "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.csv", }, "stream": "stream1", @@ -1072,25 +1072,25 @@ "data": { "col1": "val ! 21a", "col2": "val22a", - "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.csv", }, "stream": "stream1", }, { - "data": {"col3": "val @@@@ 13b", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", "_ab_source_file_url": "b.csv"}, + "data": {"col3": "val @@@@ 13b", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.csv"}, "stream": "stream1", }, { - "data": {"col3": "val23b", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", "_ab_source_file_url": "b.csv"}, + "data": {"col3": "val23b", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.csv"}, "stream": "stream1", }, { - "data": {"col3": "val @@ 13b", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", "_ab_source_file_url": "b.csv"}, + "data": {"col3": "val @@ 13b", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.csv"}, "stream": "stream2", }, { - "data": {"col3": "val23b", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", "_ab_source_file_url": "b.csv"}, + "data": {"col3": "val23b", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.csv"}, "stream": "stream2", }, ] @@ -1160,7 +1160,7 @@ "data": { "col1": "val11", "col2": "val12", - "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.csv", }, "stream": "stream1", @@ -1169,7 +1169,7 @@ "data": { "col1": "val21", "col2": "val22", - "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.csv", }, "stream": "stream1", @@ -1241,7 +1241,7 @@ { "data": { "data": {"col1": "val11a", "col2": "val12a"}, - "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.csv", }, "stream": "stream1", @@ -1249,7 +1249,7 @@ { "data": { "data": {"col1": "val21a", "col2": "val22a"}, - "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.csv", }, "stream": "stream1", @@ -1257,7 +1257,7 @@ { "data": { "data": {"col1": "val11b", "col2": "val12b", "col3": "val13b"}, - "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.csv", }, "stream": "stream1", @@ -1265,7 +1265,7 @@ { "data": { "data": {"col1": "val21b", "col2": "val22b", "col3": "val23b"}, - "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.csv", }, "stream": "stream1", @@ -1357,7 +1357,7 @@ { "data": { "data": {"col1": "val11a", "col2": "val12a"}, - "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.csv", }, "stream": "stream1", @@ -1365,17 +1365,17 @@ { "data": { "data": {"col1": "val21a", "col2": "val22a"}, - "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.csv", }, "stream": "stream1", }, { - "data": {"col3": "val13b", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", "_ab_source_file_url": "b.csv"}, + "data": {"col3": "val13b", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.csv"}, "stream": "stream2", }, { - "data": {"col3": "val23b", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", "_ab_source_file_url": "b.csv"}, + "data": {"col3": "val23b", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.csv"}, "stream": "stream2", }, ] diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/incremental_scenarios.py b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/incremental_scenarios.py index c25a5fe48823..78ba23c3760e 100644 --- a/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/incremental_scenarios.py +++ b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/incremental_scenarios.py @@ -48,14 +48,15 @@ )) .set_expected_records( [ - {"data": {"col1": "val11", "col2": "val12", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, - {"data": {"col1": "val21", "col2": "val22", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, + {"data": {"col1": "val11", "col2": "val12", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, + {"data": {"col1": "val21", "col2": "val22", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, { "stream1": { "history": { "some_old_file.csv": "2023-06-01T03:54:07.000000Z", "a.csv": "2023-06-05T03:54:07.000000Z" }, + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z_a.csv", } } ] @@ -139,6 +140,7 @@ "history": { "a.csv": "2023-06-05T03:54:07.000000Z" }, + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z_a.csv", } } ] @@ -217,13 +219,14 @@ )) .set_expected_records( [ - {"data": {"col1": "val11", "col2": "val12", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, - {"data": {"col1": "val21", "col2": "val22", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, + {"data": {"col1": "val11", "col2": "val12", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, + {"data": {"col1": "val21", "col2": "val22", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, { "stream1": { "history": { "a.csv": "2023-06-05T03:54:07.000000Z" }, + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z_a.csv", } } ] @@ -316,13 +319,14 @@ ) .set_expected_records( [ - {"data": {"col1": "val11", "col2": "val12", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, - {"data": {"col1": "val21", "col2": "val22", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, + {"data": {"col1": "val11", "col2": "val12", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, + {"data": {"col1": "val21", "col2": "val22", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, { "stream1": { "history": { "a.csv": "2023-06-05T03:54:07.000000Z" }, + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z_a.csv", } } ] @@ -401,11 +405,11 @@ ) .set_expected_records( [ - {"data": {"col1": "val11a", "col2": "val12a", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, - {"data": {"col1": "val21a", "col2": "val22a", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, - {"data": {"col1": "val11b", "col2": "val12b", "col3": "val13b", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + {"data": {"col1": "val11a", "col2": "val12a", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, + {"data": {"col1": "val21a", "col2": "val22a", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, + {"data": {"col1": "val11b", "col2": "val12b", "col3": "val13b", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.csv"}, "stream": "stream1"}, - {"data": {"col1": "val21b", "col2": "val22b", "col3": "val23b", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + {"data": {"col1": "val21b", "col2": "val22b", "col3": "val23b", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.csv"}, "stream": "stream1"}, { "stream1": { @@ -413,6 +417,7 @@ "a.csv": "2023-06-05T03:54:07.000000Z", "b.csv": "2023-06-05T03:54:07.000000Z" }, + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z_b.csv", } } ] @@ -479,14 +484,15 @@ ) .set_expected_records( [ - {"data": {"col1": "val11", "col2": "val12", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, - {"data": {"col1": "val21", "col2": "val22", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, + {"data": {"col1": "val11", "col2": "val12", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, + {"data": {"col1": "val21", "col2": "val22", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, { "stream1": { "history": { "recent_file.csv": "2023-07-15T23:59:59.000000Z", "a.csv": "2023-06-05T03:54:07.000000Z", }, + "_ab_source_file_last_modified": "2023-07-15T23:59:59.000000Z_recent_file.csv", } } ] @@ -576,18 +582,19 @@ ) .set_expected_records( [ - {"data": {"col1": "val11a", "col2": "val12a", "_ab_source_file_last_modified": "2023-06-04T03:54:07Z", "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, - {"data": {"col1": "val21a", "col2": "val22a", "_ab_source_file_last_modified": "2023-06-04T03:54:07Z", "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, + {"data": {"col1": "val11a", "col2": "val12a", "_ab_source_file_last_modified": "2023-06-04T03:54:07.000000Z", "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, + {"data": {"col1": "val21a", "col2": "val22a", "_ab_source_file_last_modified": "2023-06-04T03:54:07.000000Z", "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, { "stream1": { "history": { "a.csv": "2023-06-04T03:54:07.000000Z", }, + "_ab_source_file_last_modified": "2023-06-04T03:54:07.000000Z_a.csv", } }, - {"data": {"col1": "val11b", "col2": "val12b", "col3": "val13b", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + {"data": {"col1": "val11b", "col2": "val12b", "col3": "val13b", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.csv"}, "stream": "stream1"}, - {"data": {"col1": "val21b", "col2": "val22b", "col3": "val23b", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + {"data": {"col1": "val21b", "col2": "val22b", "col3": "val23b", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.csv"}, "stream": "stream1"}, { "stream1": { @@ -595,6 +602,7 @@ "a.csv": "2023-06-04T03:54:07.000000Z", "b.csv": "2023-06-05T03:54:07.000000Z" }, + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z_b.csv", } } ] @@ -681,11 +689,11 @@ ) .set_expected_records( [ - {"data": {"col1": "val11a", "col2": "val12a", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, - {"data": {"col1": "val21a", "col2": "val22a", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, - {"data": {"col1": "val11b", "col2": "val12b", "col3": "val13b", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + {"data": {"col1": "val11a", "col2": "val12a", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, + {"data": {"col1": "val21a", "col2": "val22a", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, + {"data": {"col1": "val11b", "col2": "val12b", "col3": "val13b", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.csv"}, "stream": "stream1"}, - {"data": {"col1": "val21b", "col2": "val22b", "col3": "val23b", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + {"data": {"col1": "val21b", "col2": "val22b", "col3": "val23b", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.csv"}, "stream": "stream1"}, { "stream1": { @@ -693,11 +701,12 @@ "a.csv": "2023-06-05T03:54:07.000000Z", "b.csv": "2023-06-05T03:54:07.000000Z" }, + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z_b.csv", } }, - {"data": {"col1": "val11c", "col2": "val12c", "col3": "val13c", "_ab_source_file_last_modified": "2023-06-06T03:54:07Z", + {"data": {"col1": "val11c", "col2": "val12c", "col3": "val13c", "_ab_source_file_last_modified": "2023-06-06T03:54:07.000000Z", "_ab_source_file_url": "c.csv"}, "stream": "stream1"}, - {"data": {"col1": "val21c", "col2": "val22c", "col3": "val23c", "_ab_source_file_last_modified": "2023-06-06T03:54:07Z", + {"data": {"col1": "val21c", "col2": "val22c", "col3": "val23c", "_ab_source_file_last_modified": "2023-06-06T03:54:07.000000Z", "_ab_source_file_url": "c.csv"}, "stream": "stream1"}, { "stream1": { @@ -706,6 +715,7 @@ "b.csv": "2023-06-05T03:54:07.000000Z", "c.csv": "2023-06-06T03:54:07.000000Z" }, + "_ab_source_file_last_modified": "2023-06-06T03:54:07.000000Z_c.csv", } }, ] @@ -794,9 +804,9 @@ [ # {"data": {"col1": "val11a", "col2": "val12a"}, "stream": "stream1"}, # this file is skipped # {"data": {"col1": "val21a", "col2": "val22a"}, "stream": "stream1"}, # this file is skipped - {"data": {"col1": "val11b", "col2": "val12b", "col3": "val13b", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + {"data": {"col1": "val11b", "col2": "val12b", "col3": "val13b", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.csv"}, "stream": "stream1"}, - {"data": {"col1": "val21b", "col2": "val22b", "col3": "val23b", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + {"data": {"col1": "val21b", "col2": "val22b", "col3": "val23b", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.csv"}, "stream": "stream1"}, { "stream1": { @@ -804,11 +814,12 @@ "a.csv": "2023-06-05T03:54:07.000000Z", "b.csv": "2023-06-05T03:54:07.000000Z" }, + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z_b.csv", } }, - {"data": {"col1": "val11c", "col2": "val12c", "col3": "val13c", "_ab_source_file_last_modified": "2023-06-06T03:54:07Z", + {"data": {"col1": "val11c", "col2": "val12c", "col3": "val13c", "_ab_source_file_last_modified": "2023-06-06T03:54:07.000000Z", "_ab_source_file_url": "c.csv"}, "stream": "stream1"}, - {"data": {"col1": "val21c", "col2": "val22c", "col3": "val23c", "_ab_source_file_last_modified": "2023-06-06T03:54:07Z", + {"data": {"col1": "val21c", "col2": "val22c", "col3": "val23c", "_ab_source_file_last_modified": "2023-06-06T03:54:07.000000Z", "_ab_source_file_url": "c.csv"}, "stream": "stream1"}, { "stream1": { @@ -817,6 +828,7 @@ "b.csv": "2023-06-05T03:54:07.000000Z", "c.csv": "2023-06-06T03:54:07.000000Z" }, + "_ab_source_file_last_modified": "2023-06-06T03:54:07.000000Z_c.csv", } }, ] @@ -914,9 +926,9 @@ [ # {"data": {"col1": "val11a", "col2": "val12a"}, "stream": "stream1"}, # this file is skipped # {"data": {"col1": "val21a", "col2": "val22a"}, "stream": "stream1"}, # this file is skipped - {"data": {"col1": "val11b", "col2": "val12b", "col3": "val13b", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + {"data": {"col1": "val11b", "col2": "val12b", "col3": "val13b", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.csv"}, "stream": "stream1"}, - {"data": {"col1": "val21b", "col2": "val22b", "col3": "val23b", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + {"data": {"col1": "val21b", "col2": "val22b", "col3": "val23b", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.csv"}, "stream": "stream1"}, # {"data": {"col1": "val11c", "col2": "val12c", "col3": "val13c"}, "stream": "stream1"}, # this file is skipped # {"data": {"col1": "val21c", "col2": "val22c", "col3": "val23c"}, "stream": "stream1"}, # this file is skipped @@ -927,6 +939,7 @@ "b.csv": "2023-06-05T03:54:07.000000Z", "c.csv": "2023-06-06T03:54:07.000000Z" }, + "_ab_source_file_last_modified": "2023-06-06T03:54:07.000000Z_c.csv", } }, ] @@ -1026,8 +1039,8 @@ ) .set_expected_records( [ - {"data": {"col1": "val11a", "col2": "val12a", "_ab_source_file_last_modified": "2023-06-06T03:54:07Z", "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, - {"data": {"col1": "val21a", "col2": "val22a", "_ab_source_file_last_modified": "2023-06-06T03:54:07Z", "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, + {"data": {"col1": "val11a", "col2": "val12a", "_ab_source_file_last_modified": "2023-06-06T03:54:07.000000Z", "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, + {"data": {"col1": "val21a", "col2": "val22a", "_ab_source_file_last_modified": "2023-06-06T03:54:07.000000Z", "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, { "stream1": { "history": { @@ -1035,11 +1048,12 @@ "old_file_same_timestamp_as_a.csv": "2023-06-06T03:54:07.000000Z", "a.csv": "2023-06-06T03:54:07.000000Z", }, + "_ab_source_file_last_modified": "2023-06-06T03:54:07.000000Z_old_file_same_timestamp_as_a.csv", } }, - {"data": {"col1": "val11b", "col2": "val12b", "col3": "val13b", "_ab_source_file_last_modified": "2023-06-07T03:54:07Z", + {"data": {"col1": "val11b", "col2": "val12b", "col3": "val13b", "_ab_source_file_last_modified": "2023-06-07T03:54:07.000000Z", "_ab_source_file_url": "b.csv"}, "stream": "stream1"}, - {"data": {"col1": "val21b", "col2": "val22b", "col3": "val23b", "_ab_source_file_last_modified": "2023-06-07T03:54:07Z", + {"data": {"col1": "val21b", "col2": "val22b", "col3": "val23b", "_ab_source_file_last_modified": "2023-06-07T03:54:07.000000Z", "_ab_source_file_url": "b.csv"}, "stream": "stream1"}, { "stream1": { @@ -1048,11 +1062,12 @@ "a.csv": "2023-06-06T03:54:07.000000Z", "b.csv": "2023-06-07T03:54:07.000000Z", }, + "_ab_source_file_last_modified": "2023-06-07T03:54:07.000000Z_b.csv", } }, - {"data": {"col1": "val11c", "col2": "val12c", "col3": "val13c", "_ab_source_file_last_modified": "2023-06-10T03:54:07Z", + {"data": {"col1": "val11c", "col2": "val12c", "col3": "val13c", "_ab_source_file_last_modified": "2023-06-10T03:54:07.000000Z", "_ab_source_file_url": "c.csv"}, "stream": "stream1"}, - {"data": {"col1": "val21c", "col2": "val22c", "col3": "val23c", "_ab_source_file_last_modified": "2023-06-10T03:54:07Z", + {"data": {"col1": "val21c", "col2": "val22c", "col3": "val23c", "_ab_source_file_last_modified": "2023-06-10T03:54:07.000000Z", "_ab_source_file_url": "c.csv"}, "stream": "stream1"}, { "stream1": { @@ -1061,6 +1076,7 @@ "b.csv": "2023-06-07T03:54:07.000000Z", "c.csv": "2023-06-10T03:54:07.000000Z" }, + "_ab_source_file_last_modified": "2023-06-10T03:54:07.000000Z_c.csv", } }, ] @@ -1170,19 +1186,19 @@ ) .set_expected_records( [ - {"data": {"col1": "val11a", "col2": "val12a", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, - {"data": {"col1": "val21a", "col2": "val22a", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, - {"data": {"col1": "val11b", "col2": "val12b", "col3": "val13b", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + {"data": {"col1": "val11a", "col2": "val12a", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, + {"data": {"col1": "val21a", "col2": "val22a", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, + {"data": {"col1": "val11b", "col2": "val12b", "col3": "val13b", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.csv"}, "stream": "stream1"}, - {"data": {"col1": "val21b", "col2": "val22b", "col3": "val23b", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + {"data": {"col1": "val21b", "col2": "val22b", "col3": "val23b", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.csv"}, "stream": "stream1"}, - {"data": {"col1": "val11c", "col2": "val12c", "col3": "val13c", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + {"data": {"col1": "val11c", "col2": "val12c", "col3": "val13c", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "c.csv"}, "stream": "stream1"}, - {"data": {"col1": "val21c", "col2": "val22c", "col3": "val23c", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + {"data": {"col1": "val21c", "col2": "val22c", "col3": "val23c", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "c.csv"}, "stream": "stream1"}, - {"data": {"col1": "val11d", "col2": "val12d", "col3": "val13d", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + {"data": {"col1": "val11d", "col2": "val12d", "col3": "val13d", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "d.csv"}, "stream": "stream1"}, - {"data": {"col1": "val21d", "col2": "val22d", "col3": "val23d", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + {"data": {"col1": "val21d", "col2": "val22d", "col3": "val23d", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "d.csv"}, "stream": "stream1"}, { "stream1": { @@ -1191,6 +1207,7 @@ "c.csv": "2023-06-05T03:54:07.000000Z", "d.csv": "2023-06-05T03:54:07.000000Z", }, + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z_d.csv", } } ] @@ -1294,6 +1311,7 @@ "c.csv": "2023-06-05T03:54:07.000000Z", "d.csv": "2023-06-05T03:54:07.000000Z", }, + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z_d.csv", } } ] @@ -1405,9 +1423,9 @@ [ # {"data": {"col1": "val11a", "col2": "val12a"}, "stream": "stream1"}, # This file is skipped because it is older than the time_window # {"data": {"col1": "val21a", "col2": "val22a"}, "stream": "stream1"}, - {"data": {"col1": "val11b", "col2": "val12b", "col3": "val13b", "_ab_source_file_last_modified": "2023-06-06T03:54:07Z", + {"data": {"col1": "val11b", "col2": "val12b", "col3": "val13b", "_ab_source_file_last_modified": "2023-06-06T03:54:07.000000Z", "_ab_source_file_url": "b.csv"}, "stream": "stream1"}, - {"data": {"col1": "val21b", "col2": "val22b", "col3": "val23b", "_ab_source_file_last_modified": "2023-06-06T03:54:07Z", + {"data": {"col1": "val21b", "col2": "val22b", "col3": "val23b", "_ab_source_file_last_modified": "2023-06-06T03:54:07.000000Z", "_ab_source_file_url": "b.csv"}, "stream": "stream1"}, { "stream1": { @@ -1416,6 +1434,7 @@ "d.csv": "2023-06-08T03:54:07.000000Z", "e.csv": "2023-06-08T03:54:07.000000Z", }, + "_ab_source_file_last_modified": "2023-06-08T03:54:07.000000Z_e.csv", } }, ] @@ -1525,9 +1544,9 @@ ) .set_expected_records( [ - {"data": {"col1": "val11a", "col2": "val12a", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + {"data": {"col1": "val11a", "col2": "val12a", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, - {"data": {"col1": "val21a", "col2": "val22a", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + {"data": {"col1": "val21a", "col2": "val22a", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, { "stream1": { @@ -1536,11 +1555,12 @@ "c.csv": "2023-06-07T03:54:07.000000Z", "d.csv": "2023-06-08T03:54:07.000000Z", }, + "_ab_source_file_last_modified": "2023-06-08T03:54:07.000000Z_d.csv", } }, - {"data": {"col1": "val11b", "col2": "val12b", "col3": "val13b", "_ab_source_file_last_modified": "2023-06-06T03:54:07Z", + {"data": {"col1": "val11b", "col2": "val12b", "col3": "val13b", "_ab_source_file_last_modified": "2023-06-06T03:54:07.000000Z", "_ab_source_file_url": "b.csv"}, "stream": "stream1"}, - {"data": {"col1": "val21b", "col2": "val22b", "col3": "val23b", "_ab_source_file_last_modified": "2023-06-06T03:54:07Z", + {"data": {"col1": "val21b", "col2": "val22b", "col3": "val23b", "_ab_source_file_last_modified": "2023-06-06T03:54:07.000000Z", "_ab_source_file_url": "b.csv"}, "stream": "stream1"}, { "stream1": { @@ -1549,6 +1569,7 @@ "c.csv": "2023-06-07T03:54:07.000000Z", "d.csv": "2023-06-08T03:54:07.000000Z", }, + "_ab_source_file_last_modified": "2023-06-08T03:54:07.000000Z_d.csv", } }, ] diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/jsonl_scenarios.py b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/jsonl_scenarios.py index 81b427ed1a7e..cffb7585bfd1 100644 --- a/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/jsonl_scenarios.py +++ b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/jsonl_scenarios.py @@ -64,9 +64,9 @@ ) .set_expected_records( [ - {"data": {"col1": "val11", "col2": "val12", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + {"data": {"col1": "val11", "col2": "val12", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.jsonl"}, "stream": "stream1"}, - {"data": {"col1": "val21", "col2": "val22", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + {"data": {"col1": "val21", "col2": "val22", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.jsonl"}, "stream": "stream1"}, ] ) @@ -141,13 +141,13 @@ ) .set_expected_records( [ - {"data": {"col1": "val11a", "col2": "val12a", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + {"data": {"col1": "val11a", "col2": "val12a", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.jsonl"}, "stream": "stream1"}, - {"data": {"col1": "val21a", "col2": "val22a", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + {"data": {"col1": "val21a", "col2": "val22a", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.jsonl"}, "stream": "stream1"}, - {"data": {"col1": "val11b", "col2": "val12b", "col3": "val13b", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + {"data": {"col1": "val11b", "col2": "val12b", "col3": "val13b", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.jsonl"}, "stream": "stream1"}, - {"data": {"col1": "val21b", "col3": "val23b", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + {"data": {"col1": "val21b", "col3": "val23b", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.jsonl"}, "stream": "stream1"}, ] ) @@ -218,13 +218,13 @@ ) .set_expected_records( [ - {"data": {"col1": "val11a", "col2": "val12a", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + {"data": {"col1": "val11a", "col2": "val12a", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.jsonl"}, "stream": "stream1"}, - {"data": {"col1": "val21a", "col2": "val22a", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + {"data": {"col1": "val21a", "col2": "val22a", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.jsonl"}, "stream": "stream1"}, - {"data": {"col1": "val11b", "col2": "val12b", "col3": "val13b", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + {"data": {"col1": "val11b", "col2": "val12b", "col3": "val13b", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.jsonl"}, "stream": "stream1"}, - {"data": {"col1": "val21b", "col3": "val23b", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + {"data": {"col1": "val21b", "col3": "val23b", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.jsonl"}, "stream": "stream1"}, ] ) @@ -296,13 +296,13 @@ ) .set_expected_records( [ - {"data": {"col1": "val11a", "col2": "val12a", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + {"data": {"col1": "val11a", "col2": "val12a", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.jsonl"}, "stream": "stream1"}, - {"data": {"col1": "val21a", "col2": "val22a", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + {"data": {"col1": "val21a", "col2": "val22a", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.jsonl"}, "stream": "stream1"}, - {"data": {"col1": "val11b", "col2": "val12b", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + {"data": {"col1": "val11b", "col2": "val12b", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.jsonl"}, "stream": "stream1"}, - {"data": {"col1": "val21b", "col2": "val22b", "col3": "val23b", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + {"data": {"col1": "val21b", "col2": "val22b", "col3": "val23b", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.jsonl"}, "stream": "stream1"}, ] ) @@ -364,7 +364,7 @@ } ) .set_expected_records([ - {"data": {"col1": "val1", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + {"data": {"col1": "val1", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.jsonl"}, "stream": "stream1"}, ]) .set_expected_discover_error(SchemaInferenceError, FileBasedSourceError.SCHEMA_INFERENCE_ERROR.value) @@ -475,17 +475,17 @@ ) .set_expected_records( [ - {"data": {"col1": 1, "col2": "record1", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + {"data": {"col1": 1, "col2": "record1", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.jsonl"}, "stream": "stream1"}, - {"data": {"col1": 2, "col2": "record2", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + {"data": {"col1": 2, "col2": "record2", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.jsonl"}, "stream": "stream1"}, - {"data": {"col3": 1.1, "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", "_ab_source_file_url": "b.jsonl"}, + {"data": {"col3": 1.1, "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.jsonl"}, "stream": "stream1"}, - {"data": {"col3": 2.2, "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", "_ab_source_file_url": "b.jsonl"}, + {"data": {"col3": 2.2, "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.jsonl"}, "stream": "stream1"}, - {"data": {"col3": 1.1, "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", "_ab_source_file_url": "b.jsonl"}, + {"data": {"col3": 1.1, "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.jsonl"}, "stream": "stream2"}, - {"data": {"col3": 2.2, "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", "_ab_source_file_url": "b.jsonl"}, + {"data": {"col3": 2.2, "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.jsonl"}, "stream": "stream2"}, ] ) @@ -555,13 +555,13 @@ ) .set_expected_records( [ - {"data": {"data": {"col1": 1, "col2": "record1"}, "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + {"data": {"data": {"col1": 1, "col2": "record1"}, "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.jsonl"}, "stream": "stream1"}, - {"data": {"data": {"col1": 2, "col2": "record2"}, "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + {"data": {"data": {"col1": 2, "col2": "record2"}, "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.jsonl"}, "stream": "stream1"}, - {"data": {"data": {"col1": 3, "col2": "record3", "col3": 1.1}, "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + {"data": {"data": {"col1": 3, "col2": "record3", "col3": 1.1}, "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.jsonl"}, "stream": "stream1"}, - {"data": {"data": {"col1": 4, "col2": "record4", "col3": 1.1}, "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + {"data": {"data": {"col1": 4, "col2": "record4", "col3": 1.1}, "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.jsonl"}, "stream": "stream1"}, ] ) @@ -657,13 +657,13 @@ ) .set_expected_records( [ - {"data": {"data": {"col1": 1, "col2": "record1"}, "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + {"data": {"data": {"col1": 1, "col2": "record1"}, "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.jsonl"}, "stream": "stream1"}, - {"data": {"data": {"col1": 2, "col2": "record2"}, "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + {"data": {"data": {"col1": 2, "col2": "record2"}, "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.jsonl"}, "stream": "stream1"}, - {"data": {"col3": 1.1, "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", "_ab_source_file_url": "b.jsonl"}, + {"data": {"col3": 1.1, "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.jsonl"}, "stream": "stream2"}, - {"data": {"col3": 2.2, "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", "_ab_source_file_url": "b.jsonl"}, + {"data": {"col3": 2.2, "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.jsonl"}, "stream": "stream2"}, ] ) @@ -728,9 +728,9 @@ ) .set_expected_records( [ - {"data": {"col1": 1, "col2": "val12", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + {"data": {"col1": 1, "col2": "val12", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.jsonl"}, "stream": "stream1"}, - {"data": {"col1": 2, "col2": "val22", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + {"data": {"col1": 2, "col2": "val22", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.jsonl"}, "stream": "stream1"}, ] ) diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/parquet_scenarios.py b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/parquet_scenarios.py index ce0c2c23fbcd..75a693593430 100644 --- a/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/parquet_scenarios.py +++ b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/parquet_scenarios.py @@ -170,9 +170,9 @@ .set_file_type("parquet") .set_expected_records( [ - {"data": {"col1": "val11", "col2": "val12", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + {"data": {"col1": "val11", "col2": "val12", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.parquet"}, "stream": "stream1"}, - {"data": {"col1": "val21", "col2": "val22", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + {"data": {"col1": "val21", "col2": "val22", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.parquet"}, "stream": "stream1"}, ] ) @@ -258,13 +258,13 @@ ) .set_expected_records( [ - {"data": {"col1": "val11a", "col2": "val12a", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + {"data": {"col1": "val11a", "col2": "val12a", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.parquet"}, "stream": "stream1"}, - {"data": {"col1": "val21a", "col2": "val22a", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + {"data": {"col1": "val21a", "col2": "val22a", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.parquet"}, "stream": "stream1"}, - {"data": {"col1": "val11b", "col2": "val12b", "col3": "val13b", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + {"data": {"col1": "val11b", "col2": "val12b", "col3": "val13b", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.parquet"}, "stream": "stream1"}, - {"data": {"col1": "val21b", "col2": "val22b", "col3": "val23b", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + {"data": {"col1": "val21b", "col2": "val22b", "col3": "val23b", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.parquet"}, "stream": "stream1"}, ] ) @@ -404,7 +404,7 @@ "col_list": [1, 2, 3, 4], "col_duration": 12345, "col_binary": "binary string. Hello world!", - "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.parquet"}, "stream": "stream1" }, ] @@ -430,7 +430,7 @@ .set_file_type("parquet") .set_expected_records( [ - {"data": {"col1": "13.00", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + {"data": {"col1": "13.00", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.parquet"}, "stream": "stream1"}, ] ) @@ -487,7 +487,7 @@ .set_file_type("parquet") .set_expected_records( [ - {"data": {"col1": "13.00", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + {"data": {"col1": "13.00", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.parquet"}, "stream": "stream1"}, ] ) @@ -544,7 +544,7 @@ .set_file_type("parquet") .set_expected_records( [ - {"data": {"col1": 13.00, "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + {"data": {"col1": 13.00, "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.parquet"}, "stream": "stream1"}, ] ) @@ -598,7 +598,7 @@ .set_file_type("parquet") .set_expected_records( [ - {"data": {"col1": 13.00, "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + {"data": {"col1": 13.00, "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.parquet"}, "stream": "stream1"}, ] ) diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/user_input_schema_scenarios.py b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/user_input_schema_scenarios.py index 9a5f84921551..e37e35fe53c3 100644 --- a/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/user_input_schema_scenarios.py +++ b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/user_input_schema_scenarios.py @@ -61,9 +61,9 @@ ) .set_expected_records( [ - {"data": {"col1": "val11", "col2": "val12", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + {"data": {"col1": "val11", "col2": "val12", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, - {"data": {"col1": "val21", "col2": "val22", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + {"data": {"col1": "val21", "col2": "val22", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, ] ) @@ -338,18 +338,18 @@ ) .set_expected_records( [ - {"data": {"col1": "val11a", "col2": 21, "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + {"data": {"col1": "val11a", "col2": 21, "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, - {"data": {"col1": "val12a", "col2": 22, "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + {"data": {"col1": "val12a", "col2": 22, "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, # The files in b.csv are emitted despite having an invalid schema - {"data": {"col1": "val11b", "col2": "val12b", "col3": "val13b", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + {"data": {"col1": "val11b", "col2": "val12b", "col3": "val13b", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.csv"}, "stream": "stream2"}, - {"data": {"col1": "val21b", "col2": "val22b", "col3": "val23b", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + {"data": {"col1": "val21b", "col2": "val22b", "col3": "val23b", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.csv"}, "stream": "stream2"}, - {"data": {"col1": "val11c", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + {"data": {"col1": "val11c", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "c.csv"}, "stream": "stream3"}, - {"data": {"col1": "val21c", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + {"data": {"col1": "val21c", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "c.csv"}, "stream": "stream3"}, ] ) @@ -536,17 +536,17 @@ .set_expected_check_error(None, FileBasedSourceError.ERROR_PARSING_USER_PROVIDED_SCHEMA.value) .set_expected_records( [ - {"data": {"col1": "val11a", "col2": 21, "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + {"data": {"col1": "val11a", "col2": 21, "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, - {"data": {"col1": "val12a", "col2": 22, "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + {"data": {"col1": "val12a", "col2": 22, "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, - {"data": {"col1": "val11b", "col2": "val12b", "col3": "val13b", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + {"data": {"col1": "val11b", "col2": "val12b", "col3": "val13b", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.csv"}, "stream": "stream2"}, - {"data": {"col1": "val21b", "col2": "val22b", "col3": "val23b", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + {"data": {"col1": "val21b", "col2": "val22b", "col3": "val23b", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.csv"}, "stream": "stream2"}, - {"data": {"col1": "val11c", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + {"data": {"col1": "val11c", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "c.csv"}, "stream": "stream3"}, - {"data": {"col1": "val21c", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + {"data": {"col1": "val21c", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "c.csv"}, "stream": "stream3"}, ] ) @@ -675,17 +675,17 @@ .set_expected_check_error(None, FileBasedSourceError.ERROR_PARSING_USER_PROVIDED_SCHEMA.value) .set_expected_records( [ - {"data": {"col1": "val11a", "col2": 21, "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + {"data": {"col1": "val11a", "col2": 21, "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, - {"data": {"col1": "val12a", "col2": 22, "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + {"data": {"col1": "val12a", "col2": 22, "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, - # {"data": {"col1": "val11b", "col2": "val12b", "col3": "val13b", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + # {"data": {"col1": "val11b", "col2": "val12b", "col3": "val13b", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", # "_ab_source_file_url": "b.csv"}, "stream": "stream2"}, - # {"data": {"col1": "val21b", "col2": "val22b", "col3": "val23b", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + # {"data": {"col1": "val21b", "col2": "val22b", "col3": "val23b", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", # "_ab_source_file_url": "b.csv"}, "stream": "stream2"}, - {"data": {"col1": "val11c", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + {"data": {"col1": "val11c", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "c.csv"}, "stream": "stream3"}, - {"data": {"col1": "val21c", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", + {"data": {"col1": "val21c", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "c.csv"}, "stream": "stream3"}, ] ) diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/validation_policy_scenarios.py b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/validation_policy_scenarios.py index aea0bd83949b..55d0d40911f9 100644 --- a/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/validation_policy_scenarios.py +++ b/airbyte-cdk/python/unit_tests/sources/file_based/scenarios/validation_policy_scenarios.py @@ -213,14 +213,14 @@ ) .set_expected_records( [ - # {"data": {"col1": "val_a_11", "col2": "val_a_21", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, # This record is skipped because it does not conform - # {"data": {"col1": "val_a_12", "col2": "val_a_22", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, # This record is skipped because it does not conform - {"data": {"col1": "val_b_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", "_ab_source_file_url": "b.csv"}, "stream": "stream1"}, - {"data": {"col1": "val_b_12", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", "_ab_source_file_url": "b.csv"}, "stream": "stream1"}, - {"data": {"col1": "val_c_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", "_ab_source_file_url": "c.csv"}, "stream": "stream1"}, - # {"data": {"col1": "val_c_12", None: "val_c_22", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", "_ab_source_file_url": "c.csv"}, "stream": "stream1"}, # This record is malformed so should not be emitted - {"data": {"col1": "val_c_13", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", "_ab_source_file_url": "c.csv"}, "stream": "stream1"}, - {"data": {"col1": "val_d_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", "_ab_source_file_url": "d.csv"}, "stream": "stream1"}, + # {"data": {"col1": "val_a_11", "col2": "val_a_21", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, # This record is skipped because it does not conform + # {"data": {"col1": "val_a_12", "col2": "val_a_22", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, # This record is skipped because it does not conform + {"data": {"col1": "val_b_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.csv"}, "stream": "stream1"}, + {"data": {"col1": "val_b_12", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.csv"}, "stream": "stream1"}, + {"data": {"col1": "val_c_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "c.csv"}, "stream": "stream1"}, + # {"data": {"col1": "val_c_12", None: "val_c_22", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "c.csv"}, "stream": "stream1"}, # This record is malformed so should not be emitted + {"data": {"col1": "val_c_13", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "c.csv"}, "stream": "stream1"}, + {"data": {"col1": "val_d_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "d.csv"}, "stream": "stream1"}, ] ) .set_expected_logs({ @@ -262,20 +262,20 @@ ) .set_expected_records( [ - # {"data": {"col1": "val_aa1_11", "col2": "val_aa1_21", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", "_ab_source_file_url": "a/a1.csv"}, "stream": "stream1"}, # This record is skipped because it does not conform - # {"data": {"col1": "val_aa1_12", "col2": "val_aa1_22", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", "_ab_source_file_url": "a/a1.csv"}, "stream": "stream1"}, # This record is skipped because it does not conform - {"data": {"col1": "val_aa2_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", "_ab_source_file_url": "a/a2.csv"}, "stream": "stream1"}, - {"data": {"col1": "val_aa2_12", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", "_ab_source_file_url": "a/a2.csv"}, "stream": "stream1"}, - {"data": {"col1": "val_aa3_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", "_ab_source_file_url": "a/a3.csv"}, "stream": "stream1"}, - # {"data": {"col1": "val_aa3_12", None: "val_aa3_22", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", "_ab_source_file_url": "a/a3.csv"}, "stream": "stream1"}, # This record is malformed so should not be emitted - {"data": {"col1": "val_aa3_13", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", "_ab_source_file_url": "a/a3.csv"}, "stream": "stream1"}, - {"data": {"col1": "val_aa4_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", "_ab_source_file_url": "a/a4.csv"}, "stream": "stream1"}, - {"data": {"col1": "val_bb1_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", "_ab_source_file_url": "b/b1.csv"}, "stream": "stream2"}, # This record is skipped because it does not conform - {"data": {"col1": "val_bb1_12", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", "_ab_source_file_url": "b/b1.csv"}, "stream": "stream2"}, # This record is skipped because it does not conform - # {"data": {"col1": "val_bb2_11", "col2": "val_bb2_21", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", "_ab_source_file_url": "b/b2.csv"}, "stream": "stream2"}, - # {"data": {"col1": "val_bb2_12", "col2": "val_bb2_21", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", "_ab_source_file_url": "b/b2.csv"}, "stream": "stream2"}, - {"data": {"col1": "val_bb3_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", "_ab_source_file_url": "b/b3.csv"}, "stream": "stream2"}, - {"data": {"col1": "val_bb3_12", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", "_ab_source_file_url": "b/b3.csv"}, "stream": "stream2"}, + # {"data": {"col1": "val_aa1_11", "col2": "val_aa1_21", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a1.csv"}, "stream": "stream1"}, # This record is skipped because it does not conform + # {"data": {"col1": "val_aa1_12", "col2": "val_aa1_22", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a1.csv"}, "stream": "stream1"}, # This record is skipped because it does not conform + {"data": {"col1": "val_aa2_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a2.csv"}, "stream": "stream1"}, + {"data": {"col1": "val_aa2_12", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a2.csv"}, "stream": "stream1"}, + {"data": {"col1": "val_aa3_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a3.csv"}, "stream": "stream1"}, + # {"data": {"col1": "val_aa3_12", None: "val_aa3_22", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a3.csv"}, "stream": "stream1"}, # This record is malformed so should not be emitted + {"data": {"col1": "val_aa3_13", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a3.csv"}, "stream": "stream1"}, + {"data": {"col1": "val_aa4_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a4.csv"}, "stream": "stream1"}, + {"data": {"col1": "val_bb1_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b/b1.csv"}, "stream": "stream2"}, # This record is skipped because it does not conform + {"data": {"col1": "val_bb1_12", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b/b1.csv"}, "stream": "stream2"}, # This record is skipped because it does not conform + # {"data": {"col1": "val_bb2_11", "col2": "val_bb2_21", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b/b2.csv"}, "stream": "stream2"}, + # {"data": {"col1": "val_bb2_12", "col2": "val_bb2_21", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b/b2.csv"}, "stream": "stream2"}, + {"data": {"col1": "val_bb3_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b/b3.csv"}, "stream": "stream2"}, + {"data": {"col1": "val_bb3_12", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b/b3.csv"}, "stream": "stream2"}, ] ) .set_expected_logs({ @@ -314,14 +314,14 @@ ) .set_expected_records( [ - {"data": {"col1": "val_a_11", "col2": "val_a_21", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, - {"data": {"col1": "val_a_12", "col2": "val_a_22", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, - {"data": {"col1": "val_b_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", "_ab_source_file_url": "b.csv"}, "stream": "stream1"}, - {"data": {"col1": "val_b_12", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", "_ab_source_file_url": "b.csv"}, "stream": "stream1"}, - {"data": {"col1": "val_c_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", "_ab_source_file_url": "c.csv"}, "stream": "stream1"}, - # {"data": {"col1": "val_c_12", None: "val_c_22", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", "_ab_source_file_url": "c.csv"}, "stream": "stream1"}, # This record is malformed so should not be emitted - # {"data": {"col1": "val_c_13", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", "_ab_source_file_url": "c.csv"}, "stream": "stream1"}, # No more records from this stream are emitted after we hit a parse error - # {"data": {"col1": "val_d_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", "_ab_source_file_url": "d.csv"}, "stream": "stream1"}, + {"data": {"col1": "val_a_11", "col2": "val_a_21", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, + {"data": {"col1": "val_a_12", "col2": "val_a_22", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a.csv"}, "stream": "stream1"}, + {"data": {"col1": "val_b_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.csv"}, "stream": "stream1"}, + {"data": {"col1": "val_b_12", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b.csv"}, "stream": "stream1"}, + {"data": {"col1": "val_c_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "c.csv"}, "stream": "stream1"}, + # {"data": {"col1": "val_c_12", None: "val_c_22", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "c.csv"}, "stream": "stream1"}, # This record is malformed so should not be emitted + # {"data": {"col1": "val_c_13", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "c.csv"}, "stream": "stream1"}, # No more records from this stream are emitted after we hit a parse error + # {"data": {"col1": "val_d_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "d.csv"}, "stream": "stream1"}, ] ) .set_expected_logs({ @@ -359,20 +359,20 @@ ) .set_expected_records( [ - {"data": {"col1": "val_aa1_11", "col2": "val_aa1_21", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", "_ab_source_file_url": "a/a1.csv"}, "stream": "stream1"}, - {"data": {"col1": "val_aa1_12", "col2": "val_aa1_22", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", "_ab_source_file_url": "a/a1.csv"}, "stream": "stream1"}, - {"data": {"col1": "val_aa2_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", "_ab_source_file_url": "a/a2.csv"}, "stream": "stream1"}, - {"data": {"col1": "val_aa2_12", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", "_ab_source_file_url": "a/a2.csv"}, "stream": "stream1"}, - {"data": {"col1": "val_aa3_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", "_ab_source_file_url": "a/a3.csv"}, "stream": "stream1"}, - # {"data": {"col1": "val_aa3_12", None: "val_aa3_22", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", "_ab_source_file_url": "a/a3.csv"}, "stream": "stream1"}, # This record is malformed so should not be emitted - # {"data": {"col1": "val_aa3_13", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", "_ab_source_file_url": "a/a3.csv"}, "stream": "stream1"}, # No more records from this stream are emitted after we hit a parse error - # {"data": {"col1": "val_aa4_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", "_ab_source_file_url": "a/a4.csv"}, "stream": "stream1"}, - {"data": {"col1": "val_bb1_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", "_ab_source_file_url": "b/b1.csv"}, "stream": "stream2"}, - {"data": {"col1": "val_bb1_12", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", "_ab_source_file_url": "b/b1.csv"}, "stream": "stream2"}, - {"data": {"col1": "val_bb2_11", "col2": "val_bb2_21", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", "_ab_source_file_url": "b/b2.csv"}, "stream": "stream2"}, - {"data": {"col1": "val_bb2_12", "col2": "val_bb2_22", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", "_ab_source_file_url": "b/b2.csv"}, "stream": "stream2"}, - {"data": {"col1": "val_bb3_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", "_ab_source_file_url": "b/b3.csv"}, "stream": "stream2"}, - {"data": {"col1": "val_bb3_12", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", "_ab_source_file_url": "b/b3.csv"}, "stream": "stream2"}, + {"data": {"col1": "val_aa1_11", "col2": "val_aa1_21", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a1.csv"}, "stream": "stream1"}, + {"data": {"col1": "val_aa1_12", "col2": "val_aa1_22", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a1.csv"}, "stream": "stream1"}, + {"data": {"col1": "val_aa2_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a2.csv"}, "stream": "stream1"}, + {"data": {"col1": "val_aa2_12", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a2.csv"}, "stream": "stream1"}, + {"data": {"col1": "val_aa3_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a3.csv"}, "stream": "stream1"}, + # {"data": {"col1": "val_aa3_12", None: "val_aa3_22", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a3.csv"}, "stream": "stream1"}, # This record is malformed so should not be emitted + # {"data": {"col1": "val_aa3_13", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a3.csv"}, "stream": "stream1"}, # No more records from this stream are emitted after we hit a parse error + # {"data": {"col1": "val_aa4_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a4.csv"}, "stream": "stream1"}, + {"data": {"col1": "val_bb1_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b/b1.csv"}, "stream": "stream2"}, + {"data": {"col1": "val_bb1_12", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b/b1.csv"}, "stream": "stream2"}, + {"data": {"col1": "val_bb2_11", "col2": "val_bb2_21", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b/b2.csv"}, "stream": "stream2"}, + {"data": {"col1": "val_bb2_12", "col2": "val_bb2_22", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b/b2.csv"}, "stream": "stream2"}, + {"data": {"col1": "val_bb3_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b/b3.csv"}, "stream": "stream2"}, + {"data": {"col1": "val_bb3_12", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b/b3.csv"}, "stream": "stream2"}, ] ) .set_expected_logs({ @@ -439,20 +439,20 @@ ) .set_expected_records( [ - # {"data": {"col1": "val_aa1_11", "col2": "val_aa1_21", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", "_ab_source_file_url": "a/a1.csv"}, "stream": "stream1"}, # The first record does not conform so we don't sync anything from this stream - # {"data": {"col1": "val_aa1_12", "col2": "val_aa1_22", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", "_ab_source_file_url": "a/a1.csv"}, "stream": "stream1"}, - # {"data": {"col1": "val_aa2_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", "_ab_source_file_url": "a/a2.csv"}, "stream": "stream1"}, - # {"data": {"col1": "val_aa2_12", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", "_ab_source_file_url": "a/a2.csv"}, "stream": "stream1"}, - # {"data": {"col1": "val_aa3_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", "_ab_source_file_url": "a/a3.csv"}, "stream": "stream1"}, - # {"data": {"col1": "val_aa3_12", None: "val_aa3_22", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", "_ab_source_file_url": "a/a3.csv"}, "stream": "stream1"}, - # {"data": {"col1": "val_aa3_13", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", "_ab_source_file_url": "a/a3.csv"}, "stream": "stream1"}, - # {"data": {"col1": "val_aa4_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", "_ab_source_file_url": "a/a4.csv"}, "stream": "stream1"}, - {"data": {"col1": "val_bb1_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", "_ab_source_file_url": "b/b1.csv"}, "stream": "stream2"}, - {"data": {"col1": "val_bb1_12", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", "_ab_source_file_url": "b/b1.csv"}, "stream": "stream2"}, - # {"data": {"col1": "val_bb2_11", "col2": "val_bb2_21", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", "_ab_source_file_url": "b/b2.csv"}, "stream": "stream2"}, # No more records from this stream are emitted after a nonconforming record is encountered - # {"data": {"col1": "val_bb2_12", "col2": "val_bb2_21", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", "_ab_source_file_url": "b/b2.csv"}, "stream": "stream2"}, - # {"data": {"col1": "val_bb3_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", "_ab_source_file_url": "b/b3.csv"}, "stream": "stream2"}, - # {"data": {"col1": "val_bb3_12", "_ab_source_file_last_modified": "2023-06-05T03:54:07Z", "_ab_source_file_url": "b/b3.csv"}, "stream": "stream2"}, + # {"data": {"col1": "val_aa1_11", "col2": "val_aa1_21", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a1.csv"}, "stream": "stream1"}, # The first record does not conform so we don't sync anything from this stream + # {"data": {"col1": "val_aa1_12", "col2": "val_aa1_22", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a1.csv"}, "stream": "stream1"}, + # {"data": {"col1": "val_aa2_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a2.csv"}, "stream": "stream1"}, + # {"data": {"col1": "val_aa2_12", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a2.csv"}, "stream": "stream1"}, + # {"data": {"col1": "val_aa3_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a3.csv"}, "stream": "stream1"}, + # {"data": {"col1": "val_aa3_12", None: "val_aa3_22", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a3.csv"}, "stream": "stream1"}, + # {"data": {"col1": "val_aa3_13", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a3.csv"}, "stream": "stream1"}, + # {"data": {"col1": "val_aa4_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "a/a4.csv"}, "stream": "stream1"}, + {"data": {"col1": "val_bb1_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b/b1.csv"}, "stream": "stream2"}, + {"data": {"col1": "val_bb1_12", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b/b1.csv"}, "stream": "stream2"}, + # {"data": {"col1": "val_bb2_11", "col2": "val_bb2_21", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b/b2.csv"}, "stream": "stream2"}, # No more records from this stream are emitted after a nonconforming record is encountered + # {"data": {"col1": "val_bb2_12", "col2": "val_bb2_21", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b/b2.csv"}, "stream": "stream2"}, + # {"data": {"col1": "val_bb3_11", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b/b3.csv"}, "stream": "stream2"}, + # {"data": {"col1": "val_bb3_12", "_ab_source_file_last_modified": "2023-06-05T03:54:07.000000Z", "_ab_source_file_url": "b/b3.csv"}, "stream": "stream2"}, ] ) .set_expected_logs({ diff --git a/airbyte-cdk/python/unit_tests/sources/file_based/stream/test_default_file_based_cursor.py b/airbyte-cdk/python/unit_tests/sources/file_based/stream/test_default_file_based_cursor.py index fae69e409ba0..cc3aeab21cff 100644 --- a/airbyte-cdk/python/unit_tests/sources/file_based/stream/test_default_file_based_cursor.py +++ b/airbyte-cdk/python/unit_tests/sources/file_based/stream/test_default_file_based_cursor.py @@ -30,11 +30,14 @@ [datetime(2021, 1, 1), datetime(2021, 1, 1), datetime(2020, 12, 31)], - {"history": { - "a.csv": "2021-01-01T00:00:00.000000Z", - "b.csv": "2021-01-02T00:00:00.000000Z", - "c.csv": "2020-12-31T00:00:00.000000Z", - }, }, + { + "history": { + "a.csv": "2021-01-01T00:00:00.000000Z", + "b.csv": "2021-01-02T00:00:00.000000Z", + "c.csv": "2020-12-31T00:00:00.000000Z", + }, + "_ab_source_file_last_modified": "2021-01-02T00:00:00.000000Z_b.csv", + }, id="test_file_start_time_is_earliest_time_in_history"), pytest.param([ RemoteFile(uri="a.csv", @@ -55,11 +58,14 @@ datetime(2021, 1, 1), datetime(2021, 1, 1), datetime(2021, 1, 2)], - {"history": { - "b.csv": "2021-01-02T00:00:00.000000Z", - "c.csv": "2021-01-03T00:00:00.000000Z", - "d.csv": "2021-01-04T00:00:00.000000Z", - }, }, + { + "history": { + "b.csv": "2021-01-02T00:00:00.000000Z", + "c.csv": "2021-01-03T00:00:00.000000Z", + "d.csv": "2021-01-04T00:00:00.000000Z", + }, + "_ab_source_file_last_modified": "2021-01-04T00:00:00.000000Z_d.csv", + }, id="test_earliest_file_is_removed_from_history_if_history_is_full"), pytest.param([ RemoteFile(uri="a.csv", @@ -85,11 +91,14 @@ datetime(2021, 1, 2), datetime(2021, 1, 2), ], - {"history": { - "file_with_same_timestamp_as_b.csv": "2021-01-02T00:00:00.000000Z", - "c.csv": "2021-01-03T00:00:00.000000Z", - "d.csv": "2021-01-04T00:00:00.000000Z", - }, }, + { + "history": { + "file_with_same_timestamp_as_b.csv": "2021-01-02T00:00:00.000000Z", + "c.csv": "2021-01-03T00:00:00.000000Z", + "d.csv": "2021-01-04T00:00:00.000000Z", + }, + "_ab_source_file_last_modified": "2021-01-04T00:00:00.000000Z_d.csv", + }, id="test_files_are_sorted_by_timestamp_and_by_name"), ], )