-
Notifications
You must be signed in to change notification settings - Fork 4.2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
File-based CDK + Source S3 (v4): Pass configured file encoding to stream reader #29110
Changes from 10 commits
c0fe74f
6b4d097
7f14405
bfbf402
109771e
c828d3e
3296be8
fc4f050
806408f
0ef6913
8a7af72
908928c
bbc0e2d
5fff44b
474de6e
bf2d50e
19ce36d
85f8d8e
eca20ae
f0e56fe
b7aed85
983b40e
78cf573
ba6984c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -16,6 +16,7 @@ | |
class JsonlParser(FileTypeParser): | ||
|
||
MAX_BYTES_PER_FILE_FOR_SCHEMA_INFERENCE = 1_000_000 | ||
ENCODING = "utf8" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. encoding isn't configurable in legacy S3 source. We can move this to a config if needed |
||
|
||
async def infer_schema( | ||
self, | ||
|
@@ -31,7 +32,7 @@ async def infer_schema( | |
inferred_schema: Dict[str, Any] = {} | ||
read_bytes = 0 | ||
|
||
with stream_reader.open_file(file, self.file_read_mode, logger) as fp: | ||
with stream_reader.open_file(file, self.file_read_mode, self.ENCODING, logger) as fp: | ||
for line in fp: | ||
if read_bytes < self.MAX_BYTES_PER_FILE_FOR_SCHEMA_INFERENCE: | ||
line_schema = self.infer_schema_for_record(json.loads(line)) | ||
|
@@ -53,7 +54,7 @@ def parse_records( | |
stream_reader: AbstractFileBasedStreamReader, | ||
logger: logging.Logger, | ||
) -> Iterable[Dict[str, Any]]: | ||
with stream_reader.open_file(file, self.file_read_mode, logger) as fp: | ||
with stream_reader.open_file(file, self.file_read_mode, self.ENCODING, logger) as fp: | ||
for line in fp: | ||
yield json.loads(line) | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -18,6 +18,9 @@ | |
|
||
|
||
class ParquetParser(FileTypeParser): | ||
|
||
ENCODING = None | ||
|
||
async def infer_schema( | ||
self, | ||
config: FileBasedStreamConfig, | ||
|
@@ -29,7 +32,7 @@ async def infer_schema( | |
if not isinstance(parquet_format, ParquetFormat): | ||
raise ValueError(f"Expected ParquetFormat, got {parquet_format}") | ||
|
||
with stream_reader.open_file(file, self.file_read_mode, logger) as fp: | ||
with stream_reader.open_file(file, self.file_read_mode, self.ENCODING, logger) as fp: | ||
parquet_file = pq.ParquetFile(fp) | ||
parquet_schema = parquet_file.schema_arrow | ||
|
||
|
@@ -50,8 +53,8 @@ def parse_records( | |
) -> Iterable[Dict[str, Any]]: | ||
parquet_format = config.format[config.file_type] if config.format else ParquetFormat() | ||
if not isinstance(parquet_format, ParquetFormat): | ||
raise ValueError(f"Expected ParquetFormat, got {parquet_format}") # FIXME test this branch! | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
with stream_reader.open_file(file, self.file_read_mode, logger) as fp: | ||
raise ValueError(f"Expected ParquetFormat, got {parquet_format}") | ||
with stream_reader.open_file(file, self.file_read_mode, self.ENCODING, logger) as fp: | ||
reader = pq.ParquetFile(fp) | ||
partition_columns = {x.split("=")[0]: x.split("=")[1] for x in self._extract_partitions(file.uri)} | ||
for row_group in range(reader.num_row_groups): | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,14 +1,21 @@ | ||
# | ||
# Copyright (c) 2023 Airbyte, Inc., all rights reserved. | ||
# | ||
|
||
import asyncio | ||
import io | ||
import logging | ||
from datetime import datetime | ||
from typing import Iterable, List, Optional | ||
from unittest.mock import MagicMock, Mock | ||
|
||
import pytest | ||
from airbyte_cdk.sources.file_based.config.abstract_file_based_spec import AbstractFileBasedSpec | ||
from airbyte_cdk.sources.file_based.config.csv_format import DEFAULT_FALSE_VALUES, DEFAULT_TRUE_VALUES, CsvFormat | ||
from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig | ||
from airbyte_cdk.sources.file_based.exceptions import RecordParseError | ||
from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode | ||
from airbyte_cdk.sources.file_based.file_types.csv_parser import CsvParser, _cast_types | ||
from airbyte_cdk.sources.file_based.remote_file import RemoteFile | ||
|
||
PROPERTY_TYPES = { | ||
"col1": "null", | ||
|
@@ -96,3 +103,40 @@ def test_read_and_cast_types(reader_values, expected_rows): | |
list(parser._read_and_cast_types(reader, schema, config_format, logger)) | ||
else: | ||
assert expected_rows == list(parser._read_and_cast_types(reader, schema, config_format, logger)) | ||
|
||
|
||
class MockFileBasedStreamReader(AbstractFileBasedStreamReader): | ||
def __init__(self, expected_encoding: Optional[str]): | ||
self._expected_encoding = expected_encoding | ||
|
||
def open_file(self, file: RemoteFile, mode: FileReadMode, encoding: Optional[str], logger: logging.Logger) -> io.IOBase: | ||
assert encoding == self._expected_encoding | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. not a great test, but the actual decoding is done outside of the CDK There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why are we defining a class |
||
return io.StringIO("c1,c2\nv1,v2") | ||
|
||
def get_matching_files(self, globs: List[str], logger: logging.Logger) -> Iterable[RemoteFile]: | ||
pass | ||
|
||
@property | ||
def config(self) -> Optional[AbstractFileBasedSpec]: | ||
return None | ||
|
||
@config.setter | ||
def config(self, value: AbstractFileBasedSpec) -> None: | ||
pass | ||
|
||
|
||
def test_encoding_is_passed_to_stream_reader(): | ||
parser = CsvParser() | ||
encoding = "ascii" | ||
stream_reader = MockFileBasedStreamReader(encoding) | ||
file = RemoteFile(uri="s3://bucket/key.csv", last_modified=datetime.now()) | ||
config = FileBasedStreamConfig( | ||
name="test", | ||
validation_policy="emit_record", | ||
file_type="csv", | ||
format={"csv": CsvFormat(encoding=encoding)} | ||
) | ||
list(parser.parse_records(config, file, stream_reader, logger)) | ||
|
||
loop = asyncio.get_event_loop() | ||
loop.run_until_complete(parser.infer_schema(config, file, stream_reader, logger)) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,6 +6,7 @@ | |
import decimal | ||
|
||
import pyarrow as pa | ||
from airbyte_cdk.sources.file_based.exceptions import ConfigValidationError | ||
from unit_tests.sources.file_based.in_memory_files_source import TemporaryParquetFilesStreamReader | ||
from unit_tests.sources.file_based.scenarios.scenario_builder import TestScenarioBuilder | ||
|
||
|
@@ -629,3 +630,34 @@ | |
} | ||
) | ||
).build() | ||
|
||
parquet_with_invalid_config_scenario = ( | ||
TestScenarioBuilder() | ||
.set_name("parquet_with_invalid_config_scenario") | ||
.set_config( | ||
{ | ||
"streams": [ | ||
{ | ||
"name": "stream1", | ||
"file_type": "parquet", | ||
"globs": ["*"], | ||
"validation_policy": "emit_record", | ||
"format": { | ||
"parquet": { | ||
"filetype": "csv", | ||
} | ||
} | ||
} | ||
] | ||
} | ||
) | ||
.set_stream_reader(TemporaryParquetFilesStreamReader(files=_single_parquet_file, file_type="parquet")) | ||
.set_file_type("parquet") | ||
.set_expected_read_error(ConfigValidationError, "Error creating stream config object. Contact Support if you need assistance.") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I guess this is invalid because of the csv, right? Could we have a better error message? Also, could this be a unit test? |
||
.set_expected_discover_error(ConfigValidationError, "Error creating stream config object. Contact Support if you need assistance.") | ||
.set_expected_records( | ||
[ | ||
# No records were read | ||
] | ||
) | ||
).build() |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -89,15 +89,15 @@ def get_matching_files(self, globs: List[str], logger: logging.Logger) -> Iterab | |
) from exc | ||
|
||
@contextmanager | ||
def open_file(self, file: RemoteFile, mode: FileReadMode, logger: logging.Logger) -> IOBase: | ||
def open_file(self, file: RemoteFile, mode: FileReadMode, encoding: Optional[str], logger: logging.Logger) -> IOBase: | ||
try: | ||
params = {"client": self.s3_client} | ||
except Exception as exc: | ||
raise exc | ||
|
||
logger.debug(f"try to open {file.uri}") | ||
try: | ||
result = smart_open.open(f"s3://{self.config.bucket}/{file.uri}", transport_params=params, mode=mode.value) | ||
result = smart_open.open(f"s3://{self.config.bucket}/{file.uri}", transport_params=params, mode=mode.value, encoding=encoding) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it's a little tricky to test this until we have the adapter. I tested by merging the codebase with this branch to run the v4 source and reading |
||
except OSError: | ||
logger.warning( | ||
f"We don't have access to {file.uri}. The file appears to have become unreachable during sync." | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
escape_char is an optional field