-
Notifications
You must be signed in to change notification settings - Fork 4.2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Source file: do not read whole file on check and discover #24278
Changes from 3 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,6 +4,7 @@ | |
|
||
|
||
import json | ||
import sys | ||
import tempfile | ||
import traceback | ||
import urllib | ||
|
@@ -288,11 +289,12 @@ def load_yaml(self, fp): | |
if self._reader_format == "yaml": | ||
return pd.DataFrame(safe_load(fp)) | ||
|
||
def load_dataframes(self, fp, skip_data=False) -> Iterable: | ||
def load_dataframes(self, fp, skip_data=False, read_sample_chunk: bool = False) -> Iterable: | ||
"""load and return the appropriate pandas dataframe. | ||
:param fp: file-like object to read from | ||
:param skip_data: limit reading data | ||
:param read_sample_chunk: indicates whether a single chunk should only be read to generate schema | ||
:return: a list of dataframe loaded from files described in the configuration | ||
""" | ||
readers = { | ||
|
@@ -321,11 +323,16 @@ def load_dataframes(self, fp, skip_data=False) -> Iterable: | |
reader_options = {**self._reader_options} | ||
try: | ||
if self._reader_format == "csv": | ||
bytes_read = 0 | ||
reader_options["chunksize"] = self.CSV_CHUNK_SIZE | ||
if skip_data: | ||
reader_options["nrows"] = 0 | ||
reader_options["index_col"] = 0 | ||
yield from reader(fp, **reader_options) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Read only |
||
for record in reader(fp, **reader_options): | ||
bytes_read += sys.getsizeof(record) | ||
yield record | ||
if read_sample_chunk and bytes_read >= self.CSV_CHUNK_SIZE: | ||
return | ||
elif self._reader_options == "excel_binary": | ||
reader_options["engine"] = "pyxlsb" | ||
yield from reader(fp, **reader_options) | ||
|
@@ -393,13 +400,17 @@ def _cache_stream(self, fp): | |
fp.close() | ||
return fp_tmp | ||
|
||
def _stream_properties(self, fp): | ||
def _stream_properties(self, fp, empty_schema: bool = False, read_sample_chunk: bool = False): | ||
""" | ||
empty_schema param is used to check connectivity, i.e. we only read a header and do not produce stream properties | ||
read_sample_chunk is used to determine if just one chunk should be read to generate schema | ||
""" | ||
if self._reader_format == "yaml": | ||
df_list = [self.load_yaml(fp)] | ||
else: | ||
if self.binary_source: | ||
fp = self._cache_stream(fp) | ||
df_list = self.load_dataframes(fp, skip_data=False) | ||
df_list = self.load_dataframes(fp, skip_data=empty_schema, read_sample_chunk=read_sample_chunk) | ||
fields = {} | ||
for df in df_list: | ||
for col in df.columns: | ||
|
@@ -408,8 +419,7 @@ def _stream_properties(self, fp): | |
fields[col] = self.dtype_to_json_type(prev_frame_column_type, df[col].dtype) | ||
return {field: {"type": [fields[field], "null"]} for field in fields} | ||
|
||
@property | ||
def streams(self) -> Iterable: | ||
def streams(self, empty_schema: bool = False) -> Iterable: | ||
"""Discovers available streams""" | ||
# TODO handle discovery of directories of multiple files instead | ||
with self.reader.open() as fp: | ||
|
@@ -419,6 +429,6 @@ def streams(self) -> Iterable: | |
json_schema = { | ||
"$schema": "http://json-schema.org/draft-07/schema#", | ||
"type": "object", | ||
"properties": self._stream_properties(fp), | ||
"properties": self._stream_properties(fp, empty_schema=empty_schema, read_sample_chunk=True), | ||
} | ||
yield AirbyteStream(name=self.stream_name, json_schema=json_schema, supported_sync_modes=[SyncMode.full_refresh]) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -79,7 +79,8 @@ def _get_client(self, config: Mapping): | |
|
||
return client | ||
|
||
def _validate_and_transform(self, config: Mapping[str, Any]): | ||
@staticmethod | ||
def _validate_and_transform(config: Mapping[str, Any]): | ||
if "reader_options" in config: | ||
try: | ||
config["reader_options"] = json.loads(config["reader_options"]) | ||
|
@@ -108,9 +109,8 @@ def check(self, logger, config: Mapping) -> AirbyteConnectionStatus: | |
client = self._get_client(config) | ||
source_url = client.reader.full_url | ||
try: | ||
with client.reader.open(): | ||
list(client.streams) | ||
return AirbyteConnectionStatus(status=Status.SUCCEEDED) | ||
list(client.streams(empty_schema=True)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Read only file header when running |
||
return AirbyteConnectionStatus(status=Status.SUCCEEDED) | ||
except (TypeError, ValueError, ConfigurationError) as err: | ||
reason = f"Failed to load {source_url}\n Please check File Format and Reader Options are set correctly. \n{repr(err)}" | ||
logger.error(reason) | ||
|
@@ -127,13 +127,13 @@ def discover(self, logger: AirbyteLogger, config: Mapping) -> AirbyteCatalog: | |
""" | ||
config = self._validate_and_transform(config) | ||
client = self._get_client(config) | ||
name = client.stream_name | ||
name, full_url = client.stream_name, client.reader.full_url | ||
|
||
logger.info(f"Discovering schema of {name} at {client.reader.full_url}...") | ||
logger.info(f"Discovering schema of {name} at {full_url}...") | ||
try: | ||
streams = list(client.streams) | ||
streams = list(client.streams()) | ||
except Exception as err: | ||
reason = f"Failed to discover schemas of {name} at {client.reader.full_url}: {repr(err)}\n{traceback.format_exc()}" | ||
reason = f"Failed to discover schemas of {name} at {full_url}: {repr(err)}\n{traceback.format_exc()}" | ||
logger.error(reason) | ||
raise err | ||
return AirbyteCatalog(streams=streams) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
dropped this test because it doesn't represent the expected behavior from now on