Skip to content

Commit

Permalink
FIX-#4057: Allow reading an empty parquet file. (#4075)
Browse files Browse the repository at this point in the history
  • Loading branch information
mvashishtha authored Feb 23, 2022
1 parent 47b8a1c commit dae9891
Show file tree
Hide file tree
Showing 3 changed files with 25 additions and 7 deletions.
3 changes: 2 additions & 1 deletion docs/release_notes/release_notes-0.14.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ Key Features and Updates
* FIX-#4158: Do not print OmniSci logs to stdout by default (#4159)
* FIX-#4177: Support read_feather from pathlike objects (#4177)
* FIX-#4234: Upgrade pandas to 1.4.1 (#4235)
* FIX-#4057: Allow reading an empty parquet file (#4075)
* Performance enhancements
* FIX-#4138, FIX-#4009: remove redundant sorting in the internal '.mask()' flow (#4140)
* Benchmarking enhancements
Expand Down Expand Up @@ -61,4 +62,4 @@ Contributors
@devin-petersohn
@dchigarev
@Garra1980
@mvashishtha
@mvashishtha
21 changes: 15 additions & 6 deletions modin/core/io/column_stores/column_store_dispatcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,9 @@ def build_index(cls, partition_ids):
List with lengths of index chunks.
"""
num_partitions = NPartitions.get()
index_len = cls.materialize(partition_ids[-2][0])
index_len = (
0 if len(partition_ids) == 0 else cls.materialize(partition_ids[-2][0])
)
if isinstance(index_len, int):
index = pandas.RangeIndex(index_len)
else:
Expand Down Expand Up @@ -158,15 +160,18 @@ def build_columns(cls, columns):
List with lengths of `col_partitions` subarrays
(number of columns that should be read by workers).
"""
columns_length = len(columns)
if columns_length == 0:
return [], []
num_partitions = NPartitions.get()
column_splits = (
len(columns) // num_partitions
if len(columns) % num_partitions == 0
else len(columns) // num_partitions + 1
columns_length // num_partitions
if columns_length % num_partitions == 0
else columns_length // num_partitions + 1
)
col_partitions = [
columns[i : i + column_splits]
for i in range(0, len(columns), column_splits)
for i in range(0, columns_length, column_splits)
]
column_widths = [len(c) for c in col_partitions]
return col_partitions, column_widths
Expand Down Expand Up @@ -215,7 +220,11 @@ def build_query_compiler(cls, path, columns, **kwargs):
partition_ids = cls.call_deploy(path, col_partitions, **kwargs)
index, row_lens = cls.build_index(partition_ids)
remote_parts = cls.build_partition(partition_ids[:-2], row_lens, column_widths)
dtypes = cls.build_dtypes(partition_ids[-1], columns)
dtypes = (
cls.build_dtypes(partition_ids[-1], columns)
if len(partition_ids) > 0
else None
)
new_query_compiler = cls.query_compiler_cls(
cls.frame_cls(
remote_parts,
Expand Down
8 changes: 8 additions & 0 deletions modin/pandas/test/test_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -1397,6 +1397,14 @@ def test_read_parquet_without_metadata(self):
finally:
teardown_test_files([parquet_fname, csv_fname])

def test_read_empty_parquet_file(self):
test_df = pandas.DataFrame()
with tempfile.TemporaryDirectory() as directory:
path = f"{directory}/data"
os.makedirs(path)
test_df.to_parquet(path + "/part-00000.parquet")
eval_io(fn_name="read_parquet", path=path)

@pytest.mark.xfail(
condition="config.getoption('--simulate-cloud').lower() != 'off'",
reason="The reason of tests fail in `cloud` mode is unknown for now - issue #3264",
Expand Down

0 comments on commit dae9891

Please sign in to comment.