FIX-#4057: Allow reading an empty parquet file. (#4075)

modin-project · Feb 23, 2022 · dae9891 · dae9891
1 parent 47b8a1c
commit dae9891
Show file tree

Hide file tree

Showing 3 changed files with 25 additions and 7 deletions.
diff --git a/docs/release_notes/release_notes-0.14.0.rst b/docs/release_notes/release_notes-0.14.0.rst
@@ -13,6 +13,7 @@ Key Features and Updates
   * FIX-#4158: Do not print OmniSci logs to stdout by default (#4159)
   * FIX-#4177: Support read_feather from pathlike objects (#4177)
   * FIX-#4234: Upgrade pandas to 1.4.1 (#4235)
+  * FIX-#4057: Allow reading an empty parquet file (#4075)  
 * Performance enhancements
   * FIX-#4138, FIX-#4009: remove redundant sorting in the internal '.mask()' flow (#4140)
 * Benchmarking enhancements
@@ -61,4 +62,4 @@ Contributors
 @devin-petersohn
 @dchigarev
 @Garra1980
-@mvashishtha
+@mvashishtha
diff --git a/modin/core/io/column_stores/column_store_dispatcher.py b/modin/core/io/column_stores/column_store_dispatcher.py
@@ -122,7 +122,9 @@ def build_index(cls, partition_ids):
             List with lengths of index chunks.
         """
         num_partitions = NPartitions.get()
-        index_len = cls.materialize(partition_ids[-2][0])
+        index_len = (
+            0 if len(partition_ids) == 0 else cls.materialize(partition_ids[-2][0])
+        )
         if isinstance(index_len, int):
             index = pandas.RangeIndex(index_len)
         else:
@@ -158,15 +160,18 @@ def build_columns(cls, columns):
             List with lengths of `col_partitions` subarrays
             (number of columns that should be read by workers).
         """
+        columns_length = len(columns)
+        if columns_length == 0:
+            return [], []
         num_partitions = NPartitions.get()
         column_splits = (
-            len(columns) // num_partitions
-            if len(columns) % num_partitions == 0
-            else len(columns) // num_partitions + 1
+            columns_length // num_partitions
+            if columns_length % num_partitions == 0
+            else columns_length // num_partitions + 1
         )
         col_partitions = [
             columns[i : i + column_splits]
-            for i in range(0, len(columns), column_splits)
+            for i in range(0, columns_length, column_splits)
         ]
         column_widths = [len(c) for c in col_partitions]
         return col_partitions, column_widths
@@ -215,7 +220,11 @@ def build_query_compiler(cls, path, columns, **kwargs):
         partition_ids = cls.call_deploy(path, col_partitions, **kwargs)
         index, row_lens = cls.build_index(partition_ids)
         remote_parts = cls.build_partition(partition_ids[:-2], row_lens, column_widths)
-        dtypes = cls.build_dtypes(partition_ids[-1], columns)
+        dtypes = (
+            cls.build_dtypes(partition_ids[-1], columns)
+            if len(partition_ids) > 0
+            else None
+        )
         new_query_compiler = cls.query_compiler_cls(
             cls.frame_cls(
                 remote_parts,

diff --git a/modin/pandas/test/test_io.py b/modin/pandas/test/test_io.py
@@ -1397,6 +1397,14 @@ def test_read_parquet_without_metadata(self):
         finally:
             teardown_test_files([parquet_fname, csv_fname])
 
+    def test_read_empty_parquet_file(self):
+        test_df = pandas.DataFrame()
+        with tempfile.TemporaryDirectory() as directory:
+            path = f"{directory}/data"
+            os.makedirs(path)
+            test_df.to_parquet(path + "/part-00000.parquet")
+            eval_io(fn_name="read_parquet", path=path)
+
     @pytest.mark.xfail(
         condition="config.getoption('--simulate-cloud').lower() != 'off'",
         reason="The reason of tests fail in `cloud` mode is unknown for now - issue #3264",