REFACTOR-#5718: add columns parameter for get_dtypes function (#5717)

anmyachev · web-flow · commit bb0950d06ce4 · 2023-03-01T14:39:47.000+01:00
Signed-off-by: Anatoly Myachev &lt;anatoly.myachev@intel.com&gt;
diff --git a/modin/core/io/text/csv_glob_dispatcher.py b/modin/core/io/text/csv_glob_dispatcher.py
@@ -231,18 +231,14 @@ def _read(cls, filepath_or_buffer, **kwargs):
             new_index = index_objs[0].append(index_objs[1:])
             new_index.name = pd_df_metadata.index.name
 
+        partition_ids = cls.build_partition(partition_ids, row_lengths, column_widths)
+
         # Compute dtypes by getting collecting and combining all of the partitions. The
         # reported dtypes from differing rows can be different based on the inference in
         # the limited data seen by each worker. We use pandas to compute the exact dtype
         # over the whole column for each column. The index is set below.
-        dtypes = cls.get_dtypes(dtypes_ids) if len(dtypes_ids) > 0 else None
+        dtypes = cls.get_dtypes(dtypes_ids, column_names)
 
-        partition_ids = cls.build_partition(partition_ids, row_lengths, column_widths)
-        # Set the index for the dtypes to the column names
-        if isinstance(dtypes, pandas.Series):
-            dtypes.index = column_names
-        else:
-            dtypes = pandas.Series(dtypes, index=column_names)
         new_frame = cls.frame_cls(
             partition_ids,
             new_index,
diff --git a/modin/core/io/text/excel_dispatcher.py b/modin/core/io/text/excel_dispatcher.py
@@ -221,18 +221,14 @@ def _read(cls, io, **kwargs):
             row_lengths = [len(o) for o in index_objs]
             new_index = index_objs[0].append(index_objs[1:])
 
+        data_ids = cls.build_partition(data_ids, row_lengths, column_widths)
+
         # Compute dtypes by getting collecting and combining all of the partitions. The
         # reported dtypes from differing rows can be different based on the inference in
         # the limited data seen by each worker. We use pandas to compute the exact dtype
         # over the whole column for each column. The index is set below.
-        dtypes = cls.get_dtypes(dtypes_ids)
+        dtypes = cls.get_dtypes(dtypes_ids, column_names)
 
-        data_ids = cls.build_partition(data_ids, row_lengths, column_widths)
-        # Set the index for the dtypes to the column names
-        if isinstance(dtypes, pandas.Series):
-            dtypes.index = column_names
-        else:
-            dtypes = pandas.Series(dtypes, index=column_names)
         new_frame = cls.frame_cls(
             data_ids,
             new_index,
diff --git a/modin/core/io/text/json_dispatcher.py b/modin/core/io/text/json_dispatcher.py
@@ -84,13 +84,13 @@ def _read(cls, path_or_buf, **kwargs):
         row_lengths = cls.materialize(index_ids)
         new_index = pandas.RangeIndex(sum(row_lengths))
 
-        dtypes = cls.get_dtypes(dtypes_ids)
         partition_ids = cls.build_partition(partition_ids, row_lengths, column_widths)
 
-        if isinstance(dtypes, pandas.Series):
-            dtypes.index = columns
-        else:
-            dtypes = pandas.Series(dtypes, index=columns)
+        # Compute dtypes by getting collecting and combining all of the partitions. The
+        # reported dtypes from differing rows can be different based on the inference in
+        # the limited data seen by each worker. We use pandas to compute the exact dtype
+        # over the whole column for each column. The index is set below.
+        dtypes = cls.get_dtypes(dtypes_ids, columns)
 
         new_frame = cls.frame_cls(
             np.array(partition_ids),
diff --git a/modin/core/io/text/text_file_dispatcher.py b/modin/core/io/text/text_file_dispatcher.py
@@ -909,19 +909,14 @@ def _get_new_qc(
             New query compiler, created from `new_frame`.
         """
         new_index, row_lengths = cls._define_index(index_ids, index_name)
+        # Compose modin partitions from `partition_ids`
+        partition_ids = cls.build_partition(partition_ids, row_lengths, column_widths)
+
         # Compute dtypes by collecting and combining all of the partition dtypes. The
         # reported dtypes from differing rows can be different based on the inference in
         # the limited data seen by each worker. We use pandas to compute the exact dtype
         # over the whole column for each column. The index is set below.
-        dtypes = cls.get_dtypes(dtypes_ids) if len(dtypes_ids) > 0 else None
-        # Compose modin partitions from `partition_ids`
-        partition_ids = cls.build_partition(partition_ids, row_lengths, column_widths)
-
-        # Set the index for the dtypes to the column names
-        if isinstance(dtypes, pandas.Series):
-            dtypes.index = column_names
-        else:
-            dtypes = pandas.Series(dtypes, index=column_names)
+        dtypes = cls.get_dtypes(dtypes_ids, column_names)
 
         new_frame = cls.frame_cls(
             partition_ids,
diff --git a/modin/core/storage_formats/pandas/parsers.py b/modin/core/storage_formats/pandas/parsers.py
@@ -211,22 +211,27 @@ def generic_parse(fname, **kwargs):
         ]
 
     @classmethod
-    def get_dtypes(cls, dtypes_ids):
+    def get_dtypes(cls, dtypes_ids, columns):
         """
         Get common for all partitions dtype for each of the columns.
 
         Parameters
         ----------
         dtypes_ids : list
             Array with references to the partitions dtypes objects.
+        columns : array-like or Index (1d)
+            The names of the columns in this variable will be used
+            for dtypes creation.
 
         Returns
         -------
-        frame_dtypes : pandas.Series or dtype
+        frame_dtypes : pandas.Series, dtype or None
             Resulting dtype or pandas.Series where column names are used as
             index and types of columns are used as values for full resulting
             frame.
         """
+        if len(dtypes_ids) == 0:
+            return None
         # each element in `partitions_dtypes` is a Series, where column names are
         # used as index and types of columns for different partitions are used as values
         partitions_dtypes = cls.materialize(dtypes_ids)
@@ -251,6 +256,12 @@ def get_dtypes(cls, dtypes_ids):
                 axis=1,
             ).squeeze(axis=0)
 
+        # Set the index for the dtypes to the column names
+        if isinstance(frame_dtypes, pandas.Series):
+            frame_dtypes.index = columns
+        else:
+            frame_dtypes = pandas.Series(frame_dtypes, index=columns)
+
         return frame_dtypes
 
     @classmethod