Skip to content

Commit bb0950d

Browse files
authored
REFACTOR-#5718: add columns parameter for get_dtypes function (#5717)
Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>
1 parent 4db3e70 commit bb0950d

File tree

5 files changed

+28
-30
lines changed

5 files changed

+28
-30
lines changed

modin/core/io/text/csv_glob_dispatcher.py

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -231,18 +231,14 @@ def _read(cls, filepath_or_buffer, **kwargs):
231231
new_index = index_objs[0].append(index_objs[1:])
232232
new_index.name = pd_df_metadata.index.name
233233

234+
partition_ids = cls.build_partition(partition_ids, row_lengths, column_widths)
235+
234236
# Compute dtypes by getting collecting and combining all of the partitions. The
235237
# reported dtypes from differing rows can be different based on the inference in
236238
# the limited data seen by each worker. We use pandas to compute the exact dtype
237239
# over the whole column for each column. The index is set below.
238-
dtypes = cls.get_dtypes(dtypes_ids) if len(dtypes_ids) > 0 else None
240+
dtypes = cls.get_dtypes(dtypes_ids, column_names)
239241

240-
partition_ids = cls.build_partition(partition_ids, row_lengths, column_widths)
241-
# Set the index for the dtypes to the column names
242-
if isinstance(dtypes, pandas.Series):
243-
dtypes.index = column_names
244-
else:
245-
dtypes = pandas.Series(dtypes, index=column_names)
246242
new_frame = cls.frame_cls(
247243
partition_ids,
248244
new_index,

modin/core/io/text/excel_dispatcher.py

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -221,18 +221,14 @@ def _read(cls, io, **kwargs):
221221
row_lengths = [len(o) for o in index_objs]
222222
new_index = index_objs[0].append(index_objs[1:])
223223

224+
data_ids = cls.build_partition(data_ids, row_lengths, column_widths)
225+
224226
# Compute dtypes by getting collecting and combining all of the partitions. The
225227
# reported dtypes from differing rows can be different based on the inference in
226228
# the limited data seen by each worker. We use pandas to compute the exact dtype
227229
# over the whole column for each column. The index is set below.
228-
dtypes = cls.get_dtypes(dtypes_ids)
230+
dtypes = cls.get_dtypes(dtypes_ids, column_names)
229231

230-
data_ids = cls.build_partition(data_ids, row_lengths, column_widths)
231-
# Set the index for the dtypes to the column names
232-
if isinstance(dtypes, pandas.Series):
233-
dtypes.index = column_names
234-
else:
235-
dtypes = pandas.Series(dtypes, index=column_names)
236232
new_frame = cls.frame_cls(
237233
data_ids,
238234
new_index,

modin/core/io/text/json_dispatcher.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -84,13 +84,13 @@ def _read(cls, path_or_buf, **kwargs):
8484
row_lengths = cls.materialize(index_ids)
8585
new_index = pandas.RangeIndex(sum(row_lengths))
8686

87-
dtypes = cls.get_dtypes(dtypes_ids)
8887
partition_ids = cls.build_partition(partition_ids, row_lengths, column_widths)
8988

90-
if isinstance(dtypes, pandas.Series):
91-
dtypes.index = columns
92-
else:
93-
dtypes = pandas.Series(dtypes, index=columns)
89+
# Compute dtypes by getting collecting and combining all of the partitions. The
90+
# reported dtypes from differing rows can be different based on the inference in
91+
# the limited data seen by each worker. We use pandas to compute the exact dtype
92+
# over the whole column for each column. The index is set below.
93+
dtypes = cls.get_dtypes(dtypes_ids, columns)
9494

9595
new_frame = cls.frame_cls(
9696
np.array(partition_ids),

modin/core/io/text/text_file_dispatcher.py

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -909,19 +909,14 @@ def _get_new_qc(
909909
New query compiler, created from `new_frame`.
910910
"""
911911
new_index, row_lengths = cls._define_index(index_ids, index_name)
912+
# Compose modin partitions from `partition_ids`
913+
partition_ids = cls.build_partition(partition_ids, row_lengths, column_widths)
914+
912915
# Compute dtypes by collecting and combining all of the partition dtypes. The
913916
# reported dtypes from differing rows can be different based on the inference in
914917
# the limited data seen by each worker. We use pandas to compute the exact dtype
915918
# over the whole column for each column. The index is set below.
916-
dtypes = cls.get_dtypes(dtypes_ids) if len(dtypes_ids) > 0 else None
917-
# Compose modin partitions from `partition_ids`
918-
partition_ids = cls.build_partition(partition_ids, row_lengths, column_widths)
919-
920-
# Set the index for the dtypes to the column names
921-
if isinstance(dtypes, pandas.Series):
922-
dtypes.index = column_names
923-
else:
924-
dtypes = pandas.Series(dtypes, index=column_names)
919+
dtypes = cls.get_dtypes(dtypes_ids, column_names)
925920

926921
new_frame = cls.frame_cls(
927922
partition_ids,

modin/core/storage_formats/pandas/parsers.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -211,22 +211,27 @@ def generic_parse(fname, **kwargs):
211211
]
212212

213213
@classmethod
214-
def get_dtypes(cls, dtypes_ids):
214+
def get_dtypes(cls, dtypes_ids, columns):
215215
"""
216216
Get common for all partitions dtype for each of the columns.
217217
218218
Parameters
219219
----------
220220
dtypes_ids : list
221221
Array with references to the partitions dtypes objects.
222+
columns : array-like or Index (1d)
223+
The names of the columns in this variable will be used
224+
for dtypes creation.
222225
223226
Returns
224227
-------
225-
frame_dtypes : pandas.Series or dtype
228+
frame_dtypes : pandas.Series, dtype or None
226229
Resulting dtype or pandas.Series where column names are used as
227230
index and types of columns are used as values for full resulting
228231
frame.
229232
"""
233+
if len(dtypes_ids) == 0:
234+
return None
230235
# each element in `partitions_dtypes` is a Series, where column names are
231236
# used as index and types of columns for different partitions are used as values
232237
partitions_dtypes = cls.materialize(dtypes_ids)
@@ -251,6 +256,12 @@ def get_dtypes(cls, dtypes_ids):
251256
axis=1,
252257
).squeeze(axis=0)
253258

259+
# Set the index for the dtypes to the column names
260+
if isinstance(frame_dtypes, pandas.Series):
261+
frame_dtypes.index = columns
262+
else:
263+
frame_dtypes = pandas.Series(frame_dtypes, index=columns)
264+
254265
return frame_dtypes
255266

256267
@classmethod

0 commit comments

Comments
 (0)