From 8f1cb5a6fab104065c1d7336e11bee8273f6232a Mon Sep 17 00:00:00 2001 From: William Ma Date: Sun, 16 Sep 2018 16:02:27 -0700 Subject: [PATCH 1/3] Quick fix for info --- modin/data_management/data_manager.py | 29 ++++++++++++++------------- modin/pandas/dataframe.py | 5 ++--- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/modin/data_management/data_manager.py b/modin/data_management/data_manager.py index 7024b3a130f..e0832f96d8d 100644 --- a/modin/data_management/data_manager.py +++ b/modin/data_management/data_manager.py @@ -680,13 +680,14 @@ def round(self, **kwargs): # Currently, this means a Pandas Series will be returned, but in the future # we will implement a Distributed Series, and this will be returned # instead. - def full_axis_reduce(self, func, axis): + def full_axis_reduce(self, func, axis, named_index=True): result = self.data.map_across_full_axis(axis, func).to_pandas(self._is_transposed) - if not axis: - result.index = self.columns - else: - result.index = self.index + if named_index: + if not axis: + result.index = self.columns + else: + result.index = self.index return result @@ -751,14 +752,6 @@ def idxmin_builder(df, **kwargs): return self._post_process_idx_ops(axis, min_result) def info(self, **kwargs): - def info_builder(df, **kwargs): - result = pandas.DataFrame() - if memory_usage: - result['memory'] = df.memory_usage(index=False, deep=memory_usage_deep) - if null_counts: - result['count'] = df.count(axis=0) - return result - memory_usage = kwargs.get('memory_usage', True) null_counts = kwargs.get('null_counts', True) @@ -767,8 +760,16 @@ def info_builder(df, **kwargs): else: memory_usage_deep = False + def info_builder(df, **kwargs): + result = pandas.DataFrame() + if memory_usage: + result['memory'] = df.memory_usage(index=False, deep=memory_usage_deep) + if null_counts: + result['count'] = df.count(axis=0) + return result + func = self._prepare_method(info_builder, **kwargs) - return self.full_axis_reduce(func, 0) + return self.full_axis_reduce(func, 0, False) def last_valid_index(self): diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py index 839dc21c0b9..d2afb6e1987 100644 --- a/modin/pandas/dataframe.py +++ b/modin/pandas/dataframe.py @@ -1727,15 +1727,14 @@ def info(self, if memory_usage or null_counts: results_data = self._data_manager.info( verbose=actually_verbose, - buf=buf, max_cols=max_cols, memory_usage=memory_usage, null_counts=null_counts ) if null_counts: - # For some reason, the counts table has a shape of (columns, columns) counts = results_data['count'] counts.columns = columns + print(counts) if memory_usage: # For some reason, the memory table has a shape of (columns, columns) # but it doesn't matter because the cells not on the diagonal are NaN @@ -1748,7 +1747,7 @@ def info(self, for col, dtype in zip(columns, dtypes): col_string += '{0}\t'.format(col) if null_counts: - col_string += '{0} not-null '.format(counts.loc[col, col]) + col_string += '{0} not-null '.format(counts[col]) col_string += '{0}\n'.format(dtype) else: # Create string for not verbose output From 98f44a28c093e43cf5a2268429f04d95a6923f7b Mon Sep 17 00:00:00 2001 From: William Ma Date: Sun, 16 Sep 2018 17:05:33 -0700 Subject: [PATCH 2/3] Removed extraneous print statement --- modin/pandas/dataframe.py | 1 - 1 file changed, 1 deletion(-) diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py index d2afb6e1987..a9af0f45d80 100644 --- a/modin/pandas/dataframe.py +++ b/modin/pandas/dataframe.py @@ -1734,7 +1734,6 @@ def info(self, if null_counts: counts = results_data['count'] counts.columns = columns - print(counts) if memory_usage: # For some reason, the memory table has a shape of (columns, columns) # but it doesn't matter because the cells not on the diagonal are NaN From 30daae1cfe093c147391fc3f69c67bcac644ac70 Mon Sep 17 00:00:00 2001 From: William Ma Date: Sun, 16 Sep 2018 18:09:40 -0700 Subject: [PATCH 3/3] Restructured to use count and memory_usage instead --- modin/data_management/data_manager.py | 31 +++++---------------------- modin/pandas/dataframe.py | 18 ++++------------ 2 files changed, 9 insertions(+), 40 deletions(-) diff --git a/modin/data_management/data_manager.py b/modin/data_management/data_manager.py index e0832f96d8d..74c2aadaf43 100644 --- a/modin/data_management/data_manager.py +++ b/modin/data_management/data_manager.py @@ -680,14 +680,13 @@ def round(self, **kwargs): # Currently, this means a Pandas Series will be returned, but in the future # we will implement a Distributed Series, and this will be returned # instead. - def full_axis_reduce(self, func, axis, named_index=True): + def full_axis_reduce(self, func, axis): result = self.data.map_across_full_axis(axis, func).to_pandas(self._is_transposed) - if named_index: - if not axis: - result.index = self.columns - else: - result.index = self.index + if not axis: + result.index = self.columns + else: + result.index = self.index return result @@ -751,26 +750,6 @@ def idxmin_builder(df, **kwargs): # have to do a conversion. return self._post_process_idx_ops(axis, min_result) - def info(self, **kwargs): - memory_usage = kwargs.get('memory_usage', True) - null_counts = kwargs.get('null_counts', True) - - if type(memory_usage) == str and memory_usage == 'deep': - memory_usage_deep = True - else: - memory_usage_deep = False - - def info_builder(df, **kwargs): - result = pandas.DataFrame() - if memory_usage: - result['memory'] = df.memory_usage(index=False, deep=memory_usage_deep) - if null_counts: - result['count'] = df.count(axis=0) - return result - - func = self._prepare_method(info_builder, **kwargs) - return self.full_axis_reduce(func, 0, False) - def last_valid_index(self): def last_valid_index_builder(df): diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py index a9af0f45d80..4b85bb62a66 100644 --- a/modin/pandas/dataframe.py +++ b/modin/pandas/dataframe.py @@ -1724,20 +1724,10 @@ def info(self, # Create the Index info() string by parsing self.index index_string = index.summary() + '\n' - if memory_usage or null_counts: - results_data = self._data_manager.info( - verbose=actually_verbose, - max_cols=max_cols, - memory_usage=memory_usage, - null_counts=null_counts - ) - if null_counts: - counts = results_data['count'] - counts.columns = columns - if memory_usage: - # For some reason, the memory table has a shape of (columns, columns) - # but it doesn't matter because the cells not on the diagonal are NaN - memory_usage_data = results_data['memory'].sum() + index.memory_usage(deep=memory_usage_deep) + if null_counts: + counts = self._data_manager.count() + if memory_usage: + memory_usage_data = self._data_manager.memory_usage(deep=memory_usage_deep, index=True) if actually_verbose: # Create string for verbose output