From e513486e6c101ef91d0726428b2dcdb208a82475 Mon Sep 17 00:00:00 2001 From: behzad nouri Date: Sun, 18 Jan 2015 17:25:17 -0500 Subject: [PATCH] tests for issues with unstack with nan --- doc/source/whatsnew/v0.16.0.txt | 2 +- pandas/core/reshape.py | 38 +++++++----- pandas/tests/test_frame.py | 106 ++++++++++++++++++++++++++++++++ pandas/tests/test_series.py | 1 - 4 files changed, 129 insertions(+), 18 deletions(-) diff --git a/doc/source/whatsnew/v0.16.0.txt b/doc/source/whatsnew/v0.16.0.txt index b0c5b11079f31..6082a58687c2c 100644 --- a/doc/source/whatsnew/v0.16.0.txt +++ b/doc/source/whatsnew/v0.16.0.txt @@ -146,7 +146,7 @@ Bug Fixes - Fixed bug on bug endian platforms which produced incorrect results in ``StataReader`` (:issue:`8688`). - Bug in ``MultiIndex.has_duplicates`` when having many levels causes an indexer overflow (:issue:`9075`, :issue:`5873`) -- Bug in ``pivot`` and `unstack`` where ``nan`` values would break index alignment (:issue:`7466`) +- Bug in ``pivot`` and `unstack`` where ``nan`` values would break index alignment (:issue:`4862`, :issue:`7401`, :issue:`7403`, :issue:`7405`, :issue:`7466`) - Bug in left ``join`` on multi-index with ``sort=True`` or null values (:issue:`9210`). - Bug in ``MultiIndex`` where inserting new keys would fail (:issue:`9250`). diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index 659d944a5e784..18dab471e3de2 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -17,7 +17,7 @@ import pandas.core.common as com import pandas.algos as algos -from pandas.core.index import MultiIndex, _get_na_value +from pandas.core.index import MultiIndex class _Unstacker(object): @@ -198,14 +198,8 @@ def get_new_values(self): def get_new_columns(self): if self.value_columns is None: - if self.lift == 0: - return self.removed_level - - lev = self.removed_level - vals = np.insert(lev.astype('object'), 0, - _get_na_value(lev.dtype.type)) - - return lev._shallow_copy(vals) + return _make_new_index(self.removed_level, None) \ + if self.lift != 0 else self.removed_level stride = len(self.removed_level) + self.lift width = len(self.value_columns) @@ -232,19 +226,31 @@ def get_new_index(self): # construct the new index if len(self.new_index_levels) == 1: lev, lab = self.new_index_levels[0], result_labels[0] - if not (lab == -1).any(): - return lev.take(lab) - - vals = np.insert(lev.astype('object'), len(lev), - _get_na_value(lev.dtype.type)).take(lab) - - return lev._shallow_copy(vals) + return _make_new_index(lev, lab) \ + if (lab == -1).any() else lev.take(lab) return MultiIndex(levels=self.new_index_levels, labels=result_labels, names=self.new_index_names, verify_integrity=False) + +def _make_new_index(lev, lab): + from pandas.core.index import Index, _get_na_value + + nan = _get_na_value(lev.dtype.type) + vals = lev.values.astype('object') + vals = np.insert(vals, 0, nan) if lab is None else \ + np.insert(vals, len(vals), nan).take(lab) + + try: + vals = vals.astype(lev.dtype, subok=False, copy=False) + except ValueError: + return Index(vals, **lev._get_attributes_dict()) + + return lev._shallow_copy(vals) + + def _unstack_multiple(data, clocs): if len(clocs) == 0: return data diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 7984c82cfbe9c..563e9d4dae57c 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -12328,6 +12328,25 @@ def test_unstack_dtypes(self): expected = Series({'float64' : 2, 'object' : 2}) assert_series_equal(result, expected) + # GH7405 + for c, d in (np.zeros(5), np.zeros(5)), \ + (np.arange(5, dtype='f8'), np.arange(5, 10, dtype='f8')): + + df = DataFrame({'A': ['a']*5, 'C':c, 'D':d, + 'B':pd.date_range('2012-01-01', periods=5)}) + + right = df.iloc[:3].copy(deep=True) + + df = df.set_index(['A', 'B']) + df['D'] = df['D'].astype('int64') + + left = df.iloc[:3].unstack(0) + right = right.set_index(['A', 'B']).unstack(0) + right[('D', 'a')] = right[('D', 'a')].astype('int64') + + self.assertEqual(left.shape, (3, 2)) + tm.assert_frame_equal(left, right) + def test_unstack_non_unique_index_names(self): idx = MultiIndex.from_tuples([('a', 'b'), ('c', 'd')], names=['c1', 'c1']) @@ -12385,6 +12404,93 @@ def verify(df): for col in ['4th', '5th']: verify(udf[col]) + # GH7403 + df = pd.DataFrame({'A': list('aaaabbbb'),'B':range(8), 'C':range(8)}) + df.iloc[3, 1] = np.NaN + left = df.set_index(['A', 'B']).unstack(0) + + vals = [[3, 0, 1, 2, nan, nan, nan, nan], + [nan, nan, nan, nan, 4, 5, 6, 7]] + vals = list(map(list, zip(*vals))) + idx = Index([nan, 0, 1, 2, 4, 5, 6, 7], name='B') + cols = MultiIndex(levels=[['C'], ['a', 'b']], + labels=[[0, 0], [0, 1]], + names=[None, 'A']) + + right = DataFrame(vals, columns=cols, index=idx) + assert_frame_equal(left, right) + + df = DataFrame({'A': list('aaaabbbb'), 'B':list(range(4))*2, + 'C':range(8)}) + df.iloc[2,1] = np.NaN + left = df.set_index(['A', 'B']).unstack(0) + + vals = [[2, nan], [0, 4], [1, 5], [nan, 6], [3, 7]] + cols = MultiIndex(levels=[['C'], ['a', 'b']], + labels=[[0, 0], [0, 1]], + names=[None, 'A']) + idx = Index([nan, 0, 1, 2, 3], name='B') + right = DataFrame(vals, columns=cols, index=idx) + assert_frame_equal(left, right) + + df = pd.DataFrame({'A': list('aaaabbbb'),'B':list(range(4))*2, + 'C':range(8)}) + df.iloc[3,1] = np.NaN + left = df.set_index(['A', 'B']).unstack(0) + + vals = [[3, nan], [0, 4], [1, 5], [2, 6], [nan, 7]] + cols = MultiIndex(levels=[['C'], ['a', 'b']], + labels=[[0, 0], [0, 1]], + names=[None, 'A']) + idx = Index([nan, 0, 1, 2, 3], name='B') + right = DataFrame(vals, columns=cols, index=idx) + assert_frame_equal(left, right) + + # GH7401 + df = pd.DataFrame({'A': list('aaaaabbbbb'), 'C':np.arange(10), + 'B':date_range('2012-01-01', periods=5).tolist()*2 }) + + df.iloc[3,1] = np.NaN + left = df.set_index(['A', 'B']).unstack() + + vals = np.array([[3, 0, 1, 2, nan, 4], [nan, 5, 6, 7, 8, 9]]) + idx = Index(['a', 'b'], name='A') + cols = MultiIndex(levels=[['C'], date_range('2012-01-01', periods=5)], + labels=[[0, 0, 0, 0, 0, 0], [-1, 0, 1, 2, 3, 4]], + names=[None, 'B']) + + right = DataFrame(vals, columns=cols, index=idx) + assert_frame_equal(left, right) + + # GH4862 + vals = [['Hg', nan, nan, 680585148], + ['U', 0.0, nan, 680585148], + ['Pb', 7.07e-06, nan, 680585148], + ['Sn', 2.3614e-05, 0.0133, 680607017], + ['Ag', 0.0, 0.0133, 680607017], + ['Hg', -0.00015, 0.0133, 680607017]] + df = DataFrame(vals, columns=['agent', 'change', 'dosage', 's_id'], + index=[17263, 17264, 17265, 17266, 17267, 17268]) + + left = df.copy().set_index(['s_id','dosage','agent']).unstack() + + vals = [[nan, nan, 7.07e-06, nan, 0.0], + [0.0, -0.00015, nan, 2.3614e-05, nan]] + + idx = MultiIndex(levels=[[680585148, 680607017], [0.0133]], + labels=[[0, 1], [-1, 0]], + names=['s_id', 'dosage']) + + cols = MultiIndex(levels=[['change'], ['Ag', 'Hg', 'Pb', 'Sn', 'U']], + labels=[[0, 0, 0, 0, 0], [0, 1, 2, 3, 4]], + names=[None, 'agent']) + + right = DataFrame(vals, columns=cols, index=idx) + assert_frame_equal(left, right) + + left = df.ix[17264:].copy().set_index(['s_id','dosage','agent']) + assert_frame_equal(left.unstack(), right) + def test_stack_datetime_column_multiIndex(self): # GH 8039 t = datetime(2014, 1, 1) diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index db8ff37e4e1b4..d762ac4ff774e 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -5954,7 +5954,6 @@ def test_unstack(self): idx = pd.MultiIndex.from_arrays([[101, 102], [3.5, np.nan]]) ts = pd.Series([1,2], index=idx) left = ts.unstack() - left.columns = left.columns.astype('float64') right = DataFrame([[nan, 1], [2, nan]], index=[101, 102], columns=[nan, 3.5]) assert_frame_equal(left, right)