diff --git a/RELEASE.rst b/RELEASE.rst index 0ade3e92c164a..2f98922eb403e 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -144,6 +144,7 @@ pandas 0.11.1 - Fix plotting of unordered DatetimeIndex (GH3601_) - ``sql.write_frame`` failing when writing a single column to sqlite (GH3628_), thanks to @stonebig + - Fix pivoting with ``nan`` in the index (GH3558_) .. _GH3164: https://github.com/pydata/pandas/issues/3164 .. _GH2786: https://github.com/pydata/pandas/issues/2786 @@ -194,6 +195,7 @@ pandas 0.11.1 .. _GH3617: https://github.com/pydata/pandas/issues/3617 .. _GH3435: https://github.com/pydata/pandas/issues/3435 .. _GH3611: https://github.com/pydata/pandas/issues/3611 +.. _GH3558: https://github.com/pydata/pandas/issues/3558 .. _GH3062: https://github.com/pydata/pandas/issues/3062 .. _GH3624: https://github.com/pydata/pandas/issues/3624 .. _GH3626: https://github.com/pydata/pandas/issues/3626 diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 02f1cf4539ac4..ea684ef11446c 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -469,11 +469,14 @@ def _reindex(keys, level=None): missing = com._ensure_platform_int(missing) missing_labels = keyarr.take(missing) - missing_labels_indexer = com._ensure_int64(l[~check]) + missing_indexer = com._ensure_int64(l[~check]) cur_labels = result._get_axis(axis).values - cur_labels_indexer = com._ensure_int64(l[check]) - new_labels = lib.combine_from_indexers(cur_labels, cur_labels_indexer, - missing_labels, missing_labels_indexer) + cur_indexer = com._ensure_int64(l[check]) + + new_labels = np.empty(tuple([len(indexer)]),dtype=object) + new_labels[cur_indexer] = cur_labels + new_labels[missing_indexer] = missing_labels + result = result.reindex_axis(new_labels,axis=axis) return result diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index 8595e2a91906d..b2e5bb01f53af 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -10,12 +10,12 @@ from pandas.core.categorical import Categorical from pandas.core.common import (notnull, _ensure_platform_int, _maybe_promote, - _maybe_upcast) + _maybe_upcast, isnull) from pandas.core.groupby import (get_group_index, _compress_group_index, decons_group_index) import pandas.core.common as com import pandas.algos as algos - +from pandas import lib from pandas.core.index import MultiIndex, Index @@ -67,7 +67,14 @@ def __init__(self, values, index, level=-1, value_columns=None): self.index = index self.level = self.index._get_level_number(level) - self.new_index_levels = list(index.levels) + levels = index.levels + labels = index.labels + def _make_index(lev,lab): + i = lev.__class__(_make_index_array_level(lev.values,lab)) + i.name = lev.name + return i + + self.new_index_levels = list([ _make_index(lev,lab) for lev,lab in zip(levels,labels) ]) self.new_index_names = list(index.names) self.removed_name = self.new_index_names.pop(self.level) @@ -140,6 +147,19 @@ def get_result(self): values = com.take_nd(values, inds, axis=1) columns = columns[inds] + # we might have a missing index + if len(index) != values.shape[0]: + mask = isnull(index) + if mask.any(): + l = np.arange(len(index)) + values, orig_values = np.empty((len(index),values.shape[1])), values + values.fill(np.nan) + values_indexer = com._ensure_int64(l[~mask]) + for i, j in enumerate(values_indexer): + values[j] = orig_values[i] + else: + index = index.take(self.unique_groups) + return DataFrame(values, index=index, columns=columns) def get_new_values(self): @@ -201,11 +221,13 @@ def get_new_columns(self): def get_new_index(self): result_labels = [] for cur in self.sorted_labels[:-1]: - result_labels.append(cur.take(self.compressor)) + labels = cur.take(self.compressor) + labels = _make_index_array_level(labels,cur) + result_labels.append(labels) # construct the new index if len(self.new_index_levels) == 1: - new_index = self.new_index_levels[0].take(self.unique_groups) + new_index = self.new_index_levels[0] new_index.name = self.new_index_names[0] else: new_index = MultiIndex(levels=self.new_index_levels, @@ -215,6 +237,26 @@ def get_new_index(self): return new_index +def _make_index_array_level(lev,lab): + """ create the combined index array, preserving nans, return an array """ + mask = lab == -1 + if not mask.any(): + return lev + + l = np.arange(len(lab)) + mask_labels = np.empty(len(mask[mask]),dtype=object) + mask_labels.fill(np.nan) + mask_indexer = com._ensure_int64(l[mask]) + + labels = lev + labels_indexer = com._ensure_int64(l[~mask]) + + new_labels = np.empty(tuple([len(lab)]),dtype=object) + new_labels[labels_indexer] = labels + new_labels[mask_indexer] = mask_labels + + return new_labels + def _unstack_multiple(data, clocs): if len(clocs) == 0: return data diff --git a/pandas/lib.pyx b/pandas/lib.pyx index 30c65d9fcdd9f..15791a984ecc5 100644 --- a/pandas/lib.pyx +++ b/pandas/lib.pyx @@ -416,26 +416,6 @@ def dicts_to_array(list dicts, list columns): return result -@cython.wraparound(False) -@cython.boundscheck(False) -def combine_from_indexers(ndarray a, ndarray[int64_t] a_indexer, - ndarray b, ndarray[int64_t] b_indexer): - cdef: - Py_ssize_t i, n_a, n_b - ndarray result - - n_a = len(a) - n_b = len(b) - result = np.empty(n_a+n_b,dtype=object) - - for i in range(n_a): - result[a_indexer[i]] = a[i] - for i in range(n_b): - result[b_indexer[i]] = b[i] - - return result - - def fast_zip(list ndarrays): ''' For zipping multiple ndarrays into an ndarray of tuples diff --git a/pandas/tests/test_indexing.py b/pandas/tests/test_indexing.py index d90aa369aa46e..e9afa1ae6ec1d 100644 --- a/pandas/tests/test_indexing.py +++ b/pandas/tests/test_indexing.py @@ -840,6 +840,16 @@ def test_set_index_nan(self): result = df.set_index(['year','PRuid','QC']).reset_index().reindex(columns=df.columns) assert_frame_equal(result,df) + def test_multi_nan_indexing(self): + + # GH 3588 + df = DataFrame({"a":['R1', 'R2', np.nan, 'R4'], 'b':["C1", "C2", "C3" , "C4"], "c":[10, 15, np.nan , 20]}) + result = df.set_index(['a','b'], drop=False) + expected = DataFrame({"a":['R1', 'R2', np.nan, 'R4'], 'b':["C1", "C2", "C3" , "C4"], "c":[10, 15, np.nan , 20]}, + index = [Index(['R1','R2',np.nan,'R4'],name='a'),Index(['C1','C2','C3','C4'],name='b')]) + assert_frame_equal(result,expected) + + def test_iloc_panel_issue(self): # GH 3617 diff --git a/pandas/tools/tests/test_pivot.py b/pandas/tools/tests/test_pivot.py index c0e0de1a23dad..e333691b1e6d2 100644 --- a/pandas/tools/tests/test_pivot.py +++ b/pandas/tools/tests/test_pivot.py @@ -2,7 +2,7 @@ import numpy as np -from pandas import DataFrame, Series +from pandas import DataFrame, Series, Index from pandas.tools.merge import concat from pandas.tools.pivot import pivot_table, crosstab import pandas.util.testing as tm @@ -129,6 +129,17 @@ def test_pivot_multi_functions(self): expected = concat([means, stds], keys=['mean', 'std'], axis=1) tm.assert_frame_equal(result, expected) + def test_pivot_index_with_nan(self): + # GH 3588 + nan = np.nan + df = DataFrame({"a":['R1', 'R2', nan, 'R4'], 'b':["C1", "C2", "C3" , "C4"], "c":[10, 15, nan , 20]}) + result = df.pivot('a','b','c') + expected = DataFrame([[nan,nan,nan,nan],[nan,10,nan,nan], + [nan,nan,nan,nan],[nan,nan,15,20]], + index = Index(['R1','R2',nan,'R4'],name='a'), + columns = Index(['C1','C2','C3','C4'],name='b')) + tm.assert_frame_equal(result, expected) + def test_margins(self): def _check_output(res, col, rows=['A', 'B'], cols=['C']): cmarg = res['All'][:-1]