Skip to content

BUG: (GH3588) fix pivoting with nan in the index #3627

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
May 19, 2013
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions RELEASE.rst
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,7 @@ pandas 0.11.1
- Fix plotting of unordered DatetimeIndex (GH3601_)
- ``sql.write_frame`` failing when writing a single column to sqlite (GH3628_),
thanks to @stonebig
- Fix pivoting with ``nan`` in the index (GH3558_)

.. _GH3164: https://github.com/pydata/pandas/issues/3164
.. _GH2786: https://github.com/pydata/pandas/issues/2786
Expand Down Expand Up @@ -194,6 +195,7 @@ pandas 0.11.1
.. _GH3617: https://github.com/pydata/pandas/issues/3617
.. _GH3435: https://github.com/pydata/pandas/issues/3435
.. _GH3611: https://github.com/pydata/pandas/issues/3611
.. _GH3558: https://github.com/pydata/pandas/issues/3558
.. _GH3062: https://github.com/pydata/pandas/issues/3062
.. _GH3624: https://github.com/pydata/pandas/issues/3624
.. _GH3626: https://github.com/pydata/pandas/issues/3626
Expand Down
11 changes: 7 additions & 4 deletions pandas/core/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -469,11 +469,14 @@ def _reindex(keys, level=None):

missing = com._ensure_platform_int(missing)
missing_labels = keyarr.take(missing)
missing_labels_indexer = com._ensure_int64(l[~check])
missing_indexer = com._ensure_int64(l[~check])
cur_labels = result._get_axis(axis).values
cur_labels_indexer = com._ensure_int64(l[check])
new_labels = lib.combine_from_indexers(cur_labels, cur_labels_indexer,
missing_labels, missing_labels_indexer)
cur_indexer = com._ensure_int64(l[check])

new_labels = np.empty(tuple([len(indexer)]),dtype=object)
new_labels[cur_indexer] = cur_labels
new_labels[missing_indexer] = missing_labels

result = result.reindex_axis(new_labels,axis=axis)

return result
Expand Down
52 changes: 47 additions & 5 deletions pandas/core/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,12 @@

from pandas.core.categorical import Categorical
from pandas.core.common import (notnull, _ensure_platform_int, _maybe_promote,
_maybe_upcast)
_maybe_upcast, isnull)
from pandas.core.groupby import (get_group_index, _compress_group_index,
decons_group_index)
import pandas.core.common as com
import pandas.algos as algos

from pandas import lib

from pandas.core.index import MultiIndex, Index

Expand Down Expand Up @@ -67,7 +67,14 @@ def __init__(self, values, index, level=-1, value_columns=None):
self.index = index
self.level = self.index._get_level_number(level)

self.new_index_levels = list(index.levels)
levels = index.levels
labels = index.labels
def _make_index(lev,lab):
i = lev.__class__(_make_index_array_level(lev.values,lab))
i.name = lev.name
return i

self.new_index_levels = list([ _make_index(lev,lab) for lev,lab in zip(levels,labels) ])
self.new_index_names = list(index.names)

self.removed_name = self.new_index_names.pop(self.level)
Expand Down Expand Up @@ -140,6 +147,19 @@ def get_result(self):
values = com.take_nd(values, inds, axis=1)
columns = columns[inds]

# we might have a missing index
if len(index) != values.shape[0]:
mask = isnull(index)
if mask.any():
l = np.arange(len(index))
values, orig_values = np.empty((len(index),values.shape[1])), values
values.fill(np.nan)
values_indexer = com._ensure_int64(l[~mask])
for i, j in enumerate(values_indexer):
values[j] = orig_values[i]
else:
index = index.take(self.unique_groups)

return DataFrame(values, index=index, columns=columns)

def get_new_values(self):
Expand Down Expand Up @@ -201,11 +221,13 @@ def get_new_columns(self):
def get_new_index(self):
result_labels = []
for cur in self.sorted_labels[:-1]:
result_labels.append(cur.take(self.compressor))
labels = cur.take(self.compressor)
labels = _make_index_array_level(labels,cur)
result_labels.append(labels)

# construct the new index
if len(self.new_index_levels) == 1:
new_index = self.new_index_levels[0].take(self.unique_groups)
new_index = self.new_index_levels[0]
new_index.name = self.new_index_names[0]
else:
new_index = MultiIndex(levels=self.new_index_levels,
Expand All @@ -215,6 +237,26 @@ def get_new_index(self):
return new_index


def _make_index_array_level(lev,lab):
""" create the combined index array, preserving nans, return an array """
mask = lab == -1
if not mask.any():
return lev

l = np.arange(len(lab))
mask_labels = np.empty(len(mask[mask]),dtype=object)
mask_labels.fill(np.nan)
mask_indexer = com._ensure_int64(l[mask])

labels = lev
labels_indexer = com._ensure_int64(l[~mask])

new_labels = np.empty(tuple([len(lab)]),dtype=object)
new_labels[labels_indexer] = labels
new_labels[mask_indexer] = mask_labels

return new_labels

def _unstack_multiple(data, clocs):
if len(clocs) == 0:
return data
Expand Down
20 changes: 0 additions & 20 deletions pandas/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -416,26 +416,6 @@ def dicts_to_array(list dicts, list columns):

return result

@cython.wraparound(False)
@cython.boundscheck(False)
def combine_from_indexers(ndarray a, ndarray[int64_t] a_indexer,
ndarray b, ndarray[int64_t] b_indexer):
cdef:
Py_ssize_t i, n_a, n_b
ndarray result

n_a = len(a)
n_b = len(b)
result = np.empty(n_a+n_b,dtype=object)

for i in range(n_a):
result[a_indexer[i]] = a[i]
for i in range(n_b):
result[b_indexer[i]] = b[i]

return result


def fast_zip(list ndarrays):
'''
For zipping multiple ndarrays into an ndarray of tuples
Expand Down
10 changes: 10 additions & 0 deletions pandas/tests/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -840,6 +840,16 @@ def test_set_index_nan(self):
result = df.set_index(['year','PRuid','QC']).reset_index().reindex(columns=df.columns)
assert_frame_equal(result,df)

def test_multi_nan_indexing(self):

# GH 3588
df = DataFrame({"a":['R1', 'R2', np.nan, 'R4'], 'b':["C1", "C2", "C3" , "C4"], "c":[10, 15, np.nan , 20]})
result = df.set_index(['a','b'], drop=False)
expected = DataFrame({"a":['R1', 'R2', np.nan, 'R4'], 'b':["C1", "C2", "C3" , "C4"], "c":[10, 15, np.nan , 20]},
index = [Index(['R1','R2',np.nan,'R4'],name='a'),Index(['C1','C2','C3','C4'],name='b')])
assert_frame_equal(result,expected)


def test_iloc_panel_issue(self):

# GH 3617
Expand Down
13 changes: 12 additions & 1 deletion pandas/tools/tests/test_pivot.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import numpy as np

from pandas import DataFrame, Series
from pandas import DataFrame, Series, Index
from pandas.tools.merge import concat
from pandas.tools.pivot import pivot_table, crosstab
import pandas.util.testing as tm
Expand Down Expand Up @@ -129,6 +129,17 @@ def test_pivot_multi_functions(self):
expected = concat([means, stds], keys=['mean', 'std'], axis=1)
tm.assert_frame_equal(result, expected)

def test_pivot_index_with_nan(self):
# GH 3588
nan = np.nan
df = DataFrame({"a":['R1', 'R2', nan, 'R4'], 'b':["C1", "C2", "C3" , "C4"], "c":[10, 15, nan , 20]})
result = df.pivot('a','b','c')
expected = DataFrame([[nan,nan,nan,nan],[nan,10,nan,nan],
[nan,nan,nan,nan],[nan,nan,15,20]],
index = Index(['R1','R2',nan,'R4'],name='a'),
columns = Index(['C1','C2','C3','C4'],name='b'))
tm.assert_frame_equal(result, expected)

def test_margins(self):
def _check_output(res, col, rows=['A', 'B'], cols=['C']):
cmarg = res['All'][:-1]
Expand Down