Skip to content

ENH: GH9746 DataFrame.unstack and Series.unstack now take fill_value … #10246

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions doc/source/reshaping.rst
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,27 @@ which level in the columns to stack:
df2.stack('exp')
df2.stack('animal')

Unstacking can result in missing values if subgroups do not have the same
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add a versionadded tag

set of labels. By default, missing values will be replaced with the default
fill value for that data type, ``NaN`` for float, ``NaT`` for datetimelike,
etc. For integer types, by default data will converted to float and missing
values will be set to ``NaN``.

.. ipython:: python

df3 = df.iloc[[0, 1, 4, 7], [1, 2]]
df3
df3.unstack()

.. versionadded: 0.18.0

Alternatively, unstack takes an optional ``fill_value`` argument, for specifying
the value of missing data.

.. ipython:: python

df3.unstack(fill_value=-1e9)

With a MultiIndex
~~~~~~~~~~~~~~~~~

Expand Down
4 changes: 4 additions & 0 deletions doc/source/whatsnew/v0.18.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -431,6 +431,10 @@ Other API Changes

- ``pandas.merge()`` and ``DataFrame.merge()`` will show a specific error message when trying to merge with an object that is not of type ``DataFrame`` or a subclass (:issue:`12081`)

- ``DataFrame.unstack`` and ``Series.unstack`` now take ``fill_value`` keyword to allow direct replacement of
missing values when an unstack results in missing values in the resulting ``DataFrame``. As an added benefit,
specifying ``fill_value`` will preserve the data type of the original stacked data. (:issue:`9746`)

.. _whatsnew_0180.deprecations:

Deprecations
Expand Down
16 changes: 16 additions & 0 deletions pandas/core/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -1127,6 +1127,12 @@ def _maybe_promote(dtype, fill_value=np.nan):
# the proper thing to do here would probably be to upcast
# to object (but numpy 1.6.1 doesn't do this properly)
fill_value = tslib.iNaT
elif issubclass(dtype.type, np.timedelta64):
try:
fill_value = lib.Timedelta(fill_value).value
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@amcpherson I think we can eliminate this section here, and just upcast to np.object_ if the fill_value cannot be coerced, (and the above about datetimes), can you create an issue for this? (and PR would be great as well!) thanks

This is some pretty old code I think

except:
# as for datetimes, cannot upcast to object
fill_value = tslib.iNaT
else:
fill_value = tslib.iNaT
elif is_datetimetz(dtype):
Expand All @@ -1153,6 +1159,16 @@ def _maybe_promote(dtype, fill_value=np.nan):
dtype = np.object_
elif issubclass(dtype.type, (np.integer, np.floating)):
dtype = np.complex128
elif fill_value is None:
if is_float_dtype(dtype) or is_complex_dtype(dtype):
fill_value = np.nan
elif is_integer_dtype(dtype):
dtype = np.float64
fill_value = np.nan
elif is_datetime_or_timedelta_dtype(dtype):
fill_value = tslib.iNaT
else:
dtype = np.object_
else:
dtype = np.object_

Expand Down
8 changes: 6 additions & 2 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -3851,7 +3851,7 @@ def stack(self, level=-1, dropna=True):
else:
return stack(self, level, dropna=dropna)

def unstack(self, level=-1):
def unstack(self, level=-1, fill_value=None):
"""
Pivot a level of the (necessarily hierarchical) index labels, returning
a DataFrame having a new level of column labels whose inner-most level
Expand All @@ -3864,6 +3864,10 @@ def unstack(self, level=-1):
----------
level : int, string, or list of these, default -1 (last level)
Level(s) of index to unstack, can pass level name
fill_value : replace NaN with this value if the unstack produces
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add a versionadded tag

missing values

.. versionadded: 0.18.0

See also
--------
Expand Down Expand Up @@ -3905,7 +3909,7 @@ def unstack(self, level=-1):
unstacked : DataFrame or Series
"""
from pandas.core.reshape import unstack
return unstack(self, level)
return unstack(self, level, fill_value)

# ----------------------------------------------------------------------
# Time series-related
Expand Down
21 changes: 13 additions & 8 deletions pandas/core/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,8 @@ class _Unstacker(object):
unstacked : DataFrame
"""

def __init__(self, values, index, level=-1, value_columns=None):
def __init__(self, values, index, level=-1, value_columns=None,
fill_value=None):

self.is_categorical = None
if values.ndim == 1:
Expand All @@ -70,6 +71,7 @@ def __init__(self, values, index, level=-1, value_columns=None):
values = values[:, np.newaxis]
self.values = values
self.value_columns = value_columns
self.fill_value = fill_value

if value_columns is None and values.shape[1] != 1: # pragma: no cover
raise ValueError('must pass column labels for multi-column data')
Expand Down Expand Up @@ -178,7 +180,7 @@ def get_new_values(self):
dtype = values.dtype
new_values = np.empty(result_shape, dtype=dtype)
else:
dtype, fill_value = _maybe_promote(values.dtype)
dtype, fill_value = _maybe_promote(values.dtype, self.fill_value)
new_values = np.empty(result_shape, dtype=dtype)
new_values.fill(fill_value)

Expand Down Expand Up @@ -389,21 +391,22 @@ def _slow_pivot(index, columns, values):
return DataFrame(tree)


def unstack(obj, level):
def unstack(obj, level, fill_value=None):
if isinstance(level, (tuple, list)):
return _unstack_multiple(obj, level)

if isinstance(obj, DataFrame):
if isinstance(obj.index, MultiIndex):
return _unstack_frame(obj, level)
return _unstack_frame(obj, level, fill_value=fill_value)
else:
return obj.T.stack(dropna=False)
else:
unstacker = _Unstacker(obj.values, obj.index, level=level)
unstacker = _Unstacker(obj.values, obj.index, level=level,
fill_value=fill_value)
return unstacker.get_result()


def _unstack_frame(obj, level):
def _unstack_frame(obj, level, fill_value=None):
from pandas.core.internals import BlockManager, make_block

if obj._is_mixed_type:
Expand All @@ -419,7 +422,8 @@ def _unstack_frame(obj, level):
for blk in obj._data.blocks:
blk_items = obj._data.items[blk.mgr_locs.indexer]
bunstacker = _Unstacker(blk.values.T, obj.index, level=level,
value_columns=blk_items)
value_columns=blk_items,
fill_value=fill_value)
new_items = bunstacker.get_new_columns()
new_placement = new_columns.get_indexer(new_items)
new_values, mask = bunstacker.get_new_values()
Expand All @@ -435,7 +439,8 @@ def _unstack_frame(obj, level):
return result.ix[:, mask_frame.sum(0) > 0]
else:
unstacker = _Unstacker(obj.values, obj.index, level=level,
value_columns=obj.columns)
value_columns=obj.columns,
fill_value=fill_value)
return unstacker.get_result()


Expand Down
8 changes: 6 additions & 2 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -2003,7 +2003,7 @@ def reorder_levels(self, order):
result.index = result.index.reorder_levels(order)
return result

def unstack(self, level=-1):
def unstack(self, level=-1, fill_value=None):
"""
Unstack, a.k.a. pivot, Series with MultiIndex to produce DataFrame.
The level involved will automatically get sorted.
Expand All @@ -2012,6 +2012,10 @@ def unstack(self, level=-1):
----------
level : int, string, or list of these, default last level
Level(s) to unstack, can pass level name
fill_value : replace NaN with this value if the unstack produces
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add a version added tag

missing values

.. versionadded: 0.18.0

Examples
--------
Expand All @@ -2036,7 +2040,7 @@ def unstack(self, level=-1):
unstacked : DataFrame
"""
from pandas.core.reshape import unstack
return unstack(self, level)
return unstack(self, level, fill_value)

# ----------------------------------------------------------------------
# function application
Expand Down
137 changes: 136 additions & 1 deletion pandas/tests/frame/test_reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import numpy as np

from pandas.compat import u
from pandas import DataFrame, Index, Series, MultiIndex, date_range
from pandas import DataFrame, Index, Series, MultiIndex, date_range, Timedelta, Period
import pandas as pd

from pandas.util.testing import (assert_series_equal,
Expand Down Expand Up @@ -136,6 +136,141 @@ def test_stack_unstack(self):
assert_frame_equal(unstacked_cols.T, self.frame)
assert_frame_equal(unstacked_cols_df['bar'].T, self.frame)

def test_unstack_fill(self):

# GH #9746: fill_value keyword argument for Series
# and DataFrame unstack

# From a series
data = Series([1, 2, 4, 5], dtype=np.int16)
data.index = MultiIndex.from_tuples(
[('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')])

result = data.unstack(fill_value=-1)
expected = DataFrame({'a': [1, -1, 5], 'b': [2, 4, -1]},
index=['x', 'y', 'z'], dtype=np.int16)
assert_frame_equal(result, expected)

# From a series with incorrect data type for fill_value
result = data.unstack(fill_value=0.5)
expected = DataFrame({'a': [1, 0.5, 5], 'b': [2, 4, 0.5]},
index=['x', 'y', 'z'], dtype=np.float)
assert_frame_equal(result, expected)

# From a dataframe
rows = [[1, 2], [3, 4], [5, 6], [7, 8]]
df = DataFrame(rows, columns=list('AB'), dtype=np.int32)
df.index = MultiIndex.from_tuples(
[('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')])

result = df.unstack(fill_value=-1)

rows = [[1, 3, 2, 4], [-1, 5, -1, 6], [7, -1, 8, -1]]
expected = DataFrame(rows, index=list('xyz'), dtype=np.int32)
expected.columns = MultiIndex.from_tuples(
[('A', 'a'), ('A', 'b'), ('B', 'a'), ('B', 'b')])
assert_frame_equal(result, expected)

# From a mixed type dataframe
df['A'] = df['A'].astype(np.int16)
df['B'] = df['B'].astype(np.float64)

result = df.unstack(fill_value=-1)
expected['A'] = expected['A'].astype(np.int16)
expected['B'] = expected['B'].astype(np.float64)
assert_frame_equal(result, expected)

# From a dataframe with incorrect data type for fill_value
result = df.unstack(fill_value=0.5)

rows = [[1, 3, 2, 4], [0.5, 5, 0.5, 6], [7, 0.5, 8, 0.5]]
expected = DataFrame(rows, index=list('xyz'), dtype=np.float)
expected.columns = MultiIndex.from_tuples(
[('A', 'a'), ('A', 'b'), ('B', 'a'), ('B', 'b')])
assert_frame_equal(result, expected)

# Test unstacking with date times
dv = pd.date_range('2012-01-01', periods=4).values
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add a test with timedeltas as well (similar to the datetimes test)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, timedeltas required some changes to _maybe_promote, good catch. As
with datatime, cant seem to upcast to object.

On Wed, Jan 20, 2016 at 8:48 AM Jeff Reback notifications@github.com
wrote:

In pandas/tests/frame/test_reshape.py
#10246 (comment):

  •    result = df.unstack(fill_value=-1)
    
  •    expected['A'] = expected['A'].astype(np.int16)
    
  •    expected['B'] = expected['B'].astype(np.float64)
    
  •    assert_frame_equal(result, expected)
    
  •    # From a dataframe with incorrect data type for fill_value
    
  •    result = df.unstack(fill_value=0.5)
    
  •    rows = [[1, 3, 2, 4], [0.5, 5, 0.5, 6], [7, 0.5, 8, 0.5]]
    
  •    expected = DataFrame(rows, index=list('xyz'), dtype=np.float)
    
  •    expected.columns = MultiIndex.from_tuples(
    
  •        [('A', 'a'), ('A', 'b'), ('B', 'a'), ('B', 'b')])
    
  •    assert_frame_equal(result, expected)
    
  •    # Test unstacking with date times
    
  •    dv = pd.date_range('2012-01-01', periods=4).values
    

can you add a test with timedeltas as well (similar to the datetimes test)


Reply to this email directly or view it on GitHub
https://github.com/pydata/pandas/pull/10246/files#r50281164.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

gr8! also add a test with Period, though these are object ATM so should work w/o changes.

data = Series(dv)
data.index = MultiIndex.from_tuples(
[('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')])

result = data.unstack()
expected = DataFrame({'a': [dv[0], pd.NaT, dv[3]],
'b': [dv[1], dv[2], pd.NaT]},
index=['x', 'y', 'z'])
assert_frame_equal(result, expected)

result = data.unstack(fill_value=dv[0])
expected = DataFrame({'a': [dv[0], dv[0], dv[3]],
'b': [dv[1], dv[2], dv[0]]},
index=['x', 'y', 'z'])
assert_frame_equal(result, expected)

# Test unstacking with time deltas
td = [Timedelta(days=i) for i in range(4)]
data = Series(td)
data.index = MultiIndex.from_tuples(
[('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')])

result = data.unstack()
expected = DataFrame({'a': [td[0], pd.NaT, td[3]],
'b': [td[1], td[2], pd.NaT]},
index=['x', 'y', 'z'])
assert_frame_equal(result, expected)

result = data.unstack(fill_value=td[1])
expected = DataFrame({'a': [td[0], td[1], td[3]],
'b': [td[1], td[2], td[1]]},
index=['x', 'y', 'z'])
assert_frame_equal(result, expected)

# Test unstacking with period
periods = [Period('2012-01'), Period('2012-02'), Period('2012-03'),
Period('2012-04')]
data = Series(periods)
data.index = MultiIndex.from_tuples(
[('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')])

result = data.unstack()
expected = DataFrame({'a': [periods[0], None, periods[3]],
'b': [periods[1], periods[2], None]},
index=['x', 'y', 'z'])
assert_frame_equal(result, expected)

result = data.unstack(fill_value=periods[1])
expected = DataFrame({'a': [periods[0], periods[1], periods[3]],
'b': [periods[1], periods[2], periods[1]]},
index=['x', 'y', 'z'])
assert_frame_equal(result, expected)

# Test unstacking with categorical
data = pd.Series(['a', 'b', 'c', 'a'], dtype='category')
data.index = pd.MultiIndex.from_tuples(
[('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')])

# By default missing values will be NaN
result = data.unstack()
expected = DataFrame({'a': pd.Categorical(list('axa'),
categories=list('abc')),
'b': pd.Categorical(list('bcx'),
categories=list('abc'))},
index=list('xyz'))
assert_frame_equal(result, expected)

# Fill with non-category results in NaN entries similar to above
result = data.unstack(fill_value='d')
assert_frame_equal(result, expected)

# Fill with category value replaces missing values as expected
result = data.unstack(fill_value='c')
expected = DataFrame({'a': pd.Categorical(list('aca'),
categories=list('abc')),
'b': pd.Categorical(list('bcc'),
categories=list('abc'))},
index=list('xyz'))
assert_frame_equal(result, expected)

def test_stack_ints(self):
df = DataFrame(
np.random.randn(30, 27),
Expand Down