-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
ENH: GH9746 DataFrame.unstack and Series.unstack now take fill_value … #10246
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1127,6 +1127,12 @@ def _maybe_promote(dtype, fill_value=np.nan): | |
# the proper thing to do here would probably be to upcast | ||
# to object (but numpy 1.6.1 doesn't do this properly) | ||
fill_value = tslib.iNaT | ||
elif issubclass(dtype.type, np.timedelta64): | ||
try: | ||
fill_value = lib.Timedelta(fill_value).value | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @amcpherson I think we can eliminate this section here, and just upcast to This is some pretty old code I think |
||
except: | ||
# as for datetimes, cannot upcast to object | ||
fill_value = tslib.iNaT | ||
else: | ||
fill_value = tslib.iNaT | ||
elif is_datetimetz(dtype): | ||
|
@@ -1153,6 +1159,16 @@ def _maybe_promote(dtype, fill_value=np.nan): | |
dtype = np.object_ | ||
elif issubclass(dtype.type, (np.integer, np.floating)): | ||
dtype = np.complex128 | ||
elif fill_value is None: | ||
if is_float_dtype(dtype) or is_complex_dtype(dtype): | ||
fill_value = np.nan | ||
elif is_integer_dtype(dtype): | ||
dtype = np.float64 | ||
fill_value = np.nan | ||
elif is_datetime_or_timedelta_dtype(dtype): | ||
fill_value = tslib.iNaT | ||
else: | ||
dtype = np.object_ | ||
else: | ||
dtype = np.object_ | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3851,7 +3851,7 @@ def stack(self, level=-1, dropna=True): | |
else: | ||
return stack(self, level, dropna=dropna) | ||
|
||
def unstack(self, level=-1): | ||
def unstack(self, level=-1, fill_value=None): | ||
""" | ||
Pivot a level of the (necessarily hierarchical) index labels, returning | ||
a DataFrame having a new level of column labels whose inner-most level | ||
|
@@ -3864,6 +3864,10 @@ def unstack(self, level=-1): | |
---------- | ||
level : int, string, or list of these, default -1 (last level) | ||
Level(s) of index to unstack, can pass level name | ||
fill_value : replace NaN with this value if the unstack produces | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. add a |
||
missing values | ||
|
||
.. versionadded: 0.18.0 | ||
|
||
See also | ||
-------- | ||
|
@@ -3905,7 +3909,7 @@ def unstack(self, level=-1): | |
unstacked : DataFrame or Series | ||
""" | ||
from pandas.core.reshape import unstack | ||
return unstack(self, level) | ||
return unstack(self, level, fill_value) | ||
|
||
# ---------------------------------------------------------------------- | ||
# Time series-related | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2003,7 +2003,7 @@ def reorder_levels(self, order): | |
result.index = result.index.reorder_levels(order) | ||
return result | ||
|
||
def unstack(self, level=-1): | ||
def unstack(self, level=-1, fill_value=None): | ||
""" | ||
Unstack, a.k.a. pivot, Series with MultiIndex to produce DataFrame. | ||
The level involved will automatically get sorted. | ||
|
@@ -2012,6 +2012,10 @@ def unstack(self, level=-1): | |
---------- | ||
level : int, string, or list of these, default last level | ||
Level(s) to unstack, can pass level name | ||
fill_value : replace NaN with this value if the unstack produces | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. add a version added tag |
||
missing values | ||
|
||
.. versionadded: 0.18.0 | ||
|
||
Examples | ||
-------- | ||
|
@@ -2036,7 +2040,7 @@ def unstack(self, level=-1): | |
unstacked : DataFrame | ||
""" | ||
from pandas.core.reshape import unstack | ||
return unstack(self, level) | ||
return unstack(self, level, fill_value) | ||
|
||
# ---------------------------------------------------------------------- | ||
# function application | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -10,7 +10,7 @@ | |
import numpy as np | ||
|
||
from pandas.compat import u | ||
from pandas import DataFrame, Index, Series, MultiIndex, date_range | ||
from pandas import DataFrame, Index, Series, MultiIndex, date_range, Timedelta, Period | ||
import pandas as pd | ||
|
||
from pandas.util.testing import (assert_series_equal, | ||
|
@@ -136,6 +136,141 @@ def test_stack_unstack(self): | |
assert_frame_equal(unstacked_cols.T, self.frame) | ||
assert_frame_equal(unstacked_cols_df['bar'].T, self.frame) | ||
|
||
def test_unstack_fill(self): | ||
|
||
# GH #9746: fill_value keyword argument for Series | ||
# and DataFrame unstack | ||
|
||
# From a series | ||
data = Series([1, 2, 4, 5], dtype=np.int16) | ||
data.index = MultiIndex.from_tuples( | ||
[('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')]) | ||
|
||
result = data.unstack(fill_value=-1) | ||
expected = DataFrame({'a': [1, -1, 5], 'b': [2, 4, -1]}, | ||
index=['x', 'y', 'z'], dtype=np.int16) | ||
assert_frame_equal(result, expected) | ||
|
||
# From a series with incorrect data type for fill_value | ||
result = data.unstack(fill_value=0.5) | ||
expected = DataFrame({'a': [1, 0.5, 5], 'b': [2, 4, 0.5]}, | ||
index=['x', 'y', 'z'], dtype=np.float) | ||
assert_frame_equal(result, expected) | ||
|
||
# From a dataframe | ||
rows = [[1, 2], [3, 4], [5, 6], [7, 8]] | ||
df = DataFrame(rows, columns=list('AB'), dtype=np.int32) | ||
df.index = MultiIndex.from_tuples( | ||
[('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')]) | ||
|
||
result = df.unstack(fill_value=-1) | ||
|
||
rows = [[1, 3, 2, 4], [-1, 5, -1, 6], [7, -1, 8, -1]] | ||
expected = DataFrame(rows, index=list('xyz'), dtype=np.int32) | ||
expected.columns = MultiIndex.from_tuples( | ||
[('A', 'a'), ('A', 'b'), ('B', 'a'), ('B', 'b')]) | ||
assert_frame_equal(result, expected) | ||
|
||
# From a mixed type dataframe | ||
df['A'] = df['A'].astype(np.int16) | ||
df['B'] = df['B'].astype(np.float64) | ||
|
||
result = df.unstack(fill_value=-1) | ||
expected['A'] = expected['A'].astype(np.int16) | ||
expected['B'] = expected['B'].astype(np.float64) | ||
assert_frame_equal(result, expected) | ||
|
||
# From a dataframe with incorrect data type for fill_value | ||
result = df.unstack(fill_value=0.5) | ||
|
||
rows = [[1, 3, 2, 4], [0.5, 5, 0.5, 6], [7, 0.5, 8, 0.5]] | ||
expected = DataFrame(rows, index=list('xyz'), dtype=np.float) | ||
expected.columns = MultiIndex.from_tuples( | ||
[('A', 'a'), ('A', 'b'), ('B', 'a'), ('B', 'b')]) | ||
assert_frame_equal(result, expected) | ||
|
||
# Test unstacking with date times | ||
dv = pd.date_range('2012-01-01', periods=4).values | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you add a test with timedeltas as well (similar to the datetimes test) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, timedeltas required some changes to _maybe_promote, good catch. As On Wed, Jan 20, 2016 at 8:48 AM Jeff Reback notifications@github.com
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. gr8! also add a test with |
||
data = Series(dv) | ||
data.index = MultiIndex.from_tuples( | ||
[('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')]) | ||
|
||
result = data.unstack() | ||
expected = DataFrame({'a': [dv[0], pd.NaT, dv[3]], | ||
'b': [dv[1], dv[2], pd.NaT]}, | ||
index=['x', 'y', 'z']) | ||
assert_frame_equal(result, expected) | ||
|
||
result = data.unstack(fill_value=dv[0]) | ||
expected = DataFrame({'a': [dv[0], dv[0], dv[3]], | ||
'b': [dv[1], dv[2], dv[0]]}, | ||
index=['x', 'y', 'z']) | ||
assert_frame_equal(result, expected) | ||
|
||
# Test unstacking with time deltas | ||
td = [Timedelta(days=i) for i in range(4)] | ||
data = Series(td) | ||
data.index = MultiIndex.from_tuples( | ||
[('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')]) | ||
|
||
result = data.unstack() | ||
expected = DataFrame({'a': [td[0], pd.NaT, td[3]], | ||
'b': [td[1], td[2], pd.NaT]}, | ||
index=['x', 'y', 'z']) | ||
assert_frame_equal(result, expected) | ||
|
||
result = data.unstack(fill_value=td[1]) | ||
expected = DataFrame({'a': [td[0], td[1], td[3]], | ||
'b': [td[1], td[2], td[1]]}, | ||
index=['x', 'y', 'z']) | ||
assert_frame_equal(result, expected) | ||
|
||
# Test unstacking with period | ||
periods = [Period('2012-01'), Period('2012-02'), Period('2012-03'), | ||
Period('2012-04')] | ||
data = Series(periods) | ||
data.index = MultiIndex.from_tuples( | ||
[('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')]) | ||
|
||
result = data.unstack() | ||
expected = DataFrame({'a': [periods[0], None, periods[3]], | ||
'b': [periods[1], periods[2], None]}, | ||
index=['x', 'y', 'z']) | ||
assert_frame_equal(result, expected) | ||
|
||
result = data.unstack(fill_value=periods[1]) | ||
expected = DataFrame({'a': [periods[0], periods[1], periods[3]], | ||
'b': [periods[1], periods[2], periods[1]]}, | ||
index=['x', 'y', 'z']) | ||
assert_frame_equal(result, expected) | ||
|
||
# Test unstacking with categorical | ||
data = pd.Series(['a', 'b', 'c', 'a'], dtype='category') | ||
data.index = pd.MultiIndex.from_tuples( | ||
[('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')]) | ||
|
||
# By default missing values will be NaN | ||
result = data.unstack() | ||
expected = DataFrame({'a': pd.Categorical(list('axa'), | ||
categories=list('abc')), | ||
'b': pd.Categorical(list('bcx'), | ||
categories=list('abc'))}, | ||
index=list('xyz')) | ||
assert_frame_equal(result, expected) | ||
|
||
# Fill with non-category results in NaN entries similar to above | ||
result = data.unstack(fill_value='d') | ||
assert_frame_equal(result, expected) | ||
|
||
# Fill with category value replaces missing values as expected | ||
result = data.unstack(fill_value='c') | ||
expected = DataFrame({'a': pd.Categorical(list('aca'), | ||
categories=list('abc')), | ||
'b': pd.Categorical(list('bcc'), | ||
categories=list('abc'))}, | ||
index=list('xyz')) | ||
assert_frame_equal(result, expected) | ||
|
||
def test_stack_ints(self): | ||
df = DataFrame( | ||
np.random.randn(30, 27), | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
add a versionadded tag