Skip to content

Commit

Permalink
ENH add drop_na argument to pivot_table
Browse files Browse the repository at this point in the history
  • Loading branch information
hayd committed Jul 6, 2013
1 parent 030f613 commit 2d63a71
Show file tree
Hide file tree
Showing 5 changed files with 95 additions and 6 deletions.
1 change: 1 addition & 0 deletions doc/source/release.rst
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ pandas 0.12
- support python3 (via ``PyTables 3.0.0``) (:issue:`3750`)
- Add modulo operator to Series, DataFrame
- Add ``date`` method to DatetimeIndex
- Add ``dropna`` argument to pivot_table (:issue: `3820`)
- Simplified the API and added a describe method to Categorical
- ``melt`` now accepts the optional parameters ``var_name`` and ``value_name``
to specify custom column names of the returned DataFrame (:issue:`3649`),
Expand Down
26 changes: 22 additions & 4 deletions pandas/tools/pivot.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,13 @@
from pandas.core.index import MultiIndex
from pandas.core.reshape import _unstack_multiple
from pandas.tools.merge import concat
from pandas.tools.util import cartesian_product
import pandas.core.common as com
import numpy as np


def pivot_table(data, values=None, rows=None, cols=None, aggfunc='mean',
fill_value=None, margins=False):
fill_value=None, margins=False, dropna=True):
"""
Create a spreadsheet-style pivot table as a DataFrame. The levels in the
pivot table will be stored in MultiIndex objects (hierarchical indexes) on
Expand All @@ -31,6 +32,8 @@ def pivot_table(data, values=None, rows=None, cols=None, aggfunc='mean',
Value to replace missing values with
margins : boolean, default False
Add all row / columns (e.g. for subtotal / grand totals)
dropna : boolean, default True
Do not include columns whose entries are all NaN
Examples
--------
Expand Down Expand Up @@ -105,6 +108,19 @@ def pivot_table(data, values=None, rows=None, cols=None, aggfunc='mean',
for i in range(len(rows), len(keys))]
table = agged.unstack(to_unstack)

if not dropna:
try:
m = MultiIndex.from_arrays(cartesian_product(table.index.levels))
table = table.reindex_axis(m, axis=0)
except AttributeError:
pass # it's a single level

try:
m = MultiIndex.from_arrays(cartesian_product(table.columns.levels))
table = table.reindex_axis(m, axis=1)
except AttributeError:
pass # it's a single level or a series

if isinstance(table, DataFrame):
if isinstance(table.columns, MultiIndex):
table = table.sortlevel(axis=1)
Expand Down Expand Up @@ -216,7 +232,7 @@ def _convert_by(by):


def crosstab(rows, cols, values=None, rownames=None, colnames=None,
aggfunc=None, margins=False):
aggfunc=None, margins=False, dropna=True):
"""
Compute a simple cross-tabulation of two (or more) factors. By default
computes a frequency table of the factors unless an array of values and an
Expand All @@ -238,6 +254,8 @@ def crosstab(rows, cols, values=None, rownames=None, colnames=None,
If passed, must match number of column arrays passed
margins : boolean, default False
Add row/column margins (subtotals)
dropna : boolean, default True
Do not include columns whose entries are all NaN
Notes
-----
Expand Down Expand Up @@ -281,13 +299,13 @@ def crosstab(rows, cols, values=None, rownames=None, colnames=None,
df = DataFrame(data)
df['__dummy__'] = 0
table = df.pivot_table('__dummy__', rows=rownames, cols=colnames,
aggfunc=len, margins=margins)
aggfunc=len, margins=margins, dropna=dropna)
return table.fillna(0).astype(np.int64)
else:
data['__dummy__'] = values
df = DataFrame(data)
table = df.pivot_table('__dummy__', rows=rownames, cols=colnames,
aggfunc=aggfunc, margins=margins)
aggfunc=aggfunc, margins=margins, dropna=dropna)
return table


Expand Down
29 changes: 28 additions & 1 deletion pandas/tools/tests/test_pivot.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import unittest

import numpy as np
from numpy.testing import assert_equal

from pandas import DataFrame, Series, Index
from pandas import DataFrame, Series, Index, MultiIndex
from pandas.tools.merge import concat
from pandas.tools.pivot import pivot_table, crosstab
import pandas.util.testing as tm
Expand Down Expand Up @@ -62,6 +63,22 @@ def test_pivot_table_nocols(self):
xp = df.pivot_table(rows='cols', aggfunc={'values': 'mean'}).T
tm.assert_frame_equal(rs, xp)

def test_pivot_table_dropna(self):
df = DataFrame({'amount': {0: 60000, 1: 100000, 2: 50000, 3: 30000},
'customer': {0: 'A', 1: 'A', 2: 'B', 3: 'C'},
'month': {0: 201307, 1: 201309, 2: 201308, 3: 201310},
'product': {0: 'a', 1: 'b', 2: 'c', 3: 'd'},
'quantity': {0: 2000000, 1: 500000, 2: 1000000, 3: 1000000}})
pv_col = df.pivot_table('quantity', 'month', ['customer', 'product'], dropna=False)
pv_ind = df.pivot_table('quantity', ['customer', 'product'], 'month', dropna=False)

m = MultiIndex.from_tuples([(u'A', u'a'), (u'A', u'b'), (u'A', u'c'), (u'A', u'd'),
(u'B', u'a'), (u'B', u'b'), (u'B', u'c'), (u'B', u'd'),
(u'C', u'a'), (u'C', u'b'), (u'C', u'c'), (u'C', u'd')])

assert_equal(pv_col.columns.values, m.values)
assert_equal(pv_ind.index.values, m.values)


def test_pass_array(self):
result = self.data.pivot_table('D', rows=self.data.A, cols=self.data.C)
Expand Down Expand Up @@ -374,6 +391,16 @@ def test_crosstab_pass_values(self):
aggfunc=np.sum)
tm.assert_frame_equal(table, expected)

def test_crosstab_dropna(self):
# GH 3820
a = np.array(['foo', 'foo', 'foo', 'bar', 'bar', 'foo', 'foo'], dtype=object)
b = np.array(['one', 'one', 'two', 'one', 'two', 'two', 'two'], dtype=object)
c = np.array(['dull', 'dull', 'dull', 'dull', 'dull', 'shiny', 'shiny'], dtype=object)
res = crosstab(a, [b, c], rownames=['a'], colnames=['b', 'c'], dropna=False)
m = MultiIndex.from_tuples([('one', 'dull'), ('one', 'shiny'),
('two', 'dull'), ('two', 'shiny')])
assert_equal(res.columns.values, m.values)

if __name__ == '__main__':
import nose
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
Expand Down
21 changes: 21 additions & 0 deletions pandas/tools/tests/test_util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import os
import nose
import unittest

import numpy as np
from numpy.testing import assert_equal

from pandas.tools.util import cartesian_product

class TestCartesianProduct(unittest.TestCase):

def test_simple(self):
x, y = list('ABC'), [1, 22]
result = cartesian_product([x, y])
expected = [np.array(['A', 'A', 'B', 'B', 'C', 'C']),
np.array([ 1, 22, 1, 22, 1, 22])]
assert_equal(result, expected)

if __name__ == '__main__':
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
exit=False)
24 changes: 23 additions & 1 deletion pandas/tools/util.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,28 @@
from pandas.core.index import Index
import numpy as np

def match(needles, haystack):
haystack = Index(haystack)
needles = Index(needles)
return haystack.get_indexer(needles)
return haystack.get_indexer(needles)

def cartesian_product(X):
'''
Numpy version of itertools.product or pandas.util.compat.product.
Sometimes faster (for large inputs)...
Examples
--------
>>> cartesian_product([list('ABC'), [1, 2]])
[array(['A', 'A', 'B', 'B', 'C', 'C'], dtype='|S1'),
array([1, 2, 1, 2, 1, 2])]
'''
lenX = map(len, X)
cumprodX = np.cumproduct(lenX)
a = np.insert(cumprodX, 0, 1)
b = a[-1] / a[1:]
return [np.tile(np.repeat(x, b[i]),
np.product(a[i]))
for i, x in enumerate(X)]

0 comments on commit 2d63a71

Please sign in to comment.