ENH add drop_na argument to pivot_table

hayd · hayd · commit 2d63a71d1526 · 2013-07-06T20:59:32.000+01:00
diff --git a/doc/source/release.rst b/doc/source/release.rst
@@ -72,6 +72,7 @@ pandas 0.12
     - support python3 (via ``PyTables 3.0.0``) (:issue:`3750`)
   - Add modulo operator to Series, DataFrame
   - Add ``date`` method to DatetimeIndex
+  - Add ``dropna`` argument to pivot_table (:issue: `3820`) 
   - Simplified the API and added a describe method to Categorical
   - ``melt`` now accepts the optional parameters ``var_name`` and ``value_name``
     to specify custom column names of the returned DataFrame (:issue:`3649`),
diff --git a/pandas/tools/pivot.py b/pandas/tools/pivot.py
@@ -4,12 +4,13 @@
 from pandas.core.index import MultiIndex
 from pandas.core.reshape import _unstack_multiple
 from pandas.tools.merge import concat
+from pandas.tools.util import cartesian_product
 import pandas.core.common as com
 import numpy as np
 
 
 def pivot_table(data, values=None, rows=None, cols=None, aggfunc='mean',
-                fill_value=None, margins=False):
+                fill_value=None, margins=False, dropna=True):
     """
     Create a spreadsheet-style pivot table as a DataFrame. The levels in the
     pivot table will be stored in MultiIndex objects (hierarchical indexes) on
@@ -31,6 +32,8 @@ def pivot_table(data, values=None, rows=None, cols=None, aggfunc='mean',
         Value to replace missing values with
     margins : boolean, default False
         Add all row / columns (e.g. for subtotal / grand totals)
+    dropna : boolean, default True
+        Do not include columns whose entries are all NaN
 
     Examples
     --------
@@ -105,6 +108,19 @@ def pivot_table(data, values=None, rows=None, cols=None, aggfunc='mean',
                       for i in range(len(rows), len(keys))]
         table = agged.unstack(to_unstack)
 
+    if not dropna:
+        try:
+            m = MultiIndex.from_arrays(cartesian_product(table.index.levels))
+            table = table.reindex_axis(m, axis=0)
+        except AttributeError:
+            pass # it's a single level
+
+        try:
+            m = MultiIndex.from_arrays(cartesian_product(table.columns.levels))
+            table = table.reindex_axis(m, axis=1)
+        except AttributeError:
+            pass # it's a single level or a series
+
     if isinstance(table, DataFrame):
         if isinstance(table.columns, MultiIndex):
             table = table.sortlevel(axis=1)
@@ -216,7 +232,7 @@ def _convert_by(by):
 
 
 def crosstab(rows, cols, values=None, rownames=None, colnames=None,
-             aggfunc=None, margins=False):
+             aggfunc=None, margins=False, dropna=True):
     """
     Compute a simple cross-tabulation of two (or more) factors. By default
     computes a frequency table of the factors unless an array of values and an
@@ -238,6 +254,8 @@ def crosstab(rows, cols, values=None, rownames=None, colnames=None,
         If passed, must match number of column arrays passed
     margins : boolean, default False
         Add row/column margins (subtotals)
+    dropna : boolean, default True
+        Do not include columns whose entries are all NaN
 
     Notes
     -----
@@ -281,13 +299,13 @@ def crosstab(rows, cols, values=None, rownames=None, colnames=None,
         df = DataFrame(data)
         df['__dummy__'] = 0
         table = df.pivot_table('__dummy__', rows=rownames, cols=colnames,
-                               aggfunc=len, margins=margins)
+                               aggfunc=len, margins=margins, dropna=dropna)
         return table.fillna(0).astype(np.int64)
     else:
         data['__dummy__'] = values
         df = DataFrame(data)
         table = df.pivot_table('__dummy__', rows=rownames, cols=colnames,
-                               aggfunc=aggfunc, margins=margins)
+                               aggfunc=aggfunc, margins=margins, dropna=dropna)
         return table
 
 
diff --git a/pandas/tools/tests/test_pivot.py b/pandas/tools/tests/test_pivot.py
@@ -1,8 +1,9 @@
 import unittest
 
 import numpy as np
+from numpy.testing import assert_equal
 
-from pandas import DataFrame, Series, Index
+from pandas import DataFrame, Series, Index, MultiIndex
 from pandas.tools.merge import concat
 from pandas.tools.pivot import pivot_table, crosstab
 import pandas.util.testing as tm
@@ -62,6 +63,22 @@ def test_pivot_table_nocols(self):
         xp = df.pivot_table(rows='cols', aggfunc={'values': 'mean'}).T
         tm.assert_frame_equal(rs, xp)
 
+    def test_pivot_table_dropna(self):
+        df = DataFrame({'amount': {0: 60000, 1: 100000, 2: 50000, 3: 30000},
+                        'customer': {0: 'A', 1: 'A', 2: 'B', 3: 'C'},
+                        'month': {0: 201307, 1: 201309, 2: 201308, 3: 201310},
+                        'product': {0: 'a', 1: 'b', 2: 'c', 3: 'd'},
+                        'quantity': {0: 2000000, 1: 500000, 2: 1000000, 3: 1000000}})
+        pv_col = df.pivot_table('quantity', 'month', ['customer', 'product'], dropna=False)
+        pv_ind = df.pivot_table('quantity', ['customer', 'product'], 'month', dropna=False)
+
+        m = MultiIndex.from_tuples([(u'A', u'a'), (u'A', u'b'), (u'A', u'c'), (u'A', u'd'), 
+                                   (u'B', u'a'), (u'B', u'b'), (u'B', u'c'), (u'B', u'd'),
+                                   (u'C', u'a'), (u'C', u'b'), (u'C', u'c'), (u'C', u'd')])
+
+        assert_equal(pv_col.columns.values, m.values)
+        assert_equal(pv_ind.index.values, m.values)
+
 
     def test_pass_array(self):
         result = self.data.pivot_table('D', rows=self.data.A, cols=self.data.C)
@@ -374,6 +391,16 @@ def test_crosstab_pass_values(self):
                                   aggfunc=np.sum)
         tm.assert_frame_equal(table, expected)
 
+    def test_crosstab_dropna(self):
+        # GH 3820
+        a = np.array(['foo', 'foo', 'foo', 'bar', 'bar', 'foo', 'foo'], dtype=object)
+        b = np.array(['one', 'one', 'two', 'one', 'two', 'two', 'two'], dtype=object)
+        c = np.array(['dull', 'dull', 'dull', 'dull', 'dull', 'shiny', 'shiny'], dtype=object)
+        res = crosstab(a, [b, c], rownames=['a'], colnames=['b', 'c'], dropna=False)
+        m = MultiIndex.from_tuples([('one', 'dull'), ('one', 'shiny'),
+                                    ('two', 'dull'), ('two', 'shiny')])
+        assert_equal(res.columns.values, m.values)
+
 if __name__ == '__main__':
     import nose
     nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
diff --git a/pandas/tools/tests/test_util.py b/pandas/tools/tests/test_util.py
@@ -0,0 +1,21 @@
+import os
+import nose
+import unittest
+
+import numpy as np
+from numpy.testing import assert_equal
+
+from pandas.tools.util import cartesian_product
+
+class TestCartesianProduct(unittest.TestCase):
+
+    def test_simple(self):
+        x, y = list('ABC'), [1, 22]
+        result = cartesian_product([x, y])
+        expected = [np.array(['A', 'A', 'B', 'B', 'C', 'C']),
+                    np.array([ 1, 22,  1, 22,  1, 22])]
+        assert_equal(result, expected)
+
+if __name__ == '__main__':
+    nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
+                   exit=False)
diff --git a/pandas/tools/util.py b/pandas/tools/util.py
@@ -1,6 +1,28 @@
 from pandas.core.index import Index
+import numpy as np
 
 def match(needles, haystack):
     haystack = Index(haystack)
     needles = Index(needles)
-    return haystack.get_indexer(needles)
+    return haystack.get_indexer(needles)
+
+def cartesian_product(X):
+    '''
+    Numpy version of itertools.product or pandas.util.compat.product.
+    Sometimes faster (for large inputs)...
+
+    Examples
+    --------
+    >>> cartesian_product([list('ABC'), [1, 2]])
+    [array(['A', 'A', 'B', 'B', 'C', 'C'], dtype='|S1'),
+ 	array([1, 2, 1, 2, 1, 2])]
+
+    '''
+    lenX = map(len, X)
+    cumprodX = np.cumproduct(lenX)
+    a = np.insert(cumprodX, 0, 1)
+    b = a[-1] / a[1:]
+    return [np.tile(np.repeat(x, b[i]), 
+    	            np.product(a[i]))
+               for i, x in enumerate(X)]
+