Merge pull request #3600 from jreback/modulo

jreback · jreback · commit a14cbd0ec2ce · 2013-05-14T14:48:10.000-07:00
BUG:  Fix integer modulo and division to make integer and float dtypes work similarly for invalid values
diff --git a/RELEASE.rst b/RELEASE.rst
@@ -48,6 +48,7 @@ pandas 0.11.1
       to append an index with a different name than the existing
     - support datelike columns with a timezone as data_columns (GH2852_)
     - table writing performance improvements.
+  - Add modulo operator to Series, DataFrame
 
 **API Changes**
 
@@ -111,6 +112,8 @@ pandas 0.11.1
     is a ``list`` or ``tuple``.
   - Fixed bug where a time-series was being selected in preference to an actual column name
     in a frame (GH3594_)
+  - Fix modulo and integer division on Series,DataFrames to act similary to ``float`` dtypes to return 
+    ``np.nan`` or ``np.inf`` as appropriate (GH3590_)
 
 .. _GH3164: https://github.com/pydata/pandas/issues/3164
 .. _GH2786: https://github.com/pydata/pandas/issues/2786
@@ -155,6 +158,7 @@ pandas 0.11.1
 .. _GH3593: https://github.com/pydata/pandas/issues/3593
 .. _GH3556: https://github.com/pydata/pandas/issues/3556
 .. _GH3594: https://github.com/pydata/pandas/issues/3594
+.. _GH3590: https://github.com/pydata/pandas/issues/3590
 .. _GH3435: https://github.com/pydata/pandas/issues/3435
 
 
diff --git a/doc/source/v0.11.1.txt b/doc/source/v0.11.1.txt
@@ -9,6 +9,17 @@ enhancements along with a large number of bug fixes.
 API changes
 ~~~~~~~~~~~
 
+  - Fix modulo and integer division on Series,DataFrames to act similary to ``float`` dtypes to return 
+    ``np.nan`` or ``np.inf`` as appropriate (GH3590_). This correct a numpy bug that treats ``integer``
+    and ``float`` dtypes differently.
+
+    .. ipython:: python
+
+        p = DataFrame({ 'first' : [3,4,5,8], 'second' : [0,0,0,3] })
+        p % 0
+        p % p
+        p / p
+        p / 0
 
 Enhancements
 ~~~~~~~~~~~~
@@ -33,4 +44,5 @@ on GitHub for a complete list.
 .. _GH3477: https://github.com/pydata/pandas/issues/3477
 .. _GH3492: https://github.com/pydata/pandas/issues/3492
 .. _GH3499: https://github.com/pydata/pandas/issues/3499
+.. _GH3590: https://github.com/pydata/pandas/issues/3590
 .. _GH3435: https://github.com/pydata/pandas/issues/3435
diff --git a/pandas/core/common.py b/pandas/core/common.py
@@ -793,13 +793,16 @@ def changeit():
 
             # try to directly set by expanding our array to full
             # length of the boolean
-            om = other[mask]
-            om_at = om.astype(result.dtype)
-            if (om == om_at).all():
-                new_other = result.values.copy()
-                new_other[mask] = om_at
-                result[:] = new_other
-                return result, False
+            try:
+                om = other[mask]
+                om_at = om.astype(result.dtype)
+                if (om == om_at).all():
+                    new_other = result.values.copy()
+                    new_other[mask] = om_at
+                    result[:] = new_other
+                    return result, False
+            except:
+                pass
 
             # we are forced to change the dtype of the result as the input isn't compatible
             r, fill_value = _maybe_upcast(result, fill_value=other, dtype=dtype, copy=True)
@@ -948,6 +951,27 @@ def _lcd_dtypes(a_dtype, b_dtype):
             return np.float64
     return np.object
 
+def _fill_zeros(result, y, fill):
+    """ if we have an integer value (or array in y)
+        and we have 0's, fill them with the fill,
+        return the result """
+
+    if fill is not None:
+        if not isinstance(y, np.ndarray):
+            dtype, value = _infer_dtype_from_scalar(y)
+            y = pa.empty(result.shape,dtype=dtype)
+            y.fill(value)
+
+        if is_integer_dtype(y):
+
+            mask = y.ravel() == 0
+            if mask.any():
+                shape = result.shape
+                result, changed = _maybe_upcast_putmask(result.ravel(),mask,fill)
+                result = result.reshape(shape)
+
+    return result
+
 def _interp_wrapper(f, wrap_dtype, na_override=None):
     def wrapper(arr, mask, limit=None):
         view = arr.view(wrap_dtype)
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -189,10 +189,12 @@ class DataConflictError(Exception):
 # Factory helper methods
 
 
-def _arith_method(op, name, str_rep = None, default_axis='columns'):
+def _arith_method(op, name, str_rep = None, default_axis='columns', fill_zeros=None):
     def na_op(x, y):
         try:
             result = expressions.evaluate(op, str_rep, x, y, raise_on_error=True)
+            result = com._fill_zeros(result,y,fill_zeros)
+
         except TypeError:
             xrav = x.ravel()
             result = np.empty(x.size, dtype=x.dtype)
@@ -841,20 +843,23 @@ def __contains__(self, key):
     __sub__ = _arith_method(operator.sub, '__sub__', '-', default_axis=None)
     __mul__ = _arith_method(operator.mul, '__mul__', '*', default_axis=None)
     __truediv__ = _arith_method(operator.truediv, '__truediv__', '/',
-                                default_axis=None)
+                                default_axis=None, fill_zeros=np.inf)
     __floordiv__ = _arith_method(operator.floordiv, '__floordiv__',
-                                 default_axis=None)
+                                 default_axis=None, fill_zeros=np.inf)
     __pow__ = _arith_method(operator.pow, '__pow__', '**', default_axis=None)
 
+    __mod__ = _arith_method(operator.mod, '__mod__', '*', default_axis=None, fill_zeros=np.nan)
+
     __radd__ = _arith_method(_radd_compat, '__radd__', default_axis=None)
     __rmul__ = _arith_method(operator.mul, '__rmul__', default_axis=None)
     __rsub__ = _arith_method(lambda x, y: y - x, '__rsub__', default_axis=None)
     __rtruediv__ = _arith_method(lambda x, y: y / x, '__rtruediv__',
-                                 default_axis=None)
+                                 default_axis=None, fill_zeros=np.inf)
     __rfloordiv__ = _arith_method(lambda x, y: y // x, '__rfloordiv__',
-                                  default_axis=None)
+                                  default_axis=None, fill_zeros=np.inf)
     __rpow__ = _arith_method(lambda x, y: y ** x, '__rpow__',
                              default_axis=None)
+    __rmod__ = _arith_method(operator.mod, '__rmod__', default_axis=None, fill_zeros=np.nan)
 
     # boolean operators
     __and__ = _arith_method(operator.and_, '__and__', '&')
@@ -863,9 +868,10 @@ def __contains__(self, key):
 
     # Python 2 division methods
     if not py3compat.PY3:
-        __div__ = _arith_method(operator.div, '__div__', '/', default_axis=None)
+        __div__ = _arith_method(operator.div, '__div__', '/', 
+                                default_axis=None, fill_zeros=np.inf)
         __rdiv__ = _arith_method(lambda x, y: y / x, '__rdiv__',
-                                 default_axis=None)
+                                 default_axis=None, fill_zeros=np.inf)
 
     def __neg__(self):
         arr = operator.neg(self.values)
diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -55,14 +55,17 @@
 # Wrapper function for Series arithmetic methods
 
 
-def _arith_method(op, name):
+def _arith_method(op, name, fill_zeros=None):
     """
     Wrapper function for Series arithmetic operations, to avoid
     code duplication.
     """
     def na_op(x, y):
         try:
+
             result = op(x, y)
+            result = com._fill_zeros(result,y,fill_zeros)
+
         except TypeError:
             result = pa.empty(len(x), dtype=x.dtype)
             if isinstance(y, pa.Array):
@@ -1258,16 +1261,18 @@ def iteritems(self):
     __add__ = _arith_method(operator.add, '__add__')
     __sub__ = _arith_method(operator.sub, '__sub__')
     __mul__ = _arith_method(operator.mul, '__mul__')
-    __truediv__ = _arith_method(operator.truediv, '__truediv__')
-    __floordiv__ = _arith_method(operator.floordiv, '__floordiv__')
+    __truediv__ = _arith_method(operator.truediv, '__truediv__', fill_zeros=np.inf)
+    __floordiv__ = _arith_method(operator.floordiv, '__floordiv__', fill_zeros=np.inf)
     __pow__ = _arith_method(operator.pow, '__pow__')
+    __mod__ = _arith_method(operator.mod, '__mod__', fill_zeros=np.nan)
 
     __radd__ = _arith_method(_radd_compat, '__add__')
     __rmul__ = _arith_method(operator.mul, '__mul__')
     __rsub__ = _arith_method(lambda x, y: y - x, '__sub__')
-    __rtruediv__ = _arith_method(lambda x, y: y / x, '__truediv__')
-    __rfloordiv__ = _arith_method(lambda x, y: y // x, '__floordiv__')
+    __rtruediv__ = _arith_method(lambda x, y: y / x, '__truediv__', fill_zeros=np.inf)
+    __rfloordiv__ = _arith_method(lambda x, y: y // x, '__floordiv__', fill_zeros=np.inf)
     __rpow__ = _arith_method(lambda x, y: y ** x, '__pow__')
+    __rmod__ = _arith_method(operator.mod, '__mod__', fill_zeros=np.nan)
 
     # comparisons
     __gt__ = _comp_method(operator.gt, '__gt__')
@@ -1301,8 +1306,8 @@ def __invert__(self):
 
     # Python 2 division operators
     if not py3compat.PY3:
-        __div__ = _arith_method(operator.div, '__div__')
-        __rdiv__ = _arith_method(lambda x, y: y / x, '__div__')
+        __div__ = _arith_method(operator.div, '__div__', fill_zeros=np.inf)
+        __rdiv__ = _arith_method(lambda x, y: y / x, '__div__', fill_zeros=np.inf)
         __idiv__ = __div__
 
     #----------------------------------------------------------------------
diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py
@@ -4011,6 +4011,50 @@ def test_operators_none_as_na(self):
             result = op(df.fillna(7), df)
             assert_frame_equal(result, expected)
 
+    def test_modulo(self):
+
+        # GH3590, modulo as ints
+        p = DataFrame({ 'first' : [3,4,5,8], 'second' : [0,0,0,3] })
+
+        ### this is technically wrong as the integer portion is coerced to float ###
+        expected = DataFrame({ 'first' : Series([0,0,0,0],dtype='float64'), 'second' : Series([np.nan,np.nan,np.nan,0]) })
+        result = p % p
+        assert_frame_equal(result,expected)
+
+        # numpy has a slightly different (wrong) treatement
+        result2 = DataFrame(p.values % p.values,index=p.index,columns=p.columns,dtype='float64')
+        result2.iloc[0:3,1] = np.nan
+        assert_frame_equal(result2,expected)
+
+        result = p % 0
+        expected = DataFrame(np.nan,index=p.index,columns=p.columns)
+        assert_frame_equal(result,expected)
+
+        # numpy has a slightly different (wrong) treatement
+        result2 = DataFrame(p.values.astype('float64') % 0,index=p.index,columns=p.columns)
+        assert_frame_equal(result2,expected)
+
+    def test_div(self):
+
+        # integer div, but deal with the 0's
+        p = DataFrame({ 'first' : [3,4,5,8], 'second' : [0,0,0,3] })
+        result = p / p
+
+        ### this is technically wrong as the integer portion is coerced to float ###
+        expected = DataFrame({ 'first' : Series([1,1,1,1],dtype='float64'), 'second' : Series([np.inf,np.inf,np.inf,1]) })
+        assert_frame_equal(result,expected)
+        
+        result2 = DataFrame(p.values.astype('float64')/p.values,index=p.index,columns=p.columns).fillna(np.inf)
+        assert_frame_equal(result2,expected)
+
+        result = p / 0
+        expected = DataFrame(np.inf,index=p.index,columns=p.columns)
+        assert_frame_equal(result,expected)
+
+        # numpy has a slightly different (wrong) treatement
+        result2 = DataFrame(p.values.astype('float64')/0,index=p.index,columns=p.columns).fillna(np.inf)
+        assert_frame_equal(result2,expected)
+
     def test_logical_operators(self):
         import operator
 
diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py
@@ -1766,6 +1766,49 @@ def test_neg(self):
     def test_invert(self):
         assert_series_equal(-(self.series < 0), ~(self.series < 0))
 
+    def test_modulo(self):
+
+        # GH3590, modulo as ints
+        p = DataFrame({ 'first' : [3,4,5,8], 'second' : [0,0,0,3] })
+        result = p['first'] % p['second']
+        expected = Series(p['first'].values % p['second'].values,dtype='float64')
+        expected.iloc[0:3] = np.nan
+        assert_series_equal(result,expected)
+
+        result = p['first'] % 0
+        expected = Series(np.nan,index=p.index)
+        assert_series_equal(result,expected)
+
+        p = p.astype('float64')
+        result = p['first'] % p['second']
+        expected = Series(p['first'].values % p['second'].values)
+        assert_series_equal(result,expected)
+
+    def test_div(self):
+
+        # integer div, but deal with the 0's
+        p = DataFrame({ 'first' : [3,4,5,8], 'second' : [0,0,0,3] })
+        result = p['first'] / p['second']
+        expected = Series(p['first'].values / p['second'].values,dtype='float64')
+        expected.iloc[0:3] = np.inf
+        assert_series_equal(result,expected)
+
+        result = p['first'] / 0
+        expected = Series(np.inf,index=p.index)
+        assert_series_equal(result,expected)
+
+        p = p.astype('float64')
+        result = p['first'] / p['second']
+        expected = Series(p['first'].values / p['second'].values)
+        assert_series_equal(result,expected)
+
+        p = DataFrame({ 'first' : [3,4,5,8], 'second' : [1,1,1,1] })
+        result = p['first'] / p['second']
+        if py3compat.PY3:
+            assert_series_equal(result,p['first'].astype('float64'))
+        else:
+            assert_series_equal(result,p['first'])
+
     def test_operators(self):
 
         def _check_op(series, other, op, pos_only=False):