diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 40ae38f12fccb..31ba2e1042547 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -520,6 +520,8 @@ Bug Fixes - Bug in extension dtype creation where the created types were not is/identical (:issue:`13285`) - Bug in ``NaT`` - ``Period`` raises ``AttributeError`` (:issue:`13071`) +- Bug in ``Series`` comparison may output incorrect result if rhs contains ``NaT`` (:issue:`9005`) +- Bug in ``Series`` and ``Index`` comparison may output incorrect result if it contains ``NaT`` with ``object`` dtype (:issue:`13592`) - Bug in ``Period`` addition raises ``TypeError`` if ``Period`` is on right hand side (:issue:`13069`) - Bug in ``Peirod`` and ``Series`` or ``Index`` comparison raises ``TypeError`` (:issue:`13200`) - Bug in ``pd.set_eng_float_format()`` that would prevent NaN's from formatting (:issue:`11981`) diff --git a/pandas/core/ops.py b/pandas/core/ops.py index 34ab3ae6863b5..0af7b6d80ce0e 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -27,7 +27,8 @@ is_integer_dtype, is_categorical_dtype, is_object_dtype, is_timedelta64_dtype, is_datetime64_dtype, is_datetime64tz_dtype, - is_bool_dtype, PerformanceWarning, ABCSeries) + is_bool_dtype, PerformanceWarning, + ABCSeries, ABCIndex) # ----------------------------------------------------------------------------- # Functions that add arithmetic methods to objects, given arithmetic factory @@ -664,6 +665,22 @@ def wrapper(left, right, name=name, na_op=na_op): return wrapper +def _comp_method_OBJECT_ARRAY(op, x, y): + if isinstance(y, list): + y = lib.list_to_object_array(y) + if isinstance(y, (np.ndarray, ABCSeries, ABCIndex)): + if not is_object_dtype(y.dtype): + y = y.astype(np.object_) + + if isinstance(y, (ABCSeries, ABCIndex)): + y = y.values + + result = lib.vec_compare(x, y, op) + else: + result = lib.scalar_compare(x, y, op) + return result + + def _comp_method_SERIES(op, name, str_rep, masker=False): """ Wrapper function for Series arithmetic operations, to avoid @@ -680,16 +697,7 @@ def na_op(x, y): return op(y, x) if is_object_dtype(x.dtype): - if isinstance(y, list): - y = lib.list_to_object_array(y) - - if isinstance(y, (np.ndarray, ABCSeries)): - if not is_object_dtype(y.dtype): - result = lib.vec_compare(x, y.astype(np.object_), op) - else: - result = lib.vec_compare(x, y, op) - else: - result = lib.scalar_compare(x, y, op) + result = _comp_method_OBJECT_ARRAY(op, x, y) else: # we want to compare like types @@ -713,12 +721,11 @@ def na_op(x, y): (not isscalar(y) and needs_i8_conversion(y))): if isscalar(y): + mask = isnull(x) y = _index.convert_scalar(x, _values_from_object(y)) else: + mask = isnull(x) | isnull(y) y = y.view('i8') - - mask = isnull(x) - x = x.view('i8') try: diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index ad27010714f63..e697dc63c2cdb 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -31,6 +31,7 @@ is_list_like, is_bool_dtype, is_integer_dtype, is_float_dtype, needs_i8_conversion) +from pandas.core.ops import _comp_method_OBJECT_ARRAY from pandas.core.strings import StringAccessorMixin from pandas.core.config import get_option @@ -3182,8 +3183,11 @@ def _evaluate_compare(self, other): if needs_i8_conversion(self) and needs_i8_conversion(other): return self._evaluate_compare(other, op) - func = getattr(self.values, op) - result = func(np.asarray(other)) + if is_object_dtype(self) and self.nlevels == 1: + # don't pass MultiIndex + result = _comp_method_OBJECT_ARRAY(op, self.values, other) + else: + result = op(self.values, np.asarray(other)) # technically we could support bool dtyped Index # for now just return the indexing array directly @@ -3196,12 +3200,12 @@ def _evaluate_compare(self, other): return _evaluate_compare - cls.__eq__ = _make_compare('__eq__') - cls.__ne__ = _make_compare('__ne__') - cls.__lt__ = _make_compare('__lt__') - cls.__gt__ = _make_compare('__gt__') - cls.__le__ = _make_compare('__le__') - cls.__ge__ = _make_compare('__ge__') + cls.__eq__ = _make_compare(operator.eq) + cls.__ne__ = _make_compare(operator.ne) + cls.__lt__ = _make_compare(operator.lt) + cls.__gt__ = _make_compare(operator.gt) + cls.__le__ = _make_compare(operator.le) + cls.__ge__ = _make_compare(operator.ge) @classmethod def _add_numericlike_set_methods_disabled(cls): diff --git a/pandas/lib.pyx b/pandas/lib.pyx index a9c7f93097f1b..7cbb502315b64 100644 --- a/pandas/lib.pyx +++ b/pandas/lib.pyx @@ -768,12 +768,12 @@ def scalar_compare(ndarray[object] values, object val, object op): raise ValueError('Unrecognized operator') result = np.empty(n, dtype=bool).view(np.uint8) - isnull_val = _checknull(val) + isnull_val = checknull(val) if flag == cpython.Py_NE: for i in range(n): x = values[i] - if _checknull(x): + if checknull(x): result[i] = True elif isnull_val: result[i] = True @@ -785,7 +785,7 @@ def scalar_compare(ndarray[object] values, object val, object op): elif flag == cpython.Py_EQ: for i in range(n): x = values[i] - if _checknull(x): + if checknull(x): result[i] = False elif isnull_val: result[i] = False @@ -798,7 +798,7 @@ def scalar_compare(ndarray[object] values, object val, object op): else: for i in range(n): x = values[i] - if _checknull(x): + if checknull(x): result[i] = False elif isnull_val: result[i] = False @@ -864,7 +864,7 @@ def vec_compare(ndarray[object] left, ndarray[object] right, object op): x = left[i] y = right[i] - if _checknull(x) or _checknull(y): + if checknull(x) or checknull(y): result[i] = True else: result[i] = cpython.PyObject_RichCompareBool(x, y, flag) @@ -873,7 +873,7 @@ def vec_compare(ndarray[object] left, ndarray[object] right, object op): x = left[i] y = right[i] - if _checknull(x) or _checknull(y): + if checknull(x) or checknull(y): result[i] = False else: result[i] = cpython.PyObject_RichCompareBool(x, y, flag) diff --git a/pandas/tests/series/test_operators.py b/pandas/tests/series/test_operators.py index 6ab382beb7973..9c401e9ce6da8 100644 --- a/pandas/tests/series/test_operators.py +++ b/pandas/tests/series/test_operators.py @@ -980,24 +980,97 @@ def test_comparison_invalid(self): self.assertRaises(TypeError, lambda: x <= y) def test_more_na_comparisons(self): - left = Series(['a', np.nan, 'c']) - right = Series(['a', np.nan, 'd']) + for dtype in [None, object]: + left = Series(['a', np.nan, 'c'], dtype=dtype) + right = Series(['a', np.nan, 'd'], dtype=dtype) - result = left == right - expected = Series([True, False, False]) - assert_series_equal(result, expected) + result = left == right + expected = Series([True, False, False]) + assert_series_equal(result, expected) - result = left != right - expected = Series([False, True, True]) - assert_series_equal(result, expected) + result = left != right + expected = Series([False, True, True]) + assert_series_equal(result, expected) - result = left == np.nan - expected = Series([False, False, False]) - assert_series_equal(result, expected) + result = left == np.nan + expected = Series([False, False, False]) + assert_series_equal(result, expected) - result = left != np.nan - expected = Series([True, True, True]) - assert_series_equal(result, expected) + result = left != np.nan + expected = Series([True, True, True]) + assert_series_equal(result, expected) + + def test_nat_comparisons(self): + data = [([pd.Timestamp('2011-01-01'), pd.NaT, + pd.Timestamp('2011-01-03')], + [pd.NaT, pd.NaT, pd.Timestamp('2011-01-03')]), + + ([pd.Timedelta('1 days'), pd.NaT, + pd.Timedelta('3 days')], + [pd.NaT, pd.NaT, pd.Timedelta('3 days')]), + + ([pd.Period('2011-01', freq='M'), pd.NaT, + pd.Period('2011-03', freq='M')], + [pd.NaT, pd.NaT, pd.Period('2011-03', freq='M')])] + + # add lhs / rhs switched data + data = data + [(r, l) for l, r in data] + + for l, r in data: + for dtype in [None, object]: + left = Series(l, dtype=dtype) + + # Series, Index + for right in [Series(r, dtype=dtype), Index(r, dtype=dtype)]: + expected = Series([False, False, True]) + assert_series_equal(left == right, expected) + + expected = Series([True, True, False]) + assert_series_equal(left != right, expected) + + expected = Series([False, False, False]) + assert_series_equal(left < right, expected) + + expected = Series([False, False, False]) + assert_series_equal(left > right, expected) + + expected = Series([False, False, True]) + assert_series_equal(left >= right, expected) + + expected = Series([False, False, True]) + assert_series_equal(left <= right, expected) + + def test_nat_comparisons_scalar(self): + data = [[pd.Timestamp('2011-01-01'), pd.NaT, + pd.Timestamp('2011-01-03')], + + [pd.Timedelta('1 days'), pd.NaT, pd.Timedelta('3 days')], + + [pd.Period('2011-01', freq='M'), pd.NaT, + pd.Period('2011-03', freq='M')]] + + for l in data: + for dtype in [None, object]: + left = Series(l, dtype=dtype) + + expected = Series([False, False, False]) + assert_series_equal(left == pd.NaT, expected) + assert_series_equal(pd.NaT == left, expected) + + expected = Series([True, True, True]) + assert_series_equal(left != pd.NaT, expected) + assert_series_equal(pd.NaT != left, expected) + + expected = Series([False, False, False]) + assert_series_equal(left < pd.NaT, expected) + assert_series_equal(pd.NaT > left, expected) + assert_series_equal(left <= pd.NaT, expected) + assert_series_equal(pd.NaT >= left, expected) + + assert_series_equal(left > pd.NaT, expected) + assert_series_equal(pd.NaT < left, expected) + assert_series_equal(left >= pd.NaT, expected) + assert_series_equal(pd.NaT <= left, expected) def test_comparison_different_length(self): a = Series(['a', 'b', 'c']) diff --git a/pandas/tseries/base.py b/pandas/tseries/base.py index 2e3d1ace9734c..4bafac873ea09 100644 --- a/pandas/tseries/base.py +++ b/pandas/tseries/base.py @@ -142,7 +142,7 @@ def _evaluate_compare(self, other, op): other = type(self)(other) # compare - result = getattr(self.asi8, op)(other.asi8) + result = op(self.asi8, other.asi8) # technically we could support bool dtyped Index # for now just return the indexing array directly diff --git a/pandas/tseries/tdi.py b/pandas/tseries/tdi.py index 84f357481a28e..af4c46e2d16fa 100644 --- a/pandas/tseries/tdi.py +++ b/pandas/tseries/tdi.py @@ -36,7 +36,7 @@ def _td_index_cmp(opname, nat_result=False): def wrapper(self, other): func = getattr(super(TimedeltaIndex, self), opname) - if _is_convertible_to_td(other): + if _is_convertible_to_td(other) or other is tslib.NaT: other = _to_m8(other) result = func(other) if com.isnull(other): diff --git a/pandas/tseries/tests/test_base.py b/pandas/tseries/tests/test_base.py index 360944e355b4d..a03fafdf36adc 100644 --- a/pandas/tseries/tests/test_base.py +++ b/pandas/tseries/tests/test_base.py @@ -457,6 +457,32 @@ def test_sub_period(self): with tm.assertRaises(TypeError): p - idx + def test_comp_nat(self): + left = pd.DatetimeIndex([pd.Timestamp('2011-01-01'), pd.NaT, + pd.Timestamp('2011-01-03')]) + right = pd.DatetimeIndex([pd.NaT, pd.NaT, pd.Timestamp('2011-01-03')]) + + for l, r in [(left, right), (left.asobject, right.asobject)]: + result = l == r + expected = np.array([False, False, True]) + tm.assert_numpy_array_equal(result, expected) + + result = l != r + expected = np.array([True, True, False]) + tm.assert_numpy_array_equal(result, expected) + + expected = np.array([False, False, False]) + tm.assert_numpy_array_equal(l == pd.NaT, expected) + tm.assert_numpy_array_equal(pd.NaT == r, expected) + + expected = np.array([True, True, True]) + tm.assert_numpy_array_equal(l != pd.NaT, expected) + tm.assert_numpy_array_equal(pd.NaT != l, expected) + + expected = np.array([False, False, False]) + tm.assert_numpy_array_equal(l < pd.NaT, expected) + tm.assert_numpy_array_equal(pd.NaT > l, expected) + def test_value_counts_unique(self): # GH 7735 for tz in [None, 'UTC', 'Asia/Tokyo', 'US/Eastern']: @@ -1237,6 +1263,32 @@ def test_addition_ops(self): expected = Timestamp('20130102') self.assertEqual(result, expected) + def test_comp_nat(self): + left = pd.TimedeltaIndex([pd.Timedelta('1 days'), pd.NaT, + pd.Timedelta('3 days')]) + right = pd.TimedeltaIndex([pd.NaT, pd.NaT, pd.Timedelta('3 days')]) + + for l, r in [(left, right), (left.asobject, right.asobject)]: + result = l == r + expected = np.array([False, False, True]) + tm.assert_numpy_array_equal(result, expected) + + result = l != r + expected = np.array([True, True, False]) + tm.assert_numpy_array_equal(result, expected) + + expected = np.array([False, False, False]) + tm.assert_numpy_array_equal(l == pd.NaT, expected) + tm.assert_numpy_array_equal(pd.NaT == r, expected) + + expected = np.array([True, True, True]) + tm.assert_numpy_array_equal(l != pd.NaT, expected) + tm.assert_numpy_array_equal(pd.NaT != l, expected) + + expected = np.array([False, False, False]) + tm.assert_numpy_array_equal(l < pd.NaT, expected) + tm.assert_numpy_array_equal(pd.NaT > l, expected) + def test_value_counts_unique(self): # GH 7735 @@ -2038,6 +2090,32 @@ def test_sub_isub(self): rng -= 1 tm.assert_index_equal(rng, expected) + def test_comp_nat(self): + left = pd.PeriodIndex([pd.Period('2011-01-01'), pd.NaT, + pd.Period('2011-01-03')]) + right = pd.PeriodIndex([pd.NaT, pd.NaT, pd.Period('2011-01-03')]) + + for l, r in [(left, right), (left.asobject, right.asobject)]: + result = l == r + expected = np.array([False, False, True]) + tm.assert_numpy_array_equal(result, expected) + + result = l != r + expected = np.array([True, True, False]) + tm.assert_numpy_array_equal(result, expected) + + expected = np.array([False, False, False]) + tm.assert_numpy_array_equal(l == pd.NaT, expected) + tm.assert_numpy_array_equal(pd.NaT == r, expected) + + expected = np.array([True, True, True]) + tm.assert_numpy_array_equal(l != pd.NaT, expected) + tm.assert_numpy_array_equal(pd.NaT != l, expected) + + expected = np.array([False, False, False]) + tm.assert_numpy_array_equal(l < pd.NaT, expected) + tm.assert_numpy_array_equal(pd.NaT > l, expected) + def test_value_counts_unique(self): # GH 7735 idx = pd.period_range('2011-01-01 09:00', freq='H', periods=10)