From 725729ff943b073fb31e1999d2d4910c0b3d1676 Mon Sep 17 00:00:00 2001 From: Jan Schulz Date: Fri, 10 Apr 2015 16:28:50 +0200 Subject: [PATCH] Fix: unequal comparisons of categorical and scalar Before, unequal comparisons were not checking the order of the categories. This was due to a conversion to an ndarray, which turned the comparison to one between ndarray and scalar, which of course has no categories to take into account. Also add test cases and remove the one which actually tested the wrong behaviour. --- doc/source/whatsnew/v0.16.1.txt | 4 ++++ pandas/core/ops.py | 28 +++++++++++++++---------- pandas/tests/test_categorical.py | 35 +++++++++++++++++++++++++++++--- 3 files changed, 53 insertions(+), 14 deletions(-) diff --git a/doc/source/whatsnew/v0.16.1.txt b/doc/source/whatsnew/v0.16.1.txt index ab57f1fb6ea10..2cba928be5ab7 100644 --- a/doc/source/whatsnew/v0.16.1.txt +++ b/doc/source/whatsnew/v0.16.1.txt @@ -116,3 +116,7 @@ Bug Fixes - Bug in ``read_csv`` and ``read_table`` when using ``skip_rows`` parameter if blank lines are present. (:issue:`9832`) - Bug in ``read_csv()`` interprets ``index_col=True`` as ``1`` (:issue:`9798`) + +- Bug in unequal comparisons between a ``Series`` of dtype `"category"` and a scalar (e.g. + ``Series(Categorical(list("abc"), categories=list("cba"), ordered=True)) > "b"``, which + wouldn't use the order of the categories but use the lexicographical order. (:issue:`9848`) \ No newline at end of file diff --git a/pandas/core/ops.py b/pandas/core/ops.py index 954d2c8a77326..2af9cd43faaef 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -594,20 +594,26 @@ def wrapper(self, other): mask = isnull(self) - values = self.get_values() - other = _index.convert_scalar(values,_values_from_object(other)) + if com.is_categorical_dtype(self): + # cats are a special case as get_values() would return an ndarray, which would then + # not take categories ordering into account + # we can go directly to op, as the na_op would just test again and dispatch to it. + res = op(self.values, other) + else: + values = self.get_values() + other = _index.convert_scalar(values,_values_from_object(other)) - if issubclass(values.dtype.type, (np.datetime64, np.timedelta64)): - values = values.view('i8') + if issubclass(values.dtype.type, (np.datetime64, np.timedelta64)): + values = values.view('i8') - # scalars - res = na_op(values, other) - if np.isscalar(res): - raise TypeError('Could not compare %s type with Series' - % type(other)) + # scalars + res = na_op(values, other) + if np.isscalar(res): + raise TypeError('Could not compare %s type with Series' + % type(other)) - # always return a full value series here - res = _values_from_object(res) + # always return a full value series here + res = _values_from_object(res) res = pd.Series(res, index=self.index, name=self.name, dtype='bool') diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 7f4b3fcb94dfa..4c5678bf6633f 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -114,6 +114,9 @@ def f(): Categorical([1,2], [1,2,np.nan, np.nan]) self.assertRaises(ValueError, f) + # The default should be unordered + c1 = Categorical(["a", "b", "c", "a"]) + self.assertFalse(c1.ordered) # Categorical as input c1 = Categorical(["a", "b", "c", "a"]) @@ -367,6 +370,13 @@ def f(): self.assertRaises(TypeError, lambda: a < cat) self.assertRaises(TypeError, lambda: a < cat_rev) + # Make sure that unequal comparison take the categories order in account + cat_rev = pd.Categorical(list("abc"), categories=list("cba"), ordered=True) + exp = np.array([True, False, False]) + res = cat_rev > "b" + self.assert_numpy_array_equal(res, exp) + + def test_na_flags_int_categories(self): # #1457 @@ -2390,6 +2400,18 @@ def test_comparisons(self): exp = Series([False, False, True]) tm.assert_series_equal(res, exp) + scalar = base[1] + res = cat > scalar + exp = Series([False, False, True]) + exp2 = cat.values > scalar + tm.assert_series_equal(res, exp) + tm.assert_numpy_array_equal(res.values, exp2) + res_rev = cat_rev > scalar + exp_rev = Series([True, False, False]) + exp_rev2 = cat_rev.values > scalar + tm.assert_series_equal(res_rev, exp_rev) + tm.assert_numpy_array_equal(res_rev.values, exp_rev2) + # Only categories with same categories can be compared def f(): cat > cat_rev @@ -2408,9 +2430,16 @@ def f(): self.assertRaises(TypeError, lambda: a < cat) self.assertRaises(TypeError, lambda: a < cat_rev) - # Categoricals can be compared to scalar values - res = cat_rev > base[0] - tm.assert_series_equal(res, exp) + # unequal comparison should raise for unordered cats + cat = Series(Categorical(list("abc"))) + def f(): + cat > "b" + self.assertRaises(TypeError, f) + cat = Series(Categorical(list("abc"), ordered=False)) + def f(): + cat > "b" + self.assertRaises(TypeError, f) + # And test NaN handling... cat = Series(Categorical(["a","b","c", np.nan]))