From 9b827efb3fe9d696c2e79be2d11544a2be7181e8 Mon Sep 17 00:00:00 2001 From: Justin Solinsky Date: Tue, 24 Jan 2017 23:01:10 -0500 Subject: [PATCH 1/3] ENH union_categoricals supports ignore_order GH13410 --- pandas/tests/tools/test_concat.py | 33 +++++++++++++++++++++++++++++++ pandas/types/concat.py | 12 ++++++++--- 2 files changed, 42 insertions(+), 3 deletions(-) diff --git a/pandas/tests/tools/test_concat.py b/pandas/tests/tools/test_concat.py index 2a28fccdc9b94..c45afc19e65f6 100644 --- a/pandas/tests/tools/test_concat.py +++ b/pandas/tests/tools/test_concat.py @@ -1662,6 +1662,39 @@ def test_union_categoricals_ordered(self): with tm.assertRaisesRegexp(TypeError, msg): union_categoricals([c1, c2]) + def test_union_categoricals_ignore_order(self): + c1 = Categorical([1, 2, 3], ordered=True) + c2 = Categorical([1, 2, 3], ordered=False) + + res = union_categoricals([c1, c2], ignore_order=True) + exp = Categorical([1, 2, 3, 1, 2, 3]) + tm.assert_categorical_equal(res, exp) + + res = union_categoricals([c1, c1], ignore_order=True) + exp = Categorical([1, 2, 3, 1, 2, 3], ordered=False) + tm.assert_categorical_equal(res, exp) + + c1 = Categorical([1, 2, 3, np.nan], ordered=True) + c2 = Categorical([3, 2], categories=[1, 2, 3], ordered=True) + + res = union_categoricals([c1, c2], ignore_order=True) + exp = Categorical([1, 2, 3, np.nan, 3, 2], ordered=False) + tm.assert_categorical_equal(res, exp) + + c1 = Categorical([1, 2, 3], ordered=True) + c2 = Categorical([1, 2, 3], categories=[3, 2, 1], ordered=True) + + res = union_categoricals([c1, c2], ignore_order=True) + exp = Categorical([1, 2, 3, 1, 2, 3]) + tm.assert_categorical_equal(res, exp) + + c1 = Categorical([1, 2, 3], categories=[3, 2, 1], ordered=True) + c2 = Categorical([1, 2, 3], ordered=True) + + res = union_categoricals([c1, c2], ignore_order=True, sort_categories=True) + exp = Categorical([1, 2, 3, 1, 2, 3], categories=[1, 2, 3]) + tm.assert_categorical_equal(res, exp) + def test_union_categoricals_sort(self): # GH 13846 c1 = Categorical(['x', 'y', 'z']) diff --git a/pandas/types/concat.py b/pandas/types/concat.py index 827eb160c452d..e9ee331c200df 100644 --- a/pandas/types/concat.py +++ b/pandas/types/concat.py @@ -208,7 +208,7 @@ def _concat_asobject(to_concat): return _concat_asobject(to_concat) -def union_categoricals(to_union, sort_categories=False): +def union_categoricals(to_union, sort_categories=False, ignore_order=False): """ Combine list-like of Categorical-like, unioning categories. All categories must have the same dtype. @@ -222,6 +222,9 @@ def union_categoricals(to_union, sort_categories=False): sort_categories : boolean, default False If true, resulting categories will be lexsorted, otherwise they will be ordered as they appear in the data. + ignore_order: boolean, default False + If true, ordered categories will be ignored. Results in + an unordered categorical. Returns ------- @@ -264,7 +267,7 @@ def _maybe_unwrap(x): ordered = first.ordered new_codes = np.concatenate([c.codes for c in to_union]) - if sort_categories and ordered: + if sort_categories and not ignore_order and ordered: raise TypeError("Cannot use sort_categories=True with " "ordered Categoricals") @@ -272,7 +275,7 @@ def _maybe_unwrap(x): categories = categories.sort_values() indexer = categories.get_indexer(first.categories) new_codes = take_1d(indexer, new_codes, fill_value=-1) - elif all(not c.ordered for c in to_union): + elif ignore_order | all(not c.ordered for c in to_union): # different categories - union and recode cats = first.categories.append([c.categories for c in to_union[1:]]) categories = Index(cats.unique()) @@ -297,6 +300,9 @@ def _maybe_unwrap(x): else: raise TypeError('Categorical.ordered must be the same') + if ignore_order: + ordered = False + return Categorical(new_codes, categories=categories, ordered=ordered, fastpath=True) From d278d623b7ed00317a870b3649c5692c284d2864 Mon Sep 17 00:00:00 2001 From: Justin Solinsky Date: Wed, 25 Jan 2017 22:54:40 -0500 Subject: [PATCH 2/3] ENH union_categoricals supports ignore_order GH13410 --- doc/source/categorical.rst | 11 +++++++++++ doc/source/whatsnew/v0.20.0.txt | 1 + pandas/tests/tools/test_concat.py | 15 +++++++++------ pandas/types/concat.py | 8 ++++---- 4 files changed, 25 insertions(+), 10 deletions(-) diff --git a/doc/source/categorical.rst b/doc/source/categorical.rst index 18e429cfc92fa..db974922e1d76 100644 --- a/doc/source/categorical.rst +++ b/doc/source/categorical.rst @@ -693,6 +693,17 @@ The below raises ``TypeError`` because the categories are ordered and not identi Out[3]: TypeError: to union ordered Categoricals, all categories must be the same +.. versionadded:: 0.20.0 + +Ordered categoricals with different categories or orderings can be combined by +using the ``ignore_ordered=True`` argument. + +.. ipython:: python + + a = pd.Categorical(["a", "b", "c"], ordered=True) + b = pd.Categorical(["c", "b", "a"], ordered=True) + union_categoricals([a, b], ignore_order=True) + ``union_categoricals`` also works with a ``CategoricalIndex``, or ``Series`` containing categorical data, but note that the resulting array will always be a plain ``Categorical`` diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index ae4a3d3c3d97f..3f7f79490eca4 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -157,6 +157,7 @@ Other enhancements - HTML table output skips ``colspan`` or ``rowspan`` attribute if equal to 1. (:issue:`15403`) .. _ISO 8601 duration: https://en.wikipedia.org/wiki/ISO_8601#Durations +- ``ignore_ordered`` argument added to ``pd.types.concat.union_categoricals``; setting the argument to true will ignore the ordered attribute of unioned categoricals (:issue:`13410`) .. _whatsnew_0200.api_breaking: diff --git a/pandas/tests/tools/test_concat.py b/pandas/tests/tools/test_concat.py index c45afc19e65f6..1558aaf5814e7 100644 --- a/pandas/tests/tools/test_concat.py +++ b/pandas/tests/tools/test_concat.py @@ -1671,14 +1671,14 @@ def test_union_categoricals_ignore_order(self): tm.assert_categorical_equal(res, exp) res = union_categoricals([c1, c1], ignore_order=True) - exp = Categorical([1, 2, 3, 1, 2, 3], ordered=False) + exp = Categorical([1, 2, 3, 1, 2, 3]) tm.assert_categorical_equal(res, exp) c1 = Categorical([1, 2, 3, np.nan], ordered=True) c2 = Categorical([3, 2], categories=[1, 2, 3], ordered=True) res = union_categoricals([c1, c2], ignore_order=True) - exp = Categorical([1, 2, 3, np.nan, 3, 2], ordered=False) + exp = Categorical([1, 2, 3, np.nan, 3, 2]) tm.assert_categorical_equal(res, exp) c1 = Categorical([1, 2, 3], ordered=True) @@ -1688,13 +1688,16 @@ def test_union_categoricals_ignore_order(self): exp = Categorical([1, 2, 3, 1, 2, 3]) tm.assert_categorical_equal(res, exp) - c1 = Categorical([1, 2, 3], categories=[3, 2, 1], ordered=True) - c2 = Categorical([1, 2, 3], ordered=True) - - res = union_categoricals([c1, c2], ignore_order=True, sort_categories=True) + res = union_categoricals([c2, c1], ignore_order=True, sort_categories=True) exp = Categorical([1, 2, 3, 1, 2, 3], categories=[1, 2, 3]) tm.assert_categorical_equal(res, exp) + c1 = Categorical([1, 2, 3], ordered=True) + c2 = Categorical([4, 5, 6], ordered=True) + result = union_categoricals([c1, c2], ignore_order=True) + expected = Categorical([1, 2, 3, 4, 5, 6]) + tm.assert_categorical_equal(result, expected) + def test_union_categoricals_sort(self): # GH 13846 c1 = Categorical(['x', 'y', 'z']) diff --git a/pandas/types/concat.py b/pandas/types/concat.py index e9ee331c200df..48fc81ef61396 100644 --- a/pandas/types/concat.py +++ b/pandas/types/concat.py @@ -223,8 +223,8 @@ def union_categoricals(to_union, sort_categories=False, ignore_order=False): If true, resulting categories will be lexsorted, otherwise they will be ordered as they appear in the data. ignore_order: boolean, default False - If true, ordered categories will be ignored. Results in - an unordered categorical. + If true, the ordered attribute of the Categoricals will be ignored. + Results in an unordered categorical. Returns ------- @@ -238,7 +238,7 @@ def union_categoricals(to_union, sort_categories=False, ignore_order=False): - all inputs are ordered and their categories are not identical - sort_categories=True and Categoricals are ordered ValueError - Emmpty list of categoricals passed + Empty list of categoricals passed """ from pandas import Index, Categorical, CategoricalIndex, Series @@ -275,7 +275,7 @@ def _maybe_unwrap(x): categories = categories.sort_values() indexer = categories.get_indexer(first.categories) new_codes = take_1d(indexer, new_codes, fill_value=-1) - elif ignore_order | all(not c.ordered for c in to_union): + elif ignore_order or all(not c.ordered for c in to_union): # different categories - union and recode cats = first.categories.append([c.categories for c in to_union[1:]]) categories = Index(cats.unique()) From e9d00dec17bd6952048a46ae130ab8e79cba286b Mon Sep 17 00:00:00 2001 From: Justin Solinsky Date: Tue, 21 Feb 2017 22:01:49 -0500 Subject: [PATCH 3/3] GH15219 Documentation fixes based on feedback --- doc/source/whatsnew/v0.20.0.txt | 2 +- pandas/tests/tools/test_concat.py | 20 +++++++++++++++++++- pandas/types/concat.py | 2 ++ 3 files changed, 22 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 3f7f79490eca4..004e58da1a3a0 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -157,7 +157,7 @@ Other enhancements - HTML table output skips ``colspan`` or ``rowspan`` attribute if equal to 1. (:issue:`15403`) .. _ISO 8601 duration: https://en.wikipedia.org/wiki/ISO_8601#Durations -- ``ignore_ordered`` argument added to ``pd.types.concat.union_categoricals``; setting the argument to true will ignore the ordered attribute of unioned categoricals (:issue:`13410`) +- ``ignore_ordered`` argument added to ``pd.types.concat.union_categoricals``; setting the argument to true will ignore the ordered attribute of unioned categoricals (:issue:`13410`) . See the :ref:`categorical union docs ` for more information. .. _whatsnew_0200.api_breaking: diff --git a/pandas/tests/tools/test_concat.py b/pandas/tests/tools/test_concat.py index 1558aaf5814e7..6d40de465bff8 100644 --- a/pandas/tests/tools/test_concat.py +++ b/pandas/tests/tools/test_concat.py @@ -1663,6 +1663,7 @@ def test_union_categoricals_ordered(self): union_categoricals([c1, c2]) def test_union_categoricals_ignore_order(self): + # GH 15219 c1 = Categorical([1, 2, 3], ordered=True) c2 = Categorical([1, 2, 3], ordered=False) @@ -1670,10 +1671,19 @@ def test_union_categoricals_ignore_order(self): exp = Categorical([1, 2, 3, 1, 2, 3]) tm.assert_categorical_equal(res, exp) + msg = 'Categorical.ordered must be the same' + with tm.assertRaisesRegexp(TypeError, msg): + union_categoricals([c1, c2], ignore_order=False) + res = union_categoricals([c1, c1], ignore_order=True) exp = Categorical([1, 2, 3, 1, 2, 3]) tm.assert_categorical_equal(res, exp) + res = union_categoricals([c1, c1], ignore_order=False) + exp = Categorical([1, 2, 3, 1, 2, 3], + categories=[1, 2, 3], ordered=True) + tm.assert_categorical_equal(res, exp) + c1 = Categorical([1, 2, 3, np.nan], ordered=True) c2 = Categorical([3, 2], categories=[1, 2, 3], ordered=True) @@ -1688,7 +1698,8 @@ def test_union_categoricals_ignore_order(self): exp = Categorical([1, 2, 3, 1, 2, 3]) tm.assert_categorical_equal(res, exp) - res = union_categoricals([c2, c1], ignore_order=True, sort_categories=True) + res = union_categoricals([c2, c1], ignore_order=True, + sort_categories=True) exp = Categorical([1, 2, 3, 1, 2, 3], categories=[1, 2, 3]) tm.assert_categorical_equal(res, exp) @@ -1698,6 +1709,13 @@ def test_union_categoricals_ignore_order(self): expected = Categorical([1, 2, 3, 4, 5, 6]) tm.assert_categorical_equal(result, expected) + msg = "to union ordered Categoricals, all categories must be the same" + with tm.assertRaisesRegexp(TypeError, msg): + union_categoricals([c1, c2], ignore_order=False) + + with tm.assertRaisesRegexp(TypeError, msg): + union_categoricals([c1, c2]) + def test_union_categoricals_sort(self): # GH 13846 c1 = Categorical(['x', 'y', 'z']) diff --git a/pandas/types/concat.py b/pandas/types/concat.py index 48fc81ef61396..9e47a97dd621a 100644 --- a/pandas/types/concat.py +++ b/pandas/types/concat.py @@ -226,6 +226,8 @@ def union_categoricals(to_union, sort_categories=False, ignore_order=False): If true, the ordered attribute of the Categoricals will be ignored. Results in an unordered categorical. + .. versionadded:: 0.20.0 + Returns ------- result : Categorical