Skip to content

Commit 8e58225

Browse files
authored
CLN: move safe_sort from core.algorithms to core.sorting (#17034)
COMPAT: safe_sort will only coerce list-likes to object, not a numpy string type xref: #17003 (comment)
1 parent 8d0c025 commit 8e58225

File tree

6 files changed

+210
-192
lines changed

6 files changed

+210
-192
lines changed

pandas/core/algorithms.py

+1-99
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,6 @@
3030
from pandas.core.dtypes.missing import isnull
3131

3232
from pandas.core import common as com
33-
from pandas.compat import string_types
3433
from pandas._libs import algos, lib, hashtable as htable
3534
from pandas._libs.tslib import iNaT
3635

@@ -431,104 +430,6 @@ def isin(comps, values):
431430
return f(comps, values)
432431

433432

434-
def safe_sort(values, labels=None, na_sentinel=-1, assume_unique=False):
435-
"""
436-
Sort ``values`` and reorder corresponding ``labels``.
437-
``values`` should be unique if ``labels`` is not None.
438-
Safe for use with mixed types (int, str), orders ints before strs.
439-
440-
.. versionadded:: 0.19.0
441-
442-
Parameters
443-
----------
444-
values : list-like
445-
Sequence; must be unique if ``labels`` is not None.
446-
labels : list_like
447-
Indices to ``values``. All out of bound indices are treated as
448-
"not found" and will be masked with ``na_sentinel``.
449-
na_sentinel : int, default -1
450-
Value in ``labels`` to mark "not found".
451-
Ignored when ``labels`` is None.
452-
assume_unique : bool, default False
453-
When True, ``values`` are assumed to be unique, which can speed up
454-
the calculation. Ignored when ``labels`` is None.
455-
456-
Returns
457-
-------
458-
ordered : ndarray
459-
Sorted ``values``
460-
new_labels : ndarray
461-
Reordered ``labels``; returned when ``labels`` is not None.
462-
463-
Raises
464-
------
465-
TypeError
466-
* If ``values`` is not list-like or if ``labels`` is neither None
467-
nor list-like
468-
* If ``values`` cannot be sorted
469-
ValueError
470-
* If ``labels`` is not None and ``values`` contain duplicates.
471-
"""
472-
if not is_list_like(values):
473-
raise TypeError("Only list-like objects are allowed to be passed to"
474-
"safe_sort as values")
475-
values = np.asarray(values)
476-
477-
def sort_mixed(values):
478-
# order ints before strings, safe in py3
479-
str_pos = np.array([isinstance(x, string_types) for x in values],
480-
dtype=bool)
481-
nums = np.sort(values[~str_pos])
482-
strs = np.sort(values[str_pos])
483-
return _ensure_object(np.concatenate([nums, strs]))
484-
485-
sorter = None
486-
if compat.PY3 and lib.infer_dtype(values) == 'mixed-integer':
487-
# unorderable in py3 if mixed str/int
488-
ordered = sort_mixed(values)
489-
else:
490-
try:
491-
sorter = values.argsort()
492-
ordered = values.take(sorter)
493-
except TypeError:
494-
# try this anyway
495-
ordered = sort_mixed(values)
496-
497-
# labels:
498-
499-
if labels is None:
500-
return ordered
501-
502-
if not is_list_like(labels):
503-
raise TypeError("Only list-like objects or None are allowed to be"
504-
"passed to safe_sort as labels")
505-
labels = _ensure_platform_int(np.asarray(labels))
506-
507-
from pandas import Index
508-
if not assume_unique and not Index(values).is_unique:
509-
raise ValueError("values should be unique if labels is not None")
510-
511-
if sorter is None:
512-
# mixed types
513-
(hash_klass, _), values = _get_data_algo(values, _hashtables)
514-
t = hash_klass(len(values))
515-
t.map_locations(values)
516-
sorter = _ensure_platform_int(t.lookup(ordered))
517-
518-
reverse_indexer = np.empty(len(sorter), dtype=np.int_)
519-
reverse_indexer.put(sorter, np.arange(len(sorter)))
520-
521-
mask = (labels < -len(values)) | (labels >= len(values)) | \
522-
(labels == na_sentinel)
523-
524-
# (Out of bound indices will be masked with `na_sentinel` next, so we may
525-
# deal with them here without performance loss using `mode='wrap'`.)
526-
new_labels = reverse_indexer.take(labels, mode='wrap')
527-
np.putmask(new_labels, mask, na_sentinel)
528-
529-
return ordered, _ensure_platform_int(new_labels)
530-
531-
532433
def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
533434
"""
534435
Encode input values as an enumerated type or categorical variable
@@ -568,6 +469,7 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
568469
uniques = uniques.to_array()
569470

570471
if sort and len(uniques) > 0:
472+
from pandas.core.sorting import safe_sort
571473
uniques, labels = safe_sort(uniques, labels, na_sentinel=na_sentinel,
572474
assume_unique=True)
573475

pandas/core/indexes/base.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@
4848
import pandas.core.dtypes.concat as _concat
4949
import pandas.core.missing as missing
5050
import pandas.core.algorithms as algos
51+
import pandas.core.sorting as sorting
5152
from pandas.io.formats.printing import pprint_thing
5253
from pandas.core.ops import _comp_method_OBJECT_ARRAY
5354
from pandas.core.strings import StringAccessorMixin
@@ -2306,7 +2307,7 @@ def difference(self, other):
23062307
assume_unique=True)
23072308
the_diff = this.values.take(label_diff)
23082309
try:
2309-
the_diff = algos.safe_sort(the_diff)
2310+
the_diff = sorting.safe_sort(the_diff)
23102311
except TypeError:
23112312
pass
23122313

@@ -2366,7 +2367,7 @@ def symmetric_difference(self, other, result_name=None):
23662367

23672368
the_diff = _concat._concat_compat([left_diff, right_diff])
23682369
try:
2369-
the_diff = algos.safe_sort(the_diff)
2370+
the_diff = sorting.safe_sort(the_diff)
23702371
except TypeError:
23712372
pass
23722373

pandas/core/reshape/merge.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838

3939
from pandas.core.sorting import is_int64_overflow_possible
4040
import pandas.core.algorithms as algos
41+
import pandas.core.sorting as sorting
4142
import pandas.core.common as com
4243
from pandas._libs import hashtable as libhashtable, join as libjoin, lib
4344
from pandas.errors import MergeError
@@ -1491,7 +1492,7 @@ def _sort_labels(uniques, left, right):
14911492
l = len(left)
14921493
labels = np.concatenate([left, right])
14931494

1494-
_, new_labels = algos.safe_sort(uniques, labels, na_sentinel=-1)
1495+
_, new_labels = sorting.safe_sort(uniques, labels, na_sentinel=-1)
14951496
new_labels = _ensure_int64(new_labels)
14961497
new_left, new_right = new_labels[:l], new_labels[l:]
14971498

pandas/core/sorting.py

+107-1
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,14 @@
11
""" miscellaneous sorting / groupby utilities """
22

33
import numpy as np
4-
from pandas.compat import long
4+
from pandas.compat import long, string_types, PY3
55
from pandas.core.categorical import Categorical
66
from pandas.core.dtypes.common import (
77
_ensure_platform_int,
88
_ensure_int64,
9+
is_list_like,
910
is_categorical_dtype)
11+
from pandas.core.dtypes.cast import infer_dtype_from_array
1012
from pandas.core.dtypes.missing import isnull
1113
import pandas.core.algorithms as algorithms
1214
from pandas._libs import lib, algos, hashtable
@@ -376,3 +378,107 @@ def _reorder_by_uniques(uniques, labels):
376378
uniques = algorithms.take_nd(uniques, sorter, allow_fill=False)
377379

378380
return uniques, labels
381+
382+
383+
def safe_sort(values, labels=None, na_sentinel=-1, assume_unique=False):
384+
"""
385+
Sort ``values`` and reorder corresponding ``labels``.
386+
``values`` should be unique if ``labels`` is not None.
387+
Safe for use with mixed types (int, str), orders ints before strs.
388+
389+
.. versionadded:: 0.19.0
390+
391+
Parameters
392+
----------
393+
values : list-like
394+
Sequence; must be unique if ``labels`` is not None.
395+
labels : list_like
396+
Indices to ``values``. All out of bound indices are treated as
397+
"not found" and will be masked with ``na_sentinel``.
398+
na_sentinel : int, default -1
399+
Value in ``labels`` to mark "not found".
400+
Ignored when ``labels`` is None.
401+
assume_unique : bool, default False
402+
When True, ``values`` are assumed to be unique, which can speed up
403+
the calculation. Ignored when ``labels`` is None.
404+
405+
Returns
406+
-------
407+
ordered : ndarray
408+
Sorted ``values``
409+
new_labels : ndarray
410+
Reordered ``labels``; returned when ``labels`` is not None.
411+
412+
Raises
413+
------
414+
TypeError
415+
* If ``values`` is not list-like or if ``labels`` is neither None
416+
nor list-like
417+
* If ``values`` cannot be sorted
418+
ValueError
419+
* If ``labels`` is not None and ``values`` contain duplicates.
420+
"""
421+
if not is_list_like(values):
422+
raise TypeError("Only list-like objects are allowed to be passed to"
423+
"safe_sort as values")
424+
425+
if not isinstance(values, np.ndarray):
426+
427+
# don't convert to string types
428+
dtype, _ = infer_dtype_from_array(values)
429+
values = np.asarray(values, dtype=dtype)
430+
431+
def sort_mixed(values):
432+
# order ints before strings, safe in py3
433+
str_pos = np.array([isinstance(x, string_types) for x in values],
434+
dtype=bool)
435+
nums = np.sort(values[~str_pos])
436+
strs = np.sort(values[str_pos])
437+
return np.concatenate([nums, np.asarray(strs, dtype=object)])
438+
439+
sorter = None
440+
if PY3 and lib.infer_dtype(values) == 'mixed-integer':
441+
# unorderable in py3 if mixed str/int
442+
ordered = sort_mixed(values)
443+
else:
444+
try:
445+
sorter = values.argsort()
446+
ordered = values.take(sorter)
447+
except TypeError:
448+
# try this anyway
449+
ordered = sort_mixed(values)
450+
451+
# labels:
452+
453+
if labels is None:
454+
return ordered
455+
456+
if not is_list_like(labels):
457+
raise TypeError("Only list-like objects or None are allowed to be"
458+
"passed to safe_sort as labels")
459+
labels = _ensure_platform_int(np.asarray(labels))
460+
461+
from pandas import Index
462+
if not assume_unique and not Index(values).is_unique:
463+
raise ValueError("values should be unique if labels is not None")
464+
465+
if sorter is None:
466+
# mixed types
467+
(hash_klass, _), values = algorithms._get_data_algo(
468+
values, algorithms._hashtables)
469+
t = hash_klass(len(values))
470+
t.map_locations(values)
471+
sorter = _ensure_platform_int(t.lookup(ordered))
472+
473+
reverse_indexer = np.empty(len(sorter), dtype=np.int_)
474+
reverse_indexer.put(sorter, np.arange(len(sorter)))
475+
476+
mask = (labels < -len(values)) | (labels >= len(values)) | \
477+
(labels == na_sentinel)
478+
479+
# (Out of bound indices will be masked with `na_sentinel` next, so we may
480+
# deal with them here without performance loss using `mode='wrap'`.)
481+
new_labels = reverse_indexer.take(labels, mode='wrap')
482+
np.putmask(new_labels, mask, na_sentinel)
483+
484+
return ordered, _ensure_platform_int(new_labels)

pandas/tests/test_algos.py

-88
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22

33
import numpy as np
44
import pytest
5-
import warnings
65

76
from numpy.random import RandomState
87
from numpy import nan
@@ -60,93 +59,6 @@ def test_strings(self):
6059
tm.assert_series_equal(result, expected)
6160

6261

63-
class TestSafeSort(object):
64-
65-
def test_basic_sort(self):
66-
values = [3, 1, 2, 0, 4]
67-
result = algos.safe_sort(values)
68-
expected = np.array([0, 1, 2, 3, 4])
69-
tm.assert_numpy_array_equal(result, expected)
70-
71-
values = list("baaacb")
72-
result = algos.safe_sort(values)
73-
expected = np.array(list("aaabbc"))
74-
tm.assert_numpy_array_equal(result, expected)
75-
76-
values = []
77-
result = algos.safe_sort(values)
78-
expected = np.array([])
79-
tm.assert_numpy_array_equal(result, expected)
80-
81-
def test_labels(self):
82-
values = [3, 1, 2, 0, 4]
83-
expected = np.array([0, 1, 2, 3, 4])
84-
85-
labels = [0, 1, 1, 2, 3, 0, -1, 4]
86-
result, result_labels = algos.safe_sort(values, labels)
87-
expected_labels = np.array([3, 1, 1, 2, 0, 3, -1, 4], dtype=np.intp)
88-
tm.assert_numpy_array_equal(result, expected)
89-
tm.assert_numpy_array_equal(result_labels, expected_labels)
90-
91-
# na_sentinel
92-
labels = [0, 1, 1, 2, 3, 0, 99, 4]
93-
result, result_labels = algos.safe_sort(values, labels,
94-
na_sentinel=99)
95-
expected_labels = np.array([3, 1, 1, 2, 0, 3, 99, 4], dtype=np.intp)
96-
tm.assert_numpy_array_equal(result, expected)
97-
tm.assert_numpy_array_equal(result_labels, expected_labels)
98-
99-
# out of bound indices
100-
labels = [0, 101, 102, 2, 3, 0, 99, 4]
101-
result, result_labels = algos.safe_sort(values, labels)
102-
expected_labels = np.array([3, -1, -1, 2, 0, 3, -1, 4], dtype=np.intp)
103-
tm.assert_numpy_array_equal(result, expected)
104-
tm.assert_numpy_array_equal(result_labels, expected_labels)
105-
106-
labels = []
107-
result, result_labels = algos.safe_sort(values, labels)
108-
expected_labels = np.array([], dtype=np.intp)
109-
tm.assert_numpy_array_equal(result, expected)
110-
tm.assert_numpy_array_equal(result_labels, expected_labels)
111-
112-
def test_mixed_integer(self):
113-
values = np.array(['b', 1, 0, 'a', 0, 'b'], dtype=object)
114-
result = algos.safe_sort(values)
115-
expected = np.array([0, 0, 1, 'a', 'b', 'b'], dtype=object)
116-
tm.assert_numpy_array_equal(result, expected)
117-
118-
values = np.array(['b', 1, 0, 'a'], dtype=object)
119-
labels = [0, 1, 2, 3, 0, -1, 1]
120-
result, result_labels = algos.safe_sort(values, labels)
121-
expected = np.array([0, 1, 'a', 'b'], dtype=object)
122-
expected_labels = np.array([3, 1, 0, 2, 3, -1, 1], dtype=np.intp)
123-
tm.assert_numpy_array_equal(result, expected)
124-
tm.assert_numpy_array_equal(result_labels, expected_labels)
125-
126-
def test_unsortable(self):
127-
# GH 13714
128-
arr = np.array([1, 2, datetime.now(), 0, 3], dtype=object)
129-
if compat.PY2 and not pd._np_version_under1p10:
130-
# RuntimeWarning: tp_compare didn't return -1 or -2 for exception
131-
with warnings.catch_warnings():
132-
pytest.raises(TypeError, algos.safe_sort, arr)
133-
else:
134-
pytest.raises(TypeError, algos.safe_sort, arr)
135-
136-
def test_exceptions(self):
137-
with tm.assert_raises_regex(TypeError,
138-
"Only list-like objects are allowed"):
139-
algos.safe_sort(values=1)
140-
141-
with tm.assert_raises_regex(TypeError,
142-
"Only list-like objects or None"):
143-
algos.safe_sort(values=[0, 1, 2], labels=1)
144-
145-
with tm.assert_raises_regex(ValueError,
146-
"values should be unique"):
147-
algos.safe_sort(values=[0, 1, 2, 1], labels=[0, 1])
148-
149-
15062
class TestFactorize(object):
15163

15264
def test_basic(self):

0 commit comments

Comments
 (0)