44"""
55from __future__ import division
66from warnings import warn , catch_warnings
7+ from textwrap import dedent
8+
79import numpy as np
810
911from pandas .core .dtypes .cast import (
3436from pandas .core import common as com
3537from pandas ._libs import algos , lib , hashtable as htable
3638from pandas ._libs .tslib import iNaT
37- from pandas .util ._decorators import deprecate_kwarg
39+ from pandas .util ._decorators import (Appender , Substitution ,
40+ deprecate_kwarg )
41+
42+ _shared_docs = {}
3843
3944
4045# --------------- #
@@ -146,10 +151,9 @@ def _reconstruct_data(values, dtype, original):
146151 Returns
147152 -------
148153 Index for extension types, otherwise ndarray casted to dtype
149-
150154 """
151155 from pandas import Index
152- if is_categorical_dtype (dtype ):
156+ if is_extension_array_dtype (dtype ):
153157 pass
154158 elif is_datetime64tz_dtype (dtype ) or is_period_dtype (dtype ):
155159 values = Index (original )._shallow_copy (values , name = None )
@@ -469,32 +473,124 @@ def _factorize_array(values, na_sentinel=-1, size_hint=None,
469473 return labels , uniques
470474
471475
472- @deprecate_kwarg (old_arg_name = 'order' , new_arg_name = None )
473- def factorize (values , sort = False , order = None , na_sentinel = - 1 , size_hint = None ):
474- """
475- Encode input values as an enumerated type or categorical variable
476+ _shared_docs ['factorize' ] = """
477+ Encode the object as an enumerated type or categorical variable.
478+
479+ This method is useful for obtaining a numeric representation of an
480+ array when all that matters is identifying distinct values. `factorize`
481+ is available as both a top-level function :func:`pandas.factorize`,
482+ and as a method :meth:`Series.factorize` and :meth:`Index.factorize`.
476483
477484 Parameters
478485 ----------
479- values : Sequence
480- ndarrays must be 1-D. Sequences that aren't pandas objects are
481- coereced to ndarrays before factorization.
482- sort : boolean, default False
483- Sort by values
486+ %(values)s%(sort)s%(order)s
484487 na_sentinel : int, default -1
485- Value to mark "not found"
486- size_hint : hint to the hashtable sizer
488+ Value to mark "not found".
489+ %( size_hint)s \
487490
488491 Returns
489492 -------
490- labels : the indexer to the original array
491- uniques : ndarray (1-d) or Index
492- the unique values. Index is returned when passed values is Index or
493- Series
493+ labels : ndarray
494+ An integer ndarray that's an indexer into `uniques`.
495+ ``uniques.take(labels)`` will have the same values as `values`.
496+ uniques : ndarray, Index, or Categorical
497+ The unique valid values. When `values` is Categorical, `uniques`
498+ is a Categorical. When `values` is some other pandas object, an
499+ `Index` is returned. Otherwise, a 1-D ndarray is returned.
500+
501+ .. note ::
502+
503+ Even if there's a missing value in `values`, `uniques` will
504+ *not* contain an entry for it.
505+
506+ See Also
507+ --------
508+ pandas.cut : Discretize continuous-valued array.
509+ pandas.unique : Find the unique valuse in an array.
510+
511+ Examples
512+ --------
513+ These examples all show factorize as a top-level method like
514+ ``pd.factorize(values)``. The results are identical for methods like
515+ :meth:`Series.factorize`.
516+
517+ >>> labels, uniques = pd.factorize(['b', 'b', 'a', 'c', 'b'])
518+ >>> labels
519+ array([0, 0, 1, 2, 0])
520+ >>> uniques
521+ array(['b', 'a', 'c'], dtype=object)
522+
523+ With ``sort=True``, the `uniques` will be sorted, and `labels` will be
524+ shuffled so that the relationship is the maintained.
525+
526+ >>> labels, uniques = pd.factorize(['b', 'b', 'a', 'c', 'b'], sort=True)
527+ >>> labels
528+ array([1, 1, 0, 2, 1])
529+ >>> uniques
530+ array(['a', 'b', 'c'], dtype=object)
531+
532+ Missing values are indicated in `labels` with `na_sentinel`
533+ (``-1`` by default). Note that missing values are never
534+ included in `uniques`.
535+
536+ >>> labels, uniques = pd.factorize(['b', None, 'a', 'c', 'b'])
537+ >>> labels
538+ array([ 0, -1, 1, 2, 0])
539+ >>> uniques
540+ array(['b', 'a', 'c'], dtype=object)
494541
495- note: an array of Periods will ignore sort as it returns an always sorted
496- PeriodIndex.
542+ Thus far, we've only factorized lists (which are internally coerced to
543+ NumPy arrays). When factorizing pandas objects, the type of `uniques`
544+ will differ. For Categoricals, a `Categorical` is returned.
545+
546+ >>> cat = pd.Categorical(['a', 'a', 'c'], categories=['a', 'b', 'c'])
547+ >>> labels, uniques = pd.factorize(cat)
548+ >>> labels
549+ array([0, 0, 1])
550+ >>> uniques
551+ [a, c]
552+ Categories (3, object): [a, b, c]
553+
554+ Notice that ``'b'`` is in ``uniques.categories``, desipite not being
555+ present in ``cat.values``.
556+
557+ For all other pandas objects, an Index of the appropriate type is
558+ returned.
559+
560+ >>> cat = pd.Series(['a', 'a', 'c'])
561+ >>> labels, uniques = pd.factorize(cat)
562+ >>> labels
563+ array([0, 0, 1])
564+ >>> uniques
565+ Index(['a', 'c'], dtype='object')
497566 """
567+
568+
569+ @Substitution (
570+ values = dedent ("""\
571+ values : sequence
572+ A 1-D seqeunce. Sequences that aren't pandas objects are
573+ coereced to ndarrays before factorization.
574+ """ ),
575+ order = dedent ("""\
576+ order
577+ .. deprecated:: 0.23.0
578+
579+ This parameter has no effect and is deprecated.
580+ """ ),
581+ sort = dedent ("""\
582+ sort : bool, default False
583+ Sort `uniques` and shuffle `labels` to maintain the
584+ relationship.
585+ """ ),
586+ size_hint = dedent ("""\
587+ size_hint : int, optional
588+ Hint to the hashtable sizer.
589+ """ ),
590+ )
591+ @Appender (_shared_docs ['factorize' ])
592+ @deprecate_kwarg (old_arg_name = 'order' , new_arg_name = None )
593+ def factorize (values , sort = False , order = None , na_sentinel = - 1 , size_hint = None ):
498594 # Implementation notes: This method is responsible for 3 things
499595 # 1.) coercing data to array-like (ndarray, Index, extension array)
500596 # 2.) factorizing labels and uniques
@@ -507,9 +603,9 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
507603 values = _ensure_arraylike (values )
508604 original = values
509605
510- if is_categorical_dtype (values ):
606+ if is_extension_array_dtype (values ):
511607 values = getattr (values , '_values' , values )
512- labels , uniques = values .factorize ()
608+ labels , uniques = values .factorize (na_sentinel = na_sentinel )
513609 dtype = original .dtype
514610 else :
515611 values , dtype , _ = _ensure_data (values )
0 commit comments