diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 3e2da3e95f396..87a049c77dc32 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1693,9 +1693,8 @@ def _validate_listlike(self, target: ArrayLike) -> np.ndarray: # Indexing on codes is more efficient if categories are the same, # so we can apply some optimizations based on the degree of # dtype-matching. - codes = recode_for_categories( - target.codes, target.categories, self.categories, copy=False - ) + cat = self._encode_with_my_categories(target) + codes = cat._codes else: codes = self.categories.get_indexer(target) @@ -1867,8 +1866,8 @@ def _validate_setitem_value(self, value): "without identical categories" ) # is_dtype_equal implies categories_match_up_to_permutation - new_codes = self._validate_listlike(value) - value = Categorical.from_codes(new_codes, dtype=self.dtype) + value = self._encode_with_my_categories(value) + return value._codes # wrap scalars and hashable-listlikes in list rvalue = value if not is_hashable(value) else [value] @@ -2100,8 +2099,8 @@ def equals(self, other: object) -> bool: if not isinstance(other, Categorical): return False elif self._categories_match_up_to_permutation(other): - other_codes = self._validate_listlike(other) - return np.array_equal(self._codes, other_codes) + other = self._encode_with_my_categories(other) + return np.array_equal(self._codes, other._codes) return False @classmethod @@ -2112,6 +2111,23 @@ def _concat_same_type(self, to_concat): # ------------------------------------------------------------------ + def _encode_with_my_categories(self, other: "Categorical") -> "Categorical": + """ + Re-encode another categorical using this Categorical's categories. + + Notes + ----- + This assumes we have already checked + self._categories_match_up_to_permutation(other). + """ + # Indexing on codes is more efficient if categories are the same, + # so we can apply some optimizations based on the degree of + # dtype-matching. + codes = recode_for_categories( + other.codes, other.categories, self.categories, copy=False + ) + return self._from_backing_data(codes) + def _categories_match_up_to_permutation(self, other: "Categorical") -> bool: """ Returns True if categoricals are the same dtype diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 99dc01ef421d1..a38d9cbad0d64 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -301,7 +301,7 @@ def _maybe_unwrap(x): categories = first.categories ordered = first.ordered - all_codes = [first._validate_listlike(x) for x in to_union] + all_codes = [first._encode_with_my_categories(x)._codes for x in to_union] new_codes = np.concatenate(all_codes) if sort_categories and not ignore_order and ordered: