Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add optional parameter for factorize to pass in original codes, uniques #1

Merged
merged 14 commits into from
Dec 11, 2023
23 changes: 23 additions & 0 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -633,6 +633,7 @@ def factorize(
sort: bool = False,
use_na_sentinel: bool = True,
size_hint: int | None = None,
original_factorization: tuple[np.ndarray, np.ndarray | Index] | None = None
) -> tuple[np.ndarray, np.ndarray | Index]:
"""
Encode the object as an enumerated type or categorical variable.
Expand Down Expand Up @@ -758,6 +759,28 @@ def factorize(
# Step 2 is dispatched to extension types (like Categorical). They are
# responsible only for factorization. All data coercion, sorting and boxing
# should happen here.

if original_factorization is not None:
original_uniques, original_codes = original_factorization
unique_to_code = dict(zip(original_uniques, range(len(original_uniques))))

# Map existing data to original codes, assign new codes to new uniques
new_codes = []
new_uniques = list(original_uniques)
next_code = len(original_uniques)

for item in values:
if item in unique_to_code:
new_codes.append(unique_to_code[item])
else:
unique_to_code[item] = next_code
new_uniques.append(item)
new_codes.append(next_code)
next_code += 1

return new_codes, new_uniques


if isinstance(values, (ABCIndex, ABCSeries)):
return values.factorize(sort=sort, use_na_sentinel=use_na_sentinel)

Expand Down
22 changes: 22 additions & 0 deletions pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -679,6 +679,28 @@ def _from_inferred_categories(

return cls._simple_new(codes, dtype=dtype)

@classmethod
def update_categories(self, new_categories):

print("Debug - self.categories:", self.categories)


# Ensure new_categories is a list
if not isinstance(new_categories, list):
raise ValueError("new_categories must be a list")

# Combine existing and new categories, maintaining order and uniqueness
current_categories = self.categories
print("self.categories", self.categories)
print("current", current_categories)
updated_categories = list(current_categories)
for cat in new_categories:
if cat not in updated_categories:
updated_categories.append(cat)

# Update the categories in-place
self.set_categories(updated_categories, inplace=True)

@classmethod
def from_codes(
cls,
Expand Down