Skip to content

Commit

Permalink
Merge pull request #1 from ngirase10/feat/categorical-update-categories
Browse files Browse the repository at this point in the history
Add optional parameter for factorize to pass in original codes, uniques
  • Loading branch information
minjooki authored Dec 11, 2023
2 parents 7012d6a + d330234 commit f1a84bb
Show file tree
Hide file tree
Showing 2 changed files with 45 additions and 0 deletions.
23 changes: 23 additions & 0 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -633,6 +633,7 @@ def factorize(
sort: bool = False,
use_na_sentinel: bool = True,
size_hint: int | None = None,
original_factorization: tuple[np.ndarray, np.ndarray | Index] | None = None
) -> tuple[np.ndarray, np.ndarray | Index]:
"""
Encode the object as an enumerated type or categorical variable.
Expand Down Expand Up @@ -758,6 +759,28 @@ def factorize(
# Step 2 is dispatched to extension types (like Categorical). They are
# responsible only for factorization. All data coercion, sorting and boxing
# should happen here.

if original_factorization is not None:
original_uniques, original_codes = original_factorization
unique_to_code = dict(zip(original_uniques, range(len(original_uniques))))

# Map existing data to original codes, assign new codes to new uniques
new_codes = []
new_uniques = list(original_uniques)
next_code = len(original_uniques)

for item in values:
if item in unique_to_code:
new_codes.append(unique_to_code[item])
else:
unique_to_code[item] = next_code
new_uniques.append(item)
new_codes.append(next_code)
next_code += 1

return new_codes, new_uniques


if isinstance(values, (ABCIndex, ABCSeries)):
return values.factorize(sort=sort, use_na_sentinel=use_na_sentinel)

Expand Down
22 changes: 22 additions & 0 deletions pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -679,6 +679,28 @@ def _from_inferred_categories(

return cls._simple_new(codes, dtype=dtype)

@classmethod
def update_categories(self, new_categories):

print("Debug - self.categories:", self.categories)


# Ensure new_categories is a list
if not isinstance(new_categories, list):
raise ValueError("new_categories must be a list")

# Combine existing and new categories, maintaining order and uniqueness
current_categories = self.categories
print("self.categories", self.categories)
print("current", current_categories)
updated_categories = list(current_categories)
for cat in new_categories:
if cat not in updated_categories:
updated_categories.append(cat)

# Update the categories in-place
self.set_categories(updated_categories, inplace=True)

@classmethod
def from_codes(
cls,
Expand Down

0 comments on commit f1a84bb

Please sign in to comment.