Skip to content

Commit

Permalink
Add "median" to TargetEncoder (#4722)
Browse files Browse the repository at this point in the history
This PR enables `TargetEncoder` to encode the `median` of the target column with respect to one or multiple categorical columns. The `for loop` logic used in this PR is not as fast as the previous optimization for `mean` and `var` but it can be easily reused for more stat functions.

Authors:
  - Jiwei Liu (https://github.com/daxiongshu)

Approvers:
  - Dante Gama Dessavre (https://github.com/dantegd)

URL: #4722
  • Loading branch information
daxiongshu authored Sep 7, 2022
1 parent 1bd3e6c commit e89e591
Show file tree
Hide file tree
Showing 2 changed files with 54 additions and 11 deletions.
47 changes: 37 additions & 10 deletions python/cuml/preprocessing/TargetEncoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,16 @@
import warnings


def get_stat_func(stat):
def func(ds):
if hasattr(ds, stat):
return getattr(ds, stat)()
else:
# implement stat
raise ValueError(f'{stat} function is not implemented.')
return func


class TargetEncoder:
"""
A cudf based implementation of target encoding [1]_, which converts
Expand Down Expand Up @@ -52,8 +62,9 @@ class TargetEncoder:
in `fit()` or `fit_transform()` functions.
output_type : {'cupy', 'numpy', 'auto'}, default = 'auto'
The data type of output. If 'auto', it matches input data.
stat : {'mean','var'}, default = 'mean'
The statistic used in encoding, mean or variance of the target.
stat : {'mean','var','median'}, default = 'mean'
The statistic used in encoding, mean, variance or median of the
target.
References
----------
Expand Down Expand Up @@ -93,8 +104,8 @@ def __init__(self, n_folds=4, smooth=0, seed=42,
" or 'numpy' or 'auto', "
"got {0}.".format(output_type))
raise ValueError(msg)
if stat not in {'mean', 'var'}:
msg = ("stat should be either 'mean' or 'var'."
if stat not in {'mean', 'var', 'median'}:
msg = ("stat should be 'mean', 'var' or 'median'."
f"got {stat}.")
raise ValueError(msg)

Expand Down Expand Up @@ -232,15 +243,15 @@ def _fit_transform(self, x, y, fold_ids):
self.n_folds = min(self.n_folds, len(train))
train[self.fold_col] = self._make_fold_column(len(train), fold_ids)

self.mean = train[self.y_col].mean()
self.y_stat_val = get_stat_func(self.stat)(train[self.y_col])
if self.stat in ['median']:
return self._fit_transform_for_loop(train, x_cols)

self.mean = train[self.y_col].mean()
if self.stat == 'var':
y_cols = [self.y_col, self.y_col2]
train[self.y_col2] = self._make_y_column(y*y)
self.mean2 = train[self.y_col2].mean()
var = self.mean2 - self.mean**2
n = train.shape[0]
self.var = var * n / (n-1)
else:
y_cols = [self.y_col]

Expand Down Expand Up @@ -277,6 +288,23 @@ def _fit_transform(self, x, y, fold_ids):
del encode_each_fold
return self._impute_and_sort(train), train

def _fit_transform_for_loop(self, train, x_cols):

def _rename_col(df, col):
df.columns = [col]
return df.reset_index()

res = []
for f in train[self.fold_col].unique().values_host:
mask = train[self.fold_col] == f
dg = train.loc[~mask].groupby(x_cols).agg({self.y_col: self.stat})
dg = _rename_col(dg, self.out_col)
res.append(train.loc[mask].merge(dg, on=x_cols, how='left'))
res = cudf.concat(res, axis=0)
self.encode_all = train.groupby(x_cols).agg({self.y_col: self.stat})
self.encode_all = _rename_col(self.encode_all, self.out_col)
return self._impute_and_sort(res), train

def _make_y_column(self, y):
"""
Create a target column given y
Expand Down Expand Up @@ -387,9 +415,8 @@ def _impute_and_sort(self, df):
"""
Impute and sort the result encoding in the same row order as input
"""
impute_val = self.var if self.stat == 'var' else self.mean
df[self.out_col] = df[self.out_col].nans_to_nulls()
df[self.out_col] = df[self.out_col].fillna(impute_val)
df[self.out_col] = df[self.out_col].fillna(self.y_stat_val)
df = df.sort_values(self.id_col)
res = df[self.out_col].values.copy()
if self.output_type == 'numpy':
Expand Down
18 changes: 17 additions & 1 deletion python/cuml/tests/test_target_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def test_targetencoder_transform():

@pytest.mark.parametrize('n_samples', [5000, 500000])
@pytest.mark.parametrize('dtype', [np.int32, np.int64, np.float32, np.float64])
@pytest.mark.parametrize('stat', ['mean', 'var'])
@pytest.mark.parametrize('stat', ['mean', 'var', 'median'])
def test_targetencoder_random(n_samples, dtype, stat):

x = cp.random.randint(0, 1000, n_samples).astype(dtype)
Expand Down Expand Up @@ -277,3 +277,19 @@ def test_get_params():
p2 = encoder.get_params()
for k, v in params.items():
assert v == p2[k]


def test_targetencoder_median():
train = cudf.DataFrame({'category': ['a', 'a', 'a', 'a',
'b', 'b', 'b', 'b'],
'label': [1, 22, 15, 17, 70, 9, 99, 56]})
encoder = TargetEncoder(stat='median')
train_encoded = encoder.fit_transform(train.category, train.label)
answer = np.array([17., 15., 17., 15., 56., 70., 56., 70.])
assert array_equal(train_encoded, answer)

encoder = TargetEncoder(stat='median')
encoder.fit(train.category, train.label)
train_encoded = encoder.transform(train.category)

assert array_equal(train_encoded, answer)

0 comments on commit e89e591

Please sign in to comment.