-
Notifications
You must be signed in to change notification settings - Fork 541
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[FEA] Add "median" to TargetEncoder #4722
Changes from all commits
cf87af4
e3b7848
e6d8ec3
8b1b7c3
7a51c5a
d7a2f60
6366e9e
7757342
6349067
6c408e5
3f4b89d
e305ae3
d3ee54a
39311f9
591ad28
c6ae54d
7f340a6
c821d78
0e29796
d6b031d
c281b4d
24864fa
02c5ddf
e2a534d
2aa8cca
b68853b
86a3397
c835151
f9015bb
dd38fd0
1dbe878
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -22,6 +22,16 @@ | |
import warnings | ||
|
||
|
||
def get_stat_func(stat): | ||
def func(ds): | ||
if hasattr(ds, stat): | ||
return getattr(ds, stat)() | ||
else: | ||
# implement stat | ||
raise ValueError(f'{stat} function is not implemented.') | ||
return func | ||
|
||
|
||
class TargetEncoder: | ||
""" | ||
A cudf based implementation of target encoding [1]_, which converts | ||
|
@@ -52,8 +62,9 @@ class TargetEncoder: | |
in `fit()` or `fit_transform()` functions. | ||
output_type : {'cupy', 'numpy', 'auto'}, default = 'auto' | ||
The data type of output. If 'auto', it matches input data. | ||
stat : {'mean','var'}, default = 'mean' | ||
The statistic used in encoding, mean or variance of the target. | ||
stat : {'mean','var','median'}, default = 'mean' | ||
The statistic used in encoding, mean, variance or median of the | ||
target. | ||
|
||
References | ||
---------- | ||
|
@@ -93,8 +104,8 @@ def __init__(self, n_folds=4, smooth=0, seed=42, | |
" or 'numpy' or 'auto', " | ||
"got {0}.".format(output_type)) | ||
raise ValueError(msg) | ||
if stat not in {'mean', 'var'}: | ||
msg = ("stat should be either 'mean' or 'var'." | ||
if stat not in {'mean', 'var', 'median'}: | ||
msg = ("stat should be 'mean', 'var' or 'median'." | ||
f"got {stat}.") | ||
raise ValueError(msg) | ||
|
||
|
@@ -232,15 +243,15 @@ def _fit_transform(self, x, y, fold_ids): | |
self.n_folds = min(self.n_folds, len(train)) | ||
train[self.fold_col] = self._make_fold_column(len(train), fold_ids) | ||
|
||
self.mean = train[self.y_col].mean() | ||
self.y_stat_val = get_stat_func(self.stat)(train[self.y_col]) | ||
if self.stat in ['median']: | ||
return self._fit_transform_for_loop(train, x_cols) | ||
Comment on lines
+247
to
+248
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think a better design pattern would be to have the codes of logic to calculate the statistics in separate functions, not only the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thank you for the comment. Do you mean something like the following to separate the logic?
|
||
|
||
self.mean = train[self.y_col].mean() | ||
if self.stat == 'var': | ||
y_cols = [self.y_col, self.y_col2] | ||
train[self.y_col2] = self._make_y_column(y*y) | ||
self.mean2 = train[self.y_col2].mean() | ||
var = self.mean2 - self.mean**2 | ||
n = train.shape[0] | ||
self.var = var * n / (n-1) | ||
else: | ||
y_cols = [self.y_col] | ||
|
||
|
@@ -277,6 +288,23 @@ def _fit_transform(self, x, y, fold_ids): | |
del encode_each_fold | ||
return self._impute_and_sort(train), train | ||
|
||
def _fit_transform_for_loop(self, train, x_cols): | ||
|
||
def _rename_col(df, col): | ||
df.columns = [col] | ||
return df.reset_index() | ||
|
||
res = [] | ||
for f in train[self.fold_col].unique().values_host: | ||
mask = train[self.fold_col] == f | ||
dg = train.loc[~mask].groupby(x_cols).agg({self.y_col: self.stat}) | ||
dg = _rename_col(dg, self.out_col) | ||
res.append(train.loc[mask].merge(dg, on=x_cols, how='left')) | ||
res = cudf.concat(res, axis=0) | ||
self.encode_all = train.groupby(x_cols).agg({self.y_col: self.stat}) | ||
self.encode_all = _rename_col(self.encode_all, self.out_col) | ||
return self._impute_and_sort(res), train | ||
|
||
def _make_y_column(self, y): | ||
""" | ||
Create a target column given y | ||
|
@@ -387,9 +415,8 @@ def _impute_and_sort(self, df): | |
""" | ||
Impute and sort the result encoding in the same row order as input | ||
""" | ||
impute_val = self.var if self.stat == 'var' else self.mean | ||
df[self.out_col] = df[self.out_col].nans_to_nulls() | ||
df[self.out_col] = df[self.out_col].fillna(impute_val) | ||
df[self.out_col] = df[self.out_col].fillna(self.y_stat_val) | ||
df = df.sort_values(self.id_col) | ||
res = df[self.out_col].values.copy() | ||
if self.output_type == 'numpy': | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -54,7 +54,7 @@ def test_targetencoder_transform(): | |
|
||
@pytest.mark.parametrize('n_samples', [5000, 500000]) | ||
@pytest.mark.parametrize('dtype', [np.int32, np.int64, np.float32, np.float64]) | ||
@pytest.mark.parametrize('stat', ['mean', 'var']) | ||
@pytest.mark.parametrize('stat', ['mean', 'var', 'median']) | ||
def test_targetencoder_random(n_samples, dtype, stat): | ||
|
||
x = cp.random.randint(0, 1000, n_samples).astype(dtype) | ||
|
@@ -277,3 +277,19 @@ def test_get_params(): | |
p2 = encoder.get_params() | ||
for k, v in params.items(): | ||
assert v == p2[k] | ||
|
||
|
||
def test_targetencoder_median(): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Test is fine, though building from my comment about separating the logic of calculating things from the init, we could have lower level stat tests, what do you think? |
||
train = cudf.DataFrame({'category': ['a', 'a', 'a', 'a', | ||
'b', 'b', 'b', 'b'], | ||
'label': [1, 22, 15, 17, 70, 9, 99, 56]}) | ||
encoder = TargetEncoder(stat='median') | ||
train_encoded = encoder.fit_transform(train.category, train.label) | ||
answer = np.array([17., 15., 17., 15., 56., 70., 56., 70.]) | ||
assert array_equal(train_encoded, answer) | ||
|
||
encoder = TargetEncoder(stat='median') | ||
encoder.fit(train.category, train.label) | ||
train_encoded = encoder.transform(train.category) | ||
|
||
assert array_equal(train_encoded, answer) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@dantegd what do you think of the change here? Thank you.