From 0e29796493cb7afd6930f608e2e957a2f17f1251 Mon Sep 17 00:00:00 2001 From: Jiwei Liu Date: Sat, 16 Apr 2022 14:19:35 -0700 Subject: [PATCH 1/9] first commit --- python/cuml/preprocessing/TargetEncoder.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cuml/preprocessing/TargetEncoder.py b/python/cuml/preprocessing/TargetEncoder.py index df45c22d1f..a6c179ecab 100644 --- a/python/cuml/preprocessing/TargetEncoder.py +++ b/python/cuml/preprocessing/TargetEncoder.py @@ -93,8 +93,8 @@ def __init__(self, n_folds=4, smooth=0, seed=42, " or 'numpy' or 'auto', " "got {0}.".format(output_type)) raise ValueError(msg) - if stat not in {'mean', 'var'}: - msg = ("stat should be either 'mean' or 'var'." + if stat not in {'mean', 'var', 'median'}: + msg = ("stat should be 'mean', 'var' or 'median'." f"got {stat}.") raise ValueError(msg) From d6b031dc1a4d453ae08d14db206000a31522dc9d Mon Sep 17 00:00:00 2001 From: Jiwei Liu Date: Sun, 17 Apr 2022 07:16:28 -0700 Subject: [PATCH 2/9] docstring --- python/cuml/preprocessing/TargetEncoder.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/python/cuml/preprocessing/TargetEncoder.py b/python/cuml/preprocessing/TargetEncoder.py index a6c179ecab..3320d11ffe 100644 --- a/python/cuml/preprocessing/TargetEncoder.py +++ b/python/cuml/preprocessing/TargetEncoder.py @@ -52,8 +52,9 @@ class TargetEncoder: in `fit()` or `fit_transform()` functions. output_type : {'cupy', 'numpy', 'auto'}, default = 'auto' The data type of output. If 'auto', it matches input data. - stat : {'mean','var'}, default = 'mean' - The statistic used in encoding, mean or variance of the target. + stat : {'mean','var','median'}, default = 'mean' + The statistic used in encoding, mean, variance or median of the + target. References ---------- From c281b4d08d9154072a22f6c745ba4200ac006929 Mon Sep 17 00:00:00 2001 From: Jiwei Liu Date: Mon, 18 Apr 2022 16:30:25 -0400 Subject: [PATCH 3/9] start for loop --- python/cuml/preprocessing/TargetEncoder.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/python/cuml/preprocessing/TargetEncoder.py b/python/cuml/preprocessing/TargetEncoder.py index 3320d11ffe..e8231e3edc 100644 --- a/python/cuml/preprocessing/TargetEncoder.py +++ b/python/cuml/preprocessing/TargetEncoder.py @@ -233,6 +233,9 @@ def _fit_transform(self, x, y, fold_ids): self.n_folds = min(self.n_folds, len(train)) train[self.fold_col] = self._make_fold_column(len(train), fold_ids) + if self.stat in ['median']: + return self._fit_transform_for_loop(train, x_cols) + self.mean = train[self.y_col].mean() if self.stat == 'var': @@ -278,6 +281,12 @@ def _fit_transform(self, x, y, fold_ids): del encode_each_fold return self._impute_and_sort(train), train + def _fit_transform_for_loop(self, train, x_cols): + res = [] + for f in train[self.fold_col].unique(): + mask = train[self.fold_col] == f + dg = train.loc[~mask].groupby(x_cols).agg({self.y_col:self.stat}) + def _make_y_column(self, y): """ Create a target column given y From 24864faa4bf28319cbe60274c7f7e1d942c1c098 Mon Sep 17 00:00:00 2001 From: Jiwei Liu Date: Mon, 18 Apr 2022 17:34:22 -0700 Subject: [PATCH 4/9] TODO: change impute_and_sort --- python/cuml/preprocessing/TargetEncoder.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/python/cuml/preprocessing/TargetEncoder.py b/python/cuml/preprocessing/TargetEncoder.py index e8231e3edc..6490e3d4cf 100644 --- a/python/cuml/preprocessing/TargetEncoder.py +++ b/python/cuml/preprocessing/TargetEncoder.py @@ -286,6 +286,12 @@ def _fit_transform_for_loop(self, train, x_cols): for f in train[self.fold_col].unique(): mask = train[self.fold_col] == f dg = train.loc[~mask].groupby(x_cols).agg({self.y_col:self.stat}) + dg.columns = [self.out_col] + dg = dg.reset_index() + res.append(train.loc[mask].merge(dg, on=x_cols, how='left')) + res = cudf.concat(res,axis=0) + return self._impute_and_sort(res), train + def _make_y_column(self, y): """ From 02c5ddf9ccd5dcb1aa45f58dbe7a36d15f0b0cd4 Mon Sep 17 00:00:00 2001 From: Jiwei Liu Date: Wed, 4 May 2022 03:25:18 -0700 Subject: [PATCH 5/9] basic works --- python/cuml/preprocessing/TargetEncoder.py | 10 ++++------ python/cuml/test/test_target_encoder.py | 6 ++++++ 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/python/cuml/preprocessing/TargetEncoder.py b/python/cuml/preprocessing/TargetEncoder.py index 6490e3d4cf..1ab4eb3fae 100644 --- a/python/cuml/preprocessing/TargetEncoder.py +++ b/python/cuml/preprocessing/TargetEncoder.py @@ -233,18 +233,15 @@ def _fit_transform(self, x, y, fold_ids): self.n_folds = min(self.n_folds, len(train)) train[self.fold_col] = self._make_fold_column(len(train), fold_ids) + self.stat_val = eval(f'train[self.y_col].{self.stat}()') if self.stat in ['median']: return self._fit_transform_for_loop(train, x_cols) self.mean = train[self.y_col].mean() - if self.stat == 'var': y_cols = [self.y_col, self.y_col2] train[self.y_col2] = self._make_y_column(y*y) self.mean2 = train[self.y_col2].mean() - var = self.mean2 - self.mean**2 - n = train.shape[0] - self.var = var * n / (n-1) else: y_cols = [self.y_col] @@ -283,7 +280,8 @@ def _fit_transform(self, x, y, fold_ids): def _fit_transform_for_loop(self, train, x_cols): res = [] - for f in train[self.fold_col].unique(): + + for f in train[self.fold_col].unique().values_host: mask = train[self.fold_col] == f dg = train.loc[~mask].groupby(x_cols).agg({self.y_col:self.stat}) dg.columns = [self.out_col] @@ -403,7 +401,7 @@ def _impute_and_sort(self, df): """ Impute and sort the result encoding in the same row order as input """ - impute_val = self.var if self.stat == 'var' else self.mean + impute_val = self.stat_val df[self.out_col] = df[self.out_col].nans_to_nulls() df[self.out_col] = df[self.out_col].fillna(impute_val) df = df.sort_values(self.id_col) diff --git a/python/cuml/test/test_target_encoder.py b/python/cuml/test/test_target_encoder.py index abab0d8bbe..12d3b44b06 100644 --- a/python/cuml/test/test_target_encoder.py +++ b/python/cuml/test/test_target_encoder.py @@ -277,3 +277,9 @@ def test_get_params(): p2 = encoder.get_params() for k, v in params.items(): assert v == p2[k] + +def test_targetencoder_median(): + train = cudf.DataFrame({'category': ['a', 'b', 'b', 'b'], + 'label': [1, 0, 1, 1]}) + encoder = TargetEncoder(stat='median') + train_encoded = encoder.fit_transform(train.category, train.label) \ No newline at end of file From e2a534df0e12c29caf00e4915b81be833fdde0f6 Mon Sep 17 00:00:00 2001 From: Jiwei Liu Date: Wed, 4 May 2022 03:46:51 -0700 Subject: [PATCH 6/9] fit_transform test passed --- python/cuml/preprocessing/TargetEncoder.py | 5 ++--- python/cuml/test/test_target_encoder.py | 9 ++++++--- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/python/cuml/preprocessing/TargetEncoder.py b/python/cuml/preprocessing/TargetEncoder.py index 1ab4eb3fae..12fbb9714f 100644 --- a/python/cuml/preprocessing/TargetEncoder.py +++ b/python/cuml/preprocessing/TargetEncoder.py @@ -233,7 +233,7 @@ def _fit_transform(self, x, y, fold_ids): self.n_folds = min(self.n_folds, len(train)) train[self.fold_col] = self._make_fold_column(len(train), fold_ids) - self.stat_val = eval(f'train[self.y_col].{self.stat}()') + self.y_stat_val = eval(f'train[self.y_col].{self.stat}()') if self.stat in ['median']: return self._fit_transform_for_loop(train, x_cols) @@ -401,9 +401,8 @@ def _impute_and_sort(self, df): """ Impute and sort the result encoding in the same row order as input """ - impute_val = self.stat_val df[self.out_col] = df[self.out_col].nans_to_nulls() - df[self.out_col] = df[self.out_col].fillna(impute_val) + df[self.out_col] = df[self.out_col].fillna(self.y_stat_val) df = df.sort_values(self.id_col) res = df[self.out_col].values.copy() if self.output_type == 'numpy': diff --git a/python/cuml/test/test_target_encoder.py b/python/cuml/test/test_target_encoder.py index 12d3b44b06..069ceac372 100644 --- a/python/cuml/test/test_target_encoder.py +++ b/python/cuml/test/test_target_encoder.py @@ -279,7 +279,10 @@ def test_get_params(): assert v == p2[k] def test_targetencoder_median(): - train = cudf.DataFrame({'category': ['a', 'b', 'b', 'b'], - 'label': [1, 0, 1, 1]}) + train = cudf.DataFrame({'category': ['a', 'a', 'a', 'a', + 'b', 'b', 'b', 'b'], + 'label': [1, 22, 15, 17, 70, 9, 99, 56]}) encoder = TargetEncoder(stat='median') - train_encoded = encoder.fit_transform(train.category, train.label) \ No newline at end of file + train_encoded = encoder.fit_transform(train.category, train.label) + answer = np.array([17., 15., 17., 15., 56., 70., 56., 70.]) + assert array_equal(train_encoded, answer) \ No newline at end of file From 2aa8ccaea820b5807f4dfd0e84c2ad7d1a4fd5a3 Mon Sep 17 00:00:00 2001 From: Jiwei Liu Date: Wed, 4 May 2022 04:05:43 -0700 Subject: [PATCH 7/9] transform test passed --- python/cuml/preprocessing/TargetEncoder.py | 16 ++++++++++------ python/cuml/test/test_target_encoder.py | 13 ++++++++++--- 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/python/cuml/preprocessing/TargetEncoder.py b/python/cuml/preprocessing/TargetEncoder.py index 12fbb9714f..c5df498647 100644 --- a/python/cuml/preprocessing/TargetEncoder.py +++ b/python/cuml/preprocessing/TargetEncoder.py @@ -279,18 +279,22 @@ def _fit_transform(self, x, y, fold_ids): return self._impute_and_sort(train), train def _fit_transform_for_loop(self, train, x_cols): + + def _rename_col(df, col): + df.columns = [col] + return df.reset_index() + res = [] - for f in train[self.fold_col].unique().values_host: mask = train[self.fold_col] == f - dg = train.loc[~mask].groupby(x_cols).agg({self.y_col:self.stat}) - dg.columns = [self.out_col] - dg = dg.reset_index() + dg = train.loc[~mask].groupby(x_cols).agg({self.y_col: self.stat}) + dg = _rename_col(dg, self.out_col) res.append(train.loc[mask].merge(dg, on=x_cols, how='left')) - res = cudf.concat(res,axis=0) + res = cudf.concat(res, axis=0) + self.encode_all = train.groupby(x_cols).agg({self.y_col: self.stat}) + self.encode_all = _rename_col(self.encode_all, self.out_col) return self._impute_and_sort(res), train - def _make_y_column(self, y): """ Create a target column given y diff --git a/python/cuml/test/test_target_encoder.py b/python/cuml/test/test_target_encoder.py index 069ceac372..1600ceb981 100644 --- a/python/cuml/test/test_target_encoder.py +++ b/python/cuml/test/test_target_encoder.py @@ -54,7 +54,7 @@ def test_targetencoder_transform(): @pytest.mark.parametrize('n_samples', [5000, 500000]) @pytest.mark.parametrize('dtype', [np.int32, np.int64, np.float32, np.float64]) -@pytest.mark.parametrize('stat', ['mean', 'var']) +@pytest.mark.parametrize('stat', ['mean', 'var', 'median']) def test_targetencoder_random(n_samples, dtype, stat): x = cp.random.randint(0, 1000, n_samples).astype(dtype) @@ -278,11 +278,18 @@ def test_get_params(): for k, v in params.items(): assert v == p2[k] + def test_targetencoder_median(): - train = cudf.DataFrame({'category': ['a', 'a', 'a', 'a', + train = cudf.DataFrame({'category': ['a', 'a', 'a', 'a', 'b', 'b', 'b', 'b'], 'label': [1, 22, 15, 17, 70, 9, 99, 56]}) encoder = TargetEncoder(stat='median') train_encoded = encoder.fit_transform(train.category, train.label) answer = np.array([17., 15., 17., 15., 56., 70., 56., 70.]) - assert array_equal(train_encoded, answer) \ No newline at end of file + assert array_equal(train_encoded, answer) + + encoder = TargetEncoder(stat='median') + encoder.fit(train.category, train.label) + train_encoded = encoder.transform(train.category) + + assert array_equal(train_encoded, answer) From dd38fd09b34665188cb27b50f098b3d3ed4fc12c Mon Sep 17 00:00:00 2001 From: Jiwei Liu Date: Thu, 1 Sep 2022 18:34:55 -0700 Subject: [PATCH 8/9] get_stat_func --- python/cuml/preprocessing/TargetEncoder.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/python/cuml/preprocessing/TargetEncoder.py b/python/cuml/preprocessing/TargetEncoder.py index c5df498647..d2a4633208 100644 --- a/python/cuml/preprocessing/TargetEncoder.py +++ b/python/cuml/preprocessing/TargetEncoder.py @@ -21,6 +21,14 @@ from cuml.common.exceptions import NotFittedError import warnings +def get_stat_func(stat): + def func(ds): + if hasattr(ds, stat): + return getattr(ds, stat)() + else: + # implement stat + raise ValueError(f'{stat} function is not implemented.') + return func class TargetEncoder: """ @@ -233,7 +241,7 @@ def _fit_transform(self, x, y, fold_ids): self.n_folds = min(self.n_folds, len(train)) train[self.fold_col] = self._make_fold_column(len(train), fold_ids) - self.y_stat_val = eval(f'train[self.y_col].{self.stat}()') + self.y_stat_val = get_stat_func(self.stat)(train[self.y_col]) if self.stat in ['median']: return self._fit_transform_for_loop(train, x_cols) From 1dbe878ae11a477a1d4662610ec332db1c98bea5 Mon Sep 17 00:00:00 2001 From: Jiwei Liu Date: Thu, 1 Sep 2022 18:48:54 -0700 Subject: [PATCH 9/9] fix style --- python/cuml/preprocessing/TargetEncoder.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/cuml/preprocessing/TargetEncoder.py b/python/cuml/preprocessing/TargetEncoder.py index d2a4633208..1b8b75334b 100644 --- a/python/cuml/preprocessing/TargetEncoder.py +++ b/python/cuml/preprocessing/TargetEncoder.py @@ -21,6 +21,7 @@ from cuml.common.exceptions import NotFittedError import warnings + def get_stat_func(stat): def func(ds): if hasattr(ds, stat): @@ -30,6 +31,7 @@ def func(ds): raise ValueError(f'{stat} function is not implemented.') return func + class TargetEncoder: """ A cudf based implementation of target encoding [1]_, which converts