From 5f64baeb359c19a34ac7368e2bea7cb8933bed69 Mon Sep 17 00:00:00 2001 From: gketronDS Date: Tue, 6 Aug 2024 10:15:04 -0700 Subject: [PATCH 1/4] test --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 0a404280..a9f180fe 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- #TODO update this from setuptools import setup, find_packages - +#test def calculate_version(): initpy = open('tpot2/_version.py').read().split('\n') From cc2662387f50a4f7880acfecb831720aa95822c0 Mon Sep 17 00:00:00 2001 From: gketron Date: Fri, 4 Oct 2024 16:12:47 -0700 Subject: [PATCH 2/4] Added Ordinal Encoder Support (Useful for some Deep Neural Network achitectures.) --- ImputerExperiments/data/r/.DS_Store | Bin 0 -> 6148 bytes tpot2/builtin_modules/__init__.py | 2 +- .../builtin_modules/column_one_hot_encoder.py | 137 +++++++++++++++++- tpot2/config/get_configspace.py | 8 +- 4 files changed, 142 insertions(+), 5 deletions(-) create mode 100644 ImputerExperiments/data/r/.DS_Store diff --git a/ImputerExperiments/data/r/.DS_Store b/ImputerExperiments/data/r/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..605eba826e16686a6ed6279ace1de81c7da8f248 GIT binary patch literal 6148 zcmeHKO)msN5Pi)iganC;3mIq0COC-OEbHJP#JVqb8De%v%=o&^-GA^G{5~gMRd-}& zb`Td4=}Nk)zg|`Ko1W=u0A}m9zY5F&m}C*#n4+o?8J995W3ESVMtmf_(>OV0OWb+! z40s0qIs@|Vj$;*@*w~Fr?{AAE%{LWY!~|||h6qjckm9a9+dNff&$`)NT;U#FWW=V_ zwP?Il(DNVYYs5HV#4}tVVZG1VO}!VhSn)E04zYJc^bw$iAzLvXna3s4O8&1E?7>%9 zm)$vT=`Y3(9&n9y95Pa*V<}@KtnE=t#xb^88*o<4)&u!C!U6WF*H|CLbqmO}f~{q6 zEUM(+{4)P?#7pKjWF9SDO@KCe%2-n7no)PTl7Y6GT)_}gnLFZu=fX}|D#RAACEn?6 zz-J&#B=r8BZ`m0|-^)MpeMlZVd_FDipdNQkP4>ETixF0S6d@xwC)`g%EfLq#=dO^x zj6K3@eN@@2#i(rly70^vnVh!PYtMjZz%%f{fP5bU7O|nTxwZP~pwcSWz`tU^G}@i^7MJAC){(`@SsSn%vxvyN+**ZDx#L(X fauhGHsBkQh3T^0YZY_H#{zpJ*@X9mrs|>sYj#A^) literal 0 HcmV?d00001 diff --git a/tpot2/builtin_modules/__init__.py b/tpot2/builtin_modules/__init__.py index 7f825e66..6a96dda1 100644 --- a/tpot2/builtin_modules/__init__.py +++ b/tpot2/builtin_modules/__init__.py @@ -1,6 +1,6 @@ from .feature_set_selector import FeatureSetSelector from .zero_count import ZeroCount -from .column_one_hot_encoder import ColumnOneHotEncoder +from .column_one_hot_encoder import ColumnOneHotEncoder, ColumnOrdinalEncoder from .arithmetictransformer import ArithmeticTransformer from .arithmetictransformer import AddTransformer, mul_neg_1_Transformer, MulTransformer, SafeReciprocalTransformer, EQTransformer, NETransformer, GETransformer, GTTransformer, LETransformer, LTTransformer, MinTransformer, MaxTransformer, ZeroTransformer, OneTransformer, NTransformer from .passthrough import Passthrough, SkipTransformer diff --git a/tpot2/builtin_modules/column_one_hot_encoder.py b/tpot2/builtin_modules/column_one_hot_encoder.py index d3472b5c..d1808bc3 100644 --- a/tpot2/builtin_modules/column_one_hot_encoder.py +++ b/tpot2/builtin_modules/column_one_hot_encoder.py @@ -3,7 +3,7 @@ from sklearn.base import BaseEstimator, TransformerMixin from sklearn.utils import check_array -from sklearn.preprocessing import OneHotEncoder +from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder import sklearn import pandas as pd @@ -168,3 +168,138 @@ def transform(self, X): return pd.concat([X_not_sel.reset_index(drop=True), X_sel.reset_index(drop=True)], axis=1) else: return np.hstack((X_not_sel, X_sel)) + +class ColumnOrdinalEncoder(BaseEstimator, TransformerMixin): + + + def __init__(self, columns='auto', handle_unknown='error', unknown_value = -1, encoded_missing_value = np.nan, min_frequency=None,max_categories=None): + ''' + + Parameters + ---------- + + columns : str, list, default='auto' + Determines which columns to onehot encode with sklearn.preprocessing.OneHotEncoder. + - 'auto' : Automatically select categorical features based on columns with less than 10 unique values + - 'categorical' : Automatically select categorical features + - 'numeric' : Automatically select numeric features + - 'all' : Select all features + - list : A list of columns to select + + drop, handle_unknown, sparse_output, min_frequency, max_categories : see sklearn.preprocessing.OneHotEncoder + + ''' + + self.columns = columns + self.handle_unknown = handle_unknown + self.unknown_value = unknown_value + self.encoded_missing_value = encoded_missing_value + self.min_frequency = min_frequency + self.max_categories = max_categories + + + + def fit(self, X, y=None): + """Fit OneHotEncoder to X, then transform X. + + Equivalent to self.fit(X).transform(X), but more convenient and more + efficient. See fit for the parameters, transform for the return value. + + Parameters + ---------- + X : array-like or sparse matrix, shape=(n_samples, n_features) + Dense array or sparse matrix. + y: array-like {n_samples,} (Optional, ignored) + Feature labels + """ + + if (self.columns == "categorical" or self.columns == "numeric") and not isinstance(X, pd.DataFrame): + raise ValueError(f"Invalid value for columns: {self.columns}. " + "Only 'all' or is supported for np arrays") + + if self.columns == "categorical": + self.columns_ = list(X.select_dtypes(exclude='number').columns) + elif self.columns == "numeric": + self.columns_ = [col for col in X.columns if is_numeric_dtype(X[col])] + elif self.columns == "auto": + self.columns_ = auto_select_categorical_features(X) + elif self.columns == "all": + if isinstance(X, pd.DataFrame): + self.columns_ = X.columns + else: + self.columns_ = list(range(X.shape[1])) + elif isinstance(self.columns, list): + self.columns_ = self.columns + else: + raise ValueError(f"Invalid value for columns: {self.columns}") + + if len(self.columns_) == 0: + return self + + self.enc = sklearn.preprocessing.OrdinalEncoder(categories='auto', + handle_unknown = self.handle_unknown, + unknown_value = self.unknown_value, + encoded_missing_value = self.encoded_missing_value, + min_frequency = self.min_frequency, + max_categories = self.max_categories) + #TODO make this more consistent with sklearn baseimputer/baseencoder + ''' + if isinstance(X, pd.DataFrame): + self.enc.set_output(transform="pandas") + for col in X.columns: + # check if the column name is not a string + if not isinstance(col, str): + # if it's not a string, rename the column with "X" prefix + X.rename(columns={col: f"X{col}"}, inplace=True) + ''' + + if len(self.columns_) == X.shape[1]: + X_sel = self.enc.fit(X) + else: + X_sel, X_not_sel = _X_selected(X, self.columns_) + X_sel = self.enc.fit(X_sel) + + return self + + def transform(self, X): + """Transform X using one-hot encoding. + + Parameters + ---------- + X : array-like or sparse matrix, shape=(n_samples, n_features) + Dense array or sparse matrix. + + Returns + ------- + X_out : sparse matrix if sparse=True else a 2-d array, dtype=int + Transformed input. + """ + + + if len(self.columns_) == 0: + return X + + #TODO make this more consistent with sklearn baseimputer/baseencoder + ''' + if isinstance(X, pd.DataFrame): + for col in X.columns: + # check if the column name is not a string + if not isinstance(col, str): + # if it's not a string, rename the column with "X" prefix + X.rename(columns={col: f"X{col}"}, inplace=True) + ''' + + if len(self.columns_) == X.shape[1]: + return self.enc.transform(X) + else: + + X_sel, X_not_sel= _X_selected(X, self.columns_) + X_sel = self.enc.transform(X_sel) + + #If X is dataframe + if isinstance(X, pd.DataFrame): + + X_sel = pd.DataFrame(X_sel, columns=self.enc.get_feature_names_out()) + return pd.concat([X_not_sel.reset_index(drop=True), X_sel.reset_index(drop=True)], axis=1) + else: + return np.hstack((X_not_sel, X_sel)) \ No newline at end of file diff --git a/tpot2/config/get_configspace.py b/tpot2/config/get_configspace.py index 46b13b60..cd63c996 100644 --- a/tpot2/config/get_configspace.py +++ b/tpot2/config/get_configspace.py @@ -26,7 +26,7 @@ from tpot2.builtin_modules import genetic_encoders, feature_encoding_frequency_selector from tpot2.builtin_modules import AddTransformer, mul_neg_1_Transformer, MulTransformer, SafeReciprocalTransformer, EQTransformer, NETransformer, GETransformer, GTTransformer, LETransformer, LTTransformer, MinTransformer, MaxTransformer, ZeroTransformer, OneTransformer, NTransformer from tpot2.builtin_modules.genetic_encoders import DominantEncoder, RecessiveEncoder, HeterosisEncoder, UnderDominanceEncoder, OverDominanceEncoder -from tpot2.builtin_modules import ZeroCount, ColumnOneHotEncoder, PassKBinsDiscretizer +from tpot2.builtin_modules import ZeroCount, ColumnOneHotEncoder, ColumnOrdinalEncoder, PassKBinsDiscretizer from tpot2.builtin_modules import Passthrough, SkipTransformer from sklearn.linear_model import SGDClassifier, LogisticRegression, SGDRegressor, Ridge, Lasso, ElasticNet, Lars, LassoLars, LassoLarsCV, RidgeCV, ElasticNetCV, PassiveAggressiveClassifier, ARDRegression from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, ExtraTreesRegressor, ExtraTreesClassifier, AdaBoostRegressor, AdaBoostClassifier, GradientBoostingRegressor,RandomForestRegressor, BaggingRegressor, ExtraTreesRegressor, HistGradientBoostingClassifier, HistGradientBoostingRegressor @@ -48,7 +48,7 @@ from sklearn.experimental import enable_iterative_imputer from sklearn.impute import SimpleImputer, IterativeImputer, KNNImputer -all_methods = [SGDClassifier, RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, MLPClassifier, DecisionTreeClassifier, XGBClassifier, KNeighborsClassifier, SVC, LogisticRegression, LGBMClassifier, LinearSVC, GaussianNB, BernoulliNB, MultinomialNB, ExtraTreesRegressor, RandomForestRegressor, GradientBoostingRegressor, BaggingRegressor, DecisionTreeRegressor, KNeighborsRegressor, XGBRegressor, ZeroCount, ColumnOneHotEncoder, Binarizer, FastICA, FeatureAgglomeration, MaxAbsScaler, MinMaxScaler, Normalizer, Nystroem, PCA, PolynomialFeatures, RBFSampler, RobustScaler, StandardScaler, SelectFwe, SelectPercentile, VarianceThreshold, SGDRegressor, Ridge, Lasso, ElasticNet, Lars, LassoLars, LassoLarsCV, RidgeCV, SVR, LinearSVR, AdaBoostRegressor, GradientBoostingRegressor, RandomForestRegressor, BaggingRegressor, ExtraTreesRegressor, DecisionTreeRegressor, KNeighborsRegressor, ElasticNetCV, +all_methods = [SGDClassifier, RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, MLPClassifier, DecisionTreeClassifier, XGBClassifier, KNeighborsClassifier, SVC, LogisticRegression, LGBMClassifier, LinearSVC, GaussianNB, BernoulliNB, MultinomialNB, ExtraTreesRegressor, RandomForestRegressor, GradientBoostingRegressor, BaggingRegressor, DecisionTreeRegressor, KNeighborsRegressor, XGBRegressor, ZeroCount, ColumnOneHotEncoder, ColumnOrdinalEncoder, Binarizer, FastICA, FeatureAgglomeration, MaxAbsScaler, MinMaxScaler, Normalizer, Nystroem, PCA, PolynomialFeatures, RBFSampler, RobustScaler, StandardScaler, SelectFwe, SelectPercentile, VarianceThreshold, SGDRegressor, Ridge, Lasso, ElasticNet, Lars, LassoLars, LassoLarsCV, RidgeCV, SVR, LinearSVR, AdaBoostRegressor, GradientBoostingRegressor, RandomForestRegressor, BaggingRegressor, ExtraTreesRegressor, DecisionTreeRegressor, KNeighborsRegressor, ElasticNetCV, AdaBoostClassifier,MLPRegressor, GaussianProcessRegressor, HistGradientBoostingClassifier, HistGradientBoostingRegressor, AddTransformer, mul_neg_1_Transformer, MulTransformer, SafeReciprocalTransformer, EQTransformer, NETransformer, GETransformer, GTTransformer, LETransformer, LTTransformer, MinTransformer, MaxTransformer, ZeroTransformer, OneTransformer, NTransformer, @@ -120,7 +120,7 @@ "regressors" : ["LGBMRegressor", 'AdaBoostRegressor', "ARDRegression", 'DecisionTreeRegressor', 'ExtraTreesRegressor', 'HistGradientBoostingRegressor', 'KNeighborsRegressor', 'LinearSVR', "MLPRegressor", 'RandomForestRegressor', 'SGDRegressor', 'SVR', 'XGBRegressor'], - "transformers": ["PassKBinsDiscretizer", "Binarizer", "PCA", "ZeroCount", "ColumnOneHotEncoder", "FastICA", "FeatureAgglomeration", "Nystroem", "RBFSampler", "QuantileTransformer", "PowerTransformer"], + "transformers": ["PassKBinsDiscretizer", "Binarizer", "PCA", "ZeroCount", "ColumnOneHotEncoder", "ColumnOrdinalEncoder", "FastICA", "FeatureAgglomeration", "Nystroem", "RBFSampler", "QuantileTransformer", "PowerTransformer"], "scalers": ["MinMaxScaler", "RobustScaler", "StandardScaler", "MaxAbsScaler", "Normalizer", ], "all_transformers" : ["transformers", "scalers"], @@ -290,6 +290,8 @@ def get_configspace(name, n_classes=3, n_samples=1000, n_features=100, random_st return transformers.RobustScaler_configspace case "ColumnOneHotEncoder": return {} + case "ColumnOrdinalEncoder": + return {} case "MaxAbsScaler": return {} case "PolynomialFeatures": From 7bc5d21130a1daaab0f18aec70ed6527d6314573 Mon Sep 17 00:00:00 2001 From: gketron Date: Fri, 4 Oct 2024 16:18:34 -0700 Subject: [PATCH 3/4] Remove Test --- setup.py | 1 - 1 file changed, 1 deletion(-) diff --git a/setup.py b/setup.py index a6475aa4..8a572de9 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,6 @@ # -*- coding: utf-8 -*- #TODO update this from setuptools import setup, find_packages -#test def calculate_version(): initpy = open('tpot2/_version.py').read().split('\n') From dab722c7696b4a1fe46996cd9961feb3889e2742 Mon Sep 17 00:00:00 2001 From: gketron Date: Thu, 10 Oct 2024 15:18:27 -0700 Subject: [PATCH 4/4] removed OneHotEncoder from transformers, moved to new column_encoder group along with the ordinal encoder. --- tpot2/config/column_encoders.py | 8 ++++++++ tpot2/config/get_configspace.py | 17 ++++++++++------- tpot2/config/transformers.py | 2 -- 3 files changed, 18 insertions(+), 9 deletions(-) create mode 100644 tpot2/config/column_encoders.py diff --git a/tpot2/config/column_encoders.py b/tpot2/config/column_encoders.py new file mode 100644 index 00000000..0adea3bd --- /dev/null +++ b/tpot2/config/column_encoders.py @@ -0,0 +1,8 @@ +from ConfigSpace import ConfigurationSpace +from ConfigSpace import ConfigurationSpace, Integer, Float, Categorical, Normal +from ConfigSpace import EqualsCondition, OrConjunction, NotEqualsCondition, InCondition +import numpy as np + +OneHotEncoder_configspace = {} #TODO include the parameter for max unique values + +OrdinalEncoder_configspace = {} #TODO include the parameter for max unique values \ No newline at end of file diff --git a/tpot2/config/get_configspace.py b/tpot2/config/get_configspace.py index cd63c996..64cd3dbd 100644 --- a/tpot2/config/get_configspace.py +++ b/tpot2/config/get_configspace.py @@ -10,6 +10,7 @@ from . import classifiers from . import transformers from . import selectors +from . import column_encoders from . import regressors from . import autoqtl_builtins from . import imputers @@ -119,10 +120,10 @@ "classifiers" : ["LGBMClassifier", "BaggingClassifier", 'AdaBoostClassifier', 'BernoulliNB', 'DecisionTreeClassifier', 'ExtraTreesClassifier', 'GaussianNB', 'HistGradientBoostingClassifier', 'KNeighborsClassifier','LinearDiscriminantAnalysis', 'LogisticRegression', "LinearSVC", "SVC", 'MLPClassifier', 'MultinomialNB', "QuadraticDiscriminantAnalysis", 'RandomForestClassifier', 'SGDClassifier', 'XGBClassifier'], "regressors" : ["LGBMRegressor", 'AdaBoostRegressor', "ARDRegression", 'DecisionTreeRegressor', 'ExtraTreesRegressor', 'HistGradientBoostingRegressor', 'KNeighborsRegressor', 'LinearSVR', "MLPRegressor", 'RandomForestRegressor', 'SGDRegressor', 'SVR', 'XGBRegressor'], - - "transformers": ["PassKBinsDiscretizer", "Binarizer", "PCA", "ZeroCount", "ColumnOneHotEncoder", "ColumnOrdinalEncoder", "FastICA", "FeatureAgglomeration", "Nystroem", "RBFSampler", "QuantileTransformer", "PowerTransformer"], + "column_encoders" : ["ColumnOneHotEncoder", "ColumnOrdinalEncoder"], + "transformers": ["PassKBinsDiscretizer", "Binarizer", "PCA", "ZeroCount", "FastICA", "FeatureAgglomeration", "Nystroem", "RBFSampler", "QuantileTransformer", "PowerTransformer"], "scalers": ["MinMaxScaler", "RobustScaler", "StandardScaler", "MaxAbsScaler", "Normalizer", ], - "all_transformers" : ["transformers", "scalers"], + "all_transformers" : ["transformers", "scalers", "column_encoders"], "arithmatic": ["AddTransformer", "mul_neg_1_Transformer", "MulTransformer", "SafeReciprocalTransformer", "EQTransformer", "NETransformer", "GETransformer", "GTTransformer", "LETransformer", "LTTransformer", "MinTransformer", "MaxTransformer"], "imputers": ["SimpleImputer", "IterativeImputer", "KNNImputer"], @@ -288,10 +289,6 @@ def get_configspace(name, n_classes=3, n_samples=1000, n_features=100, random_st return transformers.get_QuantileTransformer_configspace(random_state=random_state) case "RobustScaler": return transformers.RobustScaler_configspace - case "ColumnOneHotEncoder": - return {} - case "ColumnOrdinalEncoder": - return {} case "MaxAbsScaler": return {} case "PolynomialFeatures": @@ -301,6 +298,12 @@ def get_configspace(name, n_classes=3, n_samples=1000, n_features=100, random_st case "PassKBinsDiscretizer": return transformers.get_passkbinsdiscretizer_configspace(random_state=random_state) + #column_encoders.py + case "ColumnOneHotEncoder": + return column_encoders.OneHotEncoder_configspace + case "ColumnOrdinalEncoder": + return column_encoders.OrdinalEncoder_configspace + #selectors.py case "SelectFwe": return selectors.SelectFwe_configspace diff --git a/tpot2/config/transformers.py b/tpot2/config/transformers.py index 6d393460..ecf7c20d 100644 --- a/tpot2/config/transformers.py +++ b/tpot2/config/transformers.py @@ -26,8 +26,6 @@ } ) -OneHotEncoder_configspace = {} #TODO include the parameter for max unique values - def get_FastICA_configspace(n_features=100, random_state=None): space = {