Skip to content

Commit

Permalink
refactoring of columns selection (#29)
Browse files Browse the repository at this point in the history
* add conversion model to json : 'param_from_sklearn_model' + corresponding tests

* change wrapper, 'drop_used_columns' and 'drop_unused_columns'

* temp : remove useless attribute

* temp : fix test

* allow selector to select of type of variable among TypeOfVariables.CAT / TEXT / NUM

* change default for numerical encoder

* change test

* temp : new test

* renamming

* comments

* clean docstring

* change text models

* change 'base' models

* change corresponding tests

* add numpy array support

* fix tests

* fix test

* fix random_forest_addins columns_to_use

* fix Targetencoder

* fix special case when no column to pass to the model

* typo

* fix get_feature_names

* allow not to raise when shape between fit and transform differs

* corresponding tests

* cleanning

* fix doc + default

* rename

* fix registration

* black reformat

* clean

* add helper method

* temp add fitting test

* clean

* add test : try to fit model

* add custom default hyper-parameters

* fix inf

* clean

* add test not inf CdfScaler

* cap number of component to nb of rows

* fix seed by default

* clean

* make CdfScaler to very small, almost equal values

* test very close and  very small values

* cleanning

* divers

* add min_count param

* more data in test

* * remove cast of string that can be parsed

* corresponding test
  • Loading branch information
Lionel MASSOULARD authored and GitHub Enterprise committed Feb 21, 2020
1 parent 4f67fd6 commit 0f8a287
Show file tree
Hide file tree
Showing 20 changed files with 1,389 additions and 800 deletions.
25 changes: 1 addition & 24 deletions aikit/ml_machine/hyper_parameters.py
Original file line number Diff line number Diff line change
Expand Up @@ -564,27 +564,4 @@ def __add__(self, other):
raise TypeError("I don't know how to add this type %s" % type(other))

return HyperCrossProduct(res, random_state=self.random_state)
# TODO : make sure the seed is setted


# class HyperRandomVariable(AbstractHyper):
# """ draw along the distribution of the parameters
#
# Examples
# --------
# >>> hp = HyperRandomVariable({'a': randint(1, 20)})
# >>> hp.get_rand()
# """
#
# def __init__(self, list_of_hyperparameters):
# if not isinstance(list_of_hyperparameters, dict):
# raise TypeError("I don't know how to deal with that \
# type of list of parameters")
#
# self.list_of_hyperparameters = list_of_hyperparameters
#
# def get_rand(self):
# return list(ParameterSampler(self.list_of_hyperparameters, 1))[0]
#
# def get_size(self):
# return _get_size(self.list_of_hyperparameters)

16 changes: 9 additions & 7 deletions aikit/ml_machine/ml_machine.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,13 @@ def columns_informations(self, new_columns_informations):
@columns_informations.deleter
def columns_informations(self):
self._columns_informations = None

def infos(self):
""" helper to quickly see info of variables """
if self.dfX is not None:
return pd.concat((pd.DataFrame(self.columns_informations).T,self.dfX.dtypes.rename("type")),axis=1)
else:
return pd.DataFrame(self.columns_informations).T

# @property
# def var_type_columns_dico(self):
Expand Down Expand Up @@ -786,17 +793,12 @@ def iterator_default_models(self):
# Blocks
blocks_to_use = tuple(self.auto_ml_config.columns_block.keys()) # keep all blocks

# TODO : peut etre rajouter des modeles en enlevant a chaque fois un bloc ?
# if len(blocks_to_use) == 1:
# all_blocks = [blocks_to_use]
# else:
# all_blocks = [blocks_to_use] + [tuple(diff(blocks_to_use,[b])) for b in blocks_to_use]

# Hyper
hyper_parameters_by_step = {}
for step_name, model_name in models_by_steps.items():
if model_name[0] is not None:
default_parameters = MODEL_REGISTER.informations.get(model_name, {}).get("default_parameters", {})
default_parameters = MODEL_REGISTER.default_hyper_parameters.get(model_name, {})
# default_parameters = MODEL_REGISTER.informations.get(model_name, {}).get("default_parameters", {})
# If default_parameters present in register use it, otherwise use {} (and so will go back to default parameter of the model)
hyper_parameters_by_step[(step_name, model_name)] = default_parameters

Expand Down
33 changes: 26 additions & 7 deletions aikit/ml_machine/ml_machine_registration.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,8 +101,26 @@ class ModelRepresentationBase(_AbstractModelRepresentation):
"analyzer": hp.HyperChoice(["word", "char", "char_wb"]),
"penalty": ["l1", "l2"],
"random_state": [123], # So that every for every model with a random_state attribute, it will be passed and fix
"columns_to_encode": ["--object--"],

"drop_used_columns":[True],
"drop_unused_columns":[True]
}
# This dictionnary is used to specify the default hyper-parameters that are used during the random search phase
# They will be used if :
# * the model has a paramters among that list
# * the parameters is not specified within the class (within 'custom_hyper')



default_default_hyper = {
"random_state":123,
"drop_used_columns":True,
"drop_unused_columns":True
}
# This dictionnary is used to specify the default hyper-parameters that are used during the default model phase
# They will be used if :
# * the model has a paramters among that list
# * the default parameters is not specified within the class (withing 'default_parameters')


### Linear
Expand Down Expand Up @@ -505,7 +523,8 @@ class TextNltkProcessing_DigitAnonymizer(ModelRepresentationBase):
class NumericalEncoder_CatEncoder(ModelRepresentationBase):
klass = NumericalEncoder
category = StepCategories.CategoryEncoder
type_of_variable = (TypeOfVariables.CAT, TypeOfVariables.NUM)

type_of_variable = (TypeOfVariables.CAT, )

custom_hyper = {"encoding_type": ["dummy", "num"], "min_nb_observations": hp.HyperRangeInt(2, 20)}

Expand All @@ -522,7 +541,7 @@ class TargetEncoderClassifier_CatEncoder(ModelRepresentationBase):
klass = TargetEncoderClassifier
category = StepCategories.CategoryEncoder

type_of_variable = (TypeOfVariables.CAT, TypeOfVariables.NUM)
type_of_variable = (TypeOfVariables.CAT, )

custom_hyper = {
"cv": [None, 2, 5, 10],
Expand Down Expand Up @@ -565,7 +584,7 @@ class TargetEncoderRegressor_CatEncoder(ModelRepresentationBase):
class NumImputer_Inputer(ModelRepresentationBase):
klass = NumImputer
category = StepCategories.MissingValueImputer
type_of_variable = None # Peut etre faire que sur NUM, CAT, TEXT a priori on aura jamais de valeur manquante ?
type_of_variable = None

type_of_model = None
use_y = False
Expand All @@ -584,7 +603,7 @@ class TruncatedSVD_DimensionReduction(ModelRepresentationBase):
type_of_model = None
use_y = False

custom_hyper = {"keep_other_columns": ["keep", "drop"]}
custom_hyper = {"drop_used_columns": [True, False]}


@register
Expand Down Expand Up @@ -615,7 +634,7 @@ class Text_TruncatedSVD_DimensionReduction(ModelRepresentationBase):
type_of_model = None
use_y = False

custom_hyper = {"keep_other_columns": ["keep", "drop"]}
custom_hyper = {"drop_used_columns": [True, False]}


@register
Expand All @@ -631,7 +650,7 @@ class KMeansTransformer_DimensionReduction(ModelRepresentationBase):
use_y = False
type_of_variable = None

custom_hyper = {"keep_other_columns": ["keep", "drop"]}
custom_hyper = {"drop_used_columns": [True, False]}


@register
Expand Down
37 changes: 31 additions & 6 deletions aikit/ml_machine/model_registrer.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,11 +45,12 @@ def __init__(self):

def reset(self):
self.hyper_parameters = {}
self.default_hyper_parameters = {}
self.init_parameters = {}
self.informations = {}
self.all_registered = []

def register_new_class(self, category, klass, hyper=None, **kwargs):
def register_new_class(self, category, klass, hyper=None, default_hyper=None, **kwargs):

if not isinstance(klass, type):
raise TypeError("klass should be klass")
Expand All @@ -65,6 +66,9 @@ def register_new_class(self, category, klass, hyper=None, **kwargs):

if hyper is not None:
self.hyper_parameters[key] = hyper

if default_hyper is not None:
self.default_hyper_parameters[key] = default_hyper

if kwargs:
self.informations[key] = {k: v for k, v in kwargs.items()}
Expand Down Expand Up @@ -116,7 +120,7 @@ def register(klass):
other = {
k: v
for k, v in klass.__dict__.items()
if not k.startswith("_") and k not in ("name", "klass", "custom_hyper", "default_param", "category")
if not k.startswith("_") and k not in ("name", "klass", "custom_hyper", "default_parameters", "category")
}

if klass.category is None:
Expand All @@ -126,7 +130,11 @@ def register(klass):
raise ValueError("I must specify a klass for this klass")

MODEL_REGISTER.register_new_class(
category=klass.category, klass=klass.klass, hyper=klass.get_hyper_parameter(), **other
category=klass.category,
klass=klass.klass,
hyper=klass.get_hyper_parameter(),
default_hyper=klass.get_default_hyper_parameter(),
**other
)

return klass
Expand All @@ -140,7 +148,8 @@ class _AbstractModelRepresentation(object):

custom_hyper = {}
default_hyper = {}

default_parameters = {}

hyper = None

def __init__(self):
Expand All @@ -166,7 +175,23 @@ def get_hyper_parameter(cls):
elif p in cls.default_hyper:
all_hyper[p] = cls.default_hyper[p]

return hp.HyperCrossProduct(all_hyper) # TODO : fix seed here

return hp.HyperCrossProduct(all_hyper)

@classmethod
def get_default_hyper_parameter(cls):
if cls.klass is None:
raise ValueError("I need a klass")

all_params = list(get_init_parameters(cls.klass).keys())

default_hyper_parameters = {}
for p in all_params:
if p in cls.default_parameters:
default_hyper_parameters[p] = cls.default_parameters[p]

elif p in cls.default_default_hyper:
default_hyper_parameters[p] = cls.default_default_hyper[p]

return default_hyper_parameters

MODEL_REGISTER = _MODEL_REGISTER()
4 changes: 2 additions & 2 deletions aikit/models/random_forest_addins.py
Original file line number Diff line number Diff line change
Expand Up @@ -907,7 +907,7 @@ def __init__(
do_svd=False,
svd_n_components=100,
other_rf_params=None,
columns_to_use=None,
columns_to_use="all",
desired_output_type=None,
):

Expand Down Expand Up @@ -964,7 +964,7 @@ def __init__(
do_svd=False,
svd_n_components=100,
other_rf_params=None,
columns_to_use=None,
columns_to_use="all",
desired_output_type=None,
):

Expand Down
Loading

0 comments on commit 0f8a287

Please sign in to comment.