diff --git a/aikit/__meta__.py b/aikit/__meta__.py index f7be6b6..e0ac178 100644 --- a/aikit/__meta__.py +++ b/aikit/__meta__.py @@ -2,7 +2,7 @@ Automatic Tool Kit for Machine Learning and Datascience """ -__version__ = "0.1.5" +__version__ = "0.1.6-dev" __author__ = "Lionel Massoulard" diff --git a/aikit/enums.py b/aikit/enums.py index bf45ebd..9b6431a 100644 --- a/aikit/enums.py +++ b/aikit/enums.py @@ -122,16 +122,9 @@ class SpecialModels: GraphPipeline = "GraphPipeline" Pipeline = "Pipeline" - ModelsUnion = "ModelsUnion" + FeatureUnion = "FeatureUnion" ColumnsSelector = "ColumnsSelector" - alls = (GraphPipeline, Pipeline, ModelsUnion, ColumnsSelector) + alls = (GraphPipeline, Pipeline, FeatureUnion, ColumnsSelector) -# In[] -def verif_all(): - test_TypeOfProblem() - test_TypeOfVariables() - test_StepCategories() - test_DataTypes() - test_SpecialModelss() diff --git a/aikit/model_definition.py b/aikit/model_definition.py index 6dba7c2..3d1d64f 100644 --- a/aikit/model_definition.py +++ b/aikit/model_definition.py @@ -6,6 +6,10 @@ """ import copy +import inspect + +from sklearn.base import BaseEstimator +import numpy as np from aikit.model_registration import DICO_NAME_KLASS from aikit.enums import SpecialModels @@ -158,54 +162,90 @@ def sklearn_model_from_param(param, _copy=True): return param -# def param_from_sklearn_model(model, _simplify_default = False): -# -# if isinstance(model,Pipeline): -# return (SpecialModels.Pipeline,{"steps":[(name,param_from_model(step)) for name,step in model.steps]}) -# -# elif isinstance(model,ModelsUnion): -# return (SpecialModels.ModelsUnion ,{"transformer_list":[(name,param_from_model(step, _simplify_default = _simplify_default)) for name,step in model.transformer_list], -# "n_jobs":model.n_jobs, -# "transformer_weights":model.transformer_weights -# }) -# -# elif isinstance(model, GraphPipeline): -# return (SpecialModels.GraphPipeline , {n:param_from_model(p) for n,p in model.models.items() } , model.edges) -# -# -# elif isinstance(model,BaseEstimator) and model.__class__.__name__ in MODEL_REGISTER.dico_name_class: -# if not _simplify_default: -# param_dico = {k:param_from_model(v,_simplify_default = _simplify_default) for k,v in model.get_params().items() } -# else: -# # Experimental -# default_params = _get_default_params(model.__class__) -# param_dico = {} -# for k,v in model.get_params().items(): -# if not (k in default_params and v == default_params[k]): -# param_dico[k] = param_from_model(v, _simplify_default = _simplify_default) -# -# return (model.__class__.__name__,param_dico) -# # Ici : peut etre faire un filtre si on a les valeurs par default ? -# -# elif isinstance(model, (dict,OrderedDict)): -# res = model.__class__() -# for k,v in model.items(): -# res[k] = param_from_model(v, _simplify_default = _simplify_default) -# -# return res -# -# elif isinstance(model,list): -# return [param_from_model(v,_simplify_default = _simplify_default) for v in model] -# -# elif isinstance(model,tuple): -# return tuple([param_from_model(v,_simplify_default = _simplify_default) for v in model]) -# -# elif isinstance(model,(np.int64,np.int32)): -# return int(model) -# -# elif isinstance(model,(np.float64,np.float32)): -# return float(model) -# -# else: -# return model -# In[] +def filtered_get_params(model, simplify_default=True): + + if not simplify_default: + return model.get_params(deep=False) + + params = model.get_params(deep=False) + new_params = params.__class__() + + args = inspect.signature(model.__class__) + for param, value in params.items(): + skip=False + if param in args.parameters: + if value == args.parameters[param].default: + skip=True + if not skip: + new_params[param]=value + + return new_params + + +def param_from_sklearn_model(model, simplify_default=True): + """ convert a sklearn model into a its json representation + + Parameters + ---------- + model : sklearn.BaseEstimator + the model to convert + + simplify_default : boolean, default=True + if True will simplify the arguments that are identical to the default one + + Returns + ------- + model json representation + + + Example + ------- + >>> model = RandomForestClassifier(n_estimators=200) + >>> param_from_sklearn_model(model) + >>> ('RandomForestClassifier', {'n_estimators': 200}) + + + """ + if isinstance(model, BaseEstimator): + if model.__class__.__name__ not in DICO_NAME_KLASS._mapping: + print(f"You'll need to include your class '{model.__class__.__name__}' into the register to be able to reload it") + + if simplify_default: + param_dico = {k:param_from_sklearn_model(v, simplify_default=simplify_default) for k,v in filtered_get_params(model, simplify_default=True).items() } + else: + param_dico = {k:param_from_sklearn_model(v, simplify_default=simplify_default) for k,v in model.get_params(deep=False).items() } + + return (model.__class__.__name__, param_dico) + + + elif isinstance(model, dict): + res = model.__class__() # to keep the same format (dict, OrderedDict) + for k,v in model.items(): + res[k] = param_from_sklearn_model(v, simplify_default=simplify_default) + + return res + + elif isinstance(model, list): + return [param_from_sklearn_model(v, simplify_default=simplify_default) for v in model] + + elif isinstance(model, tuple): + return tuple([param_from_sklearn_model(v, simplify_default=simplify_default) for v in model]) + + elif isinstance(model, np.number): + if model.dtype.kind == "i": + return int(model) + + elif model.dtype.kind == "f": + return float(model) + + else: + return model + + elif isinstance(model, np.bool_): + return bool(model) + + elif isinstance(model, np.str_): + return str(model) + + else: + return model \ No newline at end of file diff --git a/aikit/transformers/base.py b/aikit/transformers/base.py index fa758f4..37ad3b3 100644 --- a/aikit/transformers/base.py +++ b/aikit/transformers/base.py @@ -1241,18 +1241,6 @@ def target_inverse_transform(self, my): return np.sign(my) * (np.exp(np.log1p(self.ll * np.abs(my)) / self.ll) - 1) -# def column_iterate(X, type_of_data = None): -# if type_of_data is None: -# type_of_data = get_type(X) -# -# if type_of_data in (DataTypes.DataFrame,DataTypes.NumpyArray): -# for column in X.columns: -# yield column,X[column] -# -# elif type_of_data in (DataTypes.NumpyArray, DataTypes.SparseArray): -# for j in range(X.shape[1]): -# yield j,X[:,j] - # In[] def _gen_column_iterator(X, type_of_data=None): """ generic column interator, helper to iterator if the column of a data object """ diff --git a/tests/test_model_definition.py b/tests/test_model_definition.py index ce3b6a6..3a2f5c9 100644 --- a/tests/test_model_definition.py +++ b/tests/test_model_definition.py @@ -6,12 +6,13 @@ """ import copy - +import json from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier from sklearn.linear_model import LogisticRegression +from sklearn.pipeline import Pipeline -from aikit.model_definition import sklearn_model_from_param +from aikit.model_definition import sklearn_model_from_param, param_from_sklearn_model, filtered_get_params from aikit.transformers.base import BoxCoxTargetTransformer, KMeansTransformer, TruncatedSVDWrapper, NumImputer from aikit.transformers.categories import NumericalEncoder @@ -27,15 +28,18 @@ def test_random_forest(self): ### Random Forest ### ##################### - param1 = ("RandomForestClassifier", {"n_estimators": 100, "criterion": "entropy"}) - param1_c = copy.deepcopy(param1) + param = ("RandomForestClassifier", {"n_estimators": 100, "criterion": "entropy"}) + param_c = copy.deepcopy(param) - model1 = sklearn_model_from_param(param1) + model = sklearn_model_from_param(param) - assert isinstance(model1, RandomForestClassifier) - assert model1.n_estimators == 100 + assert isinstance(model, RandomForestClassifier) + assert model.n_estimators == 100 - assert param1 == param1_c # verif that param was not modified inside function + assert param == param_c # verif that param was not modified inside function + + param_reverse = param_from_sklearn_model(model) + assert param_reverse == param def test_logistic_regression(self): ########################### @@ -43,41 +47,49 @@ def test_logistic_regression(self): ########################### from sklearn.linear_model import LogisticRegression - param2 = ("LogisticRegression", {"C": 10}) - param2_c = copy.deepcopy(param2) + param = ("LogisticRegression", {"C": 10}) + param_c = copy.deepcopy(param) + + model = sklearn_model_from_param(param) - model2 = sklearn_model_from_param(param2) + assert isinstance(model, LogisticRegression) + assert model.C == 10 - assert isinstance(model2, LogisticRegression) - assert model2.C == 10 + assert param == param_c # verif that param was not modified inside function - assert param2 == param2_c # verif that param was not modified inside function + param_reverse = param_from_sklearn_model(model) + assert param_reverse == param def test_graph_pipeline(self): ##################### ### GraphPipeline ### ##################### - param3 = ( + param = ( "GraphPipeline", { "models": { - "svd": ("TruncatedSVDWrapper", {"n_components": 2}), + "svd": ("TruncatedSVDWrapper", {"n_components": 3}), "logit": ("LogisticRegression", {"C": 10}), }, "edges": [("svd", "logit")], }, ) - param3_c = copy.deepcopy(param3) + param_c = copy.deepcopy(param) - model3 = sklearn_model_from_param(param3) + model = sklearn_model_from_param(param) - assert isinstance(model3, GraphPipeline) - assert isinstance(model3.models["logit"], LogisticRegression) - assert isinstance(model3.models["svd"], TruncatedSVDWrapper) + assert isinstance(model, GraphPipeline) + assert isinstance(model.models["logit"], LogisticRegression) + assert isinstance(model.models["svd"], TruncatedSVDWrapper) + assert model.models["svd"].n_components==3 + + assert param == param_c + + param_reverse = param_from_sklearn_model(model) + assert param_reverse == param - assert param3 == param3_c def test_graph_pipeline_list(self): ##################### @@ -86,59 +98,64 @@ def test_graph_pipeline_list(self): # Test when inputs are list and not tuples - param4 = [ + param = ( "GraphPipeline", { "edges": [["encoder", "imputer", "rf"], ["vect", "svd", "rf"]], "models": { - "encoder": [ + "encoder": ( "NumericalEncoder", { "columns_to_use": ["^BLOCK_", "^NUMBERTOKEN_", "^DATETOKEN_", "^CURRENCYTOKEN_"], "regex_match": True, }, - ], - "imputer": ["NumImputer", {}], - "rf": ["RandomForestClassifier", {"n_estimators": 500}], - "svd": ["TruncatedSVDWrapper", {"n_components": 200}], - "vect": [ + ), + "imputer": ("NumImputer", {}), + "rf": ("RandomForestClassifier", {"n_estimators": 500}), + "svd": ("TruncatedSVDWrapper", {"n_components": 200}), + "vect": ( "CountVectorizerWrapper", { "analyzer": "char", "columns_to_use": ["STRINGLEFTOF", "STRINGABOVEOF"], "ngram_range": [1, 4], }, - ], + ), }, }, - ] - - param4_c = copy.deepcopy(param4) - - model4 = sklearn_model_from_param(param4) + ) - assert isinstance(model4, GraphPipeline) - assert isinstance(model4.models["encoder"], NumericalEncoder) - assert isinstance(model4.models["imputer"], NumImputer) - assert isinstance(model4.models["vect"], CountVectorizerWrapper) - assert isinstance(model4.models["svd"], TruncatedSVDWrapper) - assert isinstance(model4.models["rf"], RandomForestClassifier) + param_c = copy.deepcopy(param) - assert param4 == param4_c + model = sklearn_model_from_param(param) + assert isinstance(model, GraphPipeline) + assert isinstance(model.models["encoder"], NumericalEncoder) + assert isinstance(model.models["imputer"], NumImputer) + assert isinstance(model.models["vect"], CountVectorizerWrapper) + assert isinstance(model.models["svd"], TruncatedSVDWrapper) + assert isinstance(model.models["rf"], RandomForestClassifier) + + assert param == param_c + + param_reverse = param_from_sklearn_model(model) + assert param_reverse == param + def test_boxcox_target_transformer(self): ## syntax 1 ## - params = ("BoxCoxTargetTransformer", ("RandomForestClassifier", {})) + param = ("BoxCoxTargetTransformer", ("RandomForestClassifier", {})) - params_c = copy.deepcopy(params) + param_c = copy.deepcopy(param) - model = sklearn_model_from_param(params_c) + model = sklearn_model_from_param(param_c) assert isinstance(model, BoxCoxTargetTransformer) assert isinstance(model.model, RandomForestClassifier) - assert params == params_c - + assert param == param_c + param_reverse = param_from_sklearn_model(model) # rmk : difference from param because the RandomForest isn't explicitely passed with a named attribute + assert param_reverse[0] == param[0] + ## syntax 2 ## params = ("BoxCoxTargetTransformer", ("RandomForestClassifier", {}), {"ll": 10}) @@ -149,6 +166,9 @@ def test_boxcox_target_transformer(self): assert isinstance(model.model, RandomForestClassifier) assert model.ll == 10 assert params == params_c + param_reverse = param_from_sklearn_model(model) # rmk : difference from param because the RandomForest isn't explicitely passed with a named attribute + + assert param_reverse[0] == param[0] ## syntax 3 ## params = ("BoxCoxTargetTransformer", {"model": ("RandomForestClassifier", {}), "ll": 10}) @@ -161,7 +181,9 @@ def test_boxcox_target_transformer(self): assert isinstance(model.model, RandomForestClassifier) assert model.ll == 10 assert params == params_c - + param_reverse = param_from_sklearn_model(model) # rmk : difference from param because the RandomForest isn't explicitely passed with a named attribute + assert param_reverse == params + def boxcox_and_graphpipeline(self): params = ( @@ -214,6 +236,10 @@ def boxcox_and_graphpipeline(self): ) assert params == params_c + + param_reverse = param_from_sklearn_model(model) # rmk : difference from param because the RandomForest isn't explicitely passed with a named attribute + + assert param_reverse[0] == params[0] def test_stacking_classifier(self): @@ -236,3 +262,135 @@ def test_stacking_classifier(self): assert isinstance(model.models[1], ExtraTreesClassifier) assert isinstance(model.blender, LogisticRegression) assert model.cv == 5 + param_reverse = param_from_sklearn_model(model) # rmk : difference from param because the RandomForest isn't explicitely passed with a named attribute + + assert param_reverse == params + + +def test_filtered_get_params(): + forest = RandomForestClassifier(n_estimators=250) + assert RandomForestClassifier().get_params()["n_estimators"] != 250 + assert filtered_get_params(forest) == {"n_estimators":250} + + forest = RandomForestClassifier(n_estimators=250, max_depth=None) + assert filtered_get_params(forest) == {"n_estimators":250} + + + model = BoxCoxTargetTransformer(RandomForestClassifier(n_estimators=250), ll=0) + fparams = filtered_get_params(model) + + assert "ll" not in fparams + assert "model" in fparams + + + model = BoxCoxTargetTransformer(RandomForestClassifier(n_estimators=250), ll=1) + assert BoxCoxTargetTransformer(RandomForestClassifier()).get_params()["ll"] != 1 + fparams = filtered_get_params(model) + + assert "ll" in fparams + assert fparams["ll"] == 1 + assert "model" in fparams + + +def test_param_from_sklearn_model(): + # simple RandomForest + model = RandomForestClassifier(n_estimators=250) + assert RandomForestClassifier().get_params()["n_estimators"] != 250 + assert param_from_sklearn_model(model, simplify_default=True) == ('RandomForestClassifier', {'n_estimators': 250}) + param = param_from_sklearn_model(model, simplify_default=False) + assert isinstance(param, tuple) + assert len(param) == 2 + assert param[0] == "RandomForestClassifier" + s = json.dumps(param) # check that it can be json serialized + assert isinstance(s, str) + + assert isinstance(sklearn_model_from_param(param_from_sklearn_model(model)), model.__class__) + + # Composition model : BoxCoxTargetTransformer of RandomForestClassifier + model = BoxCoxTargetTransformer(RandomForestClassifier(n_estimators=250), ll=0) + param = param_from_sklearn_model(model, simplify_default=True) + assert param == ('BoxCoxTargetTransformer', + {'model': ('RandomForestClassifier', {'n_estimators': 250})}) + + assert isinstance(sklearn_model_from_param(param_from_sklearn_model(model)), model.__class__) + s = json.dumps(param) # check that it can be json serialized + assert isinstance(s, str) + + + # Composition model : BoxCoxTargetTransformer of RandomForestClassifier + model = BoxCoxTargetTransformer(RandomForestClassifier(n_estimators=250), ll=1) + param = param_from_sklearn_model(model, simplify_default=True) + assert param == ('BoxCoxTargetTransformer', + {'ll':1, 'model': ('RandomForestClassifier', {'n_estimators': 250})}) + s = json.dumps(param) # check that it can be json serialized + assert isinstance(s, str) + + + assert isinstance(sklearn_model_from_param(param_from_sklearn_model(model)), model.__class__) + + # Pipeline + model = Pipeline([("enc", NumericalEncoder()), ("forest", RandomForestClassifier(n_estimators=250))]) + param = param_from_sklearn_model(model, simplify_default=True) + assert param == ('Pipeline', + {'steps': [('enc', ('NumericalEncoder', {})), + ('forest', ('RandomForestClassifier', {'n_estimators': 250}))]}) + + assert isinstance(sklearn_model_from_param(param_from_sklearn_model(model)), model.__class__) + s = json.dumps(param) # check that it can be json serialized + assert isinstance(s, str) + + + # GraphPipeline + model = GraphPipeline(models={"enc":NumericalEncoder(),"forest":RandomForestClassifier(n_estimators=250)}, + edges=[("enc","forest")] + ) + + param = param_from_sklearn_model(model, simplify_default=True) + assert param == ('GraphPipeline', + {'models': {'enc': ('NumericalEncoder', {}), + 'forest': ('RandomForestClassifier', {'n_estimators': 250})}, + 'edges': [('enc', 'forest')] + }) + + assert isinstance(sklearn_model_from_param(param_from_sklearn_model(model)), model.__class__) + + + # GraphPipeline with verbose = True + model = GraphPipeline(models={"enc":NumericalEncoder(),"forest":RandomForestClassifier(n_estimators=250)}, + edges=[("enc","forest")], + verbose=True + ) + + param = param_from_sklearn_model(model, simplify_default=True) + assert param == ('GraphPipeline', + {'models': {'enc': ('NumericalEncoder', {}), + 'forest': ('RandomForestClassifier', {'n_estimators': 250})}, + 'edges': [('enc', 'forest')], + 'verbose':True + }) + + s = json.dumps(param) # check that it can be json serialized + assert isinstance(s, str) + + model2 = sklearn_model_from_param(param_from_sklearn_model(model)) + assert model2.verbose is True + assert isinstance(model2, model.__class__) + + # GraphPipeline + composition + model = GraphPipeline(models={"enc":NumericalEncoder(), + "forest":BoxCoxTargetTransformer(RandomForestClassifier(n_estimators=250), ll=1)}, + edges=[("enc","forest")] + ) + + param = param_from_sklearn_model(model, simplify_default=True) + assert param == ('GraphPipeline', + {'edges': [('enc', 'forest')], + 'models': {'enc': ('NumericalEncoder', {}), + 'forest': ('BoxCoxTargetTransformer', + {'ll': 1, 'model': ('RandomForestClassifier', {'n_estimators': 250})})}}) + + assert isinstance(sklearn_model_from_param(param_from_sklearn_model(model)), model.__class__) + s = json.dumps(param) # check that it can be json serialized + assert isinstance(s, str) + +