From c93e3011f343ec4181bc1585e701b28ca5e4c27c Mon Sep 17 00:00:00 2001 From: Lionel Massoulard Date: Sat, 14 Sep 2019 18:47:19 +0200 Subject: [PATCH 1/3] initialize seed --- aikit/ml_machine/jobs.py | 1 + 1 file changed, 1 insertion(+) diff --git a/aikit/ml_machine/jobs.py b/aikit/ml_machine/jobs.py index 2e0ea0e..2559e07 100644 --- a/aikit/ml_machine/jobs.py +++ b/aikit/ml_machine/jobs.py @@ -63,6 +63,7 @@ def __init__( self.done_queue_sleeping_time = done_queue_sleeping_time self.seed = seed + self.random_state = seed @property def random_state(self): From c2a63c443c117247429115acb8ef65fa0a865b18 Mon Sep 17 00:00:00 2001 From: Lionel Massoulard Date: Sat, 14 Sep 2019 18:48:03 +0200 Subject: [PATCH 2/3] make 'guess_type_of_variable' works with boolean serie --- aikit/tools/db_informations.py | 9 ++++++++- tests/tools/test_db_informations.py | 11 ++++++++++- tests/transformers/test_categories.py | 20 ++++++++++++++++++++ 3 files changed, 38 insertions(+), 2 deletions(-) diff --git a/aikit/tools/db_informations.py b/aikit/tools/db_informations.py index 314c18b..86d01d7 100644 --- a/aikit/tools/db_informations.py +++ b/aikit/tools/db_informations.py @@ -83,12 +83,19 @@ def guess_type_of_variable(s): elif "object" in st: nb_u = s.nunique() # number of different values nb = len(s) # number of items - avg_l = s.str.len().mean() + + if hasattr(s,"str"): #For boolean + avg_l = s.str.len().mean() + else: + avg_l = 0 if avg_l >= 50 or nb_u >= 0.5 * nb: return TypeOfVariables.TEXT return TypeOfVariables.CAT + + elif "bool" in st: + return TypeOfVariables.CAT else: raise NotImplementedError("I don't know that type of Series : %s, please check" % st) diff --git a/tests/tools/test_db_informations.py b/tests/tools/test_db_informations.py index 3d0493e..a01f439 100644 --- a/tests/tools/test_db_informations.py +++ b/tests/tools/test_db_informations.py @@ -8,7 +8,7 @@ import pandas as pd import numpy as np -from aikit.tools.db_informations import has_missing_values +from aikit.tools.db_informations import has_missing_values, guess_type_of_variable, TypeOfVariables def test_has_missing_values(): @@ -27,3 +27,12 @@ def test_has_missing_values(): def verif_all(): test_has_missing_values() + + +def test_guess_type_of_variable_boolean(): + s = pd.Series([True,False,True,None]*10) + assert guess_type_of_variable(s) == TypeOfVariables.CAT + + s = pd.Series([True,False,True]*10) + assert guess_type_of_variable(s) == TypeOfVariables.CAT + diff --git a/tests/transformers/test_categories.py b/tests/transformers/test_categories.py index 31d9ba7..578465b 100644 --- a/tests/transformers/test_categories.py +++ b/tests/transformers/test_categories.py @@ -243,6 +243,26 @@ def test_NumericalEncoder_num_fit_parameters(): assert len(encoder.model.variable_modality_mapping['cat_col_1']) == 4 assert len(encoder.model.variable_modality_mapping['cat_col_2']) == 4 assert len(encoder.model.variable_modality_mapping['cat_col_3']) == 4 + + assert res["cat_col_1"].nunique() == 4 + assert res["cat_col_2"].nunique() == 4 + assert res["cat_col_3"].nunique() == 4 + + +def test_NumericalEncoder_with_boolean(): + dfX = pd.DataFrame({"c":[True,False]*200}) + + enc = NumericalEncoder() + + dfX_encoded = enc.fit_transform(dfX) + + assert "c__True" in dfX_encoded.columns + assert "c__False" in dfX_encoded.columns + assert ((dfX_encoded["c__True"] == 1) == (dfX["c"])).all() + assert ((dfX_encoded["c__False"] == 1) == (~dfX["c"])).all() + assert dfX_encoded["c__True"].dtype == np.int32 + assert dfX_encoded["c__False"].dtype == np.int32 + @pytest.mark.xfail() def test_bug_CategoryEncoder(): From 21bee76e727647e789caf645e28d6239b355146f Mon Sep 17 00:00:00 2001 From: Lionel Massoulard Date: Sat, 14 Sep 2019 18:49:12 +0200 Subject: [PATCH 3/3] more tests on RandomModelGenerator * specific hyper * filter models --- tests/ml_machine/test_ml_machine.py | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/tests/ml_machine/test_ml_machine.py b/tests/ml_machine/test_ml_machine.py index 1536850..e1ff1c4 100644 --- a/tests/ml_machine/test_ml_machine.py +++ b/tests/ml_machine/test_ml_machine.py @@ -296,10 +296,16 @@ def _all_same(all_gen): return True - -def test_RandomModelGenerator_random(): +@pytest.mark.parametrize("specific_hyper, only_random_forest",[(True,True),(True,False),(False,True),(False,False)]) +def test_RandomModelGenerator_random(specific_hyper, only_random_forest): dfX, y, auto_ml_config = get_automl_config() + + if specific_hyper: + auto_ml_config.specific_hyper = {('Model', 'RandomForestClassifier') : {"n_estimators":[10,20]}} + + if only_random_forest: + auto_ml_config.filter_models(Model='RandomForestClassifier') random_model_generator = RandomModelGenerator(auto_ml_config=auto_ml_config, random_state=123) @@ -331,6 +337,17 @@ def test_RandomModelGenerator_random(): model = sklearn_model_from_param(result["json_code"]) assert hasattr(model, "fit") + + rf_key = ('Model', ('Model', 'RandomForestClassifier')) + if only_random_forest: + assert rf_key in all_models_params + + if specific_hyper: + if rf_key in all_models_params: + assert all_models_params[rf_key]["n_estimators"] in (10,20) + + if not only_random_forest: + assert any([ rf_key not in m[1] for m in all_gen]) # Check that RandomForest wasn't drawn every time ### re-draw them thing with other seed ### random_model_generator = RandomModelGenerator(auto_ml_config=auto_ml_config, random_state=123)