Fix guess_type_of_variable failure with boolean type (#17)

* initialize seed * make 'guess_type_of_variable' works with boolean serie * more tests on RandomModelGenerator * specific hyper * filter models
societe-generale · Sep 23, 2019 · f49c4f3 · f49c4f3
1 parent fd4b2fb
commit f49c4f3
Show file tree

Hide file tree

Showing 5 changed files with 58 additions and 4 deletions.
diff --git a/aikit/ml_machine/jobs.py b/aikit/ml_machine/jobs.py
@@ -63,6 +63,7 @@ def __init__(
         self.done_queue_sleeping_time = done_queue_sleeping_time
 
         self.seed = seed
+        self.random_state = seed
 
     @property
     def random_state(self):

diff --git a/aikit/tools/db_informations.py b/aikit/tools/db_informations.py
@@ -83,12 +83,19 @@ def guess_type_of_variable(s):
     elif "object" in st:
         nb_u = s.nunique()  # number of different values
         nb = len(s)  # number of items
-        avg_l = s.str.len().mean()
+
+        if hasattr(s,"str"): #For boolean 
+            avg_l = s.str.len().mean()
+        else:
+            avg_l = 0
 
         if avg_l >= 50 or nb_u >= 0.5 * nb:
             return TypeOfVariables.TEXT
 
         return TypeOfVariables.CAT
+
+    elif "bool" in st:
+        return TypeOfVariables.CAT
 
     else:
         raise NotImplementedError("I don't know that type of Series : %s, please check" % st)

diff --git a/tests/ml_machine/test_ml_machine.py b/tests/ml_machine/test_ml_machine.py
@@ -296,10 +296,16 @@ def _all_same(all_gen):
 
     return True
 
-
-def test_RandomModelGenerator_random():
+@pytest.mark.parametrize("specific_hyper, only_random_forest",[(True,True),(True,False),(False,True),(False,False)])
+def test_RandomModelGenerator_random(specific_hyper, only_random_forest):
 
     dfX, y, auto_ml_config = get_automl_config()
+
+    if specific_hyper:
+        auto_ml_config.specific_hyper = {('Model', 'RandomForestClassifier') : {"n_estimators":[10,20]}}
+
+    if only_random_forest:
+        auto_ml_config.filter_models(Model='RandomForestClassifier')
 
     random_model_generator = RandomModelGenerator(auto_ml_config=auto_ml_config, random_state=123)
 
@@ -331,6 +337,17 @@ def test_RandomModelGenerator_random():
 
         model = sklearn_model_from_param(result["json_code"])
         assert hasattr(model, "fit")
+
+        rf_key = ('Model', ('Model', 'RandomForestClassifier'))
+        if only_random_forest:
+            assert rf_key in all_models_params
+
+        if specific_hyper:
+            if rf_key in all_models_params:
+                assert all_models_params[rf_key]["n_estimators"] in (10,20)
+
+    if not only_random_forest:
+        assert any([ rf_key not in m[1] for m in all_gen]) # Check that RandomForest wasn't drawn every time
 
     ### re-draw them thing with other seed ###
     random_model_generator = RandomModelGenerator(auto_ml_config=auto_ml_config, random_state=123)

diff --git a/tests/tools/test_db_informations.py b/tests/tools/test_db_informations.py
@@ -8,7 +8,7 @@
 import pandas as pd
 import numpy as np
 
-from aikit.tools.db_informations import has_missing_values
+from aikit.tools.db_informations import has_missing_values, guess_type_of_variable, TypeOfVariables
 
 
 def test_has_missing_values():
@@ -27,3 +27,12 @@ def test_has_missing_values():
 
 def verif_all():
     test_has_missing_values()
+
+
+def test_guess_type_of_variable_boolean():
+    s = pd.Series([True,False,True,None]*10)
+    assert guess_type_of_variable(s) == TypeOfVariables.CAT
+
+    s = pd.Series([True,False,True]*10)
+    assert guess_type_of_variable(s) == TypeOfVariables.CAT
+
diff --git a/tests/transformers/test_categories.py b/tests/transformers/test_categories.py
@@ -243,6 +243,26 @@ def test_NumericalEncoder_num_fit_parameters():
     assert len(encoder.model.variable_modality_mapping['cat_col_1']) == 4
     assert len(encoder.model.variable_modality_mapping['cat_col_2']) == 4
     assert len(encoder.model.variable_modality_mapping['cat_col_3']) == 4
+
+    assert res["cat_col_1"].nunique() == 4
+    assert res["cat_col_2"].nunique() == 4
+    assert res["cat_col_3"].nunique() == 4
+
+
+def test_NumericalEncoder_with_boolean():
+    dfX = pd.DataFrame({"c":[True,False]*200})
+
+    enc = NumericalEncoder()
+
+    dfX_encoded = enc.fit_transform(dfX)
+
+    assert "c__True" in dfX_encoded.columns
+    assert "c__False" in dfX_encoded.columns
+    assert ((dfX_encoded["c__True"] == 1) == (dfX["c"])).all()
+    assert ((dfX_encoded["c__False"] == 1) == (~dfX["c"])).all()
+    assert dfX_encoded["c__True"].dtype == np.int32
+    assert dfX_encoded["c__False"].dtype == np.int32
+
 
 @pytest.mark.xfail()
 def test_bug_CategoryEncoder():