Skip to content

Commit

Permalink
Fix guess_type_of_variable failure with boolean type (#17)
Browse files Browse the repository at this point in the history
* initialize seed

* make 'guess_type_of_variable' works with boolean serie

* more tests on RandomModelGenerator

 * specific hyper
 * filter models
  • Loading branch information
LionelMassoulard authored and gfournier committed Sep 23, 2019
1 parent fd4b2fb commit f49c4f3
Show file tree
Hide file tree
Showing 5 changed files with 58 additions and 4 deletions.
1 change: 1 addition & 0 deletions aikit/ml_machine/jobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ def __init__(
self.done_queue_sleeping_time = done_queue_sleeping_time

self.seed = seed
self.random_state = seed

@property
def random_state(self):
Expand Down
9 changes: 8 additions & 1 deletion aikit/tools/db_informations.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,12 +83,19 @@ def guess_type_of_variable(s):
elif "object" in st:
nb_u = s.nunique() # number of different values
nb = len(s) # number of items
avg_l = s.str.len().mean()

if hasattr(s,"str"): #For boolean
avg_l = s.str.len().mean()
else:
avg_l = 0

if avg_l >= 50 or nb_u >= 0.5 * nb:
return TypeOfVariables.TEXT

return TypeOfVariables.CAT

elif "bool" in st:
return TypeOfVariables.CAT

else:
raise NotImplementedError("I don't know that type of Series : %s, please check" % st)
Expand Down
21 changes: 19 additions & 2 deletions tests/ml_machine/test_ml_machine.py
Original file line number Diff line number Diff line change
Expand Up @@ -296,10 +296,16 @@ def _all_same(all_gen):

return True


def test_RandomModelGenerator_random():
@pytest.mark.parametrize("specific_hyper, only_random_forest",[(True,True),(True,False),(False,True),(False,False)])
def test_RandomModelGenerator_random(specific_hyper, only_random_forest):

dfX, y, auto_ml_config = get_automl_config()

if specific_hyper:
auto_ml_config.specific_hyper = {('Model', 'RandomForestClassifier') : {"n_estimators":[10,20]}}

if only_random_forest:
auto_ml_config.filter_models(Model='RandomForestClassifier')

random_model_generator = RandomModelGenerator(auto_ml_config=auto_ml_config, random_state=123)

Expand Down Expand Up @@ -331,6 +337,17 @@ def test_RandomModelGenerator_random():

model = sklearn_model_from_param(result["json_code"])
assert hasattr(model, "fit")

rf_key = ('Model', ('Model', 'RandomForestClassifier'))
if only_random_forest:
assert rf_key in all_models_params

if specific_hyper:
if rf_key in all_models_params:
assert all_models_params[rf_key]["n_estimators"] in (10,20)

if not only_random_forest:
assert any([ rf_key not in m[1] for m in all_gen]) # Check that RandomForest wasn't drawn every time

### re-draw them thing with other seed ###
random_model_generator = RandomModelGenerator(auto_ml_config=auto_ml_config, random_state=123)
Expand Down
11 changes: 10 additions & 1 deletion tests/tools/test_db_informations.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import pandas as pd
import numpy as np

from aikit.tools.db_informations import has_missing_values
from aikit.tools.db_informations import has_missing_values, guess_type_of_variable, TypeOfVariables


def test_has_missing_values():
Expand All @@ -27,3 +27,12 @@ def test_has_missing_values():

def verif_all():
test_has_missing_values()


def test_guess_type_of_variable_boolean():
s = pd.Series([True,False,True,None]*10)
assert guess_type_of_variable(s) == TypeOfVariables.CAT

s = pd.Series([True,False,True]*10)
assert guess_type_of_variable(s) == TypeOfVariables.CAT

20 changes: 20 additions & 0 deletions tests/transformers/test_categories.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,26 @@ def test_NumericalEncoder_num_fit_parameters():
assert len(encoder.model.variable_modality_mapping['cat_col_1']) == 4
assert len(encoder.model.variable_modality_mapping['cat_col_2']) == 4
assert len(encoder.model.variable_modality_mapping['cat_col_3']) == 4

assert res["cat_col_1"].nunique() == 4
assert res["cat_col_2"].nunique() == 4
assert res["cat_col_3"].nunique() == 4


def test_NumericalEncoder_with_boolean():
dfX = pd.DataFrame({"c":[True,False]*200})

enc = NumericalEncoder()

dfX_encoded = enc.fit_transform(dfX)

assert "c__True" in dfX_encoded.columns
assert "c__False" in dfX_encoded.columns
assert ((dfX_encoded["c__True"] == 1) == (dfX["c"])).all()
assert ((dfX_encoded["c__False"] == 1) == (~dfX["c"])).all()
assert dfX_encoded["c__True"].dtype == np.int32
assert dfX_encoded["c__False"].dtype == np.int32


@pytest.mark.xfail()
def test_bug_CategoryEncoder():
Expand Down

0 comments on commit f49c4f3

Please sign in to comment.